From: Sasha Levin <sashal@kernel.org>
To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Cc: Chunguang Xu <brookxu@tencent.com>, Tejun Heo <tj@kernel.org>,
Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>,
linux-block@vger.kernel.org, cgroups@vger.kernel.org
Subject: [PATCH AUTOSEL 5.10 18/39] blk-throtl: optimize IOPS throttle for large IO scenarios
Date: Sun, 5 Sep 2021 21:21:32 -0400 [thread overview]
Message-ID: <20210906012153.929962-18-sashal@kernel.org> (raw)
In-Reply-To: <20210906012153.929962-1-sashal@kernel.org>
From: Chunguang Xu <brookxu@tencent.com>
[ Upstream commit 4f1e9630afe6332de7286820fedd019f19eac057 ]
After patch 54efd50 (block: make generic_make_request handle
arbitrarily sized bios), the IO through io-throttle may be larger,
and these IOs may be further split into more small IOs. However,
IOPS throttle does not seem to be aware of this change, which
makes the calculation of IOPS of large IOs incomplete, resulting
in disk-side IOPS that does not meet expectations. Maybe we should
fix this problem.
We can reproduce it by set max_sectors_kb of disk to 128, set
blkio.write_iops_throttle to 100, run a dd instance inside blkio
and use iostat to watch IOPS:
dd if=/dev/zero of=/dev/sdb bs=1M count=1000 oflag=direct
As a result, without this change the average IOPS is 1995, with
this change the IOPS is 98.
Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/65869aaad05475797d63b4c3fed4f529febe3c26.1627876014.git.brookxu@tencent.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
block/blk-merge.c | 2 ++
block/blk-throttle.c | 32 ++++++++++++++++++++++++++++++++
block/blk.h | 2 ++
3 files changed, 36 insertions(+)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 349cd7d3af81..110db636d230 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -341,6 +341,8 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
submit_bio_noacct(*bio);
*bio = split;
+
+ blk_throtl_charge_bio_split(*bio);
}
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b771c4299982..63e9d00a0832 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -178,6 +178,9 @@ struct throtl_grp {
unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
unsigned long bio_cnt_reset_time;
+ atomic_t io_split_cnt[2];
+ atomic_t last_io_split_cnt[2];
+
struct blkg_rwstat stat_bytes;
struct blkg_rwstat stat_ios;
};
@@ -771,6 +774,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
+ atomic_set(&tg->io_split_cnt[rw], 0);
+
/*
* Previous slice has expired. We must have trimmed it after last
* bio dispatch. That means since start of last slice, we never used
@@ -793,6 +798,9 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
tg->io_disp[rw] = 0;
tg->slice_start[rw] = jiffies;
tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
+
+ atomic_set(&tg->io_split_cnt[rw], 0);
+
throtl_log(&tg->service_queue,
"[%c] new slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -1025,6 +1033,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
jiffies + tg->td->throtl_slice);
}
+ if (iops_limit != UINT_MAX)
+ tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0);
+
if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) &&
tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) {
if (wait)
@@ -2046,12 +2057,14 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
}
if (tg->iops[READ][LIMIT_LOW]) {
+ tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0);
iops = tg->last_io_disp[READ] * HZ / elapsed_time;
if (iops >= tg->iops[READ][LIMIT_LOW])
tg->last_low_overflow_time[READ] = now;
}
if (tg->iops[WRITE][LIMIT_LOW]) {
+ tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0);
iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
if (iops >= tg->iops[WRITE][LIMIT_LOW])
tg->last_low_overflow_time[WRITE] = now;
@@ -2170,6 +2183,25 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
}
#endif
+void blk_throtl_charge_bio_split(struct bio *bio)
+{
+ struct blkcg_gq *blkg = bio->bi_blkg;
+ struct throtl_grp *parent = blkg_to_tg(blkg);
+ struct throtl_service_queue *parent_sq;
+ bool rw = bio_data_dir(bio);
+
+ do {
+ if (!parent->has_rules[rw])
+ break;
+
+ atomic_inc(&parent->io_split_cnt[rw]);
+ atomic_inc(&parent->last_io_split_cnt[rw]);
+
+ parent_sq = parent->service_queue.parent_sq;
+ parent = sq_to_tg(parent_sq);
+ } while (parent);
+}
+
bool blk_throtl_bio(struct bio *bio)
{
struct request_queue *q = bio->bi_disk->queue;
diff --git a/block/blk.h b/block/blk.h
index dfab98465db9..a15f0b65dee4 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -303,11 +303,13 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
extern int blk_throtl_init(struct request_queue *q);
extern void blk_throtl_exit(struct request_queue *q);
extern void blk_throtl_register_queue(struct request_queue *q);
+extern void blk_throtl_charge_bio_split(struct bio *bio);
bool blk_throtl_bio(struct bio *bio);
#else /* CONFIG_BLK_DEV_THROTTLING */
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
static inline void blk_throtl_register_queue(struct request_queue *q) { }
+static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
#endif /* CONFIG_BLK_DEV_THROTTLING */
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
--
2.30.2
next prev parent reply other threads:[~2021-09-06 1:25 UTC|newest]
Thread overview: 39+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-09-06 1:21 [PATCH AUTOSEL 5.10 01/39] locking/mutex: Fix HANDOFF condition Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 02/39] regmap: fix the offset of register error log Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 03/39] regulator: tps65910: Silence deferred probe error Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 04/39] crypto: mxs-dcp - Check for DMA mapping errors Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 05/39] sched/deadline: Fix reset_on_fork reporting of DL tasks Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 06/39] power: supply: axp288_fuel_gauge: Report register-address on readb / writeb errors Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 07/39] crypto: omap-sham - clear dma flags only after omap_sham_update_dma_stop() Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 08/39] sched/deadline: Fix missing clock update in migrate_task_rq_dl() Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 09/39] rcu/tree: Handle VM stoppage in stall detection Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 10/39] EDAC/mce_amd: Do not load edac_mce_amd module on guests Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 11/39] posix-cpu-timers: Force next expiration recalc after itimer reset Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 12/39] hrtimer: Avoid double reprogramming in __hrtimer_start_range_ns() Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 13/39] hrtimer: Ensure timerfd notification for HIGHRES=n Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 14/39] udf: Check LVID earlier Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 15/39] udf: Fix iocharset=utf8 mount option Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 16/39] isofs: joliet: " Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 17/39] bcache: add proper error unwinding in bcache_device_init Sasha Levin
2021-09-06 1:21 ` Sasha Levin [this message]
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 19/39] nvme-tcp: don't update queue count when failing to set io queues Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 20/39] nvme-rdma: " Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 21/39] nvmet: pass back cntlid on successful completion Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 22/39] power: supply: smb347-charger: Add missing pin control activation Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 23/39] power: supply: max17042_battery: fix typo in MAx17042_TOFF Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 24/39] s390/cio: add dev_busid sysfs entry for each subchannel Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 25/39] s390/zcrypt: fix wrong offset index for APKA master key valid state Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 26/39] libata: fix ata_host_start() Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 27/39] crypto: omap - Fix inconsistent locking of device lists Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 28/39] crypto: qat - do not ignore errors from enable_vf2pf_comms() Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 29/39] crypto: qat - handle both source of interrupt in VF ISR Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 30/39] crypto: qat - fix reuse of completion variable Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 31/39] crypto: qat - fix naming for init/shutdown VF to PF notifications Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 32/39] crypto: qat - do not export adf_iov_putmsg() Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 33/39] fcntl: fix potential deadlock for &fasync_struct.fa_lock Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 34/39] udf_get_extendedattr() had no boundary checks Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 35/39] s390/kasan: fix large PMD pages address alignment check Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 36/39] s390/pci: fix misleading rc in clp_set_pci_fn() Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 37/39] s390/debug: keep debug data on resize Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 38/39] s390/debug: fix debug area life cycle Sasha Levin
2021-09-06 1:21 ` [PATCH AUTOSEL 5.10 39/39] s390/ap: fix state machine hang after failure to enable irq Sasha Levin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210906012153.929962-18-sashal@kernel.org \
--to=sashal@kernel.org \
--cc=axboe@kernel.dk \
--cc=brookxu@tencent.com \
--cc=cgroups@vger.kernel.org \
--cc=linux-block@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=stable@vger.kernel.org \
--cc=tj@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox