From: Sasha Levin <sashal@kernel.org>
To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Cc: Liu Bo <bo.liu@linux.alibaba.com>, Jens Axboe <axboe@kernel.dk>,
Sasha Levin <sashal@kernel.org>,
linux-block@vger.kernel.org
Subject: [PATCH AUTOSEL 4.19 64/64] blk-iolatency: fix IO hang due to negative inflight counter
Date: Thu, 28 Feb 2019 10:11:05 -0500 [thread overview]
Message-ID: <20190228151105.11277-64-sashal@kernel.org> (raw)
In-Reply-To: <20190228151105.11277-1-sashal@kernel.org>
From: Liu Bo <bo.liu@linux.alibaba.com>
[ Upstream commit 8c772a9bfc7c07c76f4a58b58910452fbb20843b ]
Our test reported the following stack, and vmcore showed that
->inflight counter is -1.
[ffffc9003fcc38d0] __schedule at ffffffff8173d95d
[ffffc9003fcc3958] schedule at ffffffff8173de26
[ffffc9003fcc3970] io_schedule at ffffffff810bb6b6
[ffffc9003fcc3988] blkcg_iolatency_throttle at ffffffff813911cb
[ffffc9003fcc3a20] rq_qos_throttle at ffffffff813847f3
[ffffc9003fcc3a48] blk_mq_make_request at ffffffff8137468a
[ffffc9003fcc3b08] generic_make_request at ffffffff81368b49
[ffffc9003fcc3b68] submit_bio at ffffffff81368d7d
[ffffc9003fcc3bb8] ext4_io_submit at ffffffffa031be00 [ext4]
[ffffc9003fcc3c00] ext4_writepages at ffffffffa03163de [ext4]
[ffffc9003fcc3d68] do_writepages at ffffffff811c49ae
[ffffc9003fcc3d78] __filemap_fdatawrite_range at ffffffff811b6188
[ffffc9003fcc3e30] filemap_write_and_wait_range at ffffffff811b6301
[ffffc9003fcc3e60] ext4_sync_file at ffffffffa030cee8 [ext4]
[ffffc9003fcc3ea8] vfs_fsync_range at ffffffff8128594b
[ffffc9003fcc3ee8] do_fsync at ffffffff81285abd
[ffffc9003fcc3f18] sys_fsync at ffffffff81285d50
[ffffc9003fcc3f28] do_syscall_64 at ffffffff81003c04
[ffffc9003fcc3f50] entry_SYSCALL_64_after_swapgs at ffffffff81742b8e
The ->inflight counter may be negative (-1) if
1) blk-iolatency was disabled when the IO was issued,
2) blk-iolatency was enabled before this IO reached its endio,
3) the ->inflight counter is decreased from 0 to -1 in endio()
In fact the hang can be easily reproduced by the below script,
H=/sys/fs/cgroup/unified/
P=/sys/fs/cgroup/unified/test
echo "+io" > $H/cgroup.subtree_control
mkdir -p $P
echo $$ > $P/cgroup.procs
xfs_io -f -d -c "pwrite 0 4k" /dev/sdg
echo "`cat /sys/block/sdg/dev` target=1000000" > $P/io.latency
xfs_io -f -d -c "pwrite 0 4k" /dev/sdg
This fixes the problem by freezing the queue so that while
enabling/disabling iolatency, there is no inflight rq running.
Note that quiesce_queue is not needed as this only updating iolatency
configuration about which dispatching request_queue doesn't care.
Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
block/blk-iolatency.c | 52 +++++++++++++++++++++++++++++++++++++------
1 file changed, 45 insertions(+), 7 deletions(-)
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 19923f8a029dd..b154e057ca67c 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -72,6 +72,7 @@
#include <linux/sched/loadavg.h>
#include <linux/sched/signal.h>
#include <trace/events/block.h>
+#include <linux/blk-mq.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
@@ -568,6 +569,9 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
return;
enabled = blk_iolatency_enabled(iolat->blkiolat);
+ if (!enabled)
+ return;
+
while (blkg && blkg->parent) {
iolat = blkg_to_lat(blkg);
if (!iolat) {
@@ -577,7 +581,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
rqw = &iolat->rq_wait;
atomic_dec(&rqw->inflight);
- if (!enabled || iolat->min_lat_nsec == 0)
+ if (iolat->min_lat_nsec == 0)
goto next;
iolatency_record_time(iolat, &bio->bi_issue, now,
issue_as_root);
@@ -721,10 +725,13 @@ int blk_iolatency_init(struct request_queue *q)
return 0;
}
-static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
+/*
+ * return 1 for enabling iolatency, return -1 for disabling iolatency, otherwise
+ * return 0.
+ */
+static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
{
struct iolatency_grp *iolat = blkg_to_lat(blkg);
- struct blk_iolatency *blkiolat = iolat->blkiolat;
u64 oldval = iolat->min_lat_nsec;
iolat->min_lat_nsec = val;
@@ -733,9 +740,10 @@ static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
BLKIOLATENCY_MAX_WIN_SIZE);
if (!oldval && val)
- atomic_inc(&blkiolat->enabled);
+ return 1;
if (oldval && !val)
- atomic_dec(&blkiolat->enabled);
+ return -1;
+ return 0;
}
static void iolatency_clear_scaling(struct blkcg_gq *blkg)
@@ -768,6 +776,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
u64 lat_val = 0;
u64 oldval;
int ret;
+ int enable = 0;
ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
if (ret)
@@ -803,7 +812,12 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
blkg = ctx.blkg;
oldval = iolat->min_lat_nsec;
- iolatency_set_min_lat_nsec(blkg, lat_val);
+ enable = iolatency_set_min_lat_nsec(blkg, lat_val);
+ if (enable) {
+ WARN_ON_ONCE(!blk_get_queue(blkg->q));
+ blkg_get(blkg);
+ }
+
if (oldval != iolat->min_lat_nsec) {
iolatency_clear_scaling(blkg);
}
@@ -811,6 +825,24 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
ret = 0;
out:
blkg_conf_finish(&ctx);
+ if (ret == 0 && enable) {
+ struct iolatency_grp *tmp = blkg_to_lat(blkg);
+ struct blk_iolatency *blkiolat = tmp->blkiolat;
+
+ blk_mq_freeze_queue(blkg->q);
+
+ if (enable == 1)
+ atomic_inc(&blkiolat->enabled);
+ else if (enable == -1)
+ atomic_dec(&blkiolat->enabled);
+ else
+ WARN_ON_ONCE(1);
+
+ blk_mq_unfreeze_queue(blkg->q);
+
+ blkg_put(blkg);
+ blk_put_queue(blkg->q);
+ }
return ret ?: nbytes;
}
@@ -910,8 +942,14 @@ static void iolatency_pd_offline(struct blkg_policy_data *pd)
{
struct iolatency_grp *iolat = pd_to_lat(pd);
struct blkcg_gq *blkg = lat_to_blkg(iolat);
+ struct blk_iolatency *blkiolat = iolat->blkiolat;
+ int ret;
- iolatency_set_min_lat_nsec(blkg, 0);
+ ret = iolatency_set_min_lat_nsec(blkg, 0);
+ if (ret == 1)
+ atomic_inc(&blkiolat->enabled);
+ if (ret == -1)
+ atomic_dec(&blkiolat->enabled);
iolatency_clear_scaling(blkg);
}
--
2.19.1
prev parent reply other threads:[~2019-02-28 15:22 UTC|newest]
Thread overview: 68+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-02-28 15:10 [PATCH AUTOSEL 4.19 01/64] ARM: OMAP: dts: N950/N9: fix onenand timings Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 02/64] ARM: dts: omap4-droid4: Fix typo in cpcap IRQ flags Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 03/64] ARM: dts: sun8i: h3: Add ethernet0 alias to Beelink X2 Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 04/64] arm: dts: meson: Fix IRQ trigger type for macirq Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 05/64] ARM: dts: meson8b: odroidc1: mark the SD card detection GPIO active-low Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 06/64] ARM: dts: meson8m2: mxiii-plus: " Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 07/64] ARM: dts: imx6sx: correct backward compatible of gpt Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 08/64] arm64: dts: renesas: r8a7796: Enable DMA for SCIF2 Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 09/64] arm64: dts: renesas: r8a77965: " Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 10/64] soc: fsl: qbman: avoid race in clearing QMan interrupt Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 11/64] pinctrl: mcp23s08: spi: Fix regmap allocation for mcp23s18 Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 12/64] wlcore: sdio: Fixup power on/off sequence Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 13/64] bpftool: Fix prog dump by tag Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 14/64] bpftool: fix percpu maps updating Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 15/64] bpf: sock recvbuff must be limited by rmem_max in bpf_setsockopt() Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 16/64] ARM: pxa: ssp: unneeded to free devm_ allocated data Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 17/64] arm64: dts: add msm8996 compatible to gicv3 Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 18/64] batman-adv: release station info tidstats Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 19/64] DTS: CI20: Fix bugs in ci20's device tree Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 20/64] usb: phy: fix link errors Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 21/64] irqchip/gic-v4: Fix occasional VLPI drop Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 22/64] irqchip/gic-v3-its: Gracefully fail on LPI exhaustion Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 23/64] irqchip/mmp: Only touch the PJ4 IRQ & FIQ bits on enable/disable Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 24/64] drm/amdgpu: Add missing power attribute to APU check Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 25/64] drm/radeon: check if device is root before getting pci speed caps Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 26/64] debugfs: return error values, not NULL Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 27/64] debugfs: debugfs_lookup() should return NULL if not found Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 28/64] drm/amdgpu: Transfer fences to dmabuf importer Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 29/64] net: stmmac: Fallback to Platform Data clock in Watchdog conversion Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 30/64] net: stmmac: Send TSO packets always from Queue 0 Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 31/64] net: stmmac: Disable EEE mode earlier in XMIT callback Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 32/64] irqchip/gic-v3-its: Fix ITT_entry_size accessor Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 33/64] relay: check return of create_buf_file() properly Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 34/64] blk-mq: protect debugfs_create_files() from failures Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 35/64] bpf, selftests: fix handling of sparse CPU allocations Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 36/64] bpf: fix lockdep false positive in percpu_freelist Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 37/64] bpf: fix potential deadlock in bpf_prog_register Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 38/64] bpf: Fix syscall's stackmap lookup potential deadlock Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 39/64] drm/sun4i: tcon: Prepare and enable TCON channel 0 clock at init Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 40/64] dmaengine: at_xdmac: Fix wrongfull report of a channel as in use Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 41/64] vsock/virtio: fix kernel panic after device hot-unplug Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 42/64] vsock/virtio: reset connected sockets on device removal Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 43/64] dmaengine: dmatest: Abort test in case of mapping error Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 44/64] selftests: netfilter: fix config fragment CONFIG_NF_TABLES_INET Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 45/64] selftests: netfilter: add simple masq/redirect test cases Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 46/64] netfilter: nf_nat: skip nat clash resolution for same-origin entries Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 47/64] arm64: ptdump: Don't iterate kernel page tables using PTRS_PER_PXX Sasha Levin
2019-02-28 15:18 ` Will Deacon
2019-03-11 17:16 ` Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 48/64] s390/qeth: release cmd buffer in error paths Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 49/64] s390/qeth: fix use-after-free in error path Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 50/64] s390/qeth: cancel close_dev work before removing a card Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 51/64] perf symbols: Filter out hidden symbols from labels Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 52/64] perf trace: Support multiple "vfs_getname" probes Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 53/64] MIPS: Loongson: Introduce and use loongson_llsc_mb() Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 54/64] MIPS: Remove function size check in get_frame_info() Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 55/64] Revert "scsi: libfc: Add WARN_ON() when deleting rports" Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 56/64] i2c: omap: Use noirq system sleep pm ops to idle device for suspend Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 57/64] drm/amdgpu: use spin_lock_irqsave to protect vm_manager.pasid_idr Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 58/64] nvme: lock NS list changes while handling command effects Sasha Levin
2019-02-28 15:11 ` [PATCH AUTOSEL 4.19 59/64] nvme-pci: fix rapid add remove sequence Sasha Levin
2019-02-28 15:16 ` Keith Busch
2019-03-11 17:21 ` Sasha Levin
2019-02-28 15:11 ` [PATCH AUTOSEL 4.19 60/64] fs: ratelimit __find_get_block_slow() failure message Sasha Levin
2019-02-28 15:11 ` [PATCH AUTOSEL 4.19 61/64] qed: Fix EQ full firmware assert Sasha Levin
2019-02-28 15:11 ` [PATCH AUTOSEL 4.19 62/64] qed: Consider TX tcs while deriving the max num_queues for PF Sasha Levin
2019-02-28 15:11 ` [PATCH AUTOSEL 4.19 63/64] qede: Fix system crash on configuring channels Sasha Levin
2019-02-28 15:11 ` Sasha Levin [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190228151105.11277-64-sashal@kernel.org \
--to=sashal@kernel.org \
--cc=axboe@kernel.dk \
--cc=bo.liu@linux.alibaba.com \
--cc=linux-block@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).