From: Chris Mason <clm@fb.com>
To: <linux-fsdevel@vger.kernel.org>, <linux-aio@kvack.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Subject: [PATCH RFC] fs/aio: fix sleeping while TASK_INTERRUPTIBLE
Date: Mon, 22 Dec 2014 19:16:25 -0500 [thread overview]
Message-ID: <20141223001619.GA26385@ret.masoncoding.com> (raw)
The 3.19 merge window brought in a great new warning to catch someone
calling might_sleep with their state != TASK_RUNNING. The idea was to
find buggy code locking mutexes after calling prepare_to_wait(), kind
of like this:
[ 445.464634] WARNING: CPU: 22 PID: 4367 at kernel/sched/core.c:7303 __might_sleep+0x9a/0xb0()
[ 445.481699] do not call blocking ops when !TASK_RUNNING; state=1 set at [<ffffffff81093f68>] prepare_to_wait_event+0x68/0x120
[ 445.504494] Modules linked in: btrfs raid6_pq zlib_deflate lzo_compress xor xfs exportfs libcrc32c nfsv4 k10temp coretemp hwmon loop tcp_diag inet_diag ip6table_filter ip6_tables xt_NFLOG nfnetlink_log nfnetlink xt_comment xt_statistic iptable_filter ip_tables x_tables mptctl netconsole autofs4 nfsv3 nfs lockd grace rpcsec_gss_krb5 auth_rpcgss oid_registry sunrpc ipv6 ext3 jbd dm_mod iTCO_wdt iTCO_vendor_support ipmi_si ipmi_msghandler rtc_cmos pcspkr i2c_i801 lpc_ich mfd_core shpchp ehci_pci ehci_hcd mlx4_en ptp pps_core mlx4_core ses enclosure sg button megaraid_sas
[ 445.612013] CPU: 22 PID: 4367 Comm: mysqld Tainted: G W 3.18.0-mason+ #21
[ 445.627862] Hardware name: ZTSYSTEMS Echo Ridge T4 /A9DRPF-10D, BIOS 1.07 05/10/2012
[ 445.643727] 0000000000001c87 ffff880e18ae7c58 ffffffff81658fc9 0000000000001c87
[ 445.659194] ffff880e18ae7ca8 ffff880e18ae7c98 ffffffff81054b25 0000000000000000
[ 445.674695] 0000000000000000 000000000000026d ffffffff81a13c0b 0000000000000000
[ 445.690091] Call Trace:
[ 445.695129] [<ffffffff81658fc9>] dump_stack+0x4f/0x6e
[ 445.705545] [<ffffffff81054b25>] warn_slowpath_common+0x95/0xe0
[ 445.717699] [<ffffffff81054c26>] warn_slowpath_fmt+0x46/0x50
[ 445.729341] [<ffffffff81093f68>] ? prepare_to_wait_event+0x68/0x120
[ 445.742191] [<ffffffff81093f68>] ? prepare_to_wait_event+0x68/0x120
[ 445.755037] [<ffffffff8107b03a>] __might_sleep+0x9a/0xb0
[ 445.765975] [<ffffffff8165b5df>] mutex_lock_nested+0x2f/0x3b0
[ 445.777783] [<ffffffff81093f5e>] ? prepare_to_wait_event+0x5e/0x120
[ 445.790617] [<ffffffff81093f5e>] ? prepare_to_wait_event+0x5e/0x120
[ 445.803473] [<ffffffff811f794b>] aio_read_events+0x4b/0x2a0
[ 445.814930] [<ffffffff811f7d36>] read_events+0x196/0x240
[ 445.825873] [<ffffffff810bb0f0>] ? update_rmtp+0x80/0x80
[ 445.836799] [<ffffffff810bba54>] ? hrtimer_start_range_ns+0x14/0x20
[ 445.849656] [<ffffffff810939e0>] ? wait_woken+0xa0/0xa0
[ 445.860419] [<ffffffff811f7867>] ? lookup_ioctx+0x47/0xe0
[ 445.871519] [<ffffffff811f8c34>] SyS_io_getevents+0x64/0x110
[ 445.883140] [<ffffffff810f433c>] ? __audit_syscall_entry+0xac/0x110
[ 445.895986] [<ffffffff8165fcd2>] system_call_fastpath+0x12/0x17
[ 445.908131] ---[ end trace 1df697a0d5d1d796 ]---
The patch below fixes things by pushing the mutex out of
aio_read_events and into it's caller. I ended up open coding the
waiting and hrtimer loop because I couldn't see a cleaner way to avoid
taking the mutex extra times.
I also had to set the state to running before calling kmap or
copy_to_user(), which means we might blow through schedule() if we do end up
copying out a single event.
This has been lightly tested and hasn't been benchmarked, so RFC for
now.
Signed-off-by: Chris Mason <clm@fb.com>
Reported-by: The code of Peter Zijlstra <peterz@infradead.org>
---
fs/aio.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++-------------
1 file changed, 93 insertions(+), 24 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index 1b7893e..b3f9fd5 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1131,6 +1131,8 @@ EXPORT_SYMBOL(aio_complete);
/* aio_read_events_ring
* Pull an event off of the ioctx's event ring. Returns the number of
* events fetched
+ *
+ * must be called with ctx->ring locked
*/
static long aio_read_events_ring(struct kioctx *ctx,
struct io_event __user *event, long nr)
@@ -1139,8 +1141,7 @@ static long aio_read_events_ring(struct kioctx *ctx,
unsigned head, tail, pos;
long ret = 0;
int copy_ret;
-
- mutex_lock(&ctx->ring_lock);
+ int running = current->state == TASK_RUNNING;
/* Access to ->ring_pages here is protected by ctx->ring_lock. */
ring = kmap_atomic(ctx->ring_pages[0]);
@@ -1179,6 +1180,17 @@ static long aio_read_events_ring(struct kioctx *ctx,
page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
pos %= AIO_EVENTS_PER_PAGE;
+ /* we're called after calling prepare_to_wait, which means
+ * our current state might not be TASK_RUNNING. Set it
+ * to running here to sleeps in kmap or copy_to_user don't
+ * last forever. If we don't copy enough events out, we'll
+ * loop through schedule() one time before sleeping.
+ */
+ if (!running) {
+ __set_current_state(TASK_RUNNING);
+ running = 1;
+ }
+
ev = kmap(page);
copy_ret = copy_to_user(event + ret, ev + pos,
sizeof(*ev) * avail);
@@ -1201,11 +1213,10 @@ static long aio_read_events_ring(struct kioctx *ctx,
pr_debug("%li h%u t%u\n", ret, head, tail);
out:
- mutex_unlock(&ctx->ring_lock);
-
return ret;
}
+/* must be called with ctx->ring locked */
static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
struct io_event __user *event, long *i)
{
@@ -1223,6 +1234,73 @@ static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
return ret < 0 || *i >= min_nr;
}
+/*
+ * will take aio ring lock
+ */
+static long aio_wait_events(struct kioctx *ctx, long min_nr, long nr,
+ struct io_event __user *event,
+ long *i_ret, ktime_t until)
+{
+ struct hrtimer_sleeper t;
+ long ret;
+ DEFINE_WAIT(wait);
+
+ mutex_lock(&ctx->ring_lock);
+
+ /*
+ * this is an open coding of wait_event_interruptible_hrtimeout
+ * so we can deal with the ctx->mutex nicely. It starts with the
+ * fast path for existing events:
+ */
+ ret = aio_read_events(ctx, min_nr, nr, event, i_ret);
+ if (ret) {
+ mutex_unlock(&ctx->ring_lock);
+ return ret;
+ }
+
+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init_sleeper(&t, current);
+ if (until.tv64 != KTIME_MAX) {
+ hrtimer_start_range_ns(&t.timer, until, current->timer_slack_ns,
+ HRTIMER_MODE_REL);
+ }
+
+ while (1) {
+ ret = prepare_to_wait_event(&ctx->wait, &wait,
+ TASK_INTERRUPTIBLE);
+ if (ret)
+ break;
+
+ ret = aio_read_events(ctx, min_nr, nr, event, i_ret);
+ if (ret)
+ break;
+
+ /* make sure we didn't timeout taking the mutex */
+ if (!t.task) {
+ ret = -ETIME;
+ break;
+ }
+
+ mutex_unlock(&ctx->ring_lock);
+ schedule();
+
+ if (!t.task) {
+ ret = -ETIME;
+ goto out;
+ }
+ mutex_lock(&ctx->ring_lock);
+
+ }
+ mutex_unlock(&ctx->ring_lock);
+
+out:
+ finish_wait(&ctx->wait, &wait);
+ hrtimer_cancel(&t.timer);
+ destroy_hrtimer_on_stack(&t.timer);
+ return ret;
+
+}
+
static long read_events(struct kioctx *ctx, long min_nr, long nr,
struct io_event __user *event,
struct timespec __user *timeout)
@@ -1239,27 +1317,18 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
until = timespec_to_ktime(ts);
}
- /*
- * Note that aio_read_events() is being called as the conditional - i.e.
- * we're calling it after prepare_to_wait() has set task state to
- * TASK_INTERRUPTIBLE.
- *
- * But aio_read_events() can block, and if it blocks it's going to flip
- * the task state back to TASK_RUNNING.
- *
- * This should be ok, provided it doesn't flip the state back to
- * TASK_RUNNING and return 0 too much - that causes us to spin. That
- * will only happen if the mutex_lock() call blocks, and we then find
- * the ringbuffer empty. So in practice we should be ok, but it's
- * something to be aware of when touching this code.
- */
- if (until.tv64 == 0)
- aio_read_events(ctx, min_nr, nr, event, &ret);
- else
- wait_event_interruptible_hrtimeout(ctx->wait,
- aio_read_events(ctx, min_nr, nr, event, &ret),
- until);
+ if (until.tv64 == 0) {
+ mutex_lock(&ctx->ring_lock);
+ aio_read_events(ctx, min_nr, nr, event, &ret);
+ mutex_unlock(&ctx->ring_lock);
+ } else {
+ /*
+ * we're pitching the -ETIME and -ERESTARTSYS return values
+ * here. It'll turn into -EINTR? ick
+ */
+ aio_wait_events(ctx, min_nr, nr, event, &ret, until);
+ }
if (!ret && signal_pending(current))
ret = -EINTR;
--
1.8.1
--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org. For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>
next reply other threads:[~2014-12-23 0:16 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-12-23 0:16 Chris Mason [this message]
2014-12-23 18:43 ` [PATCH RFC] fs/aio: fix sleeping while TASK_INTERRUPTIBLE Benjamin LaHaise
2014-12-23 18:55 ` Chris Mason
2014-12-23 21:58 ` Benjamin LaHaise
2014-12-25 2:59 ` Kent Overstreet
2014-12-25 3:11 ` Benjamin LaHaise
2014-12-25 3:29 ` Kent Overstreet
2014-12-29 1:24 ` Chris Mason
2014-12-25 2:56 ` Kent Overstreet
2014-12-25 14:27 ` Sedat Dilek
2015-01-04 10:16 ` Sedat Dilek
2014-12-29 15:08 ` Chris Mason
2014-12-29 22:08 ` Kent Overstreet
2015-01-13 16:06 ` Benjamin LaHaise
2015-01-13 16:20 ` Chris Mason
2015-01-21 10:13 ` Dave Chinner
2015-01-21 21:42 ` Chris Mason
2015-02-03 9:14 ` Sedat Dilek
2015-02-03 9:54 ` Sedat Dilek
2015-02-09 3:08 ` Sedat Dilek
2015-02-09 4:21 ` Sedat Dilek
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20141223001619.GA26385@ret.masoncoding.com \
--to=clm@fb.com \
--cc=linux-aio@kvack.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=peterz@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).