From: Chris Mason <clm@fb.com>
To: <linux-fsdevel@vger.kernel.org>, <linux-aio@kvack.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Subject: [PATCH RFC] fs/aio: fix sleeping while TASK_INTERRUPTIBLE
Date: Mon, 22 Dec 2014 19:16:25 -0500 [thread overview]
Message-ID: <20141223001619.GA26385@ret.masoncoding.com> (raw)
The 3.19 merge window brought in a great new warning to catch someone
calling might_sleep with their state != TASK_RUNNING. The idea was to
find buggy code locking mutexes after calling prepare_to_wait(), kind
of like this:
[ 445.464634] WARNING: CPU: 22 PID: 4367 at kernel/sched/core.c:7303 __might_sleep+0x9a/0xb0()
[ 445.481699] do not call blocking ops when !TASK_RUNNING; state=1 set at [<ffffffff81093f68>] prepare_to_wait_event+0x68/0x120
[ 445.504494] Modules linked in: btrfs raid6_pq zlib_deflate lzo_compress xor xfs exportfs libcrc32c nfsv4 k10temp coretemp hwmon loop tcp_diag inet_diag ip6table_filter ip6_tables xt_NFLOG nfnetlink_log nfnetlink xt_comment xt_statistic iptable_filter ip_tables x_tables mptctl netconsole autofs4 nfsv3 nfs lockd grace rpcsec_gss_krb5 auth_rpcgss oid_registry sunrpc ipv6 ext3 jbd dm_mod iTCO_wdt iTCO_vendor_support ipmi_si ipmi_msghandler rtc_cmos pcspkr i2c_i801 lpc_ich mfd_core shpchp ehci_pci ehci_hcd mlx4_en ptp pps_core mlx4_core ses enclosure sg button megaraid_sas
[ 445.612013] CPU: 22 PID: 4367 Comm: mysqld Tainted: G W 3.18.0-mason+ #21
[ 445.627862] Hardware name: ZTSYSTEMS Echo Ridge T4 /A9DRPF-10D, BIOS 1.07 05/10/2012
[ 445.643727] 0000000000001c87 ffff880e18ae7c58 ffffffff81658fc9 0000000000001c87
[ 445.659194] ffff880e18ae7ca8 ffff880e18ae7c98 ffffffff81054b25 0000000000000000
[ 445.674695] 0000000000000000 000000000000026d ffffffff81a13c0b 0000000000000000
[ 445.690091] Call Trace:
[ 445.695129] [<ffffffff81658fc9>] dump_stack+0x4f/0x6e
[ 445.705545] [<ffffffff81054b25>] warn_slowpath_common+0x95/0xe0
[ 445.717699] [<ffffffff81054c26>] warn_slowpath_fmt+0x46/0x50
[ 445.729341] [<ffffffff81093f68>] ? prepare_to_wait_event+0x68/0x120
[ 445.742191] [<ffffffff81093f68>] ? prepare_to_wait_event+0x68/0x120
[ 445.755037] [<ffffffff8107b03a>] __might_sleep+0x9a/0xb0
[ 445.765975] [<ffffffff8165b5df>] mutex_lock_nested+0x2f/0x3b0
[ 445.777783] [<ffffffff81093f5e>] ? prepare_to_wait_event+0x5e/0x120
[ 445.790617] [<ffffffff81093f5e>] ? prepare_to_wait_event+0x5e/0x120
[ 445.803473] [<ffffffff811f794b>] aio_read_events+0x4b/0x2a0
[ 445.814930] [<ffffffff811f7d36>] read_events+0x196/0x240
[ 445.825873] [<ffffffff810bb0f0>] ? update_rmtp+0x80/0x80
[ 445.836799] [<ffffffff810bba54>] ? hrtimer_start_range_ns+0x14/0x20
[ 445.849656] [<ffffffff810939e0>] ? wait_woken+0xa0/0xa0
[ 445.860419] [<ffffffff811f7867>] ? lookup_ioctx+0x47/0xe0
[ 445.871519] [<ffffffff811f8c34>] SyS_io_getevents+0x64/0x110
[ 445.883140] [<ffffffff810f433c>] ? __audit_syscall_entry+0xac/0x110
[ 445.895986] [<ffffffff8165fcd2>] system_call_fastpath+0x12/0x17
[ 445.908131] ---[ end trace 1df697a0d5d1d796 ]---
The patch below fixes things by pushing the mutex out of
aio_read_events and into it's caller. I ended up open coding the
waiting and hrtimer loop because I couldn't see a cleaner way to avoid
taking the mutex extra times.
I also had to set the state to running before calling kmap or
copy_to_user(), which means we might blow through schedule() if we do end up
copying out a single event.
This has been lightly tested and hasn't been benchmarked, so RFC for
now.
Signed-off-by: Chris Mason <clm@fb.com>
Reported-by: The code of Peter Zijlstra <peterz@infradead.org>
---
fs/aio.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++-------------
1 file changed, 93 insertions(+), 24 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index 1b7893e..b3f9fd5 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1131,6 +1131,8 @@ EXPORT_SYMBOL(aio_complete);
/* aio_read_events_ring
* Pull an event off of the ioctx's event ring. Returns the number of
* events fetched
+ *
+ * must be called with ctx->ring locked
*/
static long aio_read_events_ring(struct kioctx *ctx,
struct io_event __user *event, long nr)
@@ -1139,8 +1141,7 @@ static long aio_read_events_ring(struct kioctx *ctx,
unsigned head, tail, pos;
long ret = 0;
int copy_ret;
-
- mutex_lock(&ctx->ring_lock);
+ int running = current->state == TASK_RUNNING;
/* Access to ->ring_pages here is protected by ctx->ring_lock. */
ring = kmap_atomic(ctx->ring_pages[0]);
@@ -1179,6 +1180,17 @@ static long aio_read_events_ring(struct kioctx *ctx,
page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
pos %= AIO_EVENTS_PER_PAGE;
+ /* we're called after calling prepare_to_wait, which means
+ * our current state might not be TASK_RUNNING. Set it
+ * to running here to sleeps in kmap or copy_to_user don't
+ * last forever. If we don't copy enough events out, we'll
+ * loop through schedule() one time before sleeping.
+ */
+ if (!running) {
+ __set_current_state(TASK_RUNNING);
+ running = 1;
+ }
+
ev = kmap(page);
copy_ret = copy_to_user(event + ret, ev + pos,
sizeof(*ev) * avail);
@@ -1201,11 +1213,10 @@ static long aio_read_events_ring(struct kioctx *ctx,
pr_debug("%li h%u t%u\n", ret, head, tail);
out:
- mutex_unlock(&ctx->ring_lock);
-
return ret;
}
+/* must be called with ctx->ring locked */
static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
struct io_event __user *event, long *i)
{
@@ -1223,6 +1234,73 @@ static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
return ret < 0 || *i >= min_nr;
}
+/*
+ * will take aio ring lock
+ */
+static long aio_wait_events(struct kioctx *ctx, long min_nr, long nr,
+ struct io_event __user *event,
+ long *i_ret, ktime_t until)
+{
+ struct hrtimer_sleeper t;
+ long ret;
+ DEFINE_WAIT(wait);
+
+ mutex_lock(&ctx->ring_lock);
+
+ /*
+ * this is an open coding of wait_event_interruptible_hrtimeout
+ * so we can deal with the ctx->mutex nicely. It starts with the
+ * fast path for existing events:
+ */
+ ret = aio_read_events(ctx, min_nr, nr, event, i_ret);
+ if (ret) {
+ mutex_unlock(&ctx->ring_lock);
+ return ret;
+ }
+
+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init_sleeper(&t, current);
+ if (until.tv64 != KTIME_MAX) {
+ hrtimer_start_range_ns(&t.timer, until, current->timer_slack_ns,
+ HRTIMER_MODE_REL);
+ }
+
+ while (1) {
+ ret = prepare_to_wait_event(&ctx->wait, &wait,
+ TASK_INTERRUPTIBLE);
+ if (ret)
+ break;
+
+ ret = aio_read_events(ctx, min_nr, nr, event, i_ret);
+ if (ret)
+ break;
+
+ /* make sure we didn't timeout taking the mutex */
+ if (!t.task) {
+ ret = -ETIME;
+ break;
+ }
+
+ mutex_unlock(&ctx->ring_lock);
+ schedule();
+
+ if (!t.task) {
+ ret = -ETIME;
+ goto out;
+ }
+ mutex_lock(&ctx->ring_lock);
+
+ }
+ mutex_unlock(&ctx->ring_lock);
+
+out:
+ finish_wait(&ctx->wait, &wait);
+ hrtimer_cancel(&t.timer);
+ destroy_hrtimer_on_stack(&t.timer);
+ return ret;
+
+}
+
static long read_events(struct kioctx *ctx, long min_nr, long nr,
struct io_event __user *event,
struct timespec __user *timeout)
@@ -1239,27 +1317,18 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
until = timespec_to_ktime(ts);
}
- /*
- * Note that aio_read_events() is being called as the conditional - i.e.
- * we're calling it after prepare_to_wait() has set task state to
- * TASK_INTERRUPTIBLE.
- *
- * But aio_read_events() can block, and if it blocks it's going to flip
- * the task state back to TASK_RUNNING.
- *
- * This should be ok, provided it doesn't flip the state back to
- * TASK_RUNNING and return 0 too much - that causes us to spin. That
- * will only happen if the mutex_lock() call blocks, and we then find
- * the ringbuffer empty. So in practice we should be ok, but it's
- * something to be aware of when touching this code.
- */
- if (until.tv64 == 0)
- aio_read_events(ctx, min_nr, nr, event, &ret);
- else
- wait_event_interruptible_hrtimeout(ctx->wait,
- aio_read_events(ctx, min_nr, nr, event, &ret),
- until);
+ if (until.tv64 == 0) {
+ mutex_lock(&ctx->ring_lock);
+ aio_read_events(ctx, min_nr, nr, event, &ret);
+ mutex_unlock(&ctx->ring_lock);
+ } else {
+ /*
+ * we're pitching the -ETIME and -ERESTARTSYS return values
+ * here. It'll turn into -EINTR? ick
+ */
+ aio_wait_events(ctx, min_nr, nr, event, &ret, until);
+ }
if (!ret && signal_pending(current))
ret = -EINTR;
--
1.8.1
--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org. For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>
next reply other threads:[~2014-12-23 0:16 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-12-23 0:16 Chris Mason [this message]
2014-12-23 18:43 ` [PATCH RFC] fs/aio: fix sleeping while TASK_INTERRUPTIBLE Benjamin LaHaise
2014-12-23 18:55 ` Chris Mason
2014-12-23 21:58 ` Benjamin LaHaise
2014-12-25 2:59 ` Kent Overstreet
2014-12-25 3:11 ` Benjamin LaHaise
2014-12-25 3:29 ` Kent Overstreet
2014-12-29 1:24 ` Chris Mason
2014-12-25 2:56 ` Kent Overstreet
2014-12-25 14:27 ` Sedat Dilek
2015-01-04 10:16 ` Sedat Dilek
2014-12-29 15:08 ` Chris Mason
2014-12-29 22:08 ` Kent Overstreet
2015-01-13 16:06 ` Benjamin LaHaise
2015-01-13 16:20 ` Chris Mason
2015-01-21 10:13 ` Dave Chinner
2015-01-21 21:42 ` Chris Mason
2015-02-03 9:14 ` Sedat Dilek
2015-02-03 9:54 ` Sedat Dilek
2015-02-09 3:08 ` Sedat Dilek
2015-02-09 4:21 ` Sedat Dilek
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20141223001619.GA26385@ret.masoncoding.com \
--to=clm@fb.com \
--cc=linux-aio@kvack.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=peterz@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.