public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	Andreas Gruenbacher <agruenba@redhat.com>,
	Anand Jain <anand.jain@oracle.com>
Subject: [PATCH 5.15 23/33] gfs2: Fix mmap + page fault deadlocks for buffered I/O
Date: Fri, 29 Apr 2022 12:42:10 +0200	[thread overview]
Message-ID: <20220429104053.010361868@linuxfoundation.org> (raw)
In-Reply-To: <20220429104052.345760505@linuxfoundation.org>

From: Andreas Gruenbacher <agruenba@redhat.com>

commit 00bfe02f479688a67a29019d1228f1470e26f014 upstream

In the .read_iter and .write_iter file operations, we're accessing
user-space memory while holding the inode glock.  There is a possibility
that the memory is mapped to the same file, in which case we'd recurse
on the same glock.

We could detect and work around this simple case of recursive locking,
but more complex scenarios exist that involve multiple glocks,
processes, and cluster nodes, and working around all of those cases
isn't practical or even possible.

Avoid these kinds of problems by disabling page faults while holding the
inode glock.  If a page fault would occur, we either end up with a
partial read or write or with -EFAULT if nothing could be read or
written.  In either case, we know that we're not done with the
operation, so we indicate that we're willing to give up the inode glock
and then we fault in the missing pages.  If that made us lose the inode
glock, we return a partial read or write.  Otherwise, we resume the
operation.

This locking problem was originally reported by Jan Kara.  Linus came up
with the idea of disabling page faults.  Many thanks to Al Viro and
Matthew Wilcox for their feedback.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/gfs2/file.c |   99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 94 insertions(+), 5 deletions(-)

--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -777,6 +777,36 @@ static int gfs2_fsync(struct file *file,
 	return ret ? ret : ret1;
 }
 
+static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,
+					 size_t *prev_count,
+					 size_t *window_size)
+{
+	char __user *p = i->iov[0].iov_base + i->iov_offset;
+	size_t count = iov_iter_count(i);
+	int pages = 1;
+
+	if (likely(!count))
+		return false;
+	if (ret <= 0 && ret != -EFAULT)
+		return false;
+	if (!iter_is_iovec(i))
+		return false;
+
+	if (*prev_count != count || !*window_size) {
+		int pages, nr_dirtied;
+
+		pages = min_t(int, BIO_MAX_VECS,
+			      DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE));
+		nr_dirtied = max(current->nr_dirtied_pause -
+				 current->nr_dirtied, 1);
+		pages = min(pages, nr_dirtied);
+	}
+
+	*prev_count = count;
+	*window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p);
+	return true;
+}
+
 static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
 				     struct gfs2_holder *gh)
 {
@@ -841,9 +871,17 @@ static ssize_t gfs2_file_read_iter(struc
 {
 	struct gfs2_inode *ip;
 	struct gfs2_holder gh;
+	size_t prev_count = 0, window_size = 0;
 	size_t written = 0;
 	ssize_t ret;
 
+	/*
+	 * In this function, we disable page faults when we're holding the
+	 * inode glock while doing I/O.  If a page fault occurs, we indicate
+	 * that the inode glock may be dropped, fault in the pages manually,
+	 * and retry.
+	 */
+
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		ret = gfs2_file_direct_read(iocb, to, &gh);
 		if (likely(ret != -ENOTBLK))
@@ -865,13 +903,34 @@ static ssize_t gfs2_file_read_iter(struc
 	}
 	ip = GFS2_I(iocb->ki_filp->f_mapping->host);
 	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+retry:
 	ret = gfs2_glock_nq(&gh);
 	if (ret)
 		goto out_uninit;
+retry_under_glock:
+	pagefault_disable();
 	ret = generic_file_read_iter(iocb, to);
+	pagefault_enable();
 	if (ret > 0)
 		written += ret;
-	gfs2_glock_dq(&gh);
+
+	if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
+		size_t leftover;
+
+		gfs2_holder_allow_demote(&gh);
+		leftover = fault_in_iov_iter_writeable(to, window_size);
+		gfs2_holder_disallow_demote(&gh);
+		if (leftover != window_size) {
+			if (!gfs2_holder_queued(&gh)) {
+				if (written)
+					goto out_uninit;
+				goto retry;
+			}
+			goto retry_under_glock;
+		}
+	}
+	if (gfs2_holder_queued(&gh))
+		gfs2_glock_dq(&gh);
 out_uninit:
 	gfs2_holder_uninit(&gh);
 	return written ? written : ret;
@@ -886,8 +945,17 @@ static ssize_t gfs2_file_buffered_write(
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_holder *statfs_gh = NULL;
+	size_t prev_count = 0, window_size = 0;
+	size_t read = 0;
 	ssize_t ret;
 
+	/*
+	 * In this function, we disable page faults when we're holding the
+	 * inode glock while doing I/O.  If a page fault occurs, we indicate
+	 * that the inode glock may be dropped, fault in the pages manually,
+	 * and retry.
+	 */
+
 	if (inode == sdp->sd_rindex) {
 		statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS);
 		if (!statfs_gh)
@@ -895,10 +963,11 @@ static ssize_t gfs2_file_buffered_write(
 	}
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);
+retry:
 	ret = gfs2_glock_nq(gh);
 	if (ret)
 		goto out_uninit;
-
+retry_under_glock:
 	if (inode == sdp->sd_rindex) {
 		struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 
@@ -909,21 +978,41 @@ static ssize_t gfs2_file_buffered_write(
 	}
 
 	current->backing_dev_info = inode_to_bdi(inode);
+	pagefault_disable();
 	ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+	pagefault_enable();
 	current->backing_dev_info = NULL;
-	if (ret > 0)
+	if (ret > 0) {
 		iocb->ki_pos += ret;
+		read += ret;
+	}
 
 	if (inode == sdp->sd_rindex)
 		gfs2_glock_dq_uninit(statfs_gh);
 
+	if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
+		size_t leftover;
+
+		gfs2_holder_allow_demote(gh);
+		leftover = fault_in_iov_iter_readable(from, window_size);
+		gfs2_holder_disallow_demote(gh);
+		if (leftover != window_size) {
+			if (!gfs2_holder_queued(gh)) {
+				if (read)
+					goto out_uninit;
+				goto retry;
+			}
+			goto retry_under_glock;
+		}
+	}
 out_unlock:
-	gfs2_glock_dq(gh);
+	if (gfs2_holder_queued(gh))
+		gfs2_glock_dq(gh);
 out_uninit:
 	gfs2_holder_uninit(gh);
 	if (statfs_gh)
 		kfree(statfs_gh);
-	return ret;
+	return read ? read : ret;
 }
 
 /**



  parent reply	other threads:[~2022-04-29 10:45 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-29 10:41 [PATCH 5.15 00/33] 5.15.37-rc1 review Greg Kroah-Hartman
2022-04-29 10:41 ` [PATCH 5.15 01/33] floppy: disable FDRAWCMD by default Greg Kroah-Hartman
2022-04-29 10:41 ` [PATCH 5.15 02/33] bpf: Introduce composable reg, ret and arg types Greg Kroah-Hartman
2022-04-29 10:41 ` [PATCH 5.15 03/33] bpf: Replace ARG_XXX_OR_NULL with ARG_XXX | PTR_MAYBE_NULL Greg Kroah-Hartman
2022-04-29 10:41 ` [PATCH 5.15 04/33] bpf: Replace RET_XXX_OR_NULL with RET_XXX " Greg Kroah-Hartman
2022-04-29 10:41 ` [PATCH 5.15 05/33] bpf: Replace PTR_TO_XXX_OR_NULL with PTR_TO_XXX " Greg Kroah-Hartman
2022-04-29 10:41 ` [PATCH 5.15 06/33] bpf: Introduce MEM_RDONLY flag Greg Kroah-Hartman
2022-04-29 10:41 ` [PATCH 5.15 07/33] bpf: Convert PTR_TO_MEM_OR_NULL to composable types Greg Kroah-Hartman
2022-04-29 10:41 ` [PATCH 5.15 08/33] bpf: Make per_cpu_ptr return rdonly PTR_TO_MEM Greg Kroah-Hartman
2022-04-29 10:41 ` [PATCH 5.15 09/33] bpf: Add MEM_RDONLY for helper args that are pointers to rdonly mem Greg Kroah-Hartman
2022-04-29 10:41 ` [PATCH 5.15 10/33] bpf/selftests: Test PTR_TO_RDONLY_MEM Greg Kroah-Hartman
2022-04-29 10:41 ` [PATCH 5.15 11/33] bpf: Fix crash due to out of bounds access into reg2btf_ids Greg Kroah-Hartman
2022-04-29 10:41 ` [PATCH 5.15 12/33] spi: cadence-quadspi: fix write completion support Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 13/33] ARM: dts: socfpga: change qspi to "intel,socfpga-qspi" Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 14/33] mm: kfence: fix objcgs vector allocation Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 15/33] gup: Turn fault_in_pages_{readable,writeable} into fault_in_{readable,writeable} Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 16/33] iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 17/33] iov_iter: Introduce fault_in_iov_iter_writeable Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 18/33] gfs2: Add wrapper for iomap_file_buffered_write Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 19/33] gfs2: Clean up function may_grant Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 20/33] gfs2: Introduce flag for glock holder auto-demotion Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 21/33] gfs2: Move the inode glock locking to gfs2_file_buffered_write Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 22/33] gfs2: Eliminate ip->i_gh Greg Kroah-Hartman
2022-04-29 10:42 ` Greg Kroah-Hartman [this message]
2022-04-29 10:42 ` [PATCH 5.15 24/33] iomap: Fix iomap_dio_rw return value for user copies Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 25/33] iomap: Support partial direct I/O on user copy failures Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 26/33] iomap: Add done_before argument to iomap_dio_rw Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 27/33] gup: Introduce FOLL_NOFAULT flag to disable page faults Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 28/33] iov_iter: Introduce nofault " Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 29/33] gfs2: Fix mmap + page fault deadlocks for direct I/O Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 30/33] btrfs: fix deadlock due to page faults during direct IO reads and writes Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 31/33] btrfs: fallback to blocking mode when doing async dio over multiple extents Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 32/33] mm: gup: make fault_in_safe_writeable() use fixup_user_fault() Greg Kroah-Hartman
2022-04-29 10:42 ` [PATCH 5.15 33/33] selftests/bpf: Add test for reg2btf_ids out of bounds access Greg Kroah-Hartman
2022-06-24 10:33   ` Po-Hsu Lin
2022-06-24 11:09     ` Greg Kroah-Hartman
2022-07-01 12:51       ` Po-Hsu Lin
2022-04-29 16:39 ` [PATCH 5.15 00/33] 5.15.37-rc1 review Florian Fainelli
2022-04-29 18:36 ` Shuah Khan
2022-04-29 21:14 ` Naresh Kamboju
2022-04-29 23:47 ` Guenter Roeck
2022-04-29 23:54 ` Ron Economos
2022-04-30 10:17 ` Sudip Mukherjee

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220429104053.010361868@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=agruenba@redhat.com \
    --cc=anand.jain@oracle.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox