linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] ext4: revert i_data_sum locking cleanups for dioread_nolock
@ 2016-02-12 18:25 Eric Whitney
  2016-02-16  5:08 ` Theodore Ts'o
  0 siblings, 1 reply; 5+ messages in thread
From: Eric Whitney @ 2016-02-12 18:25 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, jack

Commit 2bcba4781fa3 ("ext4: get rid of EXT4_GET_BLOCKS_NO_LOCK flag")
can cause a kernel panic when xfstest generic/300 is run on a file
system mounted with dioread_nolock.  The panic is typically triggered
from check_irqs_on() (fs/buffer.c: 1272), and happens because
ext4_end_io_dio() is being called in an interrupt context rather than
from a workqueue for AIO.  This suggests that buffer_defer_completion
may not be set properly when creating an unwritten extent for async
direct I/O.

Revert the locking changes until this problem can be resolved.  Patch
applied to 4.5-rc3 and tested with a full xfstest-bld auto group run.

Signed-off-by: Eric Whitney <enwlinux@gmail.com>
---
 fs/ext4/ext4.h              |  6 ++++--
 fs/ext4/inode.c             | 43 +++++++++++++++++++++++--------------------
 include/trace/events/ext4.h |  1 +
 3 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0662b28..36fcf2a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -563,10 +563,12 @@ enum {
 #define EXT4_GET_BLOCKS_NO_NORMALIZE		0x0040
 	/* Request will not result in inode size update (user for fallocate) */
 #define EXT4_GET_BLOCKS_KEEP_SIZE		0x0080
+	/* Do not take i_data_sem locking in ext4_map_blocks */
+#define EXT4_GET_BLOCKS_NO_LOCK			0x0100
 	/* Convert written extents to unwritten */
-#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0100
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0200
 	/* Write zeros to newly created written extents */
-#define EXT4_GET_BLOCKS_ZERO			0x0200
+#define EXT4_GET_BLOCKS_ZERO			0x0300
 #define EXT4_GET_BLOCKS_CREATE_ZERO		(EXT4_GET_BLOCKS_CREATE |\
 					EXT4_GET_BLOCKS_ZERO)
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 83bc8bf..f72dc04 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -418,7 +418,8 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
 	 * out taking i_data_sem.  So at the time the unwritten extent
 	 * could be converted.
 	 */
-	down_read(&EXT4_I(inode)->i_data_sem);
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		down_read(&EXT4_I(inode)->i_data_sem);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -426,7 +427,8 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
 		retval = ext4_ind_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	}
-	up_read((&EXT4_I(inode)->i_data_sem));
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		up_read((&EXT4_I(inode)->i_data_sem));
 
 	/*
 	 * We don't check m_len because extent will be collpased in status
@@ -522,7 +524,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	 * Try to see if we can get the block without requesting a new
 	 * file system block.
 	 */
-	down_read(&EXT4_I(inode)->i_data_sem);
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		down_read(&EXT4_I(inode)->i_data_sem);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -553,7 +556,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		if (ret < 0)
 			retval = ret;
 	}
-	up_read((&EXT4_I(inode)->i_data_sem));
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		up_read((&EXT4_I(inode)->i_data_sem));
 
 found:
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -703,7 +707,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 	map.m_lblk = iblock;
 	map.m_len = bh->b_size >> inode->i_blkbits;
 
-	if (flags && !handle) {
+	if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
 		/* Direct IO write... */
 		if (map.m_len > DIO_MAX_BLOCKS)
 			map.m_len = DIO_MAX_BLOCKS;
@@ -898,6 +902,9 @@ int do_journal_get_write_access(handle_t *handle,
 	return ret;
 }
 
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create);
+
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
 				  get_block_t *get_block)
@@ -3070,21 +3077,13 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock,
 			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
 
-static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create)
 {
-	int ret;
-
-	ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n",
+	ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
-	ret = _ext4_get_block(inode, iblock, bh_result, 0);
-	/*
-	 * Blocks should have been preallocated! ext4_file_write_iter() checks
-	 * that.
-	 */
-	WARN_ON_ONCE(!buffer_mapped(bh_result));
-
-	return ret;
+	return _ext4_get_block(inode, iblock, bh_result,
+			       EXT4_GET_BLOCKS_NO_LOCK);
 }
 
 #ifdef CONFIG_FS_DAX
@@ -3230,8 +3229,10 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	/* If we do a overwrite dio, i_mutex locking can be released */
 	overwrite = *((int *)iocb->private);
 
-	if (overwrite)
+	if (overwrite) {
+		down_read(&EXT4_I(inode)->i_data_sem);
 		inode_unlock(inode);
+	}
 
 	/*
 	 * We could direct write to holes and fallocate.
@@ -3274,7 +3275,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	}
 
 	if (overwrite) {
-		get_block_func = ext4_get_block_overwrite;
+		get_block_func = ext4_get_block_write_nolock;
 	} else {
 		get_block_func = ext4_get_block_write;
 		dio_flags = DIO_LOCKING;
@@ -3330,8 +3331,10 @@ retake_lock:
 	if (iov_iter_rw(iter) == WRITE)
 		inode_dio_end(inode);
 	/* take i_mutex locking again if we do a ovewrite dio */
-	if (overwrite)
+	if (overwrite) {
+		up_read(&EXT4_I(inode)->i_data_sem);
 		inode_lock(inode);
+	}
 
 	return ret;
 }
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 4e4b2fa..6599e75 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -43,6 +43,7 @@ struct extent_status;
 	{ EXT4_GET_BLOCKS_METADATA_NOFAIL,	"METADATA_NOFAIL" },	\
 	{ EXT4_GET_BLOCKS_NO_NORMALIZE,		"NO_NORMALIZE" },	\
 	{ EXT4_GET_BLOCKS_KEEP_SIZE,		"KEEP_SIZE" },		\
+	{ EXT4_GET_BLOCKS_NO_LOCK,		"NO_LOCK" },		\
 	{ EXT4_GET_BLOCKS_ZERO,			"ZERO" })
 
 #define show_mflags(flags) __print_flags(flags, "",	\
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] ext4: revert i_data_sum locking cleanups for dioread_nolock
  2016-02-12 18:25 [PATCH] ext4: revert i_data_sum locking cleanups for dioread_nolock Eric Whitney
@ 2016-02-16  5:08 ` Theodore Ts'o
  2016-02-18 22:09   ` Jan Kara
  0 siblings, 1 reply; 5+ messages in thread
From: Theodore Ts'o @ 2016-02-16  5:08 UTC (permalink / raw)
  To: Eric Whitney; +Cc: linux-ext4, jack

On Fri, Feb 12, 2016 at 01:25:06PM -0500, Eric Whitney wrote:
> Commit 2bcba4781fa3 ("ext4: get rid of EXT4_GET_BLOCKS_NO_LOCK flag")
> can cause a kernel panic when xfstest generic/300 is run on a file
> system mounted with dioread_nolock.  The panic is typically triggered
> from check_irqs_on() (fs/buffer.c: 1272), and happens because
> ext4_end_io_dio() is being called in an interrupt context rather than
> from a workqueue for AIO.  This suggests that buffer_defer_completion
> may not be set properly when creating an unwritten extent for async
> direct I/O.
> 
> Revert the locking changes until this problem can be resolved.  Patch
> applied to 4.5-rc3 and tested with a full xfstest-bld auto group run.
> 
> Signed-off-by: Eric Whitney <enwlinux@gmail.com>

Applied, thanks.

I was able to reliably reproduce the problem without this revert patch
using a 32-bit x86 kvm-xfstests test appliance:

   ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/kvm-xfstests/testing/root_fs.img.i386

Using the command: "kvm-xfstests -c dioread_nolock generic/300"

With this patch, the problem goes away, so reverting the patch is
clearly the right thing for now.  There is something screwy going on,
since the original commit shouldn't have made a difference, but let's
revert it first, and figure it out when we have time to investigate
more deeply.

						- Ted

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] ext4: revert i_data_sum locking cleanups for dioread_nolock
  2016-02-16  5:08 ` Theodore Ts'o
@ 2016-02-18 22:09   ` Jan Kara
  2016-02-19  5:30     ` Theodore Ts'o
  0 siblings, 1 reply; 5+ messages in thread
From: Jan Kara @ 2016-02-18 22:09 UTC (permalink / raw)
  To: Theodore Ts'o; +Cc: Eric Whitney, linux-ext4, jack

[-- Attachment #1: Type: text/plain, Size: 2283 bytes --]

On Tue 16-02-16 00:08:40, Ted Tso wrote:
> On Fri, Feb 12, 2016 at 01:25:06PM -0500, Eric Whitney wrote:
> > Commit 2bcba4781fa3 ("ext4: get rid of EXT4_GET_BLOCKS_NO_LOCK flag")
> > can cause a kernel panic when xfstest generic/300 is run on a file
> > system mounted with dioread_nolock.  The panic is typically triggered
> > from check_irqs_on() (fs/buffer.c: 1272), and happens because
> > ext4_end_io_dio() is being called in an interrupt context rather than
> > from a workqueue for AIO.  This suggests that buffer_defer_completion
> > may not be set properly when creating an unwritten extent for async
> > direct I/O.
> > 
> > Revert the locking changes until this problem can be resolved.  Patch
> > applied to 4.5-rc3 and tested with a full xfstest-bld auto group run.
> > 
> > Signed-off-by: Eric Whitney <enwlinux@gmail.com>
> 
> Applied, thanks.
> 
> I was able to reliably reproduce the problem without this revert patch
> using a 32-bit x86 kvm-xfstests test appliance:
> 
>    ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/kvm-xfstests/testing/root_fs.img.i386
> 
> Using the command: "kvm-xfstests -c dioread_nolock generic/300"
> 
> With this patch, the problem goes away, so reverting the patch is
> clearly the right thing for now.  There is something screwy going on,
> since the original commit shouldn't have made a difference, but let's
> revert it first, and figure it out when we have time to investigate
> more deeply.

OK, I had a look into this. So I'm not 100% what has happened but the
following looks likely: Current io_end handling can overwrite io_end
pointer in the inode in dioread_nolock mode (nothing prevents unlocked DIO
to overwrite pointer of locked DIO and then clear it out). I suspect that
the change in i_data_sem locking made this race more visible. Attached
patch should fix the issue (I don't see failures of generic/300 with it in
dioread_nolock mode). Can you consider this instead of a revert Eric sent?

I have also a more complete rewrite of io_end handling which makes the code
more comprehensible and avoids storing io_end pointer in the inode (thus
avoids similar pitfalls in future) but that is a 4.6 matter. I'll submit
the rewrite once xfstests runs complete.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

[-- Attachment #2: 0001-ext4-Fix-crashes-in-dioread_nolock-mode.patch --]
[-- Type: text/x-patch, Size: 2609 bytes --]

>From 09eae61bb0433f8424f606fde00db35dbd72615d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 18 Feb 2016 22:50:07 +0100
Subject: [PATCH] ext4: Fix crashes in dioread_nolock mode

Competing overwrite DIO in dioread_nolock mode will just overwrite
pointer to io_end in the inode. This may result in data corruption or
extent conversion happening from IO completion interrupt because we
don't properly set buffer_defer_completion() when unlocked DIO races
with locked DIO to unwritten extent.

Since unlocked DIO doesn't need io_end for anything, just avoid
allocating it and corrupting pointer from inode for locked DIO.
A cleaner fix would be to avoid these games with io_end pointer from the
inode but that requires more intrusive changes so we leave that for
later.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/inode.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 83bc8bfb3bea..ee8ca1ff4023 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3253,29 +3253,29 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	 * case, we allocate an io_end structure to hook to the iocb.
 	 */
 	iocb->private = NULL;
-	ext4_inode_aio_set(inode, NULL);
-	if (!is_sync_kiocb(iocb)) {
-		io_end = ext4_init_io_end(inode, GFP_NOFS);
-		if (!io_end) {
-			ret = -ENOMEM;
-			goto retake_lock;
-		}
-		/*
-		 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
-		 */
-		iocb->private = ext4_get_io_end(io_end);
-		/*
-		 * we save the io structure for current async direct
-		 * IO, so that later ext4_map_blocks() could flag the
-		 * io structure whether there is a unwritten extents
-		 * needs to be converted when IO is completed.
-		 */
-		ext4_inode_aio_set(inode, io_end);
-	}
-
 	if (overwrite) {
 		get_block_func = ext4_get_block_overwrite;
 	} else {
+		ext4_inode_aio_set(inode, NULL);
+		if (!is_sync_kiocb(iocb)) {
+			io_end = ext4_init_io_end(inode, GFP_NOFS);
+			if (!io_end) {
+				ret = -ENOMEM;
+				goto retake_lock;
+			}
+			/*
+			 * Grab reference for DIO. Will be dropped in
+			 * ext4_end_io_dio()
+			 */
+			iocb->private = ext4_get_io_end(io_end);
+			/*
+			 * we save the io structure for current async direct
+			 * IO, so that later ext4_map_blocks() could flag the
+			 * io structure whether there is a unwritten extents
+			 * needs to be converted when IO is completed.
+			 */
+			ext4_inode_aio_set(inode, io_end);
+		}
 		get_block_func = ext4_get_block_write;
 		dio_flags = DIO_LOCKING;
 	}
-- 
2.6.2


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] ext4: revert i_data_sum locking cleanups for dioread_nolock
  2016-02-18 22:09   ` Jan Kara
@ 2016-02-19  5:30     ` Theodore Ts'o
  2016-02-19 14:19       ` Eric Whitney
  0 siblings, 1 reply; 5+ messages in thread
From: Theodore Ts'o @ 2016-02-19  5:30 UTC (permalink / raw)
  To: Jan Kara; +Cc: Eric Whitney, linux-ext4

On Thu, Feb 18, 2016 at 11:09:56PM +0100, Jan Kara wrote:
> OK, I had a look into this. So I'm not 100% what has happened but the
> following looks likely: Current io_end handling can overwrite io_end
> pointer in the inode in dioread_nolock mode (nothing prevents unlocked DIO
> to overwrite pointer of locked DIO and then clear it out). I suspect that
> the change in i_data_sem locking made this race more visible. Attached
> patch should fix the issue (I don't see failures of generic/300 with it in
> dioread_nolock mode). Can you consider this instead of a revert Eric sent?

Thanks!  That does appear to be it. I dropped the revert, confirmed
that I could still trivially reproduce the failure, applied patch,
and ran the test 10 times ("kvm-xfstests -C 10 -c dioread_nolock
generic/300") and it passed with flying colors.

> I have also a more complete rewrite of io_end handling which makes the code
> more comprehensible and avoids storing io_end pointer in the inode (thus
> avoids similar pitfalls in future) but that is a 4.6 matter. I'll submit
> the rewrite once xfstests runs complete.

Great, thanks!

					- Ted

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] ext4: revert i_data_sum locking cleanups for dioread_nolock
  2016-02-19  5:30     ` Theodore Ts'o
@ 2016-02-19 14:19       ` Eric Whitney
  0 siblings, 0 replies; 5+ messages in thread
From: Eric Whitney @ 2016-02-19 14:19 UTC (permalink / raw)
  To: Theodore Ts'o; +Cc: Jan Kara, Eric Whitney, linux-ext4

* Theodore Ts'o <tytso@mit.edu>:
> On Thu, Feb 18, 2016 at 11:09:56PM +0100, Jan Kara wrote:
> > OK, I had a look into this. So I'm not 100% what has happened but the
> > following looks likely: Current io_end handling can overwrite io_end
> > pointer in the inode in dioread_nolock mode (nothing prevents unlocked DIO
> > to overwrite pointer of locked DIO and then clear it out). I suspect that
> > the change in i_data_sem locking made this race more visible. Attached
> > patch should fix the issue (I don't see failures of generic/300 with it in
> > dioread_nolock mode). Can you consider this instead of a revert Eric sent?
> 
> Thanks!  That does appear to be it. I dropped the revert, confirmed
> that I could still trivially reproduce the failure, applied patch,
> and ran the test 10 times ("kvm-xfstests -C 10 -c dioread_nolock
> generic/300") and it passed with flying colors.
> 
> > I have also a more complete rewrite of io_end handling which makes the code
> > more comprehensible and avoids storing io_end pointer in the inode (thus
> > avoids similar pitfalls in future) but that is a 4.6 matter. I'll submit
> > the rewrite once xfstests runs complete.
> 
> Great, thanks!
> 
> 					- Ted

I ran the same ten test runs (kvm-xfstests -c dioread_nolock generic/300) on
x86_64 and a full test run (kvm-xfstests -g auto) with the patch applied to
4.5-rc4 without regressions relative to my -rc4 baseline results.  Looks
good to me.

Tested-by: Eric Whitney <enwlinux@gmail.com>



^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2016-02-19 14:19 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-02-12 18:25 [PATCH] ext4: revert i_data_sum locking cleanups for dioread_nolock Eric Whitney
2016-02-16  5:08 ` Theodore Ts'o
2016-02-18 22:09   ` Jan Kara
2016-02-19  5:30     ` Theodore Ts'o
2016-02-19 14:19       ` Eric Whitney

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).