linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] ext4: Prevent race while waling extent tree
@ 2012-11-12 14:57 Lukas Czerner
  2012-11-13  8:22 ` [PATCH v3] " Lukas Czerner
  0 siblings, 1 reply; 16+ messages in thread
From: Lukas Czerner @ 2012-11-12 14:57 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, zab, dmonakhov, Lukas Czerner

Currently ext4_ext_walk_space() only takes i_data_sem for read when
searching for the extent at given block with ext4_ext_find_extent().
Then it drops the lock and the extent tree can be changed at will.
However later on we're searching for the 'next' extent, but the extent
tree might already have changed, so the information might not be
accurate.

In fact we can hit BUG_ON(end <= start) if the extent got inserted into
the tree after the one we found and before the block we were searching
for. This has been reproduced by running xfstests 225 in loop on s390x
architecture, but theoretically we could hit this on any other
architecture as well, but probably not as often.

Fix this by extending the critical section to include
ext4_ext_next_allocated_block() as well. It means that if there are any
operation going on on the particular inode, the fiemap will return
inaccurate data. However this will also fix the concerns about starving
writers to the extent tree, because we will put and reacquire the
semaphore with every iteration. This will not be particularly fast, but
fiemap is not critical operation.

However we also need to limit the access to the extent structure to the
critical section, because outside of it the content can change. So we
remove extent and next block parameters from ext4_ext_fiemap_cb()
function and pass just flags instead.

Also we have to move path reinitialization inside the critical section.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext4/ext4_extents.h |    5 ++---
 fs/ext4/extents.c      |   40 +++++++++++++++++++++-------------------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index cb1b2c9..356ad9f 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -149,9 +149,8 @@ struct ext4_ext_path {
  * positive retcode - signal for ext4_ext_walk_space(), see below
  * callback must return valid extent (passed or newly created)
  */
-typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
-					struct ext4_ext_cache *,
-					struct ext4_extent *, void *);
+typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_cache *,
+				    unsigned int, void *);
 
 #define EXT_CONTINUE   0
 #define EXT_BREAK      1
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7011ac9..c097acf 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1968,7 +1968,8 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 	struct ext4_extent *ex;
 	ext4_lblk_t next, start = 0, end = 0;
 	ext4_lblk_t last = block + num;
-	int depth, exists, err = 0;
+	int exists, depth = 0, err = 0;
+	unsigned int flags = 0;
 
 	BUG_ON(func == NULL);
 	BUG_ON(inode == NULL);
@@ -1977,9 +1978,16 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 		num = last - block;
 		/* find extent for this block */
 		down_read(&EXT4_I(inode)->i_data_sem);
+
+		if (path && ext_depth(inode) != depth) {
+			/* depth was changed. we have to realloc path */
+			kfree(path);
+			path = NULL;
+		}
+
 		path = ext4_ext_find_extent(inode, block, path);
-		up_read(&EXT4_I(inode)->i_data_sem);
 		if (IS_ERR(path)) {
+			up_read(&EXT4_I(inode)->i_data_sem);
 			err = PTR_ERR(path);
 			path = NULL;
 			break;
@@ -1987,6 +1995,7 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 
 		depth = ext_depth(inode);
 		if (unlikely(path[depth].p_hdr == NULL)) {
+			up_read(&EXT4_I(inode)->i_data_sem);
 			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
 			err = -EIO;
 			break;
@@ -2037,14 +2046,21 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 			cbex.ec_block = le32_to_cpu(ex->ee_block);
 			cbex.ec_len = ext4_ext_get_actual_len(ex);
 			cbex.ec_start = ext4_ext_pblock(ex);
+			if (ext4_ext_is_uninitialized(ex))
+				flags |= FIEMAP_EXTENT_UNWRITTEN;
 		}
+		up_read(&EXT4_I(inode)->i_data_sem);
 
 		if (unlikely(cbex.ec_len == 0)) {
 			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
 			err = -EIO;
 			break;
 		}
-		err = func(inode, next, &cbex, ex, cbdata);
+
+		if (next == EXT_MAX_BLOCKS)
+			flags |= FIEMAP_EXTENT_LAST;
+
+		err = func(inode, &cbex, flags, cbdata);
 		ext4_ext_drop_refs(path);
 
 		if (err < 0)
@@ -2057,12 +2073,6 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 			break;
 		}
 
-		if (ext_depth(inode) != depth) {
-			/* depth was changed. we have to realloc path */
-			kfree(path);
-			path = NULL;
-		}
-
 		block = cbex.ec_block + cbex.ec_len;
 	}
 
@@ -4574,14 +4584,12 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 /*
  * Callback function called for each extent to gather FIEMAP information.
  */
-static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
-		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
-		       void *data)
+static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_cache *newex,
+			      unsigned int flags, void *data)
 {
 	__u64	logical;
 	__u64	physical;
 	__u64	length;
-	__u32	flags = 0;
 	int		ret = 0;
 	struct fiemap_extent_info *fieinfo = data;
 	unsigned char blksize_bits;
@@ -4759,12 +4767,6 @@ found_delayed_extent:
 	physical = (__u64)newex->ec_start << blksize_bits;
 	length =   (__u64)newex->ec_len << blksize_bits;
 
-	if (ex && ext4_ext_is_uninitialized(ex))
-		flags |= FIEMAP_EXTENT_UNWRITTEN;
-
-	if (next == EXT_MAX_BLOCKS)
-		flags |= FIEMAP_EXTENT_LAST;

^ permalink raw reply related	[flat|nested] 16+ messages in thread
* [PATCH] ext4: Prevent race while waling extent tree
@ 2012-11-08 11:08 Lukas Czerner
  2012-11-08 12:01 ` Dmitry Monakhov
  2012-11-08 21:52 ` Zach Brown
  0 siblings, 2 replies; 16+ messages in thread
From: Lukas Czerner @ 2012-11-08 11:08 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, Lukas Czerner

Currently ext4_ext_walk_space() only takes i_data_sem for read when
searching for the extent at given block with ext4_ext_find_extent().
Then it drops the lock and the extent tree can be changed at will.
However later on we're searching for the 'next' extent, but the extent
tree might already have changed, so the information might not be
accurate.

In fact we can hit BUG_ON(end <= start) if the extent got inserted into
the tree after the one we found and before the block we were searching
for. This has been reproduced by running xfstests 225 in loop on s390x
architecture, but theoretically we could hit this on any other
architecture as well, but probably not as often.

ext4_ext_walk_space() is currently only used from ext4_fiemap() and even
if we do not hit the BUG_ON() fiemap might return scrambled information
to the user.

Fix this by requiring ext4_ext_walk_space() to be called with i_data_sem
held. By calling it from ext4_fiemap() we can only take the i_data_sem
for read, but possibly other users might want to modify the extents so
they will be able to take write lock.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext4/extents.c |    9 +++++++--
 1 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7011ac9..f1aca06 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1959,6 +1959,11 @@ cleanup:
 	return err;
 }
 
+/*
+ * ext4_ext_walk_space() should be called with i_data_sem locked. If we're
+ * not modifying found extents, or extent tree in callback function, then
+ * read lock is ok.
+ */
 static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 			       ext4_lblk_t num, ext_prepare_callback func,
 			       void *cbdata)
@@ -1976,9 +1981,7 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 	while (block < last && block != EXT_MAX_BLOCKS) {
 		num = last - block;
 		/* find extent for this block */
-		down_read(&EXT4_I(inode)->i_data_sem);
 		path = ext4_ext_find_extent(inode, block, path);
-		up_read(&EXT4_I(inode)->i_data_sem);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
 			path = NULL;
@@ -5021,8 +5024,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		 * Walk the extent tree gathering extent information.
 		 * ext4_ext_fiemap_cb will push extents back to user.
 		 */
+		down_read(&EXT4_I(inode)->i_data_sem);
 		error = ext4_ext_walk_space(inode, start_blk, len_blks,
 					  ext4_ext_fiemap_cb, fieinfo);
+		up_read(&EXT4_I(inode)->i_data_sem);
 	}
 
 	return error;
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2012-11-19 11:11 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-11-12 14:57 [PATCH] ext4: Prevent race while waling extent tree Lukas Czerner
2012-11-13  8:22 ` [PATCH v3] " Lukas Czerner
2012-11-13 11:34   ` Peng Tao
2012-11-13 12:07     ` Lukáš Czerner
2012-11-13 14:19       ` Peng Tao
2012-11-13 18:51         ` Zach Brown
2012-11-15 16:39           ` Lukáš Czerner
2012-11-15 19:10             ` Zach Brown
2012-11-19  3:24             ` Theodore Ts'o
2012-11-19 11:11               ` Lukáš Czerner
  -- strict thread matches above, loose matches on Subject: below --
2012-11-08 11:08 [PATCH] " Lukas Czerner
2012-11-08 12:01 ` Dmitry Monakhov
2012-11-08 13:43   ` Lukáš Czerner
2012-11-08 16:07     ` Lukáš Czerner
2012-11-08 21:52 ` Zach Brown
2012-11-09  9:19   ` Lukáš Czerner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).