public inbox for linux-ext4@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] ext4: Fix data corruption with EXT4_GET_BLOCKS_ZERO
@ 2017-05-25  8:43 Jan Kara
  2017-05-26 21:44 ` Theodore Ts'o
  0 siblings, 1 reply; 2+ messages in thread
From: Jan Kara @ 2017-05-25  8:43 UTC (permalink / raw)
  To: Ted Tso; +Cc: linux-ext4, Ross Zwisler, Jan Kara, stable

When ext4_map_blocks() is called with EXT4_GET_BLOCKS_ZERO to zero-out
allocated blocks and these blocks are actually converted from unwritten
extent the following race can happen:

CPU0					CPU1

page fault				page fault
...					...
ext4_map_blocks()
  ext4_ext_map_blocks()
    ext4_ext_handle_unwritten_extents()
      ext4_ext_convert_to_initialized()
	- zero out converted extent
	ext4_zeroout_es()
	  - inserts extent as initialized in status tree

					ext4_map_blocks()
					  ext4_es_lookup_extent()
					    - finds initialized extent
					write data
  ext4_issue_zeroout()
    - zeroes out new extent overwriting data

This problem can be reproduced by generic/340 for the fallocated case
for the last block in the file.

Fix the problem by avoiding zeroing out the area we are mapping with
ext4_map_blocks() in ext4_ext_convert_to_initialized(). It is pointless
to zero out this area in the first place as the caller asked us to
convert the area to initialized because he is just going to write data
there before the transaction finishes. To achieve this we delete the
special case of zeroing out full extent as that will be handled by the
cases below zeroing only the part of the extent that needs it. We also
instruct ext4_split_extent() that the middle of extent being split
contains data so that ext4_split_extent_at() cannot zero out full extent
in case of ENOSPC.

CC: stable@vger.kernel.org
Fixes: 12735f881952c32b31bc4e433768f18489f79ec9
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/extents.c | 80 +++++++++++++++++++++++++------------------------------
 1 file changed, 37 insertions(+), 43 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2a97dff87b96..83a513efc824 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3413,13 +3413,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	struct ext4_sb_info *sbi;
 	struct ext4_extent_header *eh;
 	struct ext4_map_blocks split_map;
-	struct ext4_extent zero_ex;
+	struct ext4_extent zero_ex1, zero_ex2;
 	struct ext4_extent *ex, *abut_ex;
 	ext4_lblk_t ee_block, eof_block;
 	unsigned int ee_len, depth, map_len = map->m_len;
 	int allocated = 0, max_zeroout = 0;
 	int err = 0;
-	int split_flag = 0;
+	int split_flag = EXT4_EXT_DATA_VALID2;
 
 	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
 		"block %llu, max_blocks %u\n", inode->i_ino,
@@ -3436,7 +3436,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	ex = path[depth].p_ext;
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
-	zero_ex.ee_len = 0;
+	zero_ex1.ee_len = 0;
+	zero_ex2.ee_len = 0;
 
 	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
 
@@ -3576,62 +3577,52 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	if (ext4_encrypted_inode(inode))
 		max_zeroout = 0;
 
-	/* If extent is less than s_max_zeroout_kb, zeroout directly */
-	if (max_zeroout && (ee_len <= max_zeroout)) {
-		err = ext4_ext_zeroout(inode, ex);
-		if (err)
-			goto out;
-		zero_ex.ee_block = ex->ee_block;
-		zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex));
-		ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
-
-		err = ext4_ext_get_access(handle, inode, path + depth);
-		if (err)
-			goto out;
-		ext4_ext_mark_initialized(ex);
-		ext4_ext_try_to_merge(handle, inode, path, ex);
-		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
-		goto out;
-	}
-
 	/*
-	 * four cases:
+	 * five cases:
 	 * 1. split the extent into three extents.
-	 * 2. split the extent into two extents, zeroout the first half.
-	 * 3. split the extent into two extents, zeroout the second half.
+	 * 2. split the extent into two extents, zeroout the head of the first
+	 *    extent.
+	 * 3. split the extent into two extents, zeroout the tail of the second
+	 *    extent.
 	 * 4. split the extent into two extents with out zeroout.
+	 * 5. no splitting needed, just possibly zeroout the head and / or the
+	 *    tail of the extent.
 	 */
 	split_map.m_lblk = map->m_lblk;
 	split_map.m_len = map->m_len;
 
-	if (max_zeroout && (allocated > map->m_len)) {
+	if (max_zeroout && (allocated > split_map.m_len)) {
 		if (allocated <= max_zeroout) {
-			/* case 3 */
-			zero_ex.ee_block =
-					 cpu_to_le32(map->m_lblk);
-			zero_ex.ee_len = cpu_to_le16(allocated);
-			ext4_ext_store_pblock(&zero_ex,
-				ext4_ext_pblock(ex) + map->m_lblk - ee_block);
-			err = ext4_ext_zeroout(inode, &zero_ex);
+			/* case 3 or 5 */
+			zero_ex1.ee_block =
+				 cpu_to_le32(split_map.m_lblk +
+					     split_map.m_len);
+			zero_ex1.ee_len =
+				cpu_to_le16(allocated - split_map.m_len);
+			ext4_ext_store_pblock(&zero_ex1,
+				ext4_ext_pblock(ex) + split_map.m_lblk +
+				split_map.m_len - ee_block);
+			err = ext4_ext_zeroout(inode, &zero_ex1);
 			if (err)
 				goto out;
-			split_map.m_lblk = map->m_lblk;
 			split_map.m_len = allocated;
-		} else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
-			/* case 2 */
-			if (map->m_lblk != ee_block) {
-				zero_ex.ee_block = ex->ee_block;
-				zero_ex.ee_len = cpu_to_le16(map->m_lblk -
+		}
+		if (split_map.m_lblk - ee_block + split_map.m_len <
+								max_zeroout) {
+			/* case 2 or 5 */
+			if (split_map.m_lblk != ee_block) {
+				zero_ex2.ee_block = ex->ee_block;
+				zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk -
 							ee_block);
-				ext4_ext_store_pblock(&zero_ex,
+				ext4_ext_store_pblock(&zero_ex2,
 						      ext4_ext_pblock(ex));
-				err = ext4_ext_zeroout(inode, &zero_ex);
+				err = ext4_ext_zeroout(inode, &zero_ex2);
 				if (err)
 					goto out;
 			}
 
+			split_map.m_len += split_map.m_lblk - ee_block;
 			split_map.m_lblk = ee_block;
-			split_map.m_len = map->m_lblk - ee_block + map->m_len;
 			allocated = map->m_len;
 		}
 	}
@@ -3642,8 +3633,11 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		err = 0;
 out:
 	/* If we have gotten a failure, don't zero out status tree */
-	if (!err)
-		err = ext4_zeroout_es(inode, &zero_ex);
+	if (!err) {
+		err = ext4_zeroout_es(inode, &zero_ex1);
+		if (!err)
+			err = ext4_zeroout_es(inode, &zero_ex2);
+	}
 	return err ? err : allocated;
 }
 
-- 
2.12.0

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH] ext4: Fix data corruption with EXT4_GET_BLOCKS_ZERO
  2017-05-25  8:43 [PATCH] ext4: Fix data corruption with EXT4_GET_BLOCKS_ZERO Jan Kara
@ 2017-05-26 21:44 ` Theodore Ts'o
  0 siblings, 0 replies; 2+ messages in thread
From: Theodore Ts'o @ 2017-05-26 21:44 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-ext4, Ross Zwisler, stable

On Thu, May 25, 2017 at 10:43:13AM +0200, Jan Kara wrote:
> When ext4_map_blocks() is called with EXT4_GET_BLOCKS_ZERO to zero-out
> allocated blocks and these blocks are actually converted from unwritten
> extent the following race can happen:
> 
> CPU0					CPU1
> 
> page fault				page fault
> ...					...
> ext4_map_blocks()
>   ext4_ext_map_blocks()
>     ext4_ext_handle_unwritten_extents()
>       ext4_ext_convert_to_initialized()
> 	- zero out converted extent
> 	ext4_zeroout_es()
> 	  - inserts extent as initialized in status tree
> 
> 					ext4_map_blocks()
> 					  ext4_es_lookup_extent()
> 					    - finds initialized extent
> 					write data
>   ext4_issue_zeroout()
>     - zeroes out new extent overwriting data
> 
> This problem can be reproduced by generic/340 for the fallocated case
> for the last block in the file.
> 
> Fix the problem by avoiding zeroing out the area we are mapping with
> ext4_map_blocks() in ext4_ext_convert_to_initialized(). It is pointless
> to zero out this area in the first place as the caller asked us to
> convert the area to initialized because he is just going to write data
> there before the transaction finishes. To achieve this we delete the
> special case of zeroing out full extent as that will be handled by the
> cases below zeroing only the part of the extent that needs it. We also
> instruct ext4_split_extent() that the middle of extent being split
> contains data so that ext4_split_extent_at() cannot zero out full extent
> in case of ENOSPC.
> 
> CC: stable@vger.kernel.org
> Fixes: 12735f881952c32b31bc4e433768f18489f79ec9
> Signed-off-by: Jan Kara <jack@suse.cz>

Thanks, applied.

						- Ted

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2017-05-26 21:45 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-05-25  8:43 [PATCH] ext4: Fix data corruption with EXT4_GET_BLOCKS_ZERO Jan Kara
2017-05-26 21:44 ` Theodore Ts'o

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox