public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Greg KH <gregkh@suse.de>
To: linux-kernel@vger.kernel.org, stable@kernel.org
Cc: stable-review@kernel.org, torvalds@linux-foundation.org,
	akpm@linux-foundation.org, alan@lxorguk.ukuu.org.uk,
	Akira Fujita <a-fujita@rs.jp.nec.com>,
	"Theodore Tso" <tytso@mit.edu>,
	Greg Kroah-Hartman <gregkh@suse.de>
Subject: [08/34] ext4: fix lock order problem in ext4_move_extents()
Date: Thu, 10 Dec 2009 21:23:20 -0800	[thread overview]
Message-ID: <20091211052544.287395070@linux.site> (raw)
In-Reply-To: <20091211052858.GA23229@kroah.com>

[-- Attachment #1: 0004-ext4-fix-lock-order-problem-in-ext4_move_extents.patch --]
[-- Type: text/plain, Size: 10473 bytes --]

2.6.32-stable review patch.  If anyone has any objections, please let us know.

------------------

(cherry picked from commit fc04cb49a898c372a22b21fffc47f299d8710801)

ext4_move_extents() checks the logical block contiguousness
of original file with ext4_find_extent() and mext_next_extent().
Therefore the extent which ext4_ext_path structure indicates
must not be changed between above functions.

But in current implementation, there is no i_data_sem protection
between ext4_ext_find_extent() and mext_next_extent().  So the extent
which ext4_ext_path structure indicates may be overwritten by
delalloc.  As a result, ext4_move_extents() will exchange wrong blocks
between original and donor files.  I change the place where
acquire/release i_data_sem to solve this problem.

Moreover, I changed move_extent_per_page() to start transaction first,
and then acquire i_data_sem.  Without this change, there is a
possibility of the deadlock between mmap() and ext4_move_extents():

* NOTE: "A", "B" and "C" mean different processes

A-1: ext4_ext_move_extents() acquires i_data_sem of two inodes.

B:   do_page_fault() starts the transaction (T),
     and then tries to acquire i_data_sem.
     But process "A" is already holding it, so it is kept waiting.

C:   While "A" and "B" running, kjournald2 tries to commit transaction (T)
     but it is under updating, so kjournald2 waits for it.

A-2: Call ext4_journal_start with holding i_data_sem,
     but transaction (T) is locked.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ext4/move_extent.c |  117 ++++++++++++++++++++++----------------------------
 1 file changed, 53 insertions(+), 64 deletions(-)

--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -77,12 +77,14 @@ static int
 mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 		      struct ext4_extent **extent)
 {
+	struct ext4_extent_header *eh;
 	int ppos, leaf_ppos = path->p_depth;
 
 	ppos = leaf_ppos;
 	if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
 		/* leaf block */
 		*extent = ++path[ppos].p_ext;
+		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
 		return 0;
 	}
 
@@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, st
 					ext_block_hdr(path[cur_ppos+1].p_bh);
 			}
 
+			path[leaf_ppos].p_ext = *extent = NULL;
+
+			eh = path[leaf_ppos].p_hdr;
+			if (le16_to_cpu(eh->eh_entries) == 0)
+				/* empty leaf is found */
+				return -ENODATA;
+
 			/* leaf block */
 			path[leaf_ppos].p_ext = *extent =
 				EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+			path[leaf_ppos].p_block =
+					ext_pblock(path[leaf_ppos].p_ext);
 			return 0;
 		}
 	}
@@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inod
 }
 
 /**
- * mext_double_down_read - Acquire two inodes' read semaphore
- *
- * @orig_inode:		original inode structure
- * @donor_inode:	donor inode structure
- * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
- */
-static void
-mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
-{
-	struct inode *first = orig_inode, *second = donor_inode;
-
-	/*
-	 * Use the inode number to provide the stable locking order instead
-	 * of its address, because the C language doesn't guarantee you can
-	 * compare pointers that don't come from the same array.
-	 */
-	if (donor_inode->i_ino < orig_inode->i_ino) {
-		first = donor_inode;
-		second = orig_inode;
-	}
-
-	down_read(&EXT4_I(first)->i_data_sem);
-	down_read(&EXT4_I(second)->i_data_sem);
-}
-
-/**
- * mext_double_down_write - Acquire two inodes' write semaphore
+ * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
  *
  * @orig_inode:		original inode structure
  * @donor_inode:	donor inode structure
- * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
+ * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
+ * i_ino order.
  */
 static void
-mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
+double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
 {
 	struct inode *first = orig_inode, *second = donor_inode;
 
@@ -207,28 +193,14 @@ mext_double_down_write(struct inode *ori
 }
 
 /**
- * mext_double_up_read - Release two inodes' read semaphore
+ * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
  *
  * @orig_inode:		original inode structure to be released its lock first
  * @donor_inode:	donor inode structure to be released its lock second
- * Release read semaphore of two inodes (orig and donor).
+ * Release write lock of i_data_sem of two inodes (orig and donor).
  */
 static void
-mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
-{
-	up_read(&EXT4_I(orig_inode)->i_data_sem);
-	up_read(&EXT4_I(donor_inode)->i_data_sem);
-}
-
-/**
- * mext_double_up_write - Release two inodes' write semaphore
- *
- * @orig_inode:		original inode structure to be released its lock first
- * @donor_inode:	donor inode structure to be released its lock second
- * Release write semaphore of two inodes (orig and donor).
- */
-static void
-mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
 {
 	up_write(&EXT4_I(orig_inode)->i_data_sem);
 	up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -688,8 +660,6 @@ mext_replace_branches(handle_t *handle,
 	int replaced_count = 0;
 	int dext_alen;
 
-	mext_double_down_write(orig_inode, donor_inode);
-
 	/* Get the original extent for the block "orig_off" */
 	*err = get_ext_path(orig_inode, orig_off, &orig_path);
 	if (*err)
@@ -785,7 +755,6 @@ out:
 		kfree(donor_path);
 	}
 
-	mext_double_up_write(orig_inode, donor_inode);
 	return replaced_count;
 }
 
@@ -851,6 +820,11 @@ move_extent_per_page(struct file *o_filp
 	 * Just swap data blocks between orig and donor.
 	 */
 	if (uninit) {
+		/*
+		 * Protect extent trees against block allocations
+		 * via delalloc
+		 */
+		double_down_write_data_sem(orig_inode, donor_inode);
 		replaced_count = mext_replace_branches(handle, orig_inode,
 						donor_inode, orig_blk_offset,
 						block_len_in_page, err);
@@ -858,6 +832,7 @@ move_extent_per_page(struct file *o_filp
 		/* Clear the inode cache not to refer to the old data */
 		ext4_ext_invalidate_cache(orig_inode);
 		ext4_ext_invalidate_cache(donor_inode);
+		double_up_write_data_sem(orig_inode, donor_inode);
 		goto out2;
 	}
 
@@ -905,6 +880,8 @@ move_extent_per_page(struct file *o_filp
 	/* Release old bh and drop refs */
 	try_to_release_page(page, 0);
 
+	/* Protect extent trees against block allocations via delalloc */
+	double_down_write_data_sem(orig_inode, donor_inode);
 	replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
 					orig_blk_offset, block_len_in_page,
 					&err2);
@@ -913,14 +890,18 @@ move_extent_per_page(struct file *o_filp
 			block_len_in_page = replaced_count;
 			replaced_size =
 				block_len_in_page << orig_inode->i_blkbits;
-		} else
+		} else {
+			double_up_write_data_sem(orig_inode, donor_inode);
 			goto out;
+		}
 	}
 
 	/* Clear the inode cache not to refer to the old data */
 	ext4_ext_invalidate_cache(orig_inode);
 	ext4_ext_invalidate_cache(donor_inode);
 
+	double_up_write_data_sem(orig_inode, donor_inode);
+
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
 
@@ -1236,16 +1217,16 @@ ext4_move_extents(struct file *o_filp, s
 		return -EINVAL;
 	}
 
-	/* protect orig and donor against a truncate */
+	/* Protect orig and donor inodes against a truncate */
 	ret1 = mext_inode_double_lock(orig_inode, donor_inode);
 	if (ret1 < 0)
 		return ret1;
 
-	mext_double_down_read(orig_inode, donor_inode);
+	/* Protect extent tree against block allocations via delalloc */
+	double_down_write_data_sem(orig_inode, donor_inode);
 	/* Check the filesystem environment whether move_extent can be done */
 	ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
 					donor_start, &len, *moved_len);
-	mext_double_up_read(orig_inode, donor_inode);
 	if (ret1)
 		goto out;
 
@@ -1308,6 +1289,10 @@ ext4_move_extents(struct file *o_filp, s
 			 ext4_ext_get_actual_len(ext_cur), block_end + 1) -
 		     max(le32_to_cpu(ext_cur->ee_block), block_start);
 
+	/* Discard preallocations of two inodes */
+	ext4_discard_preallocations(orig_inode);
+	ext4_discard_preallocations(donor_inode);
+
 	while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
 		seq_blocks += add_blocks;
 
@@ -1359,14 +1344,14 @@ ext4_move_extents(struct file *o_filp, s
 		seq_start = le32_to_cpu(ext_cur->ee_block);
 		rest_blocks = seq_blocks;
 
-		/* Discard preallocations of two inodes */
-		down_write(&EXT4_I(orig_inode)->i_data_sem);
-		ext4_discard_preallocations(orig_inode);
-		up_write(&EXT4_I(orig_inode)->i_data_sem);
-
-		down_write(&EXT4_I(donor_inode)->i_data_sem);
-		ext4_discard_preallocations(donor_inode);
-		up_write(&EXT4_I(donor_inode)->i_data_sem);
+		/*
+		 * Up semaphore to avoid following problems:
+		 * a. transaction deadlock among ext4_journal_start,
+		 *    ->write_begin via pagefault, and jbd2_journal_commit
+		 * b. racing with ->readpage, ->write_begin, and ext4_get_block
+		 *    in move_extent_per_page
+		 */
+		double_up_write_data_sem(orig_inode, donor_inode);
 
 		while (orig_page_offset <= seq_end_page) {
 
@@ -1381,14 +1366,14 @@ ext4_move_extents(struct file *o_filp, s
 			/* Count how many blocks we have exchanged */
 			*moved_len += block_len_in_page;
 			if (ret1 < 0)
-				goto out;
+				break;
 			if (*moved_len > len) {
 				ext4_error(orig_inode->i_sb, __func__,
 					"We replaced blocks too much! "
 					"sum of replaced: %llu requested: %llu",
 					*moved_len, len);
 				ret1 = -EIO;
-				goto out;
+				break;
 			}
 
 			orig_page_offset++;
@@ -1400,6 +1385,10 @@ ext4_move_extents(struct file *o_filp, s
 				block_len_in_page = rest_blocks;
 		}
 
+		double_down_write_data_sem(orig_inode, donor_inode);
+		if (ret1 < 0)
+			break;
+
 		/* Decrease buffer counter */
 		if (holecheck_path)
 			ext4_ext_drop_refs(holecheck_path);
@@ -1429,7 +1418,7 @@ out:
 		ext4_ext_drop_refs(holecheck_path);
 		kfree(holecheck_path);
 	}
-
+	double_up_write_data_sem(orig_inode, donor_inode);
 	ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
 
 	if (ret1)



  parent reply	other threads:[~2009-12-11  5:29 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20091211052312.805428372@linux.site>
2009-12-11  5:28 ` [00/34] 2.6.32.1-stable review Greg KH
2009-12-11  5:23   ` [01/34] signal: Fix alternate signal stack check Greg KH
2009-12-11  5:23   ` [02/34] SCSI: scsi_lib_dma: fix bug with dma maps on nested scsi objects Greg KH
2009-12-11  5:23   ` [03/34] SCSI: osd_protocol.h: Add missing #include Greg KH
2009-12-11  5:23   ` [04/34] SCSI: megaraid_sas: fix 64 bit sense pointer truncation Greg KH
2009-12-11  5:23   ` [05/34] ext4: fix potential buffer head leak when add_dirent_to_buf() returns ENOSPC Greg KH
2009-12-11  5:23   ` [06/34] ext4: avoid divide by zero when trying to mount a corrupted file system Greg KH
2009-12-11  5:23   ` [07/34] ext4: fix the returned block count if EXT4_IOC_MOVE_EXT fails Greg KH
2009-12-11  5:23   ` Greg KH [this message]
2009-12-11  5:23   ` [09/34] ext4: fix possible recursive locking warning in EXT4_IOC_MOVE_EXT Greg KH
2009-12-11  5:23   ` [10/34] ext4: plug a buffer_head leak in an error path of ext4_iget() Greg KH
2009-12-11  5:23   ` [11/34] ext4: make sure directory and symlink blocks are revoked Greg KH
2009-12-11  5:23   ` [12/34] ext4: fix i_flags access in ext4_da_writepages_trans_blocks() Greg KH
2009-12-11  5:23   ` [13/34] ext4: journal all modifications in ext4_xattr_set_handle Greg KH
2009-12-11  5:23   ` [14/34] ext4: dont update the superblock in ext4_statfs() Greg KH
2009-12-11  5:23   ` [15/34] ext4: fix uninit block bitmap initialization when s_meta_first_bg is non-zero Greg KH
2009-12-11  5:23   ` [16/34] ext4: fix block validity checks so they work correctly with meta_bg Greg KH
2009-12-11  5:23   ` [17/34] ext4: avoid issuing unnecessary barriers Greg KH
2009-12-11  5:23   ` [18/34] ext4: fix error handling in ext4_ind_get_blocks() Greg KH
2009-12-11  5:23   ` [19/34] ext4: make trim/discard optional (and off by default) Greg KH
2009-12-11  5:23   ` [20/34] ext4: make "norecovery" an alias for "noload" Greg KH
2009-12-11  5:23   ` [21/34] ext4: Fix double-free of blocks with EXT4_IOC_MOVE_EXT Greg KH
2009-12-11  5:23   ` [22/34] ext4: initialize moved_len before calling ext4_move_extents() Greg KH
2009-12-11  5:23   ` [23/34] ext4: move_extent_per_page() cleanup Greg KH
2009-12-11  5:23   ` [24/34] jbd2: Add ENOMEM checking in and for jbd2_journal_write_metadata_buffer() Greg KH
2009-12-11  5:23   ` [25/34] ext4: Return the PTR_ERR of the correct pointer in setup_new_group_blocks() Greg KH
2009-12-11  5:23   ` [26/34] ext4: Avoid data / filesystem corruption when write fails to copy data Greg KH
2009-12-11  5:23   ` [27/34] ext4: wait for log to commit when umounting Greg KH
2009-12-11  5:23   ` [28/34] ext4: remove blocks from inode prealloc list on failure Greg KH
2009-12-11  5:23   ` [29/34] ext4: ext4_get_reserved_space() must return bytes instead of blocks Greg KH
2009-12-11  5:23   ` [30/34] ext4: quota macros cleanup Greg KH
2009-12-11  5:23   ` [31/34] ext4: fix incorrect block reservation on quota transfer Greg KH
2009-12-11  5:23   ` [32/34] ext4: Wait for proper transaction commit on fsync Greg KH
2009-12-11  5:23   ` [33/34] ext4: Fix insufficient checks in EXT4_IOC_MOVE_EXT Greg KH
2009-12-11  5:23   ` [34/34] ext4: Fix potential fiemap deadlock (mmap_sem vs. i_data_sem) Greg KH

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20091211052544.287395070@linux.site \
    --to=gregkh@suse.de \
    --cc=a-fujita@rs.jp.nec.com \
    --cc=akpm@linux-foundation.org \
    --cc=alan@lxorguk.ukuu.org.uk \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable-review@kernel.org \
    --cc=stable@kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox