All of lore.kernel.org
 help / color / mirror / Atom feed
From: Greg KH <gregkh@suse.de>
To: linux-kernel@vger.kernel.org, stable@kernel.org
Cc: stable-review@kernel.org, torvalds@linux-foundation.org,
	akpm@linux-foundation.org, alan@lxorguk.ukuu.org.uk,
	Akira Fujita <a-fujita@rs.jp.nec.com>,
	"Theodore Tso" <tytso@mit.edu>,
	Greg Kroah-Hartman <gregkh@suse.de>
Subject: [08/34] ext4: fix lock order problem in ext4_move_extents()
Date: Thu, 10 Dec 2009 21:23:20 -0800	[thread overview]
Message-ID: <20091211052544.287395070@linux.site> (raw)
In-Reply-To: <20091211052858.GA23229@kroah.com>

[-- Attachment #1: 0004-ext4-fix-lock-order-problem-in-ext4_move_extents.patch --]
[-- Type: text/plain, Size: 10473 bytes --]

2.6.32-stable review patch.  If anyone has any objections, please let us know.

------------------

(cherry picked from commit fc04cb49a898c372a22b21fffc47f299d8710801)

ext4_move_extents() checks the logical block contiguousness
of original file with ext4_find_extent() and mext_next_extent().
Therefore the extent which ext4_ext_path structure indicates
must not be changed between above functions.

But in current implementation, there is no i_data_sem protection
between ext4_ext_find_extent() and mext_next_extent().  So the extent
which ext4_ext_path structure indicates may be overwritten by
delalloc.  As a result, ext4_move_extents() will exchange wrong blocks
between original and donor files.  I change the place where
acquire/release i_data_sem to solve this problem.

Moreover, I changed move_extent_per_page() to start transaction first,
and then acquire i_data_sem.  Without this change, there is a
possibility of the deadlock between mmap() and ext4_move_extents():

* NOTE: "A", "B" and "C" mean different processes

A-1: ext4_ext_move_extents() acquires i_data_sem of two inodes.

B:   do_page_fault() starts the transaction (T),
     and then tries to acquire i_data_sem.
     But process "A" is already holding it, so it is kept waiting.

C:   While "A" and "B" running, kjournald2 tries to commit transaction (T)
     but it is under updating, so kjournald2 waits for it.

A-2: Call ext4_journal_start with holding i_data_sem,
     but transaction (T) is locked.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ext4/move_extent.c |  117 ++++++++++++++++++++++----------------------------
 1 file changed, 53 insertions(+), 64 deletions(-)

--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -77,12 +77,14 @@ static int
 mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 		      struct ext4_extent **extent)
 {
+	struct ext4_extent_header *eh;
 	int ppos, leaf_ppos = path->p_depth;
 
 	ppos = leaf_ppos;
 	if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
 		/* leaf block */
 		*extent = ++path[ppos].p_ext;
+		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
 		return 0;
 	}
 
@@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, st
 					ext_block_hdr(path[cur_ppos+1].p_bh);
 			}
 
+			path[leaf_ppos].p_ext = *extent = NULL;
+
+			eh = path[leaf_ppos].p_hdr;
+			if (le16_to_cpu(eh->eh_entries) == 0)
+				/* empty leaf is found */
+				return -ENODATA;
+
 			/* leaf block */
 			path[leaf_ppos].p_ext = *extent =
 				EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+			path[leaf_ppos].p_block =
+					ext_pblock(path[leaf_ppos].p_ext);
 			return 0;
 		}
 	}
@@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inod
 }
 
 /**
- * mext_double_down_read - Acquire two inodes' read semaphore
- *
- * @orig_inode:		original inode structure
- * @donor_inode:	donor inode structure
- * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
- */
-static void
-mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
-{
-	struct inode *first = orig_inode, *second = donor_inode;
-
-	/*
-	 * Use the inode number to provide the stable locking order instead
-	 * of its address, because the C language doesn't guarantee you can
-	 * compare pointers that don't come from the same array.
-	 */
-	if (donor_inode->i_ino < orig_inode->i_ino) {
-		first = donor_inode;
-		second = orig_inode;
-	}
-
-	down_read(&EXT4_I(first)->i_data_sem);
-	down_read(&EXT4_I(second)->i_data_sem);
-}
-
-/**
- * mext_double_down_write - Acquire two inodes' write semaphore
+ * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
  *
  * @orig_inode:		original inode structure
  * @donor_inode:	donor inode structure
- * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
+ * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
+ * i_ino order.
  */
 static void
-mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
+double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
 {
 	struct inode *first = orig_inode, *second = donor_inode;
 
@@ -207,28 +193,14 @@ mext_double_down_write(struct inode *ori
 }
 
 /**
- * mext_double_up_read - Release two inodes' read semaphore
+ * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
  *
  * @orig_inode:		original inode structure to be released its lock first
  * @donor_inode:	donor inode structure to be released its lock second
- * Release read semaphore of two inodes (orig and donor).
+ * Release write lock of i_data_sem of two inodes (orig and donor).
  */
 static void
-mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
-{
-	up_read(&EXT4_I(orig_inode)->i_data_sem);
-	up_read(&EXT4_I(donor_inode)->i_data_sem);
-}
-
-/**
- * mext_double_up_write - Release two inodes' write semaphore
- *
- * @orig_inode:		original inode structure to be released its lock first
- * @donor_inode:	donor inode structure to be released its lock second
- * Release write semaphore of two inodes (orig and donor).
- */
-static void
-mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
 {
 	up_write(&EXT4_I(orig_inode)->i_data_sem);
 	up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -688,8 +660,6 @@ mext_replace_branches(handle_t *handle,
 	int replaced_count = 0;
 	int dext_alen;
 
-	mext_double_down_write(orig_inode, donor_inode);
-
 	/* Get the original extent for the block "orig_off" */
 	*err = get_ext_path(orig_inode, orig_off, &orig_path);
 	if (*err)
@@ -785,7 +755,6 @@ out:
 		kfree(donor_path);
 	}
 
-	mext_double_up_write(orig_inode, donor_inode);
 	return replaced_count;
 }
 
@@ -851,6 +820,11 @@ move_extent_per_page(struct file *o_filp
 	 * Just swap data blocks between orig and donor.
 	 */
 	if (uninit) {
+		/*
+		 * Protect extent trees against block allocations
+		 * via delalloc
+		 */
+		double_down_write_data_sem(orig_inode, donor_inode);
 		replaced_count = mext_replace_branches(handle, orig_inode,
 						donor_inode, orig_blk_offset,
 						block_len_in_page, err);
@@ -858,6 +832,7 @@ move_extent_per_page(struct file *o_filp
 		/* Clear the inode cache not to refer to the old data */
 		ext4_ext_invalidate_cache(orig_inode);
 		ext4_ext_invalidate_cache(donor_inode);
+		double_up_write_data_sem(orig_inode, donor_inode);
 		goto out2;
 	}
 
@@ -905,6 +880,8 @@ move_extent_per_page(struct file *o_filp
 	/* Release old bh and drop refs */
 	try_to_release_page(page, 0);
 
+	/* Protect extent trees against block allocations via delalloc */
+	double_down_write_data_sem(orig_inode, donor_inode);
 	replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
 					orig_blk_offset, block_len_in_page,
 					&err2);
@@ -913,14 +890,18 @@ move_extent_per_page(struct file *o_filp
 			block_len_in_page = replaced_count;
 			replaced_size =
 				block_len_in_page << orig_inode->i_blkbits;
-		} else
+		} else {
+			double_up_write_data_sem(orig_inode, donor_inode);
 			goto out;
+		}
 	}
 
 	/* Clear the inode cache not to refer to the old data */
 	ext4_ext_invalidate_cache(orig_inode);
 	ext4_ext_invalidate_cache(donor_inode);
 
+	double_up_write_data_sem(orig_inode, donor_inode);
+
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
 
@@ -1236,16 +1217,16 @@ ext4_move_extents(struct file *o_filp, s
 		return -EINVAL;
 	}
 
-	/* protect orig and donor against a truncate */
+	/* Protect orig and donor inodes against a truncate */
 	ret1 = mext_inode_double_lock(orig_inode, donor_inode);
 	if (ret1 < 0)
 		return ret1;
 
-	mext_double_down_read(orig_inode, donor_inode);
+	/* Protect extent tree against block allocations via delalloc */
+	double_down_write_data_sem(orig_inode, donor_inode);
 	/* Check the filesystem environment whether move_extent can be done */
 	ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
 					donor_start, &len, *moved_len);
-	mext_double_up_read(orig_inode, donor_inode);
 	if (ret1)
 		goto out;
 
@@ -1308,6 +1289,10 @@ ext4_move_extents(struct file *o_filp, s
 			 ext4_ext_get_actual_len(ext_cur), block_end + 1) -
 		     max(le32_to_cpu(ext_cur->ee_block), block_start);
 
+	/* Discard preallocations of two inodes */
+	ext4_discard_preallocations(orig_inode);
+	ext4_discard_preallocations(donor_inode);
+
 	while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
 		seq_blocks += add_blocks;
 
@@ -1359,14 +1344,14 @@ ext4_move_extents(struct file *o_filp, s
 		seq_start = le32_to_cpu(ext_cur->ee_block);
 		rest_blocks = seq_blocks;
 
-		/* Discard preallocations of two inodes */
-		down_write(&EXT4_I(orig_inode)->i_data_sem);
-		ext4_discard_preallocations(orig_inode);
-		up_write(&EXT4_I(orig_inode)->i_data_sem);
-
-		down_write(&EXT4_I(donor_inode)->i_data_sem);
-		ext4_discard_preallocations(donor_inode);
-		up_write(&EXT4_I(donor_inode)->i_data_sem);
+		/*
+		 * Up semaphore to avoid following problems:
+		 * a. transaction deadlock among ext4_journal_start,
+		 *    ->write_begin via pagefault, and jbd2_journal_commit
+		 * b. racing with ->readpage, ->write_begin, and ext4_get_block
+		 *    in move_extent_per_page
+		 */
+		double_up_write_data_sem(orig_inode, donor_inode);
 
 		while (orig_page_offset <= seq_end_page) {
 
@@ -1381,14 +1366,14 @@ ext4_move_extents(struct file *o_filp, s
 			/* Count how many blocks we have exchanged */
 			*moved_len += block_len_in_page;
 			if (ret1 < 0)
-				goto out;
+				break;
 			if (*moved_len > len) {
 				ext4_error(orig_inode->i_sb, __func__,
 					"We replaced blocks too much! "
 					"sum of replaced: %llu requested: %llu",
 					*moved_len, len);
 				ret1 = -EIO;
-				goto out;
+				break;
 			}
 
 			orig_page_offset++;
@@ -1400,6 +1385,10 @@ ext4_move_extents(struct file *o_filp, s
 				block_len_in_page = rest_blocks;
 		}
 
+		double_down_write_data_sem(orig_inode, donor_inode);
+		if (ret1 < 0)
+			break;
+
 		/* Decrease buffer counter */
 		if (holecheck_path)
 			ext4_ext_drop_refs(holecheck_path);
@@ -1429,7 +1418,7 @@ out:
 		ext4_ext_drop_refs(holecheck_path);
 		kfree(holecheck_path);
 	}
-
+	double_up_write_data_sem(orig_inode, donor_inode);
 	ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
 
 	if (ret1)



  parent reply	other threads:[~2009-12-11  5:29 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20091211052312.805428372@linux.site>
2009-12-11  5:28 ` [00/34] 2.6.32.1-stable review Greg KH
2009-12-11  5:23   ` [01/34] signal: Fix alternate signal stack check Greg KH
2009-12-11  5:23   ` [02/34] SCSI: scsi_lib_dma: fix bug with dma maps on nested scsi objects Greg KH
2009-12-11  5:23   ` [03/34] SCSI: osd_protocol.h: Add missing #include Greg KH
2009-12-11  5:23   ` [04/34] SCSI: megaraid_sas: fix 64 bit sense pointer truncation Greg KH
2009-12-11  5:23   ` [05/34] ext4: fix potential buffer head leak when add_dirent_to_buf() returns ENOSPC Greg KH
2009-12-11  5:23   ` [06/34] ext4: avoid divide by zero when trying to mount a corrupted file system Greg KH
2009-12-11  5:23   ` [07/34] ext4: fix the returned block count if EXT4_IOC_MOVE_EXT fails Greg KH
2009-12-11  5:23   ` Greg KH [this message]
2009-12-11  5:23   ` [09/34] ext4: fix possible recursive locking warning in EXT4_IOC_MOVE_EXT Greg KH
2009-12-11  5:23   ` [10/34] ext4: plug a buffer_head leak in an error path of ext4_iget() Greg KH
2009-12-11  5:23   ` [11/34] ext4: make sure directory and symlink blocks are revoked Greg KH
2009-12-11  5:23   ` [12/34] ext4: fix i_flags access in ext4_da_writepages_trans_blocks() Greg KH
2009-12-11  5:23   ` [13/34] ext4: journal all modifications in ext4_xattr_set_handle Greg KH
2009-12-11  5:23   ` [14/34] ext4: dont update the superblock in ext4_statfs() Greg KH
2009-12-11  5:23   ` [15/34] ext4: fix uninit block bitmap initialization when s_meta_first_bg is non-zero Greg KH
2009-12-11  5:23   ` [16/34] ext4: fix block validity checks so they work correctly with meta_bg Greg KH
2009-12-11  5:23   ` [17/34] ext4: avoid issuing unnecessary barriers Greg KH
2009-12-11  5:23   ` [18/34] ext4: fix error handling in ext4_ind_get_blocks() Greg KH
2009-12-11  5:23   ` [19/34] ext4: make trim/discard optional (and off by default) Greg KH
2009-12-11  5:23   ` [20/34] ext4: make "norecovery" an alias for "noload" Greg KH
2009-12-11  5:23   ` [21/34] ext4: Fix double-free of blocks with EXT4_IOC_MOVE_EXT Greg KH
2009-12-11  5:23   ` [22/34] ext4: initialize moved_len before calling ext4_move_extents() Greg KH
2009-12-11  5:23   ` [23/34] ext4: move_extent_per_page() cleanup Greg KH
2009-12-11  5:23   ` [24/34] jbd2: Add ENOMEM checking in and for jbd2_journal_write_metadata_buffer() Greg KH
2009-12-11  5:23   ` [25/34] ext4: Return the PTR_ERR of the correct pointer in setup_new_group_blocks() Greg KH
2009-12-11  5:23   ` [26/34] ext4: Avoid data / filesystem corruption when write fails to copy data Greg KH
2009-12-11  5:23   ` [27/34] ext4: wait for log to commit when umounting Greg KH
2009-12-11  5:23   ` [28/34] ext4: remove blocks from inode prealloc list on failure Greg KH
2009-12-11  5:23   ` [29/34] ext4: ext4_get_reserved_space() must return bytes instead of blocks Greg KH
2009-12-11  5:23   ` [30/34] ext4: quota macros cleanup Greg KH
2009-12-11  5:23   ` [31/34] ext4: fix incorrect block reservation on quota transfer Greg KH
2009-12-11  5:23   ` [32/34] ext4: Wait for proper transaction commit on fsync Greg KH
2009-12-11  5:23   ` [33/34] ext4: Fix insufficient checks in EXT4_IOC_MOVE_EXT Greg KH
2009-12-11  5:23   ` [34/34] ext4: Fix potential fiemap deadlock (mmap_sem vs. i_data_sem) Greg KH

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20091211052544.287395070@linux.site \
    --to=gregkh@suse.de \
    --cc=a-fujita@rs.jp.nec.com \
    --cc=akpm@linux-foundation.org \
    --cc=alan@lxorguk.ukuu.org.uk \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable-review@kernel.org \
    --cc=stable@kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.