linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* delalloc and journal locking order inversion fixes.
@ 2008-05-21 17:44 Aneesh Kumar K.V
  2008-05-21 17:44 ` [PATCH] ext4: Add validation to jbd lock inversion patch and split and writepage Aneesh Kumar K.V
  0 siblings, 1 reply; 17+ messages in thread
From: Aneesh Kumar K.V @ 2008-05-21 17:44 UTC (permalink / raw)
  To: cmm, tytso, sandeen; +Cc: linux-ext4

The below set of patches gets the delalloc work with journal locking
order inversion patches.

The series file look like

+ ext4-new-defm-options
+ ext4-call-blkdev_issue_flush-on-fsync.patch
+ ext4-page-mkwrite.patch
+ ext4_ialloc-flexbg.patch
+ ext4-inverse-pagelock-vs-transaction.patch
+ 005-lock-inversion.patch
+ delalloc-vfs.patch
+ ext4-fix-fs-corruption-with-delalloc.patch
+ delalloc-ext4.patch
+ delalloc-ext4-release-page-when-write_begin-failed.patch
+ delalloc-ext4-preallocation-handling.patch
...
...
.....
+ delalloc-ext4-lock-reverse.patch
> delalloc-fix.patch

Most of these patche will have to folded into other patches before
we push them to the patch queue.

Patch and their subject line matching:

ext4-page-mkwrite.patch: (repost with changes)
[PATCH] ext4: Use page_mkwrite vma_operations to get mmap write notification

ext4-inverse-pagelock-vs-transaction.patch:(repost due to moving patch up in the stack)
[PATCH] ext4: Inverse locking order of page_lock and transaction start.

005-lock-inversion.patch: (New changes needs review )
[PATCH] ext4: Add validation to jbd lock inversion patch and split and writepage
  and page_mkwrite calls.

delalloc-ext4-lock-reverse.patch:(repost due to changes, VFS change dropped)
[PATCH] ext4:  inverse locking ordering of page_lock and transaction start in delalloc

delalloc-fix.patch:(New changes to fix the hang)
[PATCH] ext4: Fix delalloc sync hang with journal lock inversion

NOTE: The patches are only for review and not for patchqueue.

-aneesh


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] ext4: Add validation to jbd lock inversion patch and split and writepage
  2008-05-21 17:44 delalloc and journal locking order inversion fixes Aneesh Kumar K.V
@ 2008-05-21 17:44 ` Aneesh Kumar K.V
  2008-05-21 17:44   ` [PATCH] ext4: inverse locking ordering of page_lock and transaction start in delalloc Aneesh Kumar K.V
  0 siblings, 1 reply; 17+ messages in thread
From: Aneesh Kumar K.V @ 2008-05-21 17:44 UTC (permalink / raw)
  To: cmm, tytso, sandeen; +Cc: linux-ext4, Aneesh Kumar K.V

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/inode.c |  180 ++++++++++++++++++++++++++++++++++++++++++++++--------
 1 files changed, 153 insertions(+), 27 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d099f55..26a7df3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1479,6 +1479,11 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh);
+}
+
 /*
  * Note that we don't need to start a transaction unless we're journaling
  * data because we should have holes filled from ext4_page_mkwrite(). If
@@ -1531,18 +1536,26 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 static int __ext4_ordered_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
-	struct inode *inode = page->mapping->host;
-	struct buffer_head *page_bufs;
+	int ret = 0, err;
+	unsigned long len;
 	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
+	struct buffer_head *page_bufs;
+	struct inode *inode = page->mapping->host;
+	loff_t size = i_size_read(inode);
 
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, inode->i_sb->s_blocksize,
-				(1 << BH_Dirty)|(1 << BH_Uptodate));
-	}
 	page_bufs = page_buffers(page);
-	walk_page_buffers(handle, page_bufs, 0,
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	if (walk_page_buffers(NULL, page_bufs, 0,
+					len, NULL, ext4_bh_unmapped)) {
+		printk(KERN_CRIT "%s called with unmapped buffer\n",
+								__func__);
+		BUG();
+	}
+	walk_page_buffers(NULL, page_bufs, 0,
 			PAGE_CACHE_SIZE, NULL, bget_one);
 
 	ret = block_write_full_page(page, ext4_get_block, wbc);
@@ -1574,8 +1587,8 @@ static int __ext4_ordered_writepage(struct page *page,
 			ret = err;
 	}
 out_put:
-	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
-			  bput_one);
+	walk_page_buffers(handle, page_bufs, 0,
+				PAGE_CACHE_SIZE, NULL, bput_one);
 	return ret;
 }
 
@@ -1583,7 +1596,7 @@ static int ext4_ordered_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	J_ASSERT(PageLocked(page));
-
+	BUG_ON(!page_has_buffers(page));
 	/*
 	 * We give up here if we're reentered, because it might be for a
 	 * different filesystem.
@@ -1599,18 +1612,34 @@ static int ext4_ordered_writepage(struct page *page,
 static int __ext4_writeback_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
+	unsigned long len;
+	struct buffer_head *page_bufs;
 	struct inode *inode = page->mapping->host;
+	loff_t size = i_size_read(inode);
 
+	page_bufs = page_buffers(page);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	if (walk_page_buffers(NULL, page_bufs, 0,
+					len, NULL, ext4_bh_unmapped)) {
+		printk(KERN_CRIT "%s called with unmapped buffer\n",
+								__func__);
+		BUG();
+	}
 	if (test_opt(inode->i_sb, NOBH))
 		return nobh_writepage(page, ext4_get_block, wbc);
 	else
 		return block_write_full_page(page, ext4_get_block, wbc);
 }
 
-
 static int ext4_writeback_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
+	BUG_ON(!page_has_buffers(page));
+
 	if (!ext4_journal_current_handle())
 		return __ext4_writeback_writepage(page, wbc);
 
@@ -1622,18 +1651,31 @@ static int ext4_writeback_writepage(struct page *page,
 static int __ext4_journalled_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
+	int ret = 0, err;
+	unsigned long len;
+	handle_t *handle = NULL;
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
 	struct buffer_head *page_bufs;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
+	loff_t size = i_size_read(inode);
+
+	page_bufs = page_buffers(page);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
 
+	if (walk_page_buffers(NULL, page_bufs, 0,
+					len, NULL, ext4_bh_unmapped)) {
+		printk(KERN_CRIT "%s called with unmapped buffer\n",
+								__func__);
+		BUG();
+	}
+	/* FIXME!! do we need to call prepare_write for a mapped buffer */
 	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
 	if (ret != 0)
 		goto out_unlock;
 
-	page_bufs = page_buffers(page);
 	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
 								bget_one);
 	/* As soon as we unlock the page, it can go away, but we have
@@ -1671,14 +1713,13 @@ static int __ext4_journalled_writepage(struct page *page,
 static int ext4_journalled_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
+	BUG_ON(!page_has_buffers(page));
+
 	if (ext4_journal_current_handle())
 		goto no_write;
 
-	if (!page_has_buffers(page) || PageChecked(page)) {
-		/*
-		 * It's mmapped pagecache.  Add buffers and journal it.  There
-		 * doesn't seem much point in redirtying the page here.
-		 */
+	if (PageChecked(page)) {
+		/* dirty pages in data=journal mode */
 		ClearPageChecked(page);
 		return __ext4_journalled_writepage(page, wbc);
 	} else {
@@ -3520,9 +3561,94 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	return err;
 }
 
-static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+static int __ext4_journalled_allocpage(struct page *page,
+				struct writeback_control *wbc)
 {
-	return !buffer_mapped(bh);
+	int ret = 0, err;
+	handle_t *handle = NULL;
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct buffer_head *page_bufs;
+
+	/* if alloc we are called after statring a journal */
+	handle = ext4_journal_current_handle();
+	BUG_ON(!handle);
+
+	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
+	if (ret != 0)
+		goto out_unlock;
+
+	/* FIXME!! should we do a bget_one */
+	page_bufs = page_buffers(page);
+	ret = walk_page_buffers(handle, page_bufs, 0,
+			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+
+	err = walk_page_buffers(handle, page_bufs, 0,
+				PAGE_CACHE_SIZE, NULL, write_end_fn);
+	if (ret == 0)
+		ret = err;
+	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+
+out_unlock:
+	unlock_page(page);
+	return ret;
+}
+
+static int __ext4_ordered_allocpage(struct page *page,
+				struct writeback_control *wbc)
+{
+	int ret = 0;
+	handle_t *handle = NULL;
+	struct buffer_head *page_bufs;
+	struct inode *inode = page->mapping->host;
+
+	/* if alloc we are called after statring a journal */
+	handle = ext4_journal_current_handle();
+	BUG_ON(!handle);
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, inode->i_sb->s_blocksize,
+				(1 << BH_Dirty)|(1 << BH_Uptodate));
+	}
+	page_bufs = page_buffers(page);
+	walk_page_buffers(handle, page_bufs, 0,
+			PAGE_CACHE_SIZE, NULL, bget_one);
+
+	ret = block_write_full_page(page, ext4_get_block, wbc);
+
+	/*
+	 * The page can become unlocked at any point now, and
+	 * truncate can then come in and change things.  So we
+	 * can't touch *page from now on.  But *page_bufs is
+	 * safe due to elevated refcount.
+	 */
+
+	/*
+	 * And attach them to the current transaction.  But only if
+	 * block_write_full_page() succeeded.  Otherwise they are unmapped,
+	 * and generally junk.
+	 */
+	if (ret == 0) {
+		ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+					NULL, jbd2_journal_dirty_data_fn);
+	}
+	walk_page_buffers(handle, page_bufs, 0,
+				PAGE_CACHE_SIZE, NULL, bput_one);
+	return ret;
+}
+
+static int __ext4_writeback_allocpage(struct page *page,
+				struct writeback_control *wbc)
+{
+	handle_t *handle = NULL;
+	struct inode *inode = page->mapping->host;
+	/* if alloc we are called after statring a journal */
+	handle = ext4_journal_current_handle();
+	BUG_ON(!handle);
+
+	if (test_opt(inode->i_sb, NOBH))
+		return nobh_writepage(page, ext4_get_block, wbc);
+	else
+		return block_write_full_page(page, ext4_get_block, wbc);
 }
 
 int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
@@ -3578,11 +3704,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	wbc.range_start = page_offset(page);
 	wbc.range_end = page_offset(page) + PAGE_CACHE_SIZE;
 	if (ext4_should_writeback_data(inode))
-		ret = __ext4_writeback_writepage(page, &wbc);
+		ret = __ext4_writeback_allocpage(page, &wbc);
 	else if (ext4_should_order_data(inode))
-		ret = __ext4_ordered_writepage(page, &wbc);
+		ret = __ext4_ordered_allocpage(page, &wbc);
 	else
-		ret = __ext4_journalled_writepage(page, &wbc);
+		ret = __ext4_journalled_allocpage(page, &wbc);
 	/* Page got unlocked in writepage */
 	err = ext4_journal_stop(handle);
 	if (!ret)
-- 
1.5.5.1.211.g65ea3.dirty


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH] ext4:  inverse locking ordering of page_lock and transaction start in delalloc
  2008-05-21 17:44 ` [PATCH] ext4: Add validation to jbd lock inversion patch and split and writepage Aneesh Kumar K.V
@ 2008-05-21 17:44   ` Aneesh Kumar K.V
  2008-05-21 17:44     ` [PATCH] ext4: Fix delalloc sync hang with journal lock inversion Aneesh Kumar K.V
  0 siblings, 1 reply; 17+ messages in thread
From: Aneesh Kumar K.V @ 2008-05-21 17:44 UTC (permalink / raw)
  To: cmm, tytso, sandeen; +Cc: linux-ext4, Aneesh Kumar K.V

From: Mingming Cao <cmm@us.ibm.com>

Inverse locking ordering of page_lock and transaction start in delalloc

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/inode.c |   95 ++++++++++++++++++++++++++++++++++++++----------------
 1 files changed, 67 insertions(+), 28 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5e893a5..46cc610 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1448,18 +1448,14 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
-	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+	int ret;
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 	loff_t disksize = EXT4_I(inode)->i_disksize;
 	handle_t *handle = NULL;
 
-	if (create) {
-		handle = ext4_journal_start(inode, needed_blocks);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			goto out;
-		}
-	}
+	handle = ext4_journal_current_handle();
+	BUG_ON(handle == 0);
+	BUG_ON(create == 0);
 
 	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
 				   bh_result, create, 0);
@@ -1494,29 +1490,17 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 		ret = 0;
 	}
 
-out:
-	if (handle && !IS_ERR(handle))
-		ext4_journal_stop(handle);
-
 	return ret;
 }
 /* FIXME!! only support data=writeback mode */
-static int ext4_da_writepage(struct page *page,
+static int __ext4_da_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	handle_t *handle = NULL;
 	int ret = 0;
-	int err;
-
-	if (ext4_journal_current_handle())
-		goto out_fail;
 
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out_fail;
-	}
+	handle = ext4_journal_current_handle();
 
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
 		ret = nobh_writepage(page, ext4_get_block, wbc);
@@ -1528,21 +1512,76 @@ static int ext4_da_writepage(struct page *page,
 		ext4_mark_inode_dirty(handle, inode);
 	}
 
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
 	return ret;
+}
+static int ext4_da_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	if (!ext4_journal_current_handle())
+		return __ext4_da_writepage(page, wbc);
 
-out_fail:
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
-	return ret;
+	return 0;
 }
 
+/*
+ * For now just follow the DIO way to estimate the max credits
+ * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * todo: need to calculate the max credits need for
+ * extent based files, currently the DIO credits is based on
+ * indirect-blocks mapping way.
+ *
+ * Probably should have a generic way to calculate credits
+ * for DIO, writepages, and truncate
+ */
+#define EXT4_MAX_WRITEBACK_PAGES      DIO_MAX_BLOCKS
+#define EXT4_MAX_WRITEBACK_CREDITS    DIO_CREDITS
+
 static int ext4_da_writepages(struct address_space *mapping,
 				struct writeback_control *wbc)
 {
-	return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
+	struct inode *inode = mapping->host;
+	handle_t *handle = NULL;
+	int needed_blocks;
+	int ret = 0;
+	unsigned range_cyclic;
+	long to_write;
+
+	/*
+	 *  Estimate the worse case needed credits to write out
+	 * EXT4_MAX_BUF_BLOCKS pages
+	 */
+	needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+
+	to_write = wbc->nr_to_write;
+	range_cyclic = wbc->range_cyclic;
+	wbc->range_cyclic = 1;
+
+	while (!ret && to_write) {
+		/* start a new transaction*/
+		handle = ext4_journal_start(inode, needed_blocks);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out_writepages;
+		}
+		/*
+		 * set the max dirty pages could be write at a time
+		 * to fit into the reserved transaction credits
+		 */
+		if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+			wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+		to_write -= wbc->nr_to_write;
+
+		ret = mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
+		ext4_journal_stop(handle);
+		to_write +=wbc->nr_to_write;
+	}
+
+out_writepages:
+	wbc->nr_to_write = to_write;
+	wbc->range_cyclic = range_cyclic;
+	return ret;
 }
 
 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
-- 
1.5.5.1.211.g65ea3.dirty


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH] ext4: Fix delalloc sync hang with journal lock inversion
  2008-05-21 17:44   ` [PATCH] ext4: inverse locking ordering of page_lock and transaction start in delalloc Aneesh Kumar K.V
@ 2008-05-21 17:44     ` Aneesh Kumar K.V
  2008-05-21 17:44       ` [PATCH] ext4: Inverse locking order of page_lock and transaction start Aneesh Kumar K.V
                         ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Aneesh Kumar K.V @ 2008-05-21 17:44 UTC (permalink / raw)
  To: cmm, tytso, sandeen; +Cc: linux-ext4, Aneesh Kumar K.V

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/inode.c |   10 +++++++---
 1 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 46cc610..076d00f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1571,13 +1571,17 @@ static int ext4_da_writepages(struct address_space *mapping,
 		 */
 		if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
 			wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
-		to_write -= wbc->nr_to_write;
 
+		to_write -= wbc->nr_to_write;
 		ret = mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
 		ext4_journal_stop(handle);
-		to_write +=wbc->nr_to_write;
+		if (wbc->nr_to_write) {
+			/* We failed to write what we requested for */
+			to_write += wbc->nr_to_write;
+			break;
+		}
+		wbc->nr_to_write = to_write;
 	}

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH] ext4: Inverse locking order of page_lock and transaction start.
  2008-05-21 17:44     ` [PATCH] ext4: Fix delalloc sync hang with journal lock inversion Aneesh Kumar K.V
@ 2008-05-21 17:44       ` Aneesh Kumar K.V
  2008-05-21 17:44         ` [PATCH] ext4: Use page_mkwrite vma_operations to get mmap write notification Aneesh Kumar K.V
  2008-05-22 10:25       ` [PATCH] ext4: Fix delalloc sync hang with journal lock inversion Aneesh Kumar K.V
  2008-05-22 18:10       ` Mingming
  2 siblings, 1 reply; 17+ messages in thread
From: Aneesh Kumar K.V @ 2008-05-21 17:44 UTC (permalink / raw)
  To: cmm, tytso, sandeen; +Cc: linux-ext4, Jan Kara, Aneesh Kumar K.V

From: Jan Kara <jack@ghost.suse.cz>

Signed-off-by: Jan Kara <jack@ghost.suse.cz>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/ext4.h    |    4 +-
 fs/ext4/extents.c |   15 +--
 fs/ext4/inode.c   |  274 ++++++++++++++++++++++++-----------------------------
 3 files changed, 132 insertions(+), 161 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 66cdd5c..e7da2cb 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1063,7 +1063,7 @@ extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 
@@ -1222,7 +1222,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			ext4_lblk_t iblock,
 			unsigned long max_blocks, struct buffer_head *bh_result,
 			int create, int extend_disksize);
-extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c58ebd8..440b094 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2744,7 +2744,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	return err ? err : allocated;
 }
 
-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(struct inode * inode)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct super_block *sb = inode->i_sb;
@@ -2757,18 +2757,11 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
 	 */
 	err = ext4_writepage_trans_blocks(inode) + 3;
 	handle = ext4_journal_start(inode, err);
-	if (IS_ERR(handle)) {
-		if (page) {
-			clear_highpage(page);
-			flush_dcache_page(page);
-			unlock_page(page);
-			page_cache_release(page);
-		}
+	if (IS_ERR(handle))
 		return;
-	}
 
-	if (page)
-		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+	if (inode->i_size & (sb->s_blocksize - 1))
+		ext4_block_truncate_page(handle, mapping, inode->i_size);
 
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_ext_invalidate_cache(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d361693..d099f55 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1239,19 +1239,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
  	to = from + len;
 
 retry:
- 	page = __grab_cache_page(mapping, index);
- 	if (!page)
- 		return -ENOMEM;
- 	*pagep = page;
-
   	handle = ext4_journal_start(inode, needed_blocks);
   	if (IS_ERR(handle)) {
- 		unlock_page(page);
- 		page_cache_release(page);
   		ret = PTR_ERR(handle);
   		goto out;
 	}
 
+	page = __grab_cache_page(mapping, index);
+	if (!page) {
+		ext4_journal_stop(handle);
+		ret = -ENOMEM;
+		goto out;
+	}
+	*pagep = page;
+
 	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 							ext4_get_block);
 
@@ -1261,8 +1262,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 	}
 
 	if (ret) {
-		ext4_journal_stop(handle);
  		unlock_page(page);
+		ext4_journal_stop(handle);
  		page_cache_release(page);
 	}
 
@@ -1291,29 +1292,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 }
 
 /*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
-{
-	struct inode *inode = file->f_mapping->host;
-
-	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
-	if (pos+copied > inode->i_size) {
-		i_size_write(inode, pos+copied);
-		mark_inode_dirty(inode);
-	}
-
-	return copied;
-}
-
-/*
  * We need to pick up the new inode size which generic_commit_write gave us
  * `file' can be NULL - eg, when called from page_symlink().
  *
@@ -1326,7 +1304,7 @@ static int ext4_ordered_write_end(struct file *file,
 				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
-	struct inode *inode = file->f_mapping->host;
+	struct inode *inode = mapping->host;
 	unsigned from, to;
 	int ret = 0, ret2;
 
@@ -1347,7 +1325,7 @@ static int ext4_ordered_write_end(struct file *file,
 		new_i_size = pos + copied;
 		if (new_i_size > EXT4_I(inode)->i_disksize)
 			EXT4_I(inode)->i_disksize = new_i_size;
-		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+		ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 		copied = ret2;
 		if (ret2 < 0)
@@ -1356,8 +1334,6 @@ static int ext4_ordered_write_end(struct file *file,
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
-	page_cache_release(page);
 
 	return ret ? ret : copied;
 }
@@ -1368,7 +1344,7 @@ static int ext4_writeback_write_end(struct file *file,
 				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
-	struct inode *inode = file->f_mapping->host;
+	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	loff_t new_i_size;
 
@@ -1376,7 +1352,7 @@ static int ext4_writeback_write_end(struct file *file,
 	if (new_i_size > EXT4_I(inode)->i_disksize)
 		EXT4_I(inode)->i_disksize = new_i_size;
 
-	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+	ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
 	if (ret2 < 0)
@@ -1385,8 +1361,6 @@ static int ext4_writeback_write_end(struct file *file,
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
-	page_cache_release(page);
 
 	return ret ? ret : copied;
 }
@@ -1425,10 +1399,10 @@ static int ext4_journalled_write_end(struct file *file,
 			ret = ret2;
 	}
 
+	unlock_page(page);
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
 	page_cache_release(page);
 
 	return ret ? ret : copied;
@@ -1506,11 +1480,10 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 }
 
 /*
- * Note that we always start a transaction even if we're not journalling
- * data.  This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext4_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
- * refers to old data.
+ * Note that we don't need to start a transaction unless we're journaling
+ * data because we should have holes filled from ext4_page_mkwrite(). If
+ * we are journaling data, we cannot start transaction directly because
+ * transaction start ranks above page lock so we have to do some magic...
  *
  * In all journalling modes block_write_full_page() will start the I/O.
  *
@@ -1554,10 +1527,8 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
  * disastrous.  Any write() or metadata operation will sync the fs for
  * us.
  *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
  */
-static int ext4_ordered_writepage(struct page *page,
+static int __ext4_ordered_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
@@ -1566,22 +1537,6 @@ static int ext4_ordered_writepage(struct page *page,
 	int ret = 0;
 	int err;
 
-	J_ASSERT(PageLocked(page));
-
-	/*
-	 * We give up here if we're reentered, because it might be for a
-	 * different filesystem.
-	 */
-	if (ext4_journal_current_handle())
-		goto out_fail;
-
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out_fail;
-	}
-
 	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, inode->i_sb->s_blocksize,
 				(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1605,114 +1560,139 @@ static int ext4_ordered_writepage(struct page *page,
 	 * and generally junk.
 	 */
 	if (ret == 0) {
-		err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+		handle = ext4_journal_start(inode,
+					ext4_writepage_trans_blocks(inode));
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out_put;
+		}
+
+		ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
 					NULL, jbd2_journal_dirty_data_fn);
+		err = ext4_journal_stop(handle);
 		if (!ret)
 			ret = err;
 	}
-	walk_page_buffers(handle, page_bufs, 0,
-			PAGE_CACHE_SIZE, NULL, bput_one);
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
+out_put:
+	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+			  bput_one);
 	return ret;
+}
+
+static int ext4_ordered_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	J_ASSERT(PageLocked(page));
+
+	/*
+	 * We give up here if we're reentered, because it might be for a
+	 * different filesystem.
+	 */
+	if (!ext4_journal_current_handle())
+		return __ext4_ordered_writepage(page, wbc);
 
-out_fail:
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
-	return ret;
+	return 0;
 }
 
-static int ext4_writeback_writepage(struct page *page,
+static int __ext4_writeback_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
+
+	if (test_opt(inode->i_sb, NOBH))
+		return nobh_writepage(page, ext4_get_block, wbc);
+	else
+		return block_write_full_page(page, ext4_get_block, wbc);
+}
+
+
+static int ext4_writeback_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	if (!ext4_journal_current_handle())
+		return __ext4_writeback_writepage(page, wbc);
+
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return 0;
+}
+
+static int __ext4_journalled_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct buffer_head *page_bufs;
 	handle_t *handle = NULL;
 	int ret = 0;
 	int err;
 
-	if (ext4_journal_current_handle())
-		goto out_fail;
+	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
+	if (ret != 0)
+		goto out_unlock;
+
+	page_bufs = page_buffers(page);
+	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+								bget_one);
+	/* As soon as we unlock the page, it can go away, but we have
+	 * references to buffers so we are safe */
+	unlock_page(page);
 
 	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
-		goto out_fail;
+		goto out;
 	}
 
-	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-		ret = nobh_writepage(page, ext4_get_block, wbc);
-	else
-		ret = block_write_full_page(page, ext4_get_block, wbc);
+	ret = walk_page_buffers(handle, page_bufs, 0,
+			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
 
+	err = walk_page_buffers(handle, page_bufs, 0,
+				PAGE_CACHE_SIZE, NULL, write_end_fn);
+	if (ret == 0)
+		ret = err;
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
-	return ret;
 
-out_fail:
-	redirty_page_for_writepage(wbc, page);
+	walk_page_buffers(handle, page_bufs, 0,
+				PAGE_CACHE_SIZE, NULL, bput_one);
+	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+	goto out;
+
+out_unlock:
 	unlock_page(page);
+out:
 	return ret;
 }
 
 static int ext4_journalled_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
-	struct inode *inode = page->mapping->host;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
-
 	if (ext4_journal_current_handle())
 		goto no_write;
 
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto no_write;
-	}
-
 	if (!page_has_buffers(page) || PageChecked(page)) {
 		/*
 		 * It's mmapped pagecache.  Add buffers and journal it.  There
 		 * doesn't seem much point in redirtying the page here.
 		 */
 		ClearPageChecked(page);
-		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-					ext4_get_block);
-		if (ret != 0) {
-			ext4_journal_stop(handle);
-			goto out_unlock;
-		}
-		ret = walk_page_buffers(handle, page_buffers(page), 0,
-			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-
-		err = walk_page_buffers(handle, page_buffers(page), 0,
-				PAGE_CACHE_SIZE, NULL, write_end_fn);
-		if (ret == 0)
-			ret = err;
-		EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-		unlock_page(page);
+		return __ext4_journalled_writepage(page, wbc);
 	} else {
 		/*
 		 * It may be a page full of checkpoint-mode buffers.  We don't
 		 * really know unless we go poke around in the buffer_heads.
 		 * But block_write_full_page will do the right thing.
 		 */
-		ret = block_write_full_page(page, ext4_get_block, wbc);
+		return block_write_full_page(page, ext4_get_block, wbc);
 	}
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-out:
-	return ret;
-
 no_write:
 	redirty_page_for_writepage(wbc, page);
-out_unlock:
 	unlock_page(page);
-	goto out;
+	return 0;
 }
 
 static int ext4_readpage(struct file *file, struct page *page)
@@ -1922,7 +1902,7 @@ void ext4_set_aops(struct inode *inode)
  * This required during truncate. We need to physically zero the tail end
  * of that block so it doesn't yield old data if the file is later grown.
  */
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
+int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from)
 {
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1931,8 +1911,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
 	ext4_lblk_t iblock;
 	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
+	struct page *page;
 	int err = 0;
 
+	page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+	if (!page)
+		return -EINVAL;
+
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -2396,7 +2381,6 @@ void ext4_truncate(struct inode *inode)
 	int n;
 	ext4_lblk_t last_block;
 	unsigned blocksize = inode->i_sb->s_blocksize;
-	struct page *page;
 
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 	    S_ISLNK(inode->i_mode)))
@@ -2406,41 +2390,21 @@ void ext4_truncate(struct inode *inode)
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return;
 
-	/*
-	 * We have to lock the EOF page here, because lock_page() nests
-	 * outside jbd2_journal_start().
-	 */
-	if ((inode->i_size & (blocksize - 1)) == 0) {
-		/* Block boundary? Nothing to do */
-		page = NULL;
-	} else {
-		page = grab_cache_page(mapping,
-				inode->i_size >> PAGE_CACHE_SHIFT);
-		if (!page)
-			return;
-	}
-
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-		ext4_ext_truncate(inode, page);
+		ext4_ext_truncate(inode);
 		return;
 	}
 
 	handle = start_transaction(inode);
-	if (IS_ERR(handle)) {
-		if (page) {
-			clear_highpage(page);
-			flush_dcache_page(page);
-			unlock_page(page);
-			page_cache_release(page);
-		}
+	if (IS_ERR(handle))
 		return;		/* AKPM: return what? */
-	}
 
 	last_block = (inode->i_size + blocksize-1)
 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 
-	if (page)
-		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+	if (inode->i_size & (blocksize - 1))
+		if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+			goto out_stop;
 
 	n = ext4_block_to_path(inode, last_block, offsets, NULL);
 	if (n == 0)
@@ -3565,7 +3529,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
 	loff_t size;
 	unsigned long len;
-	int ret = -EINVAL;
+	int err, ret = -EINVAL;
+	handle_t *handle;
 	struct file *file = vma->vm_file;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
@@ -3604,11 +3569,24 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	 * have inode_mutex and that allow parallel write_begin, write_end call.
 	 * (lock_page prevent this from happening on the same page though)
 	 */
+	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_unlock;
+	}
 	lock_page(page);
 	wbc.range_start = page_offset(page);
 	wbc.range_end = page_offset(page) + PAGE_CACHE_SIZE;
-	ret = mapping->a_ops->writepage(page, &wbc);
-	/* writepage unlocks the page */
+	if (ext4_should_writeback_data(inode))
+		ret = __ext4_writeback_writepage(page, &wbc);
+	else if (ext4_should_order_data(inode))
+		ret = __ext4_ordered_writepage(page, &wbc);
+	else
+		ret = __ext4_journalled_writepage(page, &wbc);
+	/* Page got unlocked in writepage */
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
 out_unlock:
 	up_read(&inode->i_alloc_sem);
 	return ret;
-- 
1.5.5.1.211.g65ea3.dirty


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH] ext4: Use page_mkwrite vma_operations to get mmap write notification.
  2008-05-21 17:44       ` [PATCH] ext4: Inverse locking order of page_lock and transaction start Aneesh Kumar K.V
@ 2008-05-21 17:44         ` Aneesh Kumar K.V
  0 siblings, 0 replies; 17+ messages in thread
From: Aneesh Kumar K.V @ 2008-05-21 17:44 UTC (permalink / raw)
  To: cmm, tytso, sandeen; +Cc: linux-ext4, Aneesh Kumar K.V

We would like to get notified when we are doing a write on mmap section.
This is needed with respect to preallocated area. We split the preallocated
area into initialzed extent and uninitialzed extent in the call back. This
let us handle ENOSPC better. Otherwise we get ENOSPC in the writepage and
that would result in data loss. The changes are also needed to handle ENOSPC
when writing to an mmap section of files with holes.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |    1 +
 fs/ext4/file.c  |   19 +++++++++++++++++-
 fs/ext4/inode.c |   58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 1 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6605076..77cbb28 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1053,6 +1053,7 @@ extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
 		struct address_space *mapping, loff_t from);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6..b9510ba 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -123,6 +123,23 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 	return ret;
 }
 
+static struct vm_operations_struct ext4_file_vm_ops = {
+	.fault		= filemap_fault,
+	.page_mkwrite   = ext4_page_mkwrite,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+	file_accessed(file);
+	vma->vm_ops = &ext4_file_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+	return 0;
+}
+
 const struct file_operations ext4_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -133,7 +150,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
-	.mmap		= generic_file_mmap,
+	.mmap		= ext4_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4a7ed29..d361693 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3555,3 +3555,61 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 
 	return err;
 }
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	loff_t size;
+	unsigned long len;
+	int ret = -EINVAL;
+	struct file *file = vma->vm_file;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE,
+					 .nr_to_write = 1 };
+
+	/*
+	 * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+	 * get i_mutex because we are already holding mmap_sem.
+	 */
+	down_read(&inode->i_alloc_sem);
+	size = i_size_read(inode);
+	if (page->mapping != mapping || size <= page_offset(page)
+	    || !PageUptodate(page)) {
+		/* page got truncated from under us? */
+		goto out_unlock;
+	}
+	ret = 0;
+	if (PageMappedToDisk(page))
+		goto out_unlock;
+
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	if (page_has_buffers(page)) {
+		/* return if we have all the buffers mapped */
+		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+				       ext4_bh_unmapped))
+			goto out_unlock;
+	}
+	/*
+	 * OK, we need to fill the hole... Lock the page and do writepage.
+	 * We can't do write_begin and write_end here because we don't
+	 * have inode_mutex and that allow parallel write_begin, write_end call.
+	 * (lock_page prevent this from happening on the same page though)
+	 */
+	lock_page(page);
+	wbc.range_start = page_offset(page);
+	wbc.range_end = page_offset(page) + PAGE_CACHE_SIZE;
+	ret = mapping->a_ops->writepage(page, &wbc);
+	/* writepage unlocks the page */
+out_unlock:
+	up_read(&inode->i_alloc_sem);
+	return ret;
+}
-- 
1.5.5.1.211.g65ea3.dirty


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion
  2008-05-21 17:44     ` [PATCH] ext4: Fix delalloc sync hang with journal lock inversion Aneesh Kumar K.V
  2008-05-21 17:44       ` [PATCH] ext4: Inverse locking order of page_lock and transaction start Aneesh Kumar K.V
@ 2008-05-22 10:25       ` Aneesh Kumar K.V
  2008-05-22 17:58         ` Mingming
  2008-05-22 18:10       ` Mingming
  2 siblings, 1 reply; 17+ messages in thread
From: Aneesh Kumar K.V @ 2008-05-22 10:25 UTC (permalink / raw)
  To: cmm; +Cc: linux-ext4, tytso, sandeen

On Wed, May 21, 2008 at 11:14:17PM +0530, Aneesh Kumar K.V wrote:
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  fs/ext4/inode.c |   10 +++++++---
>  1 files changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 46cc610..076d00f 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1571,13 +1571,17 @@ static int ext4_da_writepages(struct address_space *mapping,
>  		 */
>  		if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
>  			wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
> -		to_write -= wbc->nr_to_write;
> 
> +		to_write -= wbc->nr_to_write;
>  		ret = mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
>  		ext4_journal_stop(handle);
> -		to_write +=wbc->nr_to_write;
> +		if (wbc->nr_to_write) {
> +			/* We failed to write what we requested for */
> +			to_write += wbc->nr_to_write;
> +			break;
> +		}
> +		wbc->nr_to_write = to_write;
>  	}
> -
>  out_writepages:
>  	wbc->nr_to_write = to_write;
>  	wbc->range_cyclic = range_cyclic;

We need related fix for ext4_da_writepage. We need to allocate blocks in
ext4_da_writepage and we are called with page_lock. The handle
will be NULL in the below case and that would result in
ext4_get_block starting a new transaction when allocating blocks.


static int __ext4_da_writepage(struct page *page,
				struct writeback_control *wbc)
{
	struct inode *inode = page->mapping->host;
	handle_t *handle = NULL;
	int ret = 0;

	handle = ext4_journal_current_handle();

	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
		ret = nobh_writepage(page, ext4_get_block, wbc);
	else
		ret = block_write_full_page(page, ext4_get_block, wbc);

	if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
		EXT4_I(inode)->i_disksize = inode->i_size;
		ext4_mark_inode_dirty(handle, inode);
	}

	return ret;
}

-aneesh

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion
  2008-05-22 10:25       ` [PATCH] ext4: Fix delalloc sync hang with journal lock inversion Aneesh Kumar K.V
@ 2008-05-22 17:58         ` Mingming
  2008-05-22 18:23           ` Aneesh Kumar K.V
  0 siblings, 1 reply; 17+ messages in thread
From: Mingming @ 2008-05-22 17:58 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linux-ext4, tytso, sandeen


On Thu, 2008-05-22 at 15:55 +0530, Aneesh Kumar K.V wrote:
> On Wed, May 21, 2008 at 11:14:17PM +0530, Aneesh Kumar K.V wrote:
> > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> > ---
> >  fs/ext4/inode.c |   10 +++++++---
> >  1 files changed, 7 insertions(+), 3 deletions(-)
> > 
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index 46cc610..076d00f 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -1571,13 +1571,17 @@ static int ext4_da_writepages(struct address_space *mapping,
> >  		 */
> >  		if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
> >  			wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
> > -		to_write -= wbc->nr_to_write;
> > 
> > +		to_write -= wbc->nr_to_write;
> >  		ret = mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
> >  		ext4_journal_stop(handle);
> > -		to_write +=wbc->nr_to_write;
> > +		if (wbc->nr_to_write) {
> > +			/* We failed to write what we requested for */
> > +			to_write += wbc->nr_to_write;
> > +			break;
> > +		}
> > +		wbc->nr_to_write = to_write;
> >  	}
> > -
> >  out_writepages:
> >  	wbc->nr_to_write = to_write;
> >  	wbc->range_cyclic = range_cyclic;
> 
> We need related fix for ext4_da_writepage. We need to allocate blocks in
> ext4_da_writepage and we are called with page_lock. The handle
> will be NULL in the below case and that would result in
> ext4_get_block starting a new transaction when allocating blocks.
> 

Hi Aneesh, the blocks are not allocated at ext4_da_writepage() time, 

the block allocation has been done in this path: 

ext4_da_writepages()->mpage_da_writepages()->write_cache_pages()->
__mpage_da_writepage()->mpage_da_map_blocks() will ensure blocks are all
mapped before mpage_da_submit_io() calling
__mpage_writepage()->ext4_da_writepage() to submit the IO.

> 
> static int __ext4_da_writepage(struct page *page,
> 				struct writeback_control *wbc)
> {
> 	struct inode *inode = page->mapping->host;
> 	handle_t *handle = NULL;
> 	int ret = 0;
> 
> 	handle = ext4_journal_current_handle();
> 
> 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
> 		ret = nobh_writepage(page, ext4_get_block, wbc);
> 	else
> 		ret = block_write_full_page(page, ext4_get_block, wbc);
> 
> 	if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
> 		EXT4_I(inode)->i_disksize = inode->i_size;
> 		ext4_mark_inode_dirty(handle, inode);
> 	}
> 
> 	return ret;
> }
> 
> -aneesh


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion
  2008-05-21 17:44     ` [PATCH] ext4: Fix delalloc sync hang with journal lock inversion Aneesh Kumar K.V
  2008-05-21 17:44       ` [PATCH] ext4: Inverse locking order of page_lock and transaction start Aneesh Kumar K.V
  2008-05-22 10:25       ` [PATCH] ext4: Fix delalloc sync hang with journal lock inversion Aneesh Kumar K.V
@ 2008-05-22 18:10       ` Mingming
  2008-05-22 18:26         ` Aneesh Kumar K.V
  2 siblings, 1 reply; 17+ messages in thread
From: Mingming @ 2008-05-22 18:10 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: tytso, sandeen, linux-ext4


On Wed, 2008-05-21 at 23:14 +0530, Aneesh Kumar K.V wrote:
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  fs/ext4/inode.c |   10 +++++++---
>  1 files changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 46cc610..076d00f 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1571,13 +1571,17 @@ static int ext4_da_writepages(struct address_space *mapping,
>  		 */
>  		if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
>  			wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
> -		to_write -= wbc->nr_to_write;
> 
> +		to_write -= wbc->nr_to_write;
>  		ret = mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
>  		ext4_journal_stop(handle);
> -		to_write +=wbc->nr_to_write;
> +		if (wbc->nr_to_write) {
> +			/* We failed to write what we requested for */
> +			to_write += wbc->nr_to_write;
> +			break;
> +		}

Not sure about the break here...
> +		wbc->nr_to_write = to_write;

Looks right. thanks.

>  	}
> -
>  out_writepages:
>  	wbc->nr_to_write = to_write;
>  	wbc->range_cyclic = range_cyclic;


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion
  2008-05-22 17:58         ` Mingming
@ 2008-05-22 18:23           ` Aneesh Kumar K.V
  2008-05-22 19:45             ` Mingming
  0 siblings, 1 reply; 17+ messages in thread
From: Aneesh Kumar K.V @ 2008-05-22 18:23 UTC (permalink / raw)
  To: Mingming; +Cc: linux-ext4, tytso, sandeen

On Thu, May 22, 2008 at 10:58:35AM -0700, Mingming wrote:
> 
> On Thu, 2008-05-22 at 15:55 +0530, Aneesh Kumar K.V wrote:
> > On Wed, May 21, 2008 at 11:14:17PM +0530, Aneesh Kumar K.V wrote:
> > > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> > > ---
> > >  fs/ext4/inode.c |   10 +++++++---
> > >  1 files changed, 7 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > index 46cc610..076d00f 100644
> > > --- a/fs/ext4/inode.c
> > > +++ b/fs/ext4/inode.c
> > > @@ -1571,13 +1571,17 @@ static int ext4_da_writepages(struct address_space *mapping,
> > >  		 */
> > >  		if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
> > >  			wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
> > > -		to_write -= wbc->nr_to_write;
> > > 
> > > +		to_write -= wbc->nr_to_write;
> > >  		ret = mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
> > >  		ext4_journal_stop(handle);
> > > -		to_write +=wbc->nr_to_write;
> > > +		if (wbc->nr_to_write) {
> > > +			/* We failed to write what we requested for */
> > > +			to_write += wbc->nr_to_write;
> > > +			break;
> > > +		}
> > > +		wbc->nr_to_write = to_write;
> > >  	}
> > > -
> > >  out_writepages:
> > >  	wbc->nr_to_write = to_write;
> > >  	wbc->range_cyclic = range_cyclic;
> > 
> > We need related fix for ext4_da_writepage. We need to allocate blocks in
> > ext4_da_writepage and we are called with page_lock. The handle
> > will be NULL in the below case and that would result in
> > ext4_get_block starting a new transaction when allocating blocks.
> > 
> 
> Hi Aneesh, the blocks are not allocated at ext4_da_writepage() time, 
> 
> the block allocation has been done in this path: 
> 
> ext4_da_writepages()->mpage_da_writepages()->write_cache_pages()->
> __mpage_da_writepage()->mpage_da_map_blocks() will ensure blocks are all
> mapped before mpage_da_submit_io() calling
> __mpage_writepage()->ext4_da_writepage() to submit the IO.
> 

Does that mean we don't allocate new blocks at all in ext4_da_writepage.
Then I will put a BUG() if we get passed a page that doesn't have all
the buffer head mapped in ext4_da_writepage.

We still need a diff as below

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 46cc610..8327796 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1498,9 +1498,8 @@ static int __ext4_da_writepage(struct page *page,
 {
 	struct inode *inode = page->mapping->host;
 	handle_t *handle = NULL;
-	int ret = 0;
+	int ret = 0, err;
 
-	handle = ext4_journal_current_handle();
 
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
 		ret = nobh_writepage(page, ext4_get_block, wbc);
@@ -1508,12 +1507,21 @@ static int __ext4_da_writepage(struct page *page,
 		ret = block_write_full_page(page, ext4_get_block, wbc);
 
 	if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
+		handle = ext4_journal_start(inode, 1);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
 		EXT4_I(inode)->i_disksize = inode->i_size;
-		ext4_mark_inode_dirty(handle, inode);
+		ret = ext4_mark_inode_dirty(handle, inode);
+		err = ext4_journal_stop(handle);
+		if (!ret)
+			ret = err;
 	}

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion
  2008-05-22 18:10       ` Mingming
@ 2008-05-22 18:26         ` Aneesh Kumar K.V
  2008-05-22 19:26           ` Mingming
  0 siblings, 1 reply; 17+ messages in thread
From: Aneesh Kumar K.V @ 2008-05-22 18:26 UTC (permalink / raw)
  To: Mingming; +Cc: tytso, sandeen, linux-ext4

On Thu, May 22, 2008 at 11:10:17AM -0700, Mingming wrote:
> 
> On Wed, 2008-05-21 at 23:14 +0530, Aneesh Kumar K.V wrote:
> > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> > ---
> >  fs/ext4/inode.c |   10 +++++++---
> >  1 files changed, 7 insertions(+), 3 deletions(-)
> > 
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index 46cc610..076d00f 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -1571,13 +1571,17 @@ static int ext4_da_writepages(struct address_space *mapping,
> >  		 */
> >  		if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
> >  			wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
> > -		to_write -= wbc->nr_to_write;
> > 
> > +		to_write -= wbc->nr_to_write;
> >  		ret = mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
> >  		ext4_journal_stop(handle);
> > -		to_write +=wbc->nr_to_write;
> > +		if (wbc->nr_to_write) {
> > +			/* We failed to write what we requested for */
> > +			to_write += wbc->nr_to_write;
> > +			break;
> > +		}
> 
> Not sure about the break here...
> > +		wbc->nr_to_write = to_write;
> 
> Looks right. thanks.
> 
> >  	}
> > -
> >  out_writepages:
> >  	wbc->nr_to_write = to_write;
> >  	wbc->range_cyclic = range_cyclic;
> 

The call chain that made me look at this was 

#0  ext4_da_writepages (mapping=0xc76dc244, wbc=0xc790bf70) at fs/ext4/inode.c:1557
#1  0xc0150176 in do_writepages (mapping=0xc76dc244, wbc=0xc790bf70) at mm/page-writeback.c:1004
#2  0xc0180fe6 in __writeback_single_inode (inode=0xc76dc11c, wbc=0xc790bf70) at fs/fs-writeback.c:285
#3  0xc018146c in sync_sb_inodes (sb=0xc7abac00, wbc=0xc790bf70) at fs/fs-writeback.c:502
#4  0xc0181701 in writeback_inodes (wbc=0xc790bf70) at fs/fs-writeback.c:570
#5  0xc01509f8 in background_writeout (_min_pages=<value optimized out>) at mm/page-writeback.c:639
#6  0xc0150f57 in pdflush (dummy=<value optimized out>) at mm/pdflush.c:127
#7  0xc01324af in kthread (_create=<value optimized out>) at kernel/kthread.c:79
#8  0xc0104633 in kernel_thread_helper () at include/asm/string_32.h:238

ext4_da_writepages gets called with nr_to_write MAX_WRITEBACK_PAGES. the
file size is only 4K. ie there is only one page to write. With these
value we get stuck in the above loop because to_write will never decrement
below 1023.


-aneesh

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion
  2008-05-22 18:26         ` Aneesh Kumar K.V
@ 2008-05-22 19:26           ` Mingming
  0 siblings, 0 replies; 17+ messages in thread
From: Mingming @ 2008-05-22 19:26 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: tytso, sandeen, linux-ext4


On Thu, 2008-05-22 at 23:56 +0530, Aneesh Kumar K.V wrote:
> On Thu, May 22, 2008 at 11:10:17AM -0700, Mingming wrote:
> > 
> > On Wed, 2008-05-21 at 23:14 +0530, Aneesh Kumar K.V wrote:
> > > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> > > ---
> > >  fs/ext4/inode.c |   10 +++++++---
> > >  1 files changed, 7 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > index 46cc610..076d00f 100644
> > > --- a/fs/ext4/inode.c
> > > +++ b/fs/ext4/inode.c
> > > @@ -1571,13 +1571,17 @@ static int ext4_da_writepages(struct address_space *mapping,
> > >  		 */
> > >  		if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
> > >  			wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
> > > -		to_write -= wbc->nr_to_write;
> > > 
> > > +		to_write -= wbc->nr_to_write;
> > >  		ret = mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
> > >  		ext4_journal_stop(handle);
> > > -		to_write +=wbc->nr_to_write;
> > > +		if (wbc->nr_to_write) {
> > > +			/* We failed to write what we requested for */
> > > +			to_write += wbc->nr_to_write;
> > > +			break;
> > > +		}
> > 
> > Not sure about the break here...
> > > +		wbc->nr_to_write = to_write;
> > 
> > Looks right. thanks.
> > 
> > >  	}
> > > -
> > >  out_writepages:
> > >  	wbc->nr_to_write = to_write;
> > >  	wbc->range_cyclic = range_cyclic;
> > 
> 
> The call chain that made me look at this was 
> 
> #0  ext4_da_writepages (mapping=0xc76dc244, wbc=0xc790bf70) at fs/ext4/inode.c:1557
> #1  0xc0150176 in do_writepages (mapping=0xc76dc244, wbc=0xc790bf70) at mm/page-writeback.c:1004
> #2  0xc0180fe6 in __writeback_single_inode (inode=0xc76dc11c, wbc=0xc790bf70) at fs/fs-writeback.c:285
> #3  0xc018146c in sync_sb_inodes (sb=0xc7abac00, wbc=0xc790bf70) at fs/fs-writeback.c:502
> #4  0xc0181701 in writeback_inodes (wbc=0xc790bf70) at fs/fs-writeback.c:570
> #5  0xc01509f8 in background_writeout (_min_pages=<value optimized out>) at mm/page-writeback.c:639
> #6  0xc0150f57 in pdflush (dummy=<value optimized out>) at mm/pdflush.c:127
> #7  0xc01324af in kthread (_create=<value optimized out>) at kernel/kthread.c:79
> #8  0xc0104633 in kernel_thread_helper () at include/asm/string_32.h:238
> 
> ext4_da_writepages gets called with nr_to_write MAX_WRITEBACK_PAGES. the
> file size is only 4K. ie there is only one page to write. With these
> value we get stuck in the above loop because to_write will never decrement
> below 1023.
> 

Ah, So I guess the comment should be reflect that the nr_to_write
returned after mpage_da_writepages() means there is no more dirty page
need to flush, then quit the loop.

Mingming
> 
> -aneesh


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion
  2008-05-22 18:23           ` Aneesh Kumar K.V
@ 2008-05-22 19:45             ` Mingming
  0 siblings, 0 replies; 17+ messages in thread
From: Mingming @ 2008-05-22 19:45 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linux-ext4, tytso, sandeen


On Thu, 2008-05-22 at 23:53 +0530, Aneesh Kumar K.V wrote:
> On Thu, May 22, 2008 at 10:58:35AM -0700, Mingming wrote:
> > 
> > On Thu, 2008-05-22 at 15:55 +0530, Aneesh Kumar K.V wrote:
> > > On Wed, May 21, 2008 at 11:14:17PM +0530, Aneesh Kumar K.V wrote:
> > > > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> > > > ---
> > > >  fs/ext4/inode.c |   10 +++++++---
> > > >  1 files changed, 7 insertions(+), 3 deletions(-)
> > > > 
> > > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > > index 46cc610..076d00f 100644
> > > > --- a/fs/ext4/inode.c
> > > > +++ b/fs/ext4/inode.c
> > > > @@ -1571,13 +1571,17 @@ static int ext4_da_writepages(struct address_space *mapping,
> > > >  		 */
> > > >  		if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
> > > >  			wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
> > > > -		to_write -= wbc->nr_to_write;
> > > > 
> > > > +		to_write -= wbc->nr_to_write;
> > > >  		ret = mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
> > > >  		ext4_journal_stop(handle);
> > > > -		to_write +=wbc->nr_to_write;
> > > > +		if (wbc->nr_to_write) {
> > > > +			/* We failed to write what we requested for */
> > > > +			to_write += wbc->nr_to_write;
> > > > +			break;
> > > > +		}
> > > > +		wbc->nr_to_write = to_write;
> > > >  	}
> > > > -
> > > >  out_writepages:
> > > >  	wbc->nr_to_write = to_write;
> > > >  	wbc->range_cyclic = range_cyclic;
> > > 
> > > We need related fix for ext4_da_writepage. We need to allocate blocks in
> > > ext4_da_writepage and we are called with page_lock. The handle
> > > will be NULL in the below case and that would result in
> > > ext4_get_block starting a new transaction when allocating blocks.
> > > 
> > 
> > Hi Aneesh, the blocks are not allocated at ext4_da_writepage() time, 
> > 
> > the block allocation has been done in this path: 
> > 
> > ext4_da_writepages()->mpage_da_writepages()->write_cache_pages()->
> > __mpage_da_writepage()->mpage_da_map_blocks() will ensure blocks are all
> > mapped before mpage_da_submit_io() calling
> > __mpage_writepage()->ext4_da_writepage() to submit the IO.
> > 
> 
> Does that mean we don't allocate new blocks at all in ext4_da_writepage.
> Then I will put a BUG() if we get passed a page that doesn't have all
> the buffer head mapped in ext4_da_writepage.
> 
Yes. that would be nice to have.

> We still need a diff as below
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 46cc610..8327796 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1498,9 +1498,8 @@ static int __ext4_da_writepage(struct page *page,
>  {
>  	struct inode *inode = page->mapping->host;
>  	handle_t *handle = NULL;
> -	int ret = 0;
> +	int ret = 0, err;
> 
> -	handle = ext4_journal_current_handle();
> 
>  	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
>  		ret = nobh_writepage(page, ext4_get_block, wbc);
> @@ -1508,12 +1507,21 @@ static int __ext4_da_writepage(struct page *page,
>  		ret = block_write_full_page(page, ext4_get_block, wbc);
> 
>  	if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
> +		handle = ext4_journal_start(inode, 1);
> +		if (IS_ERR(handle)) {
> +			ret = PTR_ERR(handle);
> +			goto out;
> +		}
>  		EXT4_I(inode)->i_disksize = inode->i_size;
> -		ext4_mark_inode_dirty(handle, inode);
> +		ret = ext4_mark_inode_dirty(handle, inode);
> +		err = ext4_journal_stop(handle);
> +		if (!ret)
> +			ret = err;
>  	}
> -
> +out:
>  	return ret;
>  }
> +
> 
> 

As we have discussed on IRC, I think there is bug in
ext4_da_write_page(), since ext4_da_write_page()/__ext4_da_write_page()
is always called with a journal started (by ext4_da_writepages()), so we
don't need to start a new journal in __ext4_da_write_page().

Mingming
> -aneesh
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] ext4: Add validation to jbd lock inversion patch and split and writepage
  2008-05-30 13:39     ` [PATCH] vfs: Move mark_inode_dirty() from under page lock in generic_write_end() Aneesh Kumar K.V
@ 2008-05-30 13:39       ` Aneesh Kumar K.V
  2008-06-02  9:31         ` Jan Kara
  0 siblings, 1 reply; 17+ messages in thread
From: Aneesh Kumar K.V @ 2008-05-30 13:39 UTC (permalink / raw)
  To: cmm, jack; +Cc: linux-ext4, Aneesh Kumar K.V

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/inode.c |  181 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 files changed, 156 insertions(+), 25 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a96c325..b122425 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1479,6 +1479,11 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+	return (!buffer_mapped(bh) || buffer_delay(bh));
+}
+
 /*
  * Note that we don't need to start a transaction unless we're journaling
  * data because we should have holes filled from ext4_page_mkwrite(). If
@@ -1531,18 +1536,26 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 static int __ext4_ordered_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
-	struct inode *inode = page->mapping->host;
-	struct buffer_head *page_bufs;
+	int ret = 0, err;
+	unsigned long len;
 	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
+	struct buffer_head *page_bufs;
+	struct inode *inode = page->mapping->host;
+	loff_t size = i_size_read(inode);
 
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, inode->i_sb->s_blocksize,
-				(1 << BH_Dirty)|(1 << BH_Uptodate));
-	}
 	page_bufs = page_buffers(page);
-	walk_page_buffers(handle, page_bufs, 0,
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	if (walk_page_buffers(NULL, page_bufs, 0,
+				len, NULL, ext4_bh_unmapped_or_delay)) {
+		printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
+								__func__);
+		BUG();
+	}
+	walk_page_buffers(NULL, page_bufs, 0,
 			PAGE_CACHE_SIZE, NULL, bget_one);
 
 	ret = block_write_full_page(page, ext4_get_block, wbc);
@@ -1574,8 +1587,8 @@ static int __ext4_ordered_writepage(struct page *page,
 			ret = err;
 	}
 out_put:
-	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
-			  bput_one);
+	walk_page_buffers(handle, page_bufs, 0,
+				PAGE_CACHE_SIZE, NULL, bput_one);
 	return ret;
 }
 
@@ -1583,7 +1596,7 @@ static int ext4_ordered_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	J_ASSERT(PageLocked(page));
-
+	BUG_ON(!page_has_buffers(page));
 	/*
 	 * We give up here if we're reentered, because it might be for a
 	 * different filesystem.
@@ -1599,18 +1612,34 @@ static int ext4_ordered_writepage(struct page *page,
 static int __ext4_writeback_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
+	unsigned long len;
+	struct buffer_head *page_bufs;
 	struct inode *inode = page->mapping->host;
+	loff_t size = i_size_read(inode);
+
+	page_bufs = page_buffers(page);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
 
+	if (walk_page_buffers(NULL, page_bufs, 0,
+				len, NULL, ext4_bh_unmapped_or_delay)) {
+		printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
+								__func__);
+		BUG();
+	}
 	if (test_opt(inode->i_sb, NOBH))
 		return nobh_writepage(page, ext4_get_block, wbc);
 	else
 		return block_write_full_page(page, ext4_get_block, wbc);
 }
 
-
 static int ext4_writeback_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
+	BUG_ON(!page_has_buffers(page));
+
 	if (!ext4_journal_current_handle())
 		return __ext4_writeback_writepage(page, wbc);
 
@@ -1622,18 +1651,31 @@ static int ext4_writeback_writepage(struct page *page,
 static int __ext4_journalled_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
+	int ret = 0, err;
+	unsigned long len;
+	handle_t *handle = NULL;
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
 	struct buffer_head *page_bufs;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
+	loff_t size = i_size_read(inode);
+
+	page_bufs = page_buffers(page);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
 
+	if (walk_page_buffers(NULL, page_bufs, 0,
+				len, NULL, ext4_bh_unmapped_or_delay)) {
+		printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
+								__func__);
+		BUG();
+	}
+	/* FIXME!! do we need to call prepare_write for a mapped buffer */
 	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
 	if (ret != 0)
 		goto out_unlock;
 
-	page_bufs = page_buffers(page);
 	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
 								bget_one);
 	/* As soon as we unlock the page, it can go away, but we have
@@ -1671,14 +1713,13 @@ static int __ext4_journalled_writepage(struct page *page,
 static int ext4_journalled_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
+	BUG_ON(!page_has_buffers(page));
+
 	if (ext4_journal_current_handle())
 		goto no_write;
 
-	if (!page_has_buffers(page) || PageChecked(page)) {
-		/*
-		 * It's mmapped pagecache.  Add buffers and journal it.  There
-		 * doesn't seem much point in redirtying the page here.
-		 */
+	if (PageChecked(page)) {
+		/* dirty pages in data=journal mode */
 		ClearPageChecked(page);
 		return __ext4_journalled_writepage(page, wbc);
 	} else {
@@ -3520,6 +3561,96 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	return err;
 }
 
+static int __ext4_journalled_allocpage(struct page *page,
+				struct writeback_control *wbc)
+{
+	int ret = 0, err;
+	handle_t *handle = NULL;
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct buffer_head *page_bufs;
+
+	/* if alloc we are called after statring a journal */
+	handle = ext4_journal_current_handle();
+	BUG_ON(!handle);
+
+	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
+	if (ret != 0)
+		goto out_unlock;
+
+	/* FIXME!! should we do a bget_one */
+	page_bufs = page_buffers(page);
+	ret = walk_page_buffers(handle, page_bufs, 0,
+			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+
+	err = walk_page_buffers(handle, page_bufs, 0,
+				PAGE_CACHE_SIZE, NULL, write_end_fn);
+	if (ret == 0)
+		ret = err;
+	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+
+out_unlock:
+	unlock_page(page);
+	return ret;
+}
+
+static int __ext4_ordered_allocpage(struct page *page,
+				struct writeback_control *wbc)
+{
+	int ret = 0;
+	handle_t *handle = NULL;
+	struct buffer_head *page_bufs;
+	struct inode *inode = page->mapping->host;
+
+	/* if alloc we are called after statring a journal */
+	handle = ext4_journal_current_handle();
+	BUG_ON(!handle);
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, inode->i_sb->s_blocksize,
+				(1 << BH_Dirty)|(1 << BH_Uptodate));
+	}
+	page_bufs = page_buffers(page);
+	walk_page_buffers(handle, page_bufs, 0,
+			PAGE_CACHE_SIZE, NULL, bget_one);
+
+	ret = block_write_full_page(page, ext4_get_block, wbc);
+
+	/*
+	 * The page can become unlocked at any point now, and
+	 * truncate can then come in and change things.  So we
+	 * can't touch *page from now on.  But *page_bufs is
+	 * safe due to elevated refcount.
+	 */
+
+	/*
+	 * And attach them to the current transaction.  But only if
+	 * block_write_full_page() succeeded.  Otherwise they are unmapped,
+	 * and generally junk.
+	 */
+	if (ret == 0) {
+		ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+					NULL, jbd2_journal_dirty_data_fn);
+	}
+	walk_page_buffers(handle, page_bufs, 0,
+				PAGE_CACHE_SIZE, NULL, bput_one);
+	return ret;
+}
+
+static int __ext4_writeback_allocpage(struct page *page,
+				struct writeback_control *wbc)
+{
+	handle_t *handle = NULL;
+	struct inode *inode = page->mapping->host;
+	/* if alloc we are called after statring a journal */
+	handle = ext4_journal_current_handle();
+	BUG_ON(!handle);
+
+	if (test_opt(inode->i_sb, NOBH))
+		return nobh_writepage(page, ext4_get_block, wbc);
+	else
+		return block_write_full_page(page, ext4_get_block, wbc);
+}
+
 static int ext4_bh_prepare_fill(handle_t *handle, struct buffer_head *bh)
 {
 	if (!buffer_mapped(bh)) {
@@ -3596,11 +3727,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	wbc.range_start = page_offset(page);
 	wbc.range_end = page_offset(page) + len;
 	if (ext4_should_writeback_data(inode))
-		ret = __ext4_writeback_writepage(page, &wbc);
+		ret = __ext4_writeback_allocpage(page, &wbc);
 	else if (ext4_should_order_data(inode))
-		ret = __ext4_ordered_writepage(page, &wbc);
+		ret = __ext4_ordered_allocpage(page, &wbc);
 	else
-		ret = __ext4_journalled_writepage(page, &wbc);
+		ret = __ext4_journalled_allocpage(page, &wbc);
 	/* Page got unlocked in writepage */
 	err = ext4_journal_stop(handle);
 	if (!ret)
-- 
1.5.5.1.357.g1af8b.dirty


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] ext4: Add validation to jbd lock inversion patch and split and writepage
  2008-05-30 13:39       ` [PATCH] ext4: Add validation to jbd lock inversion patch and split and writepage Aneesh Kumar K.V
@ 2008-06-02  9:31         ` Jan Kara
  2008-06-02  9:52           ` Aneesh Kumar K.V
  0 siblings, 1 reply; 17+ messages in thread
From: Jan Kara @ 2008-06-02  9:31 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: cmm, linux-ext4

  Hi Aneesh,

  Thanks for the patch but I though we decided to do this a bit differently -
see below.

On Fri 30-05-08 19:09:27, Aneesh Kumar K.V wrote:
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  fs/ext4/inode.c |  181 +++++++++++++++++++++++++++++++++++++++++++++++--------
>  1 files changed, 156 insertions(+), 25 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index a96c325..b122425 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1479,6 +1479,11 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
>  	return 0;
>  }
>  
> +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
> +{
> +	return (!buffer_mapped(bh) || buffer_delay(bh));
> +}
> +
>  /*
>   * Note that we don't need to start a transaction unless we're journaling
>   * data because we should have holes filled from ext4_page_mkwrite(). If
> @@ -1531,18 +1536,26 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
>  static int __ext4_ordered_writepage(struct page *page,
>  				struct writeback_control *wbc)
>  {
> -	struct inode *inode = page->mapping->host;
> -	struct buffer_head *page_bufs;
> +	int ret = 0, err;
> +	unsigned long len;
>  	handle_t *handle = NULL;
> -	int ret = 0;
> -	int err;
> +	struct buffer_head *page_bufs;
> +	struct inode *inode = page->mapping->host;
> +	loff_t size = i_size_read(inode);
>  
> -	if (!page_has_buffers(page)) {
> -		create_empty_buffers(page, inode->i_sb->s_blocksize,
> -				(1 << BH_Dirty)|(1 << BH_Uptodate));
> -	}
  This is OK.

>  	page_bufs = page_buffers(page);
> -	walk_page_buffers(handle, page_bufs, 0,
> +	if (page->index == size >> PAGE_CACHE_SHIFT)
> +		len = size & ~PAGE_CACHE_MASK;
> +	else
> +		len = PAGE_CACHE_SIZE;
> +
> +	if (walk_page_buffers(NULL, page_bufs, 0,
> +				len, NULL, ext4_bh_unmapped_or_delay)) {
> +		printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
> +								__func__);
> +		BUG();
> +	}
  But I'd move this check to ext4_ordered_writepage().

> +	walk_page_buffers(NULL, page_bufs, 0,
>  			PAGE_CACHE_SIZE, NULL, bget_one);
>  
>  	ret = block_write_full_page(page, ext4_get_block, wbc);
> @@ -1574,8 +1587,8 @@ static int __ext4_ordered_writepage(struct page *page,
>  			ret = err;
>  	}
>  out_put:
> -	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
> -			  bput_one);
> +	walk_page_buffers(handle, page_bufs, 0,
> +				PAGE_CACHE_SIZE, NULL, bput_one);
>  	return ret;
>  }
>  
> @@ -1583,7 +1596,7 @@ static int ext4_ordered_writepage(struct page *page,
>  				struct writeback_control *wbc)
>  {
>  	J_ASSERT(PageLocked(page));
> -
> +	BUG_ON(!page_has_buffers(page));
>  	/*
>  	 * We give up here if we're reentered, because it might be for a
>  	 * different filesystem.
> @@ -1599,18 +1612,34 @@ static int ext4_ordered_writepage(struct page *page,
>  static int __ext4_writeback_writepage(struct page *page,
>  				struct writeback_control *wbc)
>  {
> +	unsigned long len;
> +	struct buffer_head *page_bufs;
>  	struct inode *inode = page->mapping->host;
> +	loff_t size = i_size_read(inode);
> +
> +	page_bufs = page_buffers(page);
> +	if (page->index == size >> PAGE_CACHE_SHIFT)
> +		len = size & ~PAGE_CACHE_MASK;
> +	else
> +		len = PAGE_CACHE_SIZE;
>  
> +	if (walk_page_buffers(NULL, page_bufs, 0,
> +				len, NULL, ext4_bh_unmapped_or_delay)) {
> +		printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
> +								__func__);
> +		BUG();
> +	}
  And similarly move this check to ext4_writeback_writepage().

>  	if (test_opt(inode->i_sb, NOBH))
>  		return nobh_writepage(page, ext4_get_block, wbc);
>  	else
>  		return block_write_full_page(page, ext4_get_block, wbc);
>  }
>  
> -
>  static int ext4_writeback_writepage(struct page *page,
>  				struct writeback_control *wbc)
>  {
> +	BUG_ON(!page_has_buffers(page));
> +
>  	if (!ext4_journal_current_handle())
>  		return __ext4_writeback_writepage(page, wbc);
>  
> @@ -1622,18 +1651,31 @@ static int ext4_writeback_writepage(struct page *page,
>  static int __ext4_journalled_writepage(struct page *page,
>  				struct writeback_control *wbc)
>  {
> +	int ret = 0, err;
> +	unsigned long len;
> +	handle_t *handle = NULL;
>  	struct address_space *mapping = page->mapping;
>  	struct inode *inode = mapping->host;
>  	struct buffer_head *page_bufs;
> -	handle_t *handle = NULL;
> -	int ret = 0;
> -	int err;
> +	loff_t size = i_size_read(inode);
> +
> +	page_bufs = page_buffers(page);
> +	if (page->index == size >> PAGE_CACHE_SHIFT)
> +		len = size & ~PAGE_CACHE_MASK;
> +	else
> +		len = PAGE_CACHE_SIZE;
>  
> +	if (walk_page_buffers(NULL, page_bufs, 0,
> +				len, NULL, ext4_bh_unmapped_or_delay)) {
> +		printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
> +								__func__);
> +		BUG();
> +	}
> +	/* FIXME!! do we need to call prepare_write for a mapped buffer */
  This can go to ext4_journalled_writepage(). What is actually this FIXME
about? I'm not sure I get it...

>  	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
>  	if (ret != 0)
>  		goto out_unlock;
>  
> -	page_bufs = page_buffers(page);
>  	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
>  								bget_one);
>  	/* As soon as we unlock the page, it can go away, but we have
> @@ -1671,14 +1713,13 @@ static int __ext4_journalled_writepage(struct page *page,
>  static int ext4_journalled_writepage(struct page *page,
>  				struct writeback_control *wbc)
>  {
> +	BUG_ON(!page_has_buffers(page));
> +
>  	if (ext4_journal_current_handle())
>  		goto no_write;
>  
> -	if (!page_has_buffers(page) || PageChecked(page)) {
> -		/*
> -		 * It's mmapped pagecache.  Add buffers and journal it.  There
> -		 * doesn't seem much point in redirtying the page here.
> -		 */
> +	if (PageChecked(page)) {
> +		/* dirty pages in data=journal mode */
>  		ClearPageChecked(page);
>  		return __ext4_journalled_writepage(page, wbc);
>  	} else {
> @@ -3520,6 +3561,96 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
>  	return err;
>  }
>  
> +static int __ext4_journalled_allocpage(struct page *page,
> +				struct writeback_control *wbc)
> +{
> +	int ret = 0, err;
> +	handle_t *handle = NULL;
> +	struct address_space *mapping = page->mapping;
> +	struct inode *inode = mapping->host;
> +	struct buffer_head *page_bufs;
> +
> +	/* if alloc we are called after statring a journal */
> +	handle = ext4_journal_current_handle();
> +	BUG_ON(!handle);
> +
> +	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
> +	if (ret != 0)
> +		goto out_unlock;
> +
> +	/* FIXME!! should we do a bget_one */
> +	page_bufs = page_buffers(page);
> +	ret = walk_page_buffers(handle, page_bufs, 0,
> +			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
> +
> +	err = walk_page_buffers(handle, page_bufs, 0,
> +				PAGE_CACHE_SIZE, NULL, write_end_fn);
> +	if (ret == 0)
> +		ret = err;
> +	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
> +
> +out_unlock:
> +	unlock_page(page);
> +	return ret;
> +}
> +
> +static int __ext4_ordered_allocpage(struct page *page,
> +				struct writeback_control *wbc)
> +{
> +	int ret = 0;
> +	handle_t *handle = NULL;
> +	struct buffer_head *page_bufs;
> +	struct inode *inode = page->mapping->host;
> +
> +	/* if alloc we are called after statring a journal */
> +	handle = ext4_journal_current_handle();
> +	BUG_ON(!handle);
> +	if (!page_has_buffers(page)) {
> +		create_empty_buffers(page, inode->i_sb->s_blocksize,
> +				(1 << BH_Dirty)|(1 << BH_Uptodate));
> +	}
> +	page_bufs = page_buffers(page);
> +	walk_page_buffers(handle, page_bufs, 0,
> +			PAGE_CACHE_SIZE, NULL, bget_one);
> +
> +	ret = block_write_full_page(page, ext4_get_block, wbc);
> +
> +	/*
> +	 * The page can become unlocked at any point now, and
> +	 * truncate can then come in and change things.  So we
> +	 * can't touch *page from now on.  But *page_bufs is
> +	 * safe due to elevated refcount.
> +	 */
> +
> +	/*
> +	 * And attach them to the current transaction.  But only if
> +	 * block_write_full_page() succeeded.  Otherwise they are unmapped,
> +	 * and generally junk.
> +	 */
> +	if (ret == 0) {
> +		ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
> +					NULL, jbd2_journal_dirty_data_fn);
> +	}
> +	walk_page_buffers(handle, page_bufs, 0,
> +				PAGE_CACHE_SIZE, NULL, bput_one);
> +	return ret;
> +}
> +
> +static int __ext4_writeback_allocpage(struct page *page,
> +				struct writeback_control *wbc)
> +{
> +	handle_t *handle = NULL;
> +	struct inode *inode = page->mapping->host;
> +	/* if alloc we are called after statring a journal */
> +	handle = ext4_journal_current_handle();
> +	BUG_ON(!handle);
> +
> +	if (test_opt(inode->i_sb, NOBH))
> +		return nobh_writepage(page, ext4_get_block, wbc);
> +	else
> +		return block_write_full_page(page, ext4_get_block, wbc);
> +}
> +
  And then you don't need these __ext4_..._allocpage() calls because that
is what's left in __ext4_..._writepage(), isn't it? It seems also logically
more consistent - you do checks in ext4_..._writepage() and then you do the
real work in __ext4_..._writepage().
  For data=journaled mode, it may be better to really have
ext4_journaled_allocpage() because we don't have to do nasty locking
tricks. But for writeback and ordered mode I really see no need for these
special functions. So at least for these two, I'd change the code as I
suggest.

>  static int ext4_bh_prepare_fill(handle_t *handle, struct buffer_head *bh)
>  {
>  	if (!buffer_mapped(bh)) {
> @@ -3596,11 +3727,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
>  	wbc.range_start = page_offset(page);
>  	wbc.range_end = page_offset(page) + len;
>  	if (ext4_should_writeback_data(inode))
> -		ret = __ext4_writeback_writepage(page, &wbc);
> +		ret = __ext4_writeback_allocpage(page, &wbc);
>  	else if (ext4_should_order_data(inode))
> -		ret = __ext4_ordered_writepage(page, &wbc);
> +		ret = __ext4_ordered_allocpage(page, &wbc);
>  	else
> -		ret = __ext4_journalled_writepage(page, &wbc);
> +		ret = __ext4_journalled_allocpage(page, &wbc);
>  	/* Page got unlocked in writepage */
>  	err = ext4_journal_stop(handle);
>  	if (!ret)
> -- 
> 1.5.5.1.357.g1af8b.dirty

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] ext4: Add validation to jbd lock inversion patch and split and writepage
  2008-06-02  9:31         ` Jan Kara
@ 2008-06-02  9:52           ` Aneesh Kumar K.V
  2008-06-02 10:40             ` Jan Kara
  0 siblings, 1 reply; 17+ messages in thread
From: Aneesh Kumar K.V @ 2008-06-02  9:52 UTC (permalink / raw)
  To: Jan Kara; +Cc: cmm, linux-ext4

Hi Jan,


On Mon, Jun 02, 2008 at 11:31:06AM +0200, Jan Kara wrote:
>   Hi Aneesh,
> 
>   Thanks for the patch but I though we decided to do this a bit differently -
> see below.

Please feel free to merge the changes back to the lock inversion
patches. I sent it as a separate the patches to make it easier for review.

> 
> On Fri 30-05-08 19:09:27, Aneesh Kumar K.V wrote:
> > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> > ---
> >  fs/ext4/inode.c |  181 +++++++++++++++++++++++++++++++++++++++++++++++--------
> >  1 files changed, 156 insertions(+), 25 deletions(-)
> > 
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index a96c325..b122425 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -1479,6 +1479,11 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
> >  	return 0;
> >  }
> >  
> > +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
> > +{
> > +	return (!buffer_mapped(bh) || buffer_delay(bh));
> > +}
> > +
> >  /*
> >   * Note that we don't need to start a transaction unless we're journaling
> >   * data because we should have holes filled from ext4_page_mkwrite(). If
> > @@ -1531,18 +1536,26 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
> >  static int __ext4_ordered_writepage(struct page *page,
> >  				struct writeback_control *wbc)
> >  {
> > -	struct inode *inode = page->mapping->host;
> > -	struct buffer_head *page_bufs;
> > +	int ret = 0, err;
> > +	unsigned long len;
> >  	handle_t *handle = NULL;
> > -	int ret = 0;
> > -	int err;
> > +	struct buffer_head *page_bufs;
> > +	struct inode *inode = page->mapping->host;
> > +	loff_t size = i_size_read(inode);
> >  
> > -	if (!page_has_buffers(page)) {
> > -		create_empty_buffers(page, inode->i_sb->s_blocksize,
> > -				(1 << BH_Dirty)|(1 << BH_Uptodate));
> > -	}
>   This is OK.
> 
> >  	page_bufs = page_buffers(page);
> > -	walk_page_buffers(handle, page_bufs, 0,
> > +	if (page->index == size >> PAGE_CACHE_SHIFT)
> > +		len = size & ~PAGE_CACHE_MASK;
> > +	else
> > +		len = PAGE_CACHE_SIZE;
> > +
> > +	if (walk_page_buffers(NULL, page_bufs, 0,
> > +				len, NULL, ext4_bh_unmapped_or_delay)) {
> > +		printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
> > +								__func__);
> > +		BUG();
> > +	}
>   But I'd move this check to ext4_ordered_writepage().
> 

Please feel free to do so. The only reason for me to do it as a separate
function is to clarify that with the changes writepage doesn't do any
block allocation at all. And calling writepage via page_mkwrite goes
against that. So i renamed the page_mkwrite call out to *_allocpage
and made sure writepage doesn't do any block allocation.



> > +	walk_page_buffers(NULL, page_bufs, 0,
> >  			PAGE_CACHE_SIZE, NULL, bget_one);
> >  

[.... snip ..... ]

> >  
> > +	if (walk_page_buffers(NULL, page_bufs, 0,
> > +				len, NULL, ext4_bh_unmapped_or_delay)) {
> > +		printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
> > +								__func__);
> > +		BUG();
> > +	}
> > +	/* FIXME!! do we need to call prepare_write for a mapped buffer */
>   This can go to ext4_journalled_writepage(). What is actually this FIXME
> about? I'm not sure I get it...
> 


I was wondering whether we need to call prepare_write in writepage ?. We
are not allocating any new blocks in writepage with these changes.


> >  	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
> >  	if (ret != 0)
> >  		goto out_unlock;
> >  
> > -	page_bufs = page_buffers(page);
> >  	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
> >  								bget_one);
> >  	/* As soon as we unlock the page, it can go away, but we have
> > @@ -1671,14 +1713,13 @@ static int __ext4_journalled_writepage(struct page *page,
> >  static int ext4_journalled_writepage(struct page *page,
> >  				struct writeback_control *wbc)
> >  {
> > +	BUG_ON(!page_has_buffers(page));
> > +
> >  	if (ext4_journal_current_handle())
> >  		goto no_write;
> >  
> > -	if (!page_has_buffers(page) || PageChecked(page)) {
> > -		/*
> > -		 * It's mmapped pagecache.  Add buffers and journal it.  There
> > -		 * doesn't seem much point in redirtying the page here.
> > -		 */
> > +	if (PageChecked(page)) {
> > +		/* dirty pages in data=journal mode */
> >  		ClearPageChecked(page);
> >  		return __ext4_journalled_writepage(page, wbc);
> >  	} else {
> > @@ -3520,6 +3561,96 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
> >  	return err;
> >  }
> >  
> > +static int __ext4_journalled_allocpage(struct page *page,
> > +				struct writeback_control *wbc)
> > +{
> > +	int ret = 0, err;
> > +	handle_t *handle = NULL;
> > +	struct address_space *mapping = page->mapping;
> > +	struct inode *inode = mapping->host;
> > +	struct buffer_head *page_bufs;
> > +
> > +	/* if alloc we are called after statring a journal */
> > +	handle = ext4_journal_current_handle();
> > +	BUG_ON(!handle);
> > +
> > +	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
> > +	if (ret != 0)
> > +		goto out_unlock;
> > +
> > +	/* FIXME!! should we do a bget_one */
> > +	page_bufs = page_buffers(page);
> > +	ret = walk_page_buffers(handle, page_bufs, 0,
> > +			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
> > +


I also have a FIXME here. I am not sure whether unlocking the page have
some effect. Can you verify this ?



> > +	err = walk_page_buffers(handle, page_bufs, 0,
> > +				PAGE_CACHE_SIZE, NULL, write_end_fn);
> > +	if (ret == 0)
> > +		ret = err;
> > +	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
> > +
> > +out_unlock:
> > +	unlock_page(page);
> > +	return ret;
> > +}
> > +


-aneesh

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] ext4: Add validation to jbd lock inversion patch and split and writepage
  2008-06-02  9:52           ` Aneesh Kumar K.V
@ 2008-06-02 10:40             ` Jan Kara
  0 siblings, 0 replies; 17+ messages in thread
From: Jan Kara @ 2008-06-02 10:40 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: cmm, linux-ext4

On Mon 02-06-08 15:22:22, Aneesh Kumar K.V wrote:
> Hi Jan,
> 
> 
> On Mon, Jun 02, 2008 at 11:31:06AM +0200, Jan Kara wrote:
> >   Hi Aneesh,
> > 
> >   Thanks for the patch but I though we decided to do this a bit differently -
> > see below.
> 
> Please feel free to merge the changes back to the lock inversion
> patches. I sent it as a separate the patches to make it easier for review.
  OK, thanks. I'll look into this.

> > >  
> > > +	if (walk_page_buffers(NULL, page_bufs, 0,
> > > +				len, NULL, ext4_bh_unmapped_or_delay)) {
> > > +		printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
> > > +								__func__);
> > > +		BUG();
> > > +	}
> > > +	/* FIXME!! do we need to call prepare_write for a mapped buffer */
> >   This can go to ext4_journalled_writepage(). What is actually this FIXME
> > about? I'm not sure I get it...
> > 
> I was wondering whether we need to call prepare_write in writepage ?. We
> are not allocating any new blocks in writepage with these changes.
  Ah, I see. I'm only not sure whether we can rely on all buffers in the
page being uptodate... Otherwise I think block_prepare_write() is not
needed.

> > >  	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
> > >  	if (ret != 0)
> > >  		goto out_unlock;
> > >  
> > > -	page_bufs = page_buffers(page);
> > >  	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
> > >  								bget_one);
> > >  	/* As soon as we unlock the page, it can go away, but we have
> > > @@ -1671,14 +1713,13 @@ static int __ext4_journalled_writepage(struct page *page,
> > >  static int ext4_journalled_writepage(struct page *page,
> > >  				struct writeback_control *wbc)
> > >  {
> > > +	BUG_ON(!page_has_buffers(page));
> > > +
> > >  	if (ext4_journal_current_handle())
> > >  		goto no_write;
> > >  
> > > -	if (!page_has_buffers(page) || PageChecked(page)) {
> > > -		/*
> > > -		 * It's mmapped pagecache.  Add buffers and journal it.  There
> > > -		 * doesn't seem much point in redirtying the page here.
> > > -		 */
> > > +	if (PageChecked(page)) {
> > > +		/* dirty pages in data=journal mode */
> > >  		ClearPageChecked(page);
> > >  		return __ext4_journalled_writepage(page, wbc);
> > >  	} else {
> > > @@ -3520,6 +3561,96 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
> > >  	return err;
> > >  }
> > >  
> > > +static int __ext4_journalled_allocpage(struct page *page,
> > > +				struct writeback_control *wbc)
> > > +{
> > > +	int ret = 0, err;
> > > +	handle_t *handle = NULL;
> > > +	struct address_space *mapping = page->mapping;
> > > +	struct inode *inode = mapping->host;
> > > +	struct buffer_head *page_bufs;
> > > +
> > > +	/* if alloc we are called after statring a journal */
> > > +	handle = ext4_journal_current_handle();
> > > +	BUG_ON(!handle);
> > > +
> > > +	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
> > > +	if (ret != 0)
> > > +		goto out_unlock;
> > > +
> > > +	/* FIXME!! should we do a bget_one */
> > > +	page_bufs = page_buffers(page);
> > > +	ret = walk_page_buffers(handle, page_bufs, 0,
> > > +			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
> > > +
> I also have a FIXME here. I am not sure whether unlocking the page have
> some effect. Can you verify this ?
  Well, you unlock the page only after you're completely done with it as
far as I read the code. So that is correct. You only need to get references
to buffers when you need to access them after you unlock the page.

> > > +	err = walk_page_buffers(handle, page_bufs, 0,
> > > +				PAGE_CACHE_SIZE, NULL, write_end_fn);
> > > +	if (ret == 0)
> > > +		ret = err;
> > > +	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
> > > +
> > > +out_unlock:
> > > +	unlock_page(page);
> > > +	return ret;
> > > +}
> > > +

									Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2008-06-02 10:40 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-21 17:44 delalloc and journal locking order inversion fixes Aneesh Kumar K.V
2008-05-21 17:44 ` [PATCH] ext4: Add validation to jbd lock inversion patch and split and writepage Aneesh Kumar K.V
2008-05-21 17:44   ` [PATCH] ext4: inverse locking ordering of page_lock and transaction start in delalloc Aneesh Kumar K.V
2008-05-21 17:44     ` [PATCH] ext4: Fix delalloc sync hang with journal lock inversion Aneesh Kumar K.V
2008-05-21 17:44       ` [PATCH] ext4: Inverse locking order of page_lock and transaction start Aneesh Kumar K.V
2008-05-21 17:44         ` [PATCH] ext4: Use page_mkwrite vma_operations to get mmap write notification Aneesh Kumar K.V
2008-05-22 10:25       ` [PATCH] ext4: Fix delalloc sync hang with journal lock inversion Aneesh Kumar K.V
2008-05-22 17:58         ` Mingming
2008-05-22 18:23           ` Aneesh Kumar K.V
2008-05-22 19:45             ` Mingming
2008-05-22 18:10       ` Mingming
2008-05-22 18:26         ` Aneesh Kumar K.V
2008-05-22 19:26           ` Mingming
  -- strict thread matches above, loose matches on Subject: below --
2008-05-30 13:39 [PATCH -v2] delalloc and journal locking order inversion fixes Aneesh Kumar K.V
2008-05-30 13:39 ` [PATCH] ext4: Use page_mkwrite vma_operations to get mmap write notification Aneesh Kumar K.V
2008-05-30 13:39   ` [PATCH] ext4: Inverse locking order of page_lock and transaction start Aneesh Kumar K.V
2008-05-30 13:39     ` [PATCH] vfs: Move mark_inode_dirty() from under page lock in generic_write_end() Aneesh Kumar K.V
2008-05-30 13:39       ` [PATCH] ext4: Add validation to jbd lock inversion patch and split and writepage Aneesh Kumar K.V
2008-06-02  9:31         ` Jan Kara
2008-06-02  9:52           ` Aneesh Kumar K.V
2008-06-02 10:40             ` Jan Kara

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).