[RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full

public inbox for linux-ext4@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full
@ 2008-02-21 19:17 Aneesh Kumar K.V
  2008-02-21 21:07 ` Mingming Cao
  0 siblings, 1 reply; 12+ messages in thread
From: Aneesh Kumar K.V @ 2008-02-21 19:17 UTC (permalink / raw)
  To: linux-ext4, Mingming Cao

This patch had very minimal testing. I am sending this to get the
feedback on the approach. The skip_index section in the below patch
is ugly. Any suggestion to improve ?

NOTE: ext4_ext_convert_to_initialized error path have some BUGs. It
doesn't reset the extent information in case of error. But that is
another patch.


>From 6a73edd4dbb32344e6a83ebdc07edd0e96d376bd Mon Sep 17 00:00:00 2001
From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 21 Feb 2008 23:57:38 +0530
Subject: [PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full

A write to prealloc area cause the split of unititalized extent into a initialized
and uninitialized extent. If we don't have space to add new extent information instead
of returning error convert the existing uninitialized extent to initialized one. We
need to zero out the blocks corresponding to the extent to prevent wrong data reaching
userspace.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/extents.c |  135 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 133 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b179b03..d37c14e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2137,6 +2137,103 @@ void ext4_ext_release(struct super_block *sb)
 #endif
 }
 
+static int ext4_ext_zero_out(handle_t *handle, struct inode *inode,
+				ext4_lblk_t iblock, struct ext4_extent *ex)
+{
+	ext4_lblk_t ee_block;
+	unsigned int ee_len, blkcount, blocksize;
+	loff_t pos;
+	pgoff_t index, skip_index;
+	unsigned long offset;
+	struct page *page;
+	struct address_space *mapping = inode->i_mapping;
+	struct buffer_head *head, *bh;
+	int err = 0;
+
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = blkcount = ext4_ext_get_actual_len(ex);
+	blocksize = inode->i_sb->s_blocksize;
+
+	/*
+	 * find the skip index. We can't call __grab_cache_page for this
+	 * because we are in the writeout of this page and we already have
+	 * taken the lock on this page
+	 */
+	pos = iblock <<  inode->i_blkbits;
+	skip_index = pos >> PAGE_CACHE_SHIFT;
+
+	while (blkcount) {
+		pos = (ee_block  + ee_len - blkcount) << inode->i_blkbits;
+		index = pos >> PAGE_CACHE_SHIFT;
+		offset = (pos & (PAGE_CACHE_SIZE - 1));
+		if (index == skip_index) {
+			/* Page will already be locked in the writepage */
+			read_lock_irq(&mapping->tree_lock);
+			page = radix_tree_lookup(&mapping->page_tree, index);
+			read_unlock_irq(&mapping->tree_lock);
+			if (page)
+				page_cache_get(page);
+			else
+				return -ENOMEM;
+		} else {
+			page = __grab_cache_page(mapping, index);
+			if (!page)
+				return -ENOMEM;
+		}
+
+		if (!page_has_buffers(page))
+			create_empty_buffers(page, blocksize, 0);
+
+		head = page_buffers(page);
+		/* Look for the buffer_head which map the block */
+		bh = head;
+		while (offset > 0) {
+			bh = bh->b_this_page;
+			offset -= blocksize;
+		}
+		offset = (pos & (PAGE_CACHE_SIZE - 1));
+
+		/* Now write all the buffer_heads in the page */
+		do {
+			set_buffer_uptodate(bh);
+			if (ext4_should_journal_data(inode)) {
+				err = ext4_journal_get_write_access(handle, bh);
+				/* do we have that many credits ??*/
+				if (err)
+					goto err_out;
+			}
+			zero_user(page, offset, blocksize);
+			offset += blocksize;
+			if (ext4_should_journal_data(inode)) {
+				err = ext4_journal_dirty_metadata(handle, bh);
+				if (err)
+					goto err_out;
+			} else {
+				if (ext4_should_order_data(inode)) {
+					err = ext4_journal_dirty_data(handle,
+									bh);
+					if (err)
+						goto err_out;
+				}
+				mark_buffer_dirty(bh);
+			}
+
+			bh = bh->b_this_page;
+			blkcount--;
+		} while ((bh != head) && (blkcount > 0));
+		/* only unlock if we have locked */
+		if (index != skip_index)
+			unlock_page(page);
+		page_cache_release(page);
+	}
+
+	return 0;
+err_out:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
 /*
  * This function is called by ext4_ext_get_blocks() if someone tries to write
  * to an uninitialized extent. It may result in splitting the uninitialized
@@ -2153,7 +2250,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 						ext4_lblk_t iblock,
 						unsigned long max_blocks)
 {
-	struct ext4_extent *ex, newex;
+	struct ext4_extent *ex, newex, zeroout_ex;
 	struct ext4_extent *ex1 = NULL;
 	struct ext4_extent *ex2 = NULL;
 	struct ext4_extent *ex3 = NULL;
@@ -2172,6 +2269,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	allocated = ee_len - (iblock - ee_block);
 	newblock = iblock - ee_block + ext_pblock(ex);
 	ex2 = ex;
+	zeroout_ex.ee_block = ex->ee_block;
+	zeroout_ex.ee_len   = cpu_to_le16(ee_len);
+	ext4_ext_store_pblock(&zeroout_ex, ext_pblock(ex));
 
 	err = ext4_ext_get_access(handle, inode, path + depth);
 	if (err)
@@ -2200,13 +2300,32 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
 		ext4_ext_mark_uninitialized(ex3);
 		err = ext4_ext_insert_extent(handle, inode, path, ex3);
-		if (err)
+		if (err == -ENOSPC) {
+			err =  ext4_ext_zero_out(handle, inode,
+							iblock, &zeroout_ex);
+			if (err)
+				goto out;
+			/* update the extent length and mark as initialized */
+			ex->ee_block = zeroout_ex.ee_block;
+			ex->ee_len   = zeroout_ex.ee_len;
+			ext4_ext_store_pblock(ex, ext_pblock(&zeroout_ex));
+			ext4_ext_dirty(handle, inode, path + depth);
+			return le16_to_cpu(ex->ee_len);
+
+		} else if (err)
 			goto out;
+
 		/*
 		 * The depth, and hence eh & ex might change
 		 * as part of the insert above.
 		 */
 		newdepth = ext_depth(inode);
+		/*
+		 * update the extent length after successfull insert of the
+		 * split extent
+		 */
+		zeroout_ex.ee_len = cpu_to_le16(ee_len -
+						ext4_ext_get_actual_len(ex3));
 		if (newdepth != depth) {
 			depth = newdepth;
 			ext4_ext_drop_refs(path);
@@ -2281,6 +2400,18 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	goto out;
 insert:
 	err = ext4_ext_insert_extent(handle, inode, path, &newex);
+	if (err == -ENOSPC) {
+		err =  ext4_ext_zero_out(handle, inode, iblock, &zeroout_ex);
+		if (err)
+			goto out;
+		/* update the extent length and mark as initialized */
+		ex->ee_block = zeroout_ex.ee_block;
+		ex->ee_len   = zeroout_ex.ee_len;
+		ext4_ext_store_pblock(ex, ext_pblock(&zeroout_ex));
+		ext4_ext_dirty(handle, inode, path + depth);
+		return le16_to_cpu(ex->ee_len);
+	}
+
 out:
 	return err ? err : allocated;
 }
-- 
1.5.4.1.97.g40aab-dirty

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full
  2008-02-21 19:17 [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full Aneesh Kumar K.V
@ 2008-02-21 21:07 ` Mingming Cao
  2008-02-22 14:31   ` Aneesh Kumar K.V
  0 siblings, 1 reply; 12+ messages in thread
From: Mingming Cao @ 2008-02-21 21:07 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linux-ext4

Hi Aneesh,

It's a good start, a few comments below..

On Fri, 2008-02-22 at 00:47 +0530, Aneesh Kumar K.V wrote:
> From 6a73edd4dbb32344e6a83ebdc07edd0e96d376bd Mon Sep 17 00:00:00 2001
> From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> Date: Thu, 21 Feb 2008 23:57:38 +0530
> Subject: [PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full
> 
> A write to prealloc area cause the split of unititalized extent into a initialized
> and uninitialized extent. If we don't have space to add new extent information instead
> of returning error convert the existing uninitialized extent to initialized one. We
> need to zero out the blocks corresponding to the extent to prevent wrong data reaching
> userspace.
> 

> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  fs/ext4/extents.c |  135 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 files changed, 133 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index b179b03..d37c14e 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2137,6 +2137,103 @@ void ext4_ext_release(struct super_block *sb)
>  #endif
>  }
> 
> +static int ext4_ext_zero_out(handle_t *handle, struct inode *inode,
> +				ext4_lblk_t iblock, struct ext4_extent *ex)
> +{
> +	ext4_lblk_t ee_block;
> +	unsigned int ee_len, blkcount, blocksize;
> +	loff_t pos;
> +	pgoff_t index, skip_index;
> +	unsigned long offset;
> +	struct page *page;
> +	struct address_space *mapping = inode->i_mapping;
> +	struct buffer_head *head, *bh;
> +	int err = 0;
> +
> +	ee_block = le32_to_cpu(ex->ee_block);
> +	ee_len = blkcount = ext4_ext_get_actual_len(ex);
> +	blocksize = inode->i_sb->s_blocksize;
> +
> +	/*
> +	 * find the skip index. We can't call __grab_cache_page for this
> +	 * because we are in the writeout of this page and we already have
> +	 * taken the lock on this page
> +	 */
> +	pos = iblock <<  inode->i_blkbits;
> +	skip_index = pos >> PAGE_CACHE_SHIFT;
> +

We should not need to look up the page cache to do the zero out. The
approach I had thought is just zero it out on disk.

> +	while (blkcount) {
> +		pos = (ee_block  + ee_len - blkcount) << inode->i_blkbits;
> +		index = pos >> PAGE_CACHE_SHIFT;
> +		offset = (pos & (PAGE_CACHE_SIZE - 1));
> +		if (index == skip_index) {
> +			/* Page will already be locked in the writepage */
> +			read_lock_irq(&mapping->tree_lock);
> +			page = radix_tree_lookup(&mapping->page_tree, index);
> +			read_unlock_irq(&mapping->tree_lock);
> +			if (page)
> +				page_cache_get(page);
> +			else
> +				return -ENOMEM;
> +		} else {
> +			page = __grab_cache_page(mapping, index);
> +			if (!page)
> +				return -ENOMEM;
> +		}
> +

I the page is already locked before calling get_block() via writepage(),
isn't it? and the journal transaction already started...


> +		if (!page_has_buffers(page))
> +			create_empty_buffers(page, blocksize, 0);
> +
> +		head = page_buffers(page);
> +		/* Look for the buffer_head which map the block */
> +		bh = head;
> +		while (offset > 0) {
> +			bh = bh->b_this_page;
> +			offset -= blocksize;
> +		}
> +		offset = (pos & (PAGE_CACHE_SIZE - 1));
> +
> +		/* Now write all the buffer_heads in the page */
> +		do {
> +			set_buffer_uptodate(bh);
> +			if (ext4_should_journal_data(inode)) {
> +				err = ext4_journal_get_write_access(handle, bh);
> +				/* do we have that many credits ??*/
> +				if (err)
> +					goto err_out;
> +			}
> +			zero_user(page, offset, blocksize);

Ah oh, you are trying to zero out the pages in the page cache, that's
seems wrong to me. By the time get_block() is called from writepages(),
the pages should have meaningful content that needs to flush to disk,
zero the pages out will lost the data.

> +			offset += blocksize;
> +			if (ext4_should_journal_data(inode)) {
> +				err = ext4_journal_dirty_metadata(handle, bh);
> +				if (err)
> +					goto err_out;
> +			} else {
> +				if (ext4_should_order_data(inode)) {
> +					err = ext4_journal_dirty_data(handle,
> +									bh);
> +					if (err)
> +						goto err_out;
> +				}
> +				mark_buffer_dirty(bh);
> +			}
> +
> +			bh = bh->b_this_page;
> +			blkcount--;
> +		} while ((bh != head) && (blkcount > 0));
> +		/* only unlock if we have locked */
> +		if (index != skip_index)
> +			unlock_page(page);
> +		page_cache_release(page);
> +	}
> +
> +	return 0;
> +err_out:
> +	unlock_page(page);
> +	page_cache_release(page);
> +	return err;
> +}
> +

I was thinking just simply create a new bh, zero out the bh, then map
the bh with the block number to zero out, lastly submit a IO via
ll_rw_block. It maybe more efficient to do this via bio(perhaps cooking
a bio with zeroed out pages and submit_bio) but I have not look very
closely to it. Just throw out my thoughts.

Mingming
>  /* 
>   * This function is called by ext4_ext_get_blocks() if someone tries to write
>   * to an uninitialized extent. It may result in splitting the uninitialized
> @@ -2153,7 +2250,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  						ext4_lblk_t iblock,
>  						unsigned long max_blocks)
>  {
> -	struct ext4_extent *ex, newex;
> +	struct ext4_extent *ex, newex, zeroout_ex;
>  	struct ext4_extent *ex1 = NULL;
>  	struct ext4_extent *ex2 = NULL;
>  	struct ext4_extent *ex3 = NULL;
> @@ -2172,6 +2269,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  	allocated = ee_len - (iblock - ee_block);
>  	newblock = iblock - ee_block + ext_pblock(ex);
>  	ex2 = ex;
> +	zeroout_ex.ee_block = ex->ee_block;
> +	zeroout_ex.ee_len   = cpu_to_le16(ee_len);
> +	ext4_ext_store_pblock(&zeroout_ex, ext_pblock(ex));
> 
>  	err = ext4_ext_get_access(handle, inode, path + depth);
>  	if (err)
> @@ -2200,13 +2300,32 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
>  		ext4_ext_mark_uninitialized(ex3);
>  		err = ext4_ext_insert_extent(handle, inode, path, ex3);
> -		if (err)
> +		if (err == -ENOSPC) {
> +			err =  ext4_ext_zero_out(handle, inode,
> +							iblock, &zeroout_ex);
> +			if (err)
> +				goto out;
> +			/* update the extent length and mark as initialized */
> +			ex->ee_block = zeroout_ex.ee_block;
> +			ex->ee_len   = zeroout_ex.ee_len;
> +			ext4_ext_store_pblock(ex, ext_pblock(&zeroout_ex));
> +			ext4_ext_dirty(handle, inode, path + depth);
> +			return le16_to_cpu(ex->ee_len);
> +
> +		} else if (err)
>  			goto out;
> +
>  		/*
>  		 * The depth, and hence eh & ex might change
>  		 * as part of the insert above.
>  		 */
>  		newdepth = ext_depth(inode);
> +		/*
> +		 * update the extent length after successfull insert of the
> +		 * split extent
> +		 */
> +		zeroout_ex.ee_len = cpu_to_le16(ee_len -
> +						ext4_ext_get_actual_len(ex3));
>  		if (newdepth != depth) {
>  			depth = newdepth;
>  			ext4_ext_drop_refs(path);
> @@ -2281,6 +2400,18 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  	goto out;
>  insert:
>  	err = ext4_ext_insert_extent(handle, inode, path, &newex);
> +	if (err == -ENOSPC) {
> +		err =  ext4_ext_zero_out(handle, inode, iblock, &zeroout_ex);
> +		if (err)
> +			goto out;
> +		/* update the extent length and mark as initialized */
> +		ex->ee_block = zeroout_ex.ee_block;
> +		ex->ee_len   = zeroout_ex.ee_len;
> +		ext4_ext_store_pblock(ex, ext_pblock(&zeroout_ex));
> +		ext4_ext_dirty(handle, inode, path + depth);
> +		return le16_to_cpu(ex->ee_len);
> +	}
> +
>  out:
>  	return err ? err : allocated;
>  }

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full
  2008-02-21 21:07 ` Mingming Cao
@ 2008-02-22 14:31   ` Aneesh Kumar K.V
  2008-02-22 15:42     ` Aneesh Kumar K.V
  0 siblings, 1 reply; 12+ messages in thread
From: Aneesh Kumar K.V @ 2008-02-22 14:31 UTC (permalink / raw)
  To: Mingming Cao; +Cc: linux-ext4

On Thu, Feb 21, 2008 at 01:07:17PM -0800, Mingming Cao wrote:
> Hi Aneesh,
> 
> It's a good start, a few comments below..
> 

.....


> > +			page = __grab_cache_page(mapping, index);
> > +			if (!page)
> > +				return -ENOMEM;
> > +		}
> > +
> 
> I the page is already locked before calling get_block() via writepage(),
> isn't it? and the journal transaction already started...
> 

It would be via write_begin or writepage. But both the callbacks lock
the page before their call getblock for all the blocks corresponding to
the page.


> 
> > +		if (!page_has_buffers(page))
> > +			create_empty_buffers(page, blocksize, 0);
> > +
> > +		head = page_buffers(page);
> > +		/* Look for the buffer_head which map the block */
> > +		bh = head;
> > +		while (offset > 0) {
> > +			bh = bh->b_this_page;
> > +			offset -= blocksize;
> > +		}
> > +		offset = (pos & (PAGE_CACHE_SIZE - 1));
> > +
> > +		/* Now write all the buffer_heads in the page */
> > +		do {
> > +			set_buffer_uptodate(bh);
> > +			if (ext4_should_journal_data(inode)) {
> > +				err = ext4_journal_get_write_access(handle, bh);
> > +				/* do we have that many credits ??*/
> > +				if (err)
> > +					goto err_out;
> > +			}
> > +			zero_user(page, offset, blocksize);
> 
> Ah oh, you are trying to zero out the pages in the page cache, that's
> seems wrong to me. By the time get_block() is called from writepages(),
> the pages should have meaningful content that needs to flush to disk,
> zero the pages out will lost the data.
> 

It is writebegin.  In case of writebegin the pages doesn't have the content. By the
time we reach write begin the page is supposed to have buffer heads that
are alreayd mapped. So we won't end up calling get_blk. Even in case of
mmap with page_mkwrite change we would have called writebegin equivalent
before the writepage.


> > +			offset += blocksize;
> > +			if (ext4_should_journal_data(inode)) {
> > +				err = ext4_journal_dirty_metadata(handle, bh);
> > +				if (err)
> > +					goto err_out;
> > +			} else {
> > +				if (ext4_should_order_data(inode)) {
> > +					err = ext4_journal_dirty_data(handle,
> > +									bh);
> > +					if (err)
> > +						goto err_out;
> > +				}
> > +				mark_buffer_dirty(bh);
> > +			}
> > +
> > +			bh = bh->b_this_page;
> > +			blkcount--;
> > +		} while ((bh != head) && (blkcount > 0));
> > +		/* only unlock if we have locked */
> > +		if (index != skip_index)
> > +			unlock_page(page);
> > +		page_cache_release(page);
> > +	}
> > +
> > +	return 0;
> > +err_out:
> > +	unlock_page(page);
> > +	page_cache_release(page);
> > +	return err;
> > +}
> > +
> 
> I was thinking just simply create a new bh, zero out the bh, then map
> the bh with the block number to zero out, lastly submit a IO via
> ll_rw_block. It maybe more efficient to do this via bio(perhaps cooking
> a bio with zeroed out pages and submit_bio) but I have not look very
> closely to it. Just throw out my thoughts.
> 

But we would still need pages. buffer head need to have a mapped page 
b_page. Also if we don't take the page from page cache then we would
have to wait for the I/O to complete using wait_on_buffer before we can
update the extent information. Using page cache also plug it nicely with
different journalling mode.

-aneesh

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full
  2008-02-22 14:31   ` Aneesh Kumar K.V
@ 2008-02-22 15:42     ` Aneesh Kumar K.V
  2008-02-22 17:28       ` Mingming Cao
  0 siblings, 1 reply; 12+ messages in thread
From: Aneesh Kumar K.V @ 2008-02-22 15:42 UTC (permalink / raw)
  To: Mingming Cao; +Cc: linux-ext4

On Fri, Feb 22, 2008 at 08:01:28PM +0530, Aneesh Kumar K.V wrote:
> > > +
> > > +		/* Now write all the buffer_heads in the page */
> > > +		do {
> > > +			set_buffer_uptodate(bh);
> > > +			if (ext4_should_journal_data(inode)) {
> > > +				err = ext4_journal_get_write_access(handle, bh);
> > > +				/* do we have that many credits ??*/
> > > +				if (err)
> > > +					goto err_out;
> > > +			}
> > > +			zero_user(page, offset, blocksize);
> > 
> > Ah oh, you are trying to zero out the pages in the page cache, that's
> > seems wrong to me. By the time get_block() is called from writepages(),
> > the pages should have meaningful content that needs to flush to disk,
> > zero the pages out will lost the data.
> > 
> 
> It is writebegin.  In case of writebegin the pages doesn't have the content. By the
> time we reach write begin the page is supposed to have buffer heads that
> are alreayd mapped. So we won't end up calling get_blk. Even in case of
> mmap with page_mkwrite change we would have called writebegin equivalent
> before the writepage.

I guess the above para is confusing.

The callback is actually writebegin.In case of writebegin the page
doesn't have the content. With respect to writepage by the time we call the
callback the buffer_heads related to the page would already be mapped.
So we won't end up calling get_blk.


-aneesh

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full
  2008-02-22 15:42     ` Aneesh Kumar K.V
@ 2008-02-22 17:28       ` Mingming Cao
  0 siblings, 0 replies; 12+ messages in thread
From: Mingming Cao @ 2008-02-22 17:28 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linux-ext4

Aneesh Kumar K.V wrote:
> On Fri, Feb 22, 2008 at 08:01:28PM +0530, Aneesh Kumar K.V wrote:
>>>> +
>>>> +		/* Now write all the buffer_heads in the page */
>>>> +		do {
>>>> +			set_buffer_uptodate(bh);
>>>> +			if (ext4_should_journal_data(inode)) {
>>>> +				err = ext4_journal_get_write_access(handle, bh);
>>>> +				/* do we have that many credits ??*/
>>>> +				if (err)
>>>> +					goto err_out;
>>>> +			}
>>>> +			zero_user(page, offset, blocksize);
>>> Ah oh, you are trying to zero out the pages in the page cache, that's
>>> seems wrong to me. By the time get_block() is called from writepages(),
>>> the pages should have meaningful content that needs to flush to disk,
>>> zero the pages out will lost the data.
>>>
>> It is writebegin.  In case of writebegin the pages doesn't have the content. By the
>> time we reach write begin the page is supposed to have buffer heads that
>> are alreayd mapped. So we won't end up calling get_blk. Even in case of
>> mmap with page_mkwrite change we would have called writebegin equivalent
>> before the writepage.
> 
> I guess the above para is confusing.
> 
> The callback is actually writebegin.In case of writebegin the page
> doesn't have the content. With respect to writepage by the time we call the
> callback the buffer_heads related to the page would already be mapped.
> So we won't end up calling get_blk.
> 
> 

Ah, right, the callback at this moment is from write_begin(),as 
get_block() with create==1 is called then (with the recently fix:-)).

But I am thinking from delayed allocation view, since I am looking at it 
recently.:-) get_block() with create ==1 will be defered at writepages 
time, then I am afraid this will broken.

I could be wrong but the code seems only working for buffered IO. What 
about DIO writes to the uninit extents? Since there is no mapping in the 
pagecache, then DIO starts calling get_block() with create ==1. What 
happened in this case? I had a feeling this also broken, isn't it?

Regards,

Mingmng

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full
  2008-02-28 18:05 ` [RFC][PATCH] ext4: Fix fallocate error path Aneesh Kumar K.V
@ 2008-02-28 18:05   ` Aneesh Kumar K.V
  2008-02-28 23:14     ` Mingming Cao
  0 siblings, 1 reply; 12+ messages in thread
From: Aneesh Kumar K.V @ 2008-02-28 18:05 UTC (permalink / raw)
  To: cmm; +Cc: linux-ext4, Aneesh Kumar K.V

A write to prealloc area cause the split of unititalized extent into a initialized
and uninitialized extent. If we don't have space to add new extent information instead
of returning error convert the existing uninitialized extent to initialized one. We
need to zero out the blocks corresponding to the extent to prevent wrong data reaching
userspace.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/extents.c |  164 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 157 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d315cc1..39a8beb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2136,6 +2136,137 @@ void ext4_ext_release(struct super_block *sb)
 #endif
 }
 
+static int extend_credit_for_zeroout(handle_t *handle, struct inode *inode)
+{
+	int retval = 0, needed;
+
+	if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+		return 0;
+
+	/* number of filesytem blocks in one page */
+	needed = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	if (ext4_journal_extend(handle, needed) != 0)
+		retval = ext4_journal_restart(handle, needed);
+
+	return retval;
+}
+
+/* FIXME!! we need to try to merge to left or right after zerout  */
+static int ext4_ext_zeroout(handle_t *handle, struct inode *inode,
+				ext4_lblk_t iblock, struct ext4_extent *ex)
+{
+	ext4_lblk_t ee_block;
+	unsigned int ee_len, blkcount, blocksize;
+	loff_t pos;
+	pgoff_t index, skip_index;
+	unsigned long offset;
+	struct page *page;
+	struct address_space *mapping = inode->i_mapping;
+	struct buffer_head *head, *bh;
+	int err = 0;
+
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = blkcount = ext4_ext_get_actual_len(ex);
+	blocksize = inode->i_sb->s_blocksize;
+
+	/*
+	 * find the skip index. We can't call __grab_cache_page for this
+	 * because we are in the writeout of this page and we already have
+	 * taken the lock on this page
+	 */
+	pos = iblock <<  inode->i_blkbits;
+	skip_index = pos >> PAGE_CACHE_SHIFT;
+
+	while (blkcount) {
+		pos = (ee_block  + ee_len - blkcount) << inode->i_blkbits;
+		index = pos >> PAGE_CACHE_SHIFT;
+		offset = (pos & (PAGE_CACHE_SIZE - 1));
+		if (index == skip_index) {
+			/* Page will already be locked via
+			 * write_begin or writepage
+			 */
+			read_lock_irq(&mapping->tree_lock);
+			page = radix_tree_lookup(&mapping->page_tree, index);
+			read_unlock_irq(&mapping->tree_lock);
+			if (page)
+				page_cache_get(page);
+			else
+				return -ENOMEM;
+		} else {
+			page = __grab_cache_page(mapping, index);
+			if (!page)
+				return -ENOMEM;
+		}
+
+		if (!page_has_buffers(page))
+			create_empty_buffers(page, blocksize, 0);
+
+		/* extent the credit in the journal */
+		extend_credit_for_zeroout(handle, inode);
+
+		head = page_buffers(page);
+		/* Look for the buffer_head which map the block */
+		bh = head;
+		while (offset > 0) {
+			bh = bh->b_this_page;
+			offset -= blocksize;
+		}
+		offset = (pos & (PAGE_CACHE_SIZE - 1));
+
+		/* Now write all the buffer_heads in the page */
+		do {
+			if (ext4_should_journal_data(inode)) {
+				err = ext4_journal_get_write_access(handle, bh);
+				if (err)
+					goto err_out;
+			}
+			if (buffer_new(bh)) {
+				unmap_underlying_metadata(bh->b_bdev,
+								bh->b_blocknr);
+				if (!PageUptodate(page))
+					zero_user(page, offset, blocksize);
+				clear_buffer_new(bh);
+			}
+			/* Now mark the buffer uptodate. since we
+			 * have zero out the buffer
+			 */
+			set_buffer_uptodate(bh);
+			offset += blocksize;
+			if (ext4_should_journal_data(inode)) {
+				err = ext4_journal_dirty_metadata(handle, bh);
+				if (err)
+					goto err_out;
+			} else {
+				if (ext4_should_order_data(inode)) {
+					err = ext4_journal_dirty_data(handle,
+									bh);
+					if (err)
+						goto err_out;
+				}
+				mark_buffer_dirty(bh);
+			}
+
+			bh = bh->b_this_page;
+			blkcount--;
+		} while ((bh != head) && (blkcount > 0));
+		/* Now that we zeroed the non uptodate
+		 * page mark the pge uptodate
+		 */
+		SetPageUptodate(page);
+		/* only unlock if we have locked */
+		if (index != skip_index)
+			unlock_page(page);
+		page_cache_release(page);
+	}
+
+	return 0;
+err_out:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
 /*
  * This function is called by ext4_ext_get_blocks() if someone tries to write
  * to an uninitialized extent. It may result in splitting the uninitialized
@@ -2202,14 +2333,20 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
 		ext4_ext_mark_uninitialized(ex3);
 		err = ext4_ext_insert_extent(handle, inode, path, ex3);
-		if (err) {
+		if (err == -ENOSPC) {
+			err =  ext4_ext_zeroout(handle, inode,
+							iblock, &orig_ex);
+			if (err)
+				goto fix_extent_len;
+			/* update the extent length and mark as initialized */
 			ex->ee_block = orig_ex.ee_block;
 			ex->ee_len   = orig_ex.ee_len;
 			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
-			ext4_ext_mark_uninitialized(ex);
 			ext4_ext_dirty(handle, inode, path + depth);
-			goto out;
-		}
+			return le16_to_cpu(ex->ee_len);
+
+		} else if (err)
+			goto fix_extent_len;
 		/*
 		 * The depth, and hence eh & ex might change
 		 * as part of the insert above.
@@ -2295,15 +2432,28 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	goto out;
 insert:
 	err = ext4_ext_insert_extent(handle, inode, path, &newex);
-	if (err) {
+	if (err == -ENOSPC) {
+		err =  ext4_ext_zeroout(handle, inode, iblock, &orig_ex);
+		if (err)
+			goto fix_extent_len;
+		/* update the extent length and mark as initialized */
 		ex->ee_block = orig_ex.ee_block;
 		ex->ee_len   = orig_ex.ee_len;
 		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
-		ext4_ext_mark_uninitialized(ex);
 		ext4_ext_dirty(handle, inode, path + depth);
-	}
+		return le16_to_cpu(ex->ee_len);
+	} else if (err)
+		goto fix_extent_len;
 out:
 	return err ? err : allocated;
+
+fix_extent_len:
+	ex->ee_block = orig_ex.ee_block;
+	ex->ee_len   = orig_ex.ee_len;
+	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+	ext4_ext_mark_uninitialized(ex);
+	ext4_ext_dirty(handle, inode, path + depth);
+	return err;
 }
 
 /*
-- 
1.5.4.3.325.g6d216.dirty


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full
  2008-02-28 18:05   ` [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full Aneesh Kumar K.V
@ 2008-02-28 23:14     ` Mingming Cao
  2008-02-29 11:09       ` Aneesh Kumar K.V
  2008-02-29 18:05       ` Andreas Dilger
  0 siblings, 2 replies; 12+ messages in thread
From: Mingming Cao @ 2008-02-28 23:14 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linux-ext4

On Thu, 2008-02-28 at 23:35 +0530, Aneesh Kumar K.V wrote:
> A write to prealloc area cause the split of unititalized extent into a initialized
> and uninitialized extent. If we don't have space to add new extent information instead
> of returning error convert the existing uninitialized extent to initialized one. We
> need to zero out the blocks corresponding to the extent to prevent wrong data reaching
> userspace.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  fs/ext4/extents.c |  164 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 157 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index d315cc1..39a8beb 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2136,6 +2136,137 @@ void ext4_ext_release(struct super_block *sb)
>  #endif
>  }
> 
> +static int extend_credit_for_zeroout(handle_t *handle, struct inode *inode)
> +{
> +	int retval = 0, needed;
> +
> +	if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
> +		return 0;
> +
> +	/* number of filesytem blocks in one page */
> +	needed = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
> +
> +	if (ext4_journal_extend(handle, needed) != 0)
> +		retval = ext4_journal_restart(handle, needed);
> +
> +	return retval;
> +}
> +
> +/* FIXME!! we need to try to merge to left or right after zerout  */
> +static int ext4_ext_zeroout(handle_t *handle, struct inode *inode,
> +				ext4_lblk_t iblock, struct ext4_extent *ex)
> +{
> +	ext4_lblk_t ee_block;
> +	unsigned int ee_len, blkcount, blocksize;
> +	loff_t pos;
> +	pgoff_t index, skip_index;
> +	unsigned long offset;
> +	struct page *page;
> +	struct address_space *mapping = inode->i_mapping;
> +	struct buffer_head *head, *bh;
> +	int err = 0;
> +
> +	ee_block = le32_to_cpu(ex->ee_block);
> +	ee_len = blkcount = ext4_ext_get_actual_len(ex);
> +	blocksize = inode->i_sb->s_blocksize;
> +
> +	/*
> +	 * find the skip index. We can't call __grab_cache_page for this
> +	 * because we are in the writeout of this page and we already have
> +	 * taken the lock on this page
> +	 */
> +	pos = iblock <<  inode->i_blkbits;
> +	skip_index = pos >> PAGE_CACHE_SHIFT;
> +
> +	while (blkcount) {
> +		pos = (ee_block  + ee_len - blkcount) << inode->i_blkbits;
> +		index = pos >> PAGE_CACHE_SHIFT;
> +		offset = (pos & (PAGE_CACHE_SIZE - 1));
> +		if (index == skip_index) {
> +			/* Page will already be locked via
> +			 * write_begin or writepage
> +			 */
> +			read_lock_irq(&mapping->tree_lock);
> +			page = radix_tree_lookup(&mapping->page_tree, index);
> +			read_unlock_irq(&mapping->tree_lock);
> +			if (page)
> +				page_cache_get(page);
> +			else
> +				return -ENOMEM;
> +		} else {
> +			page = __grab_cache_page(mapping, index);
> +			if (!page)
> +				return -ENOMEM;
> +		}
> +
> +		if (!page_has_buffers(page))
> +			create_empty_buffers(page, blocksize, 0);
> +
> +		/* extent the credit in the journal */
> +		extend_credit_for_zeroout(handle, inode);
> +
> +		head = page_buffers(page);
> +		/* Look for the buffer_head which map the block */
> +		bh = head;
> +		while (offset > 0) {
> +			bh = bh->b_this_page;
> +			offset -= blocksize;
> +		}
> +		offset = (pos & (PAGE_CACHE_SIZE - 1));
> +
> +		/* Now write all the buffer_heads in the page */
> +		do {
> +			if (ext4_should_journal_data(inode)) {
> +				err = ext4_journal_get_write_access(handle, bh);
> +				if (err)
> +					goto err_out;
> +			}
> +			if (buffer_new(bh)) {
> +				unmap_underlying_metadata(bh->b_bdev,
> +								bh->b_blocknr);
> +				if (!PageUptodate(page))
> +					zero_user(page, offset, blocksize);
> +				clear_buffer_new(bh);
> +			}
> +			/* Now mark the buffer uptodate. since we
> +			 * have zero out the buffer
> +			 */
> +			set_buffer_uptodate(bh);
> +			offset += blocksize;
> +			if (ext4_should_journal_data(inode)) {
> +				err = ext4_journal_dirty_metadata(handle, bh);
> +				if (err)
> +					goto err_out;
> +			} else {
> +				if (ext4_should_order_data(inode)) {
> +					err = ext4_journal_dirty_data(handle,
> +									bh);
> +					if (err)
> +						goto err_out;
> +				}
> +				mark_buffer_dirty(bh);
> +			}
> +
> +			bh = bh->b_this_page;
> +			blkcount--;
> +		} while ((bh != head) && (blkcount > 0));
> +		/* Now that we zeroed the non uptodate
> +		 * page mark the pge uptodate
> +		 */
> +		SetPageUptodate(page);
> +		/* only unlock if we have locked */
> +		if (index != skip_index)
> +			unlock_page(page);
> +		page_cache_release(page);
> +	}
> +
> +	return 0;
> +err_out:
> +	unlock_page(page);
> +	page_cache_release(page);
> +	return err;
> +}
> +

The complexity added to the code to handle the corner case seems not
worth the effort. 

One simple solution is submit bio directly to zero out the blocks on
disk, and wait for that to finish before clear the uninitialized bit. On
a 4K block size case, the max size of an uninitialized extents is 128MB,
and since the blocks are all contigous on disk, a single IO could done
the job, the latency should not be a too big issue. After all when a
filesystem is full, it's already performs slowly.

>  /*
>   * This function is called by ext4_ext_get_blocks() if someone tries to write
>   * to an uninitialized extent. It may result in splitting the uninitialized
> @@ -2202,14 +2333,20 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
>  		ext4_ext_mark_uninitialized(ex3);
>  		err = ext4_ext_insert_extent(handle, inode, path, ex3);
> -		if (err) {
> +		if (err == -ENOSPC) {
> +			err =  ext4_ext_zeroout(handle, inode,
> +							iblock, &orig_ex);
> +			if (err)
> +				goto fix_extent_len;
> +			/* update the extent length and mark as initialized */
>  			ex->ee_block = orig_ex.ee_block;
>  			ex->ee_len   = orig_ex.ee_len;
>  			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
> -			ext4_ext_mark_uninitialized(ex);
>  			ext4_ext_dirty(handle, inode, path + depth);
> -			goto out;
> -		}
> +			return le16_to_cpu(ex->ee_len);
> +
> +		} else if (err)
> +			goto fix_extent_len;
>  		/*
>  		 * The depth, and hence eh & ex might change
>  		 * as part of the insert above.
> @@ -2295,15 +2432,28 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  	goto out;
>  insert:
>  	err = ext4_ext_insert_extent(handle, inode, path, &newex);
> -	if (err) {
> +	if (err == -ENOSPC) {
> +		err =  ext4_ext_zeroout(handle, inode, iblock, &orig_ex);
> +		if (err)
> +			goto fix_extent_len;
> +		/* update the extent length and mark as initialized */
>  		ex->ee_block = orig_ex.ee_block;
>  		ex->ee_len   = orig_ex.ee_len;
>  		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
> -		ext4_ext_mark_uninitialized(ex);
>  		ext4_ext_dirty(handle, inode, path + depth);
> -	}
> +		return le16_to_cpu(ex->ee_len);
> +	} else if (err)
> +		goto fix_extent_len;
>  out:
>  	return err ? err : allocated;
> +
> +fix_extent_len:
> +	ex->ee_block = orig_ex.ee_block;
> +	ex->ee_len   = orig_ex.ee_len;
> +	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
> +	ext4_ext_mark_uninitialized(ex);
> +	ext4_ext_dirty(handle, inode, path + depth);
> +	return err;
>  }
> 
It would be nice to detect if fs is full or almost full before convert
the uninitialized extents. If the total number of free blocks left are
not enough for the split(plan for the worse case, 3 extents adds), just
go ahead to do the zero out the one single chunk ahead, in stead of
possible zeroing out two chucks later on the error path. I feel it's
much cleaner that way.

Mingming


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full
  2008-02-28 23:14     ` Mingming Cao
@ 2008-02-29 11:09       ` Aneesh Kumar K.V
  2008-02-29 19:21         ` Andreas Dilger
  2008-02-29 18:05       ` Andreas Dilger
  1 sibling, 1 reply; 12+ messages in thread
From: Aneesh Kumar K.V @ 2008-02-29 11:09 UTC (permalink / raw)
  To: Mingming Cao; +Cc: linux-ext4

On Thu, Feb 28, 2008 at 03:14:00PM -0800, Mingming Cao wrote:
> On Thu, 2008-02-28 at 23:35 +0530, Aneesh Kumar K.V wrote:
> > A write to prealloc area cause the split of unititalized extent into a initialized
> > and uninitialized extent. If we don't have space to add new extent information instead
> > of returning error convert the existing uninitialized extent to initialized one. We
> > need to zero out the blocks corresponding to the extent to prevent wrong data reaching
> > userspace.
> > 
> > +

....

> 
> The complexity added to the code to handle the corner case seems not
> worth the effort. 
> 
> One simple solution is submit bio directly to zero out the blocks on
> disk, and wait for that to finish before clear the uninitialized bit. On
> a 4K block size case, the max size of an uninitialized extents is 128MB,
> and since the blocks are all contigous on disk, a single IO could done
> the job, the latency should not be a too big issue. After all when a
> filesystem is full, it's already performs slowly.

This is the change that i have now. Yet to run the full test on that.
But seems to be working for simple tests.

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d315cc1..26396e2 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2136,6 +2136,55 @@ void ext4_ext_release(struct super_block *sb)
 #endif
 }
 
+static void bi_complete(struct bio *bio, int error)
+{
+	complete((struct completion*)bio->bi_private);
+}
+
+/* FIXME!! we need to try to merge to left or right after zerout  */
+static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
+{
+	int ret = -EIO;
+	struct bio *bio;
+	int blkbits, blocksize;
+	ext4_fsblk_t ee_pblock;
+	unsigned int ee_len, i;
+	struct completion event;
+
+
+	blkbits   = inode->i_blkbits;
+	blocksize = inode->i_sb->s_blocksize;
+	ee_len    = ext4_ext_get_actual_len(ex);
+	ee_pblock = ext_pblock(ex);
+
+	bio = bio_alloc(GFP_NOIO, ee_len);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_sector = ee_pblock << (blkbits >> 9);
+	bio->bi_bdev   = inode->i_sb->s_bdev;
+
+	for (i = 0; i < ee_len; i++) {
+		ret = bio_add_page(bio, ZERO_PAGE(0), blocksize, 0);
+		if (ret != blocksize) {
+			ret = -EIO;
+			goto err_out;
+		}
+	}
+
+	init_completion(&event);
+	bio->bi_private = &event;
+	bio->bi_end_io = bi_complete;
+	submit_bio(WRITE, bio);
+	wait_for_completion(&event);
+
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+		ret = 0;
+err_out:
+	bio_put(bio);
+	return ret;
+}
+
 /*
  * This function is called by ext4_ext_get_blocks() if someone tries to write
  * to an uninitialized extent. It may result in splitting the uninitialized
@@ -2202,14 +2251,19 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
 		ext4_ext_mark_uninitialized(ex3);
 		err = ext4_ext_insert_extent(handle, inode, path, ex3);
-		if (err) {
+		if (err == -ENOSPC) {
+			err =  ext4_ext_zeroout(inode, &orig_ex);
+			if (err)
+				goto fix_extent_len;
+			/* update the extent length and mark as initialized */
 			ex->ee_block = orig_ex.ee_block;
 			ex->ee_len   = orig_ex.ee_len;
 			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
-			ext4_ext_mark_uninitialized(ex);
 			ext4_ext_dirty(handle, inode, path + depth);
-			goto out;
-		}
+			return le16_to_cpu(ex->ee_len);
+
+		} else if (err)
+			goto fix_extent_len;
 		/*
 		 * The depth, and hence eh & ex might change
 		 * as part of the insert above.
@@ -2295,15 +2349,28 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	goto out;
 insert:
 	err = ext4_ext_insert_extent(handle, inode, path, &newex);
-	if (err) {
+	if (err == -ENOSPC) {
+		err =  ext4_ext_zeroout(inode, &orig_ex);
+		if (err)
+			goto fix_extent_len;
+		/* update the extent length and mark as initialized */
 		ex->ee_block = orig_ex.ee_block;
 		ex->ee_len   = orig_ex.ee_len;
 		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
-		ext4_ext_mark_uninitialized(ex);
 		ext4_ext_dirty(handle, inode, path + depth);
-	}
+		return le16_to_cpu(ex->ee_len);
+	} else if (err)
+		goto fix_extent_len;
 out:
 	return err ? err : allocated;
+
+fix_extent_len:
+	ex->ee_block = orig_ex.ee_block;
+	ex->ee_len   = orig_ex.ee_len;
+	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+	ext4_ext_mark_uninitialized(ex);
+	ext4_ext_dirty(handle, inode, path + depth);
+	return err;
 }
 
 /*


I am not invalidating the inode mapping after zeroing out the block. I
guess that is the right thing to do considering that pages already
mapped in via read or mmap would contain same value (zero).

> 
> >  /*
> >   * This function is called by ext4_ext_get_blocks() if someone tries to write
> >   * to an uninitialized extent. It may result in splitting the uninitialized
> > @@ -2202,14 +2333,20 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
> >  		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
> >  		ext4_ext_mark_uninitialized(ex3);
> > +	} else if (err)

......


> > +		goto fix_extent_len;
> >  out:
> >  	return err ? err : allocated;
> > +
> > +fix_extent_len:
> > +	ex->ee_block = orig_ex.ee_block;
> > +	ex->ee_len   = orig_ex.ee_len;
> > +	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
> > +	ext4_ext_mark_uninitialized(ex);
> > +	ext4_ext_dirty(handle, inode, path + depth);
> > +	return err;
> >  }
> > 
> It would be nice to detect if fs is full or almost full before convert
> the uninitialized extents. If the total number of free blocks left are
> not enough for the split(plan for the worse case, 3 extents adds), just
> go ahead to do the zero out the one single chunk ahead, in stead of
> possible zeroing out two chucks later on the error path. I feel it's
> much cleaner that way.
> 

We don't zero out two chunks. The uninit extent can possibly get split
into three extent.
[ 1st uninit] [ 2 init ] [ 3rd uninit]


Now first we attempt to insert 3. And if we fail due to ENOSPC we
zero out the full extent [1 2 3]. Now if we are successful in inserting 3 then
we attempt to insert 2. If we fail, we zero out [1 2]. That should also
reduce the number blocks that we are zeroing out. For example if we have
uninit extent len of 32767 blocks and we try to write the third block within
the extent and failed in the second step above we will zero out only 3
blocks. If we want to zero out the full extent that would imply zero out
32767 blocks.

-aneesh



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full
  2008-02-28 23:14     ` Mingming Cao
  2008-02-29 11:09       ` Aneesh Kumar K.V
@ 2008-02-29 18:05       ` Andreas Dilger
  1 sibling, 0 replies; 12+ messages in thread
From: Andreas Dilger @ 2008-02-29 18:05 UTC (permalink / raw)
  To: Mingming Cao; +Cc: Aneesh Kumar K.V, linux-ext4

On Feb 28, 2008  15:14 -0800, Mingming Cao wrote:
> On Thu, 2008-02-28 at 23:35 +0530, Aneesh Kumar K.V wrote:
> A write to prealloc area cause the split of unititalized extent into
> a initialized and uninitialized extent. If we don't have space to
> add new extent information instead of returning error convert the
> existing uninitialized extent to initialized one. We need to zero out
> the blocks corresponding to the extent to prevent wrong data reaching
> userspace.

> > +/* FIXME!! we need to try to merge to left or right after zerout  */
> > +static int ext4_ext_zeroout(handle_t *handle, struct inode *inode,
> > +				ext4_lblk_t iblock, struct ext4_extent *ex)
> > +{
> > +}
> > +
> 
> The complexity added to the code to handle the corner case seems not
> worth the effort. 
> 
> One simple solution is submit bio directly to zero out the blocks on
> disk, and wait for that to finish before clear the uninitialized bit. On
> a 4K block size case, the max size of an uninitialized extents is 128MB,
> and since the blocks are all contigous on disk, a single IO could done
> the job, the latency should not be a too big issue. After all when a
> filesystem is full, it's already performs slowly.

Further to Mingming's comments:
- you can map the ZERO_PAGE to every entry in the bio, which will avoid
  the very significant problem of needing 128MB of pages to zero out the
  extent
- make sure you limit the extent size to BIO_MAX_PAGES
- submitting large bios to the block layer is MUCH more efficient than
  adding pages to the page cache because the block device can do a very
  good job of writing this out
- make sure you wait for bio completion before you allow the block IO
  to begin.  In Lustre we did this by passing a waitq and our own
  completion function to the bio and have the caller go to sleep until
  the bio completion function is called.  Note that the completion
  function may be called multiple times if there are block errors.
- zeroing out pages in the page cache is very dangerous because they
  may already have dirty data in them.
- please make a helper function like "ext4_zero_blocks()" because at
  some point in the future I'd like to add the ability to have the kernel
  zero out inode table blocks for filesystems formatted with
  "-O uninit_groups,lazy_bg"

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full
  2008-02-29 11:09       ` Aneesh Kumar K.V
@ 2008-02-29 19:21         ` Andreas Dilger
  2008-03-01 17:30           ` Aneesh Kumar K.V
  0 siblings, 1 reply; 12+ messages in thread
From: Andreas Dilger @ 2008-02-29 19:21 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Mingming Cao, linux-ext4

On Feb 29, 2008  16:39 +0530, Aneesh Kumar K.V wrote:
> > One simple solution is submit bio directly to zero out the blocks on
> > disk, and wait for that to finish before clear the uninitialized bit. On
> > a 4K block size case, the max size of an uninitialized extents is 128MB,
> > and since the blocks are all contigous on disk, a single IO could done
> > the job, the latency should not be a too big issue. After all when a
> > filesystem is full, it's already performs slowly.
> 
> This is the change that i have now. Yet to run the full test on that.
> But seems to be working for simple tests.
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index d315cc1..26396e2 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2136,6 +2136,55 @@ void ext4_ext_release(struct super_block *sb)
>  #endif
>  }
>  
> +static void bi_complete(struct bio *bio, int error)
> +{
> +	complete((struct completion*)bio->bi_private);
> +}

Note that the completion event can be called multiple times if there are
block device errors...  Our similar completion code in Lustre is like:

static int dio_complete_routine(struct bio *bio, unsigned int done, int error)
{

        /* CAVEAT EMPTOR: possibly in IRQ context  */
        if (bio->bi_size)                       /* Not complete */
                return 1;

	bio->bi_private->data.error = error;

	return 0;
}


> +/* FIXME!! we need to try to merge to left or right after zerout  */
> +static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
> +{
> +	bio = bio_alloc(GFP_NOIO, ee_len);
> +	if (!bio)
> +		return -ENOMEM;

I don't think it will be possible to allocate a bio large enough for a
maximum-sized unwritten extent.  BIO_MAX_PAGES is only 256 (1MB on x86),
but an unwritten extent can be up to 128MB.

> +	bio->bi_bdev   = inode->i_sb->s_bdev;
> +
> +	for (i = 0; i < ee_len; i++) {
> +		ret = bio_add_page(bio, ZERO_PAGE(0), blocksize, 0);
> +		if (ret != blocksize) {
> +			ret = -EIO;
> +			goto err_out;

This shouldn't be considered an error.  Rather, it just means that the
bio is full or is crossing some storage boundary so it should be submitted
and a new bio created and the zeroing continues.

Please move most of this function into a generic helper that can be used
elsewhere.  It might even go into the VFS like:

int bio_zero_blocks(struct block_device *bdev, sector_t start, sector_t len,
		    bio_end_io_t completion);

and then have ext4_ext_zeroout() call that routine after decoding the extent.
The error case is only when the bio completion routine is called and the
saved "data.error" value is returned.

> > It would be nice to detect if fs is full or almost full before convert
> > the uninitialized extents. If the total number of free blocks left are
> > not enough for the split(plan for the worse case, 3 extents adds), just
> > go ahead to do the zero out the one single chunk ahead, in stead of
> > possible zeroing out two chucks later on the error path. I feel it's
> > much cleaner that way.
> 
> We don't zero out two chunks. The uninit extent can possibly get split
> into three extent.
> [ 1st uninit] [ 2 init ] [ 3rd uninit]
> 
> 
> Now first we attempt to insert 3. And if we fail due to ENOSPC we
> zero out the full extent [1 2 3]. Now if we are successful in inserting 3 then
> we attempt to insert 2. If we fail, we zero out [1 2]. That should also
> reduce the number blocks that we are zeroing out. For example if we have
> uninit extent len of 32767 blocks and we try to write the third block within
> the extent and failed in the second step above we will zero out only 3
> blocks. If we want to zero out the full extent that would imply zero out
> 32767 blocks.

A related optimization is to determine the size of the remaining split
extents.  I propose that if either of the remaining extents are < 7
blocks long (or whatever, possibly 15 blocks to get a nice 64kB write) we
should just zero out those blocks and create a single initialized extent.
This would avoid the "write every alternate block" problem that could
grow the number of extents dramatically.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full
  2008-02-29 19:21         ` Andreas Dilger
@ 2008-03-01 17:30           ` Aneesh Kumar K.V
  2008-03-02 18:51             ` Andreas Dilger
  0 siblings, 1 reply; 12+ messages in thread
From: Aneesh Kumar K.V @ 2008-03-01 17:30 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Mingming Cao, linux-ext4

On Fri, Feb 29, 2008 at 11:21:42AM -0800, Andreas Dilger wrote:
> On Feb 29, 2008  16:39 +0530, Aneesh Kumar K.V wrote:
> > > One simple solution is submit bio directly to zero out the blocks on
> > > disk, and wait for that to finish before clear the uninitialized bit. On
> > > a 4K block size case, the max size of an uninitialized extents is 128MB,
> > > and since the blocks are all contigous on disk, a single IO could done
> > > the job, the latency should not be a too big issue. After all when a
> > > filesystem is full, it's already performs slowly.
> > 
> > This is the change that i have now. Yet to run the full test on that.
> > But seems to be working for simple tests.
> > 
> > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> > index d315cc1..26396e2 100644
> > --- a/fs/ext4/extents.c
> > +++ b/fs/ext4/extents.c
> > @@ -2136,6 +2136,55 @@ void ext4_ext_release(struct super_block *sb)
> >  #endif
> >  }
> >  
> > +static void bi_complete(struct bio *bio, int error)
> > +{
> > +	complete((struct completion*)bio->bi_private);
> > +}
> 
> Note that the completion event can be called multiple times if there are
> block device errors...  Our similar completion code in Lustre is like:
> 
> static int dio_complete_routine(struct bio *bio, unsigned int done, int error)
> {
> 
>         /* CAVEAT EMPTOR: possibly in IRQ context  */
>         if (bio->bi_size)                       /* Not complete */
>                 return 1;
> 
> 	bio->bi_private->data.error = error;
> 
> 	return 0;
> }


I looked at the latest kernel and with the latest kernel it will be called only
once. We could be having an error. But even for error we would like to be
woken up and later i test for BIO_UPTODATE and if it is not uptodate returns -EIO.

The commit below changed the bio_endio

  6712ecf8f648118c3363c142196418f89a510b90
  5bb23a688b2de23d7765a1dd439d89c038378978
  9cc54d40b8ca01fcefc9151044b6996565061d90



> 
> 
> > +/* FIXME!! we need to try to merge to left or right after zerout  */
> > +static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
> > +{
> > +	bio = bio_alloc(GFP_NOIO, ee_len);
> > +	if (!bio)
> > +		return -ENOMEM;
> 
> I don't think it will be possible to allocate a bio large enough for a
> maximum-sized unwritten extent.  BIO_MAX_PAGES is only 256 (1MB on x86),
> but an unwritten extent can be up to 128MB.
> 
> > +	bio->bi_bdev   = inode->i_sb->s_bdev;
> > +
> > +	for (i = 0; i < ee_len; i++) {
> > +		ret = bio_add_page(bio, ZERO_PAGE(0), blocksize, 0);
> > +		if (ret != blocksize) {
> > +			ret = -EIO;
> > +			goto err_out;
> 
> This shouldn't be considered an error.  Rather, it just means that the
> bio is full or is crossing some storage boundary so it should be submitted
> and a new bio created and the zeroing continues.

+static void bi_complete(struct bio *bio, int error)
+{
+	complete((struct completion*)bio->bi_private);
+}
+
+/* FIXME!! we need to try to merge to left or right after zerout  */
+static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
+{
+	int ret = -EIO;
+	struct bio *bio;
+	int blkbits, blocksize;
+	sector_t ee_pblock;
+	unsigned int ee_len, len, done;
+	struct completion event;
+
+
+	blkbits   = inode->i_blkbits;
+	blocksize = inode->i_sb->s_blocksize;
+	ee_len    = ext4_ext_get_actual_len(ex);
+	ee_pblock = ext_pblock(ex);
+
+	/* convert ee_pblock in 512 byte sector */
+	ee_pblock = ee_pblock << (blkbits >> 9);
+
+
+	while (ee_len > 0 ) {
+
+		if (ee_len > BIO_MAX_PAGES)
+			len = BIO_MAX_PAGES;
+		else
+			len = ee_len;
+
+		bio = bio_alloc(GFP_NOIO, len);
+		if (!bio)
+			return -ENOMEM;
+		bio->bi_sector = ee_pblock;
+		bio->bi_bdev   = inode->i_sb->s_bdev;
+
+		done = 0;
+		while(done < len) {
+			ret = bio_add_page(bio, ZERO_PAGE(0), blocksize, 0);
+			if (ret != blocksize) {
+				/* We can't add any more page because of
+				 * hardware limitation. Start a new bio
+				 */
+				break;
+			}
+			done++;
+		}
+
+		init_completion(&event);
+		bio->bi_private = &event;
+		bio->bi_end_io = bi_complete;
+		submit_bio(WRITE, bio);
+		wait_for_completion(&event);
+
+		if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+			ret = 0;
+		else {
+			ret = -EIO;
+			break;
+		}
+		bio_put(bio);
+		ee_len    -= done;
+		ee_pblock += done  << (blkbits - 9);
+	}
+	return ret;
+}
+

> 
> Please move most of this function into a generic helper that can be used
> elsewhere.  It might even go into the VFS like:
> 
> int bio_zero_blocks(struct block_device *bdev, sector_t start, sector_t len,
> 		    bio_end_io_t completion);
> 
> and then have ext4_ext_zeroout() call that routine after decoding the extent.
> The error case is only when the bio completion routine is called and the
> saved "data.error" value is returned.


Converting it to an API like above doesn't help much. How about 

int bio_zero_blocks(struct block_device *bdev, sector_t start, unsigned
long bytes);

Here it implies that we would like to wait for zero out to finish.

Since we don't have another user now i didn't add the helper. But that
should be easy.

> 
> > > It would be nice to detect if fs is full or almost full before convert
> > > the uninitialized extents. If the total number of free blocks left are
> > > not enough for the split(plan for the worse case, 3 extents adds), just
> > > go ahead to do the zero out the one single chunk ahead, in stead of
> > > possible zeroing out two chucks later on the error path. I feel it's
> > > much cleaner that way.
> > 
> > We don't zero out two chunks. The uninit extent can possibly get split
> > into three extent.
> > [ 1st uninit] [ 2 init ] [ 3rd uninit]
> > 
> > 
> > Now first we attempt to insert 3. And if we fail due to ENOSPC we
> > zero out the full extent [1 2 3]. Now if we are successful in inserting 3 then
> > we attempt to insert 2. If we fail, we zero out [1 2]. That should also
> > reduce the number blocks that we are zeroing out. For example if we have
> > uninit extent len of 32767 blocks and we try to write the third block within
> > the extent and failed in the second step above we will zero out only 3
> > blocks. If we want to zero out the full extent that would imply zero out
> > 32767 blocks.
> 
> A related optimization is to determine the size of the remaining split
> extents.  I propose that if either of the remaining extents are < 7
> blocks long (or whatever, possibly 15 blocks to get a nice 64kB write) we
> should just zero out those blocks and create a single initialized extent.
> This would avoid the "write every alternate block" problem that could
> grow the number of extents dramatically.

Why 64KB ?. Also while inserting the extent we try to merge with left or
right so the problem may not be that bad. But I agree with you it
would be nice to zero out if the split extent have very small size.

-aneesh

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full
  2008-03-01 17:30           ` Aneesh Kumar K.V
@ 2008-03-02 18:51             ` Andreas Dilger
  0 siblings, 0 replies; 12+ messages in thread
From: Andreas Dilger @ 2008-03-02 18:51 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Mingming Cao, linux-ext4

On Mar 01, 2008  23:00 +0530, Aneesh Kumar K.V wrote:
> +/* FIXME!! we need to try to merge to left or right after zerout  */
> +static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
> +{
> +		done = 0;
> +		while(done < len) {
> +			ret = bio_add_page(bio, ZERO_PAGE(0), blocksize, 0);

Don't we need to set the page offset here?  

> Converting it to an API like above doesn't help much. How about 
> 
> int bio_zero_blocks(struct block_device *bdev, sector_t start, unsigned
> long bytes);
> 
> Here it implies that we would like to wait for zero out to finish.
> 
> Since we don't have another user now i didn't add the helper. But that
> should be easy.

Yes, this is probably fine too, though at that point you don't need to
have "bio" in the name since it is an internal implementation detail.

> > A related optimization is to determine the size of the remaining split
> > extents.  I propose that if either of the remaining extents are < 7
> > blocks long (or whatever, possibly 15 blocks to get a nice 64kB write) we
> > should just zero out those blocks and create a single initialized extent.
> > This would avoid the "write every alternate block" problem that could
> > grow the number of extents dramatically.
> 
> Why 64KB ?. Also while inserting the extent we try to merge with left or
> right so the problem may not be that bad. But I agree with you it
> would be nice to zero out if the split extent have very small size.

I pick 64kB since this is a good size for underlying IDE disks for track
merging and such.  Smaller IO sizes probably cause internal read-modify-
write, and if we make it too large it may cause extra overhead.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2008-03-02 18:51 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-02-21 19:17 [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full Aneesh Kumar K.V
2008-02-21 21:07 ` Mingming Cao
2008-02-22 14:31   ` Aneesh Kumar K.V
2008-02-22 15:42     ` Aneesh Kumar K.V
2008-02-22 17:28       ` Mingming Cao
  -- strict thread matches above, loose matches on Subject: below --
2008-02-28 18:05 [RFC][PATCH] ext4: Use page_mkwrite vma_operations to get mmap write notification Aneesh Kumar K.V
2008-02-28 18:05 ` [RFC][PATCH] ext4: Fix fallocate error path Aneesh Kumar K.V
2008-02-28 18:05   ` [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full Aneesh Kumar K.V
2008-02-28 23:14     ` Mingming Cao
2008-02-29 11:09       ` Aneesh Kumar K.V
2008-02-29 19:21         ` Andreas Dilger
2008-03-01 17:30           ` Aneesh Kumar K.V
2008-03-02 18:51             ` Andreas Dilger
2008-02-29 18:05       ` Andreas Dilger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox