linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] ext4: dynamical adjust the length of zero-out chunk
@ 2012-07-12  6:48 Zheng Liu
  2012-07-12 14:49 ` Eric Sandeen
  0 siblings, 1 reply; 13+ messages in thread
From: Zheng Liu @ 2012-07-12  6:48 UTC (permalink / raw)
  To: linux-ext4; +Cc: Zach Brown, Andreas Dilger, Zheng Liu

From: Zheng Liu <wenqing.lz@taobao.com>

Currently in ext4 the length of zero-out chunk is set to 7.  But it is
too short so that it will cause a lot of fragmentation of extent when
we use fallocate to preallocate some uninitialized extents and the
workload frequently does some uninitialized extent conversions.  Thus,
now we set it to 256 (1MB chunk), and put it into super block in order
to adjust it dynamically in sysfs.

CC: Zach Brown <zab@zabbo.net>
CC: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
---
v2 <- v1:
 * use a on-stack copy to avoid seeing differenet values
 * add missing spaces around '*'

 fs/ext4/ext4.h    |    3 +++
 fs/ext4/extents.c |   13 ++++++++-----
 fs/ext4/super.c   |    3 +++
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cfc4e01..0f44577 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1265,6 +1265,9 @@ struct ext4_sb_info {
 	/* locality groups */
 	struct ext4_locality_group __percpu *s_locality_groups;
 
+	/* the size of zero-out chunk */
+	unsigned int s_extent_zeroout_len;
+
 	/* for write statistics */
 	unsigned long s_sectors_written_start;
 	u64 s_kbytes_written;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 91341ec..a114d65 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3029,7 +3029,6 @@ out:
 	return err ? err : map->m_len;
 }
 
-#define EXT4_EXT_ZERO_LEN 7
 /*
  * This function is called by ext4_ext_map_blocks() if someone tries to write
  * to an uninitialized extent. It may result in splitting the uninitialized
@@ -3055,12 +3054,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 					   struct ext4_map_blocks *map,
 					   struct ext4_ext_path *path)
 {
+	struct ext4_sb_info *sbi;
 	struct ext4_extent_header *eh;
 	struct ext4_map_blocks split_map;
 	struct ext4_extent zero_ex;
 	struct ext4_extent *ex;
 	ext4_lblk_t ee_block, eof_block;
 	unsigned int ee_len, depth;
+	unsigned int zeroout_len;
 	int allocated;
 	int err = 0;
 	int split_flag = 0;
@@ -3069,6 +3070,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		"block %llu, max_blocks %u\n", inode->i_ino,
 		(unsigned long long)map->m_lblk, map->m_len);
 
+	sbi = EXT4_SB(inode->i_sb);
+	zeroout_len = sbi->s_extent_zeroout_len;
 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
 		inode->i_sb->s_blocksize_bits;
 	if (eof_block < map->m_lblk + map->m_len)
@@ -3168,8 +3171,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	 */
 	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
 
-	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
+	/* If extent has less than 2*s_extent_zeroout_len zerout directly */
+	if (ee_len <= (2 * zeroout_len) &&
 	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
 		err = ext4_ext_zeroout(inode, ex);
 		if (err)
@@ -3195,7 +3198,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	split_map.m_len = map->m_len;
 
 	if (allocated > map->m_len) {
-		if (allocated <= EXT4_EXT_ZERO_LEN &&
+		if (allocated <= zeroout_len &&
 		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
 			/* case 3 */
 			zero_ex.ee_block =
@@ -3209,7 +3212,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			split_map.m_lblk = map->m_lblk;
 			split_map.m_len = allocated;
 		} else if ((map->m_lblk - ee_block + map->m_len <
-			   EXT4_EXT_ZERO_LEN) &&
+			   zeroout_len) &&
 			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
 			/* case 2 */
 			if (map->m_lblk != ee_block) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index eb7aa3e..ea7cb6b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2535,6 +2535,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
 EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_RW_ATTR_SBI_UI(extent_zeroout_len, s_extent_zeroout_len);
 EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
 
 static struct attribute *ext4_attrs[] = {
@@ -2550,6 +2551,7 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(mb_stream_req),
 	ATTR_LIST(mb_group_prealloc),
 	ATTR_LIST(max_writeback_mb_bump),
+	ATTR_LIST(extent_zeroout_len),
 	ATTR_LIST(trigger_fs_error),
 	NULL,
 };
@@ -3626,6 +3628,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_max_writeback_mb_bump = 128;
+	sbi->s_extent_zeroout_len = 256;
 
 	/*
 	 * set up enough so that it can read an inode
-- 
1.7.4.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] ext4: dynamical adjust the length of zero-out chunk
  2012-07-12  6:48 [PATCH v2] ext4: dynamical adjust the length of zero-out chunk Zheng Liu
@ 2012-07-12 14:49 ` Eric Sandeen
  2012-07-12 16:51   ` Andreas Dilger
  2012-07-17  7:19   ` [PATCH v2] ext4: dynamical adjust the length of zero-out chunk Zheng Liu
  0 siblings, 2 replies; 13+ messages in thread
From: Eric Sandeen @ 2012-07-12 14:49 UTC (permalink / raw)
  To: Zheng Liu; +Cc: linux-ext4, Zach Brown, Andreas Dilger, Zheng Liu

On 7/12/12 1:48 AM, Zheng Liu wrote:
> From: Zheng Liu <wenqing.lz@taobao.com>
> 
> Currently in ext4 the length of zero-out chunk is set to 7.  But it is
> too short so that it will cause a lot of fragmentation of extent when
> we use fallocate to preallocate some uninitialized extents and the
> workload frequently does some uninitialized extent conversions.  Thus,
> now we set it to 256 (1MB chunk), and put it into super block in order
> to adjust it dynamically in sysfs.

Does this in fact help the workload for which you wanted the non-flagged
fallocate interface?

I'm a little wary of adding another user tunable; how will the user have
any idea what value to use here?

At any rate, something should also go into Documentation/filesystems/ext4.txt
to explain the new tunable.

Thanks,
-Eric

> CC: Zach Brown <zab@zabbo.net>
> CC: Andreas Dilger <adilger@dilger.ca>
> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> ---
> v2 <- v1:
>  * use a on-stack copy to avoid seeing differenet values
>  * add missing spaces around '*'
> 
>  fs/ext4/ext4.h    |    3 +++
>  fs/ext4/extents.c |   13 ++++++++-----
>  fs/ext4/super.c   |    3 +++
>  3 files changed, 14 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index cfc4e01..0f44577 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1265,6 +1265,9 @@ struct ext4_sb_info {
>  	/* locality groups */
>  	struct ext4_locality_group __percpu *s_locality_groups;
>  
> +	/* the size of zero-out chunk */
> +	unsigned int s_extent_zeroout_len;
> +
>  	/* for write statistics */
>  	unsigned long s_sectors_written_start;
>  	u64 s_kbytes_written;
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 91341ec..a114d65 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -3029,7 +3029,6 @@ out:
>  	return err ? err : map->m_len;
>  }
>  
> -#define EXT4_EXT_ZERO_LEN 7
>  /*
>   * This function is called by ext4_ext_map_blocks() if someone tries to write
>   * to an uninitialized extent. It may result in splitting the uninitialized
> @@ -3055,12 +3054,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  					   struct ext4_map_blocks *map,
>  					   struct ext4_ext_path *path)
>  {
> +	struct ext4_sb_info *sbi;
>  	struct ext4_extent_header *eh;
>  	struct ext4_map_blocks split_map;
>  	struct ext4_extent zero_ex;
>  	struct ext4_extent *ex;
>  	ext4_lblk_t ee_block, eof_block;
>  	unsigned int ee_len, depth;
> +	unsigned int zeroout_len;
>  	int allocated;
>  	int err = 0;
>  	int split_flag = 0;
> @@ -3069,6 +3070,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  		"block %llu, max_blocks %u\n", inode->i_ino,
>  		(unsigned long long)map->m_lblk, map->m_len);
>  
> +	sbi = EXT4_SB(inode->i_sb);
> +	zeroout_len = sbi->s_extent_zeroout_len;
>  	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
>  		inode->i_sb->s_blocksize_bits;
>  	if (eof_block < map->m_lblk + map->m_len)
> @@ -3168,8 +3171,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  	 */
>  	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
>  
> -	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
> -	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
> +	/* If extent has less than 2*s_extent_zeroout_len zerout directly */
> +	if (ee_len <= (2 * zeroout_len) &&
>  	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
>  		err = ext4_ext_zeroout(inode, ex);
>  		if (err)
> @@ -3195,7 +3198,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  	split_map.m_len = map->m_len;
>  
>  	if (allocated > map->m_len) {
> -		if (allocated <= EXT4_EXT_ZERO_LEN &&
> +		if (allocated <= zeroout_len &&
>  		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
>  			/* case 3 */
>  			zero_ex.ee_block =
> @@ -3209,7 +3212,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  			split_map.m_lblk = map->m_lblk;
>  			split_map.m_len = allocated;
>  		} else if ((map->m_lblk - ee_block + map->m_len <
> -			   EXT4_EXT_ZERO_LEN) &&
> +			   zeroout_len) &&
>  			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
>  			/* case 2 */
>  			if (map->m_lblk != ee_block) {
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index eb7aa3e..ea7cb6b 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -2535,6 +2535,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
>  EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
>  EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
>  EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
> +EXT4_RW_ATTR_SBI_UI(extent_zeroout_len, s_extent_zeroout_len);
>  EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
>  
>  static struct attribute *ext4_attrs[] = {
> @@ -2550,6 +2551,7 @@ static struct attribute *ext4_attrs[] = {
>  	ATTR_LIST(mb_stream_req),
>  	ATTR_LIST(mb_group_prealloc),
>  	ATTR_LIST(max_writeback_mb_bump),
> +	ATTR_LIST(extent_zeroout_len),
>  	ATTR_LIST(trigger_fs_error),
>  	NULL,
>  };
> @@ -3626,6 +3628,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  
>  	sbi->s_stripe = ext4_get_stripe_size(sbi);
>  	sbi->s_max_writeback_mb_bump = 128;
> +	sbi->s_extent_zeroout_len = 256;
>  
>  	/*
>  	 * set up enough so that it can read an inode
> 



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] ext4: dynamical adjust the length of zero-out chunk
  2012-07-12 14:49 ` Eric Sandeen
@ 2012-07-12 16:51   ` Andreas Dilger
  2012-07-17  7:55     ` Zheng Liu
  2012-08-13  3:22     ` Theodore Ts'o
  2012-07-17  7:19   ` [PATCH v2] ext4: dynamical adjust the length of zero-out chunk Zheng Liu
  1 sibling, 2 replies; 13+ messages in thread
From: Andreas Dilger @ 2012-07-12 16:51 UTC (permalink / raw)
  To: Zheng Liu, Eric Sandeen; +Cc: Zheng Liu, ext4 development, Zach Brown

On 2012-07-12, at 8:49 AM, Eric Sandeen wrote:
> On 7/12/12 1:48 AM, Zheng Liu wrote:
>> From: Zheng Liu <wenqing.lz@taobao.com>
>> 
>> Currently in ext4 the length of zero-out chunk is set to 7.  But it is
>> too short so that it will cause a lot of fragmentation of extent when
>> we use fallocate to preallocate some uninitialized extents and the
>> workload frequently does some uninitialized extent conversions.  Thus,
>> now we set it to 256 (1MB chunk), and put it into super block in order
>> to adjust it dynamically in sysfs.
> 
> Does this in fact help the workload for which you wanted the non-flagged
> fallocate interface?
> 
> I'm a little wary of adding another user tunable; how will the user have
> any idea what value to use here?

It would make sense to use the s_raid_stripe_width as the default value for
this parameter.  The other thing we need to pay attention to is that the
growth of the extent zeroing be done on a RAID or erase-block aligned manner.
Otherwise, this might cause extra IO that doesn't benefit the application.

It appears that the current code does not pay attention to alignment, and
that should be fixed before landing this patch with larger zero-out sizes.

> At any rate, something should also go into Documentation/filesystems/ext4.txt
> to explain the new tunable.
> 
> Thanks,
> -Eric
> 
>> CC: Zach Brown <zab@zabbo.net>
>> CC: Andreas Dilger <adilger@dilger.ca>
>> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
>> ---
>> v2 <- v1:
>> * use a on-stack copy to avoid seeing differenet values
>> * add missing spaces around '*'
>> 
>> fs/ext4/ext4.h    |    3 +++
>> fs/ext4/extents.c |   13 ++++++++-----
>> fs/ext4/super.c   |    3 +++
>> 3 files changed, 14 insertions(+), 5 deletions(-)
>> 
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index cfc4e01..0f44577 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -1265,6 +1265,9 @@ struct ext4_sb_info {
>> 	/* locality groups */
>> 	struct ext4_locality_group __percpu *s_locality_groups;
>> 
>> +	/* the size of zero-out chunk */
>> +	unsigned int s_extent_zeroout_len;
>> +
>> 	/* for write statistics */
>> 	unsigned long s_sectors_written_start;
>> 	u64 s_kbytes_written;
>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>> index 91341ec..a114d65 100644
>> --- a/fs/ext4/extents.c
>> +++ b/fs/ext4/extents.c
>> @@ -3029,7 +3029,6 @@ out:
>> 	return err ? err : map->m_len;
>> }
>> 
>> -#define EXT4_EXT_ZERO_LEN 7
>> /*
>>  * This function is called by ext4_ext_map_blocks() if someone tries to write
>>  * to an uninitialized extent. It may result in splitting the uninitialized
>> @@ -3055,12 +3054,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>> 					   struct ext4_map_blocks *map,
>> 					   struct ext4_ext_path *path)
>> {
>> +	struct ext4_sb_info *sbi;
>> 	struct ext4_extent_header *eh;
>> 	struct ext4_map_blocks split_map;
>> 	struct ext4_extent zero_ex;
>> 	struct ext4_extent *ex;
>> 	ext4_lblk_t ee_block, eof_block;
>> 	unsigned int ee_len, depth;
>> +	unsigned int zeroout_len;
>> 	int allocated;
>> 	int err = 0;
>> 	int split_flag = 0;
>> @@ -3069,6 +3070,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>> 		"block %llu, max_blocks %u\n", inode->i_ino,
>> 		(unsigned long long)map->m_lblk, map->m_len);
>> 
>> +	sbi = EXT4_SB(inode->i_sb);
>> +	zeroout_len = sbi->s_extent_zeroout_len;
>> 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
>> 		inode->i_sb->s_blocksize_bits;
>> 	if (eof_block < map->m_lblk + map->m_len)
>> @@ -3168,8 +3171,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>> 	 */
>> 	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
>> 
>> -	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
>> -	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
>> +	/* If extent has less than 2*s_extent_zeroout_len zerout directly */
>> +	if (ee_len <= (2 * zeroout_len) &&
>> 	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
>> 		err = ext4_ext_zeroout(inode, ex);
>> 		if (err)
>> @@ -3195,7 +3198,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>> 	split_map.m_len = map->m_len;
>> 
>> 	if (allocated > map->m_len) {
>> -		if (allocated <= EXT4_EXT_ZERO_LEN &&
>> +		if (allocated <= zeroout_len &&
>> 		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
>> 			/* case 3 */
>> 			zero_ex.ee_block =
>> @@ -3209,7 +3212,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>> 			split_map.m_lblk = map->m_lblk;
>> 			split_map.m_len = allocated;
>> 		} else if ((map->m_lblk - ee_block + map->m_len <
>> -			   EXT4_EXT_ZERO_LEN) &&
>> +			   zeroout_len) &&
>> 			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
>> 			/* case 2 */
>> 			if (map->m_lblk != ee_block) {
>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>> index eb7aa3e..ea7cb6b 100644
>> --- a/fs/ext4/super.c
>> +++ b/fs/ext4/super.c
>> @@ -2535,6 +2535,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
>> EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
>> EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
>> EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
>> +EXT4_RW_ATTR_SBI_UI(extent_zeroout_len, s_extent_zeroout_len);
>> EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
>> 
>> static struct attribute *ext4_attrs[] = {
>> @@ -2550,6 +2551,7 @@ static struct attribute *ext4_attrs[] = {
>> 	ATTR_LIST(mb_stream_req),
>> 	ATTR_LIST(mb_group_prealloc),
>> 	ATTR_LIST(max_writeback_mb_bump),
>> +	ATTR_LIST(extent_zeroout_len),
>> 	ATTR_LIST(trigger_fs_error),
>> 	NULL,
>> };
>> @@ -3626,6 +3628,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>> 
>> 	sbi->s_stripe = ext4_get_stripe_size(sbi);
>> 	sbi->s_max_writeback_mb_bump = 128;
>> +	sbi->s_extent_zeroout_len = 256;
>> 
>> 	/*
>> 	 * set up enough so that it can read an inode
>> 
> 
> 


Cheers, Andreas






^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] ext4: dynamical adjust the length of zero-out chunk
  2012-07-12 14:49 ` Eric Sandeen
  2012-07-12 16:51   ` Andreas Dilger
@ 2012-07-17  7:19   ` Zheng Liu
  1 sibling, 0 replies; 13+ messages in thread
From: Zheng Liu @ 2012-07-17  7:19 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: linux-ext4, Zach Brown, Andreas Dilger, Zheng Liu

On Thu, Jul 12, 2012 at 09:49:38AM -0500, Eric Sandeen wrote:
> On 7/12/12 1:48 AM, Zheng Liu wrote:
> > From: Zheng Liu <wenqing.lz@taobao.com>
> > 
> > Currently in ext4 the length of zero-out chunk is set to 7.  But it is
> > too short so that it will cause a lot of fragmentation of extent when
> > we use fallocate to preallocate some uninitialized extents and the
> > workload frequently does some uninitialized extent conversions.  Thus,
> > now we set it to 256 (1MB chunk), and put it into super block in order
> > to adjust it dynamically in sysfs.
> 
> Does this in fact help the workload for which you wanted the non-flagged
> fallocate interface?

No, it almost doesn't provide any helps for that workload, but it quite
can reduce the fragmentation of extents in my test.  So IMO it can bring
some benifits for ext4. :-)

> 
> I'm a little wary of adding another user tunable; how will the user have
> any idea what value to use here?
> 
> At any rate, something should also go into Documentation/filesystems/ext4.txt
> to explain the new tunable.

Agree.  I will add this tunable parameter in ext4 doc if this patch can
be applied.

Regards,
Zheng

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] ext4: dynamical adjust the length of zero-out chunk
  2012-07-12 16:51   ` Andreas Dilger
@ 2012-07-17  7:55     ` Zheng Liu
  2012-08-13  3:22     ` Theodore Ts'o
  1 sibling, 0 replies; 13+ messages in thread
From: Zheng Liu @ 2012-07-17  7:55 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Zheng Liu, Eric Sandeen, ext4 development, Zach Brown

On Thu, Jul 12, 2012 at 10:51:12AM -0600, Andreas Dilger wrote:
> On 2012-07-12, at 8:49 AM, Eric Sandeen wrote:
> > On 7/12/12 1:48 AM, Zheng Liu wrote:
> >> From: Zheng Liu <wenqing.lz@taobao.com>
> >> 
> >> Currently in ext4 the length of zero-out chunk is set to 7.  But it is
> >> too short so that it will cause a lot of fragmentation of extent when
> >> we use fallocate to preallocate some uninitialized extents and the
> >> workload frequently does some uninitialized extent conversions.  Thus,
> >> now we set it to 256 (1MB chunk), and put it into super block in order
> >> to adjust it dynamically in sysfs.
> > 
> > Does this in fact help the workload for which you wanted the non-flagged
> > fallocate interface?
> > 
> > I'm a little wary of adding another user tunable; how will the user have
> > any idea what value to use here?
> 
> It would make sense to use the s_raid_stripe_width as the default value for
> this parameter.  The other thing we need to pay attention to is that the
> growth of the extent zeroing be done on a RAID or erase-block aligned manner.
> Otherwise, this might cause extra IO that doesn't benefit the application.

There is a problem that we use the s_raid_stripe_width as the default
value, which is that this value will be 0 when we simply use mkfs.ext4
without '-E stripe-width=XXX'.  when this value is 0, we still need to
choose a number as the default value.  So I think that we can choose 256
when the s_raid_stripe_width is 0.

Regards,
Zheng

> It appears that the current code does not pay attention to alignment, and
> that should be fixed before landing this patch with larger zero-out sizes.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] ext4: dynamical adjust the length of zero-out chunk
  2012-07-12 16:51   ` Andreas Dilger
  2012-07-17  7:55     ` Zheng Liu
@ 2012-08-13  3:22     ` Theodore Ts'o
  2012-08-13  6:55       ` Zheng Liu
  2012-08-13 17:32       ` Zach Brown
  1 sibling, 2 replies; 13+ messages in thread
From: Theodore Ts'o @ 2012-08-13  3:22 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Zheng Liu, Eric Sandeen, Zheng Liu, ext4 development, Zach Brown

On Thu, Jul 12, 2012 at 10:51:12AM -0600, Andreas Dilger wrote:
> 
> It would make sense to use the s_raid_stripe_width as the default value for
> this parameter.  The other thing we need to pay attention to is that the
> growth of the extent zeroing be done on a RAID or erase-block aligned manner.
> Otherwise, this might cause extra IO that doesn't benefit the application.

Well.... it really depends on the workload.  If you have a workload
which is doing random writes into an uninitialized region of memory,
on a RAID device you're going to be doing read/modify/write cycles
anyway.  By using a larger zero-out chunk parameter, it avoids the
excess metadata operations, and it avoids fragmenting the extent tree.

The patch that sent out, "ext4: collapse a single extent tree block
into the inode if possible" will help out in at least some cases,
hopefully the most common ones, but using a larger zero-out size can
also help address this situation.

My larger concern with this patch is that 1MB writes are not free, and
turning a 4k random write into a 1MB write is going to be noticeable.
I've changed the default from 1MB to 256k, just to be more
conservative, but need to do some benchmarking to make sure we
understand what the best number will be on a variety of common
hardware in use by our users.

I also reworded the commit description slightly, and this is what I
currently have in my tree.  What do people think?

	       	     	    	    	   - Ted

commit 5b9401f6f5afbce4cacdd01cc7c74780cc084aa3
Author: Zheng Liu <wenqing.lz@taobao.com>
Date:   Sun Aug 12 23:08:58 2012 -0400

    ext4: make the zero-out chunk size tunable
    
    Currently in ext4 the length of zero-out chunk is set to 7.  But it is
    too short so that it will cause a lot of fragmentation of extent when
    we use fallocate to preallocate some uninitialized extents and the
    workload frequently does some uninitialized extent conversions.  Thus,
    we allow it to be tunable via sysfs and set an initial default value
    of 32, so instead of creating uninitalized extents smaller than
    256k (assuming a 4k block size), they will be zeroed out instead.
    
    CC: Zach Brown <zab@zabbo.net>
    CC: Andreas Dilger <adilger@dilger.ca>
    Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
    Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 7c0841e..f9024a6 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1271,6 +1271,9 @@ struct ext4_sb_info {
 	unsigned long s_sectors_written_start;
 	u64 s_kbytes_written;
 
+	/* the size of zero-out chunk */
+	unsigned int s_extent_zeroout_len;
+
 	unsigned int s_log_groups_per_flex;
 	struct flex_groups *s_flex_groups;
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 92fac2f..10f0afd 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3084,7 +3084,6 @@ out:
 	return err ? err : map->m_len;
 }
 
-#define EXT4_EXT_ZERO_LEN 7
 /*
  * This function is called by ext4_ext_map_blocks() if someone tries to write
  * to an uninitialized extent. It may result in splitting the uninitialized
@@ -3110,6 +3109,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 					   struct ext4_map_blocks *map,
 					   struct ext4_ext_path *path)
 {
+	struct ext4_sb_info *sbi;
 	struct ext4_extent_header *eh;
 	struct ext4_map_blocks split_map;
 	struct ext4_extent zero_ex;
@@ -3124,6 +3124,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		"block %llu, max_blocks %u\n", inode->i_ino,
 		(unsigned long long)map->m_lblk, map->m_len);
 
+	sbi = EXT4_SB(inode->i_sb);
 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
 		inode->i_sb->s_blocksize_bits;
 	if (eof_block < map->m_lblk + map->m_len)
@@ -3223,8 +3224,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	 */
 	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
 
-	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
+	/* If extent has less than 2*s_extent_zeroout_len zerout directly */
+	if (ee_len <= (2 * sbi->s_extent_zeroout_len) &&
 	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
 		err = ext4_ext_zeroout(inode, ex);
 		if (err)
@@ -3250,7 +3251,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	split_map.m_len = map->m_len;
 
 	if (allocated > map->m_len) {
-		if (allocated <= EXT4_EXT_ZERO_LEN &&
+		if (allocated <= sbi->s_extent_zeroout_len &&
 		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
 			/* case 3 */
 			zero_ex.ee_block =
@@ -3264,7 +3265,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			split_map.m_lblk = map->m_lblk;
 			split_map.m_len = allocated;
 		} else if ((map->m_lblk - ee_block + map->m_len <
-			   EXT4_EXT_ZERO_LEN) &&
+			   sbi->s_extent_zeroout_len) &&
 			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
 			/* case 2 */
 			if (map->m_lblk != ee_block) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5896dcb..4a7092b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2541,6 +2541,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
 EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_RW_ATTR_SBI_UI(extent_zeroout_len, s_extent_zeroout_len);
 EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
 
 static struct attribute *ext4_attrs[] = {
@@ -2556,6 +2557,7 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(mb_stream_req),
 	ATTR_LIST(mb_group_prealloc),
 	ATTR_LIST(max_writeback_mb_bump),
+	ATTR_LIST(extent_zeroout_len),
 	ATTR_LIST(trigger_fs_error),
 	NULL,
 };
@@ -3752,6 +3754,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_max_writeback_mb_bump = 128;
+	sbi->s_extent_zeroout_len = 16;
 
 	/*
 	 * set up enough so that it can read an inode

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] ext4: dynamical adjust the length of zero-out chunk
  2012-08-13  3:22     ` Theodore Ts'o
@ 2012-08-13  6:55       ` Zheng Liu
  2012-08-13 17:32       ` Zach Brown
  1 sibling, 0 replies; 13+ messages in thread
From: Zheng Liu @ 2012-08-13  6:55 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Andreas Dilger, Zheng Liu, Eric Sandeen, ext4 development,
	Zach Brown

On Sun, Aug 12, 2012 at 11:22:43PM -0400, Theodore Ts'o wrote:
> On Thu, Jul 12, 2012 at 10:51:12AM -0600, Andreas Dilger wrote:
> > 
> > It would make sense to use the s_raid_stripe_width as the default value for
> > this parameter.  The other thing we need to pay attention to is that the
> > growth of the extent zeroing be done on a RAID or erase-block aligned manner.
> > Otherwise, this might cause extra IO that doesn't benefit the application.
> 
> Well.... it really depends on the workload.  If you have a workload
> which is doing random writes into an uninitialized region of memory,
> on a RAID device you're going to be doing read/modify/write cycles
> anyway.  By using a larger zero-out chunk parameter, it avoids the
> excess metadata operations, and it avoids fragmenting the extent tree.
> 
> The patch that sent out, "ext4: collapse a single extent tree block
> into the inode if possible" will help out in at least some cases,
> hopefully the most common ones, but using a larger zero-out size can
> also help address this situation.
> 
> My larger concern with this patch is that 1MB writes are not free, and
> turning a 4k random write into a 1MB write is going to be noticeable.
> I've changed the default from 1MB to 256k, just to be more
> conservative, but need to do some benchmarking to make sure we
> understand what the best number will be on a variety of common
> hardware in use by our users.
> 
> I also reworded the commit description slightly, and this is what I
> currently have in my tree.  What do people think?

Hi Ted,

Thanks for your reviewing, and it looks good to me.  BTW, do we need to
add some descriptions for this tuning parameter in documentation?

Regards,
Zheng

> commit 5b9401f6f5afbce4cacdd01cc7c74780cc084aa3
> Author: Zheng Liu <wenqing.lz@taobao.com>
> Date:   Sun Aug 12 23:08:58 2012 -0400
> 
>     ext4: make the zero-out chunk size tunable
>     
>     Currently in ext4 the length of zero-out chunk is set to 7.  But it is
>     too short so that it will cause a lot of fragmentation of extent when
>     we use fallocate to preallocate some uninitialized extents and the
>     workload frequently does some uninitialized extent conversions.  Thus,
>     we allow it to be tunable via sysfs and set an initial default value
>     of 32, so instead of creating uninitalized extents smaller than
>     256k (assuming a 4k block size), they will be zeroed out instead.
>     
>     CC: Zach Brown <zab@zabbo.net>
>     CC: Andreas Dilger <adilger@dilger.ca>
>     Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
>     Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 7c0841e..f9024a6 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1271,6 +1271,9 @@ struct ext4_sb_info {
>  	unsigned long s_sectors_written_start;
>  	u64 s_kbytes_written;
>  
> +	/* the size of zero-out chunk */
> +	unsigned int s_extent_zeroout_len;
> +
>  	unsigned int s_log_groups_per_flex;
>  	struct flex_groups *s_flex_groups;
>  
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 92fac2f..10f0afd 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -3084,7 +3084,6 @@ out:
>  	return err ? err : map->m_len;
>  }
>  
> -#define EXT4_EXT_ZERO_LEN 7
>  /*
>   * This function is called by ext4_ext_map_blocks() if someone tries to write
>   * to an uninitialized extent. It may result in splitting the uninitialized
> @@ -3110,6 +3109,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  					   struct ext4_map_blocks *map,
>  					   struct ext4_ext_path *path)
>  {
> +	struct ext4_sb_info *sbi;
>  	struct ext4_extent_header *eh;
>  	struct ext4_map_blocks split_map;
>  	struct ext4_extent zero_ex;
> @@ -3124,6 +3124,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  		"block %llu, max_blocks %u\n", inode->i_ino,
>  		(unsigned long long)map->m_lblk, map->m_len);
>  
> +	sbi = EXT4_SB(inode->i_sb);
>  	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
>  		inode->i_sb->s_blocksize_bits;
>  	if (eof_block < map->m_lblk + map->m_len)
> @@ -3223,8 +3224,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  	 */
>  	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
>  
> -	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
> -	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
> +	/* If extent has less than 2*s_extent_zeroout_len zerout directly */
> +	if (ee_len <= (2 * sbi->s_extent_zeroout_len) &&
>  	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
>  		err = ext4_ext_zeroout(inode, ex);
>  		if (err)
> @@ -3250,7 +3251,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  	split_map.m_len = map->m_len;
>  
>  	if (allocated > map->m_len) {
> -		if (allocated <= EXT4_EXT_ZERO_LEN &&
> +		if (allocated <= sbi->s_extent_zeroout_len &&
>  		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
>  			/* case 3 */
>  			zero_ex.ee_block =
> @@ -3264,7 +3265,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  			split_map.m_lblk = map->m_lblk;
>  			split_map.m_len = allocated;
>  		} else if ((map->m_lblk - ee_block + map->m_len <
> -			   EXT4_EXT_ZERO_LEN) &&
> +			   sbi->s_extent_zeroout_len) &&
>  			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
>  			/* case 2 */
>  			if (map->m_lblk != ee_block) {
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 5896dcb..4a7092b 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -2541,6 +2541,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
>  EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
>  EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
>  EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
> +EXT4_RW_ATTR_SBI_UI(extent_zeroout_len, s_extent_zeroout_len);
>  EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
>  
>  static struct attribute *ext4_attrs[] = {
> @@ -2556,6 +2557,7 @@ static struct attribute *ext4_attrs[] = {
>  	ATTR_LIST(mb_stream_req),
>  	ATTR_LIST(mb_group_prealloc),
>  	ATTR_LIST(max_writeback_mb_bump),
> +	ATTR_LIST(extent_zeroout_len),
>  	ATTR_LIST(trigger_fs_error),
>  	NULL,
>  };
> @@ -3752,6 +3754,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  
>  	sbi->s_stripe = ext4_get_stripe_size(sbi);
>  	sbi->s_max_writeback_mb_bump = 128;
> +	sbi->s_extent_zeroout_len = 16;
>  
>  	/*
>  	 * set up enough so that it can read an inode

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] ext4: dynamical adjust the length of zero-out chunk
  2012-08-13  3:22     ` Theodore Ts'o
  2012-08-13  6:55       ` Zheng Liu
@ 2012-08-13 17:32       ` Zach Brown
  2012-08-13 18:40         ` Theodore Ts'o
  1 sibling, 1 reply; 13+ messages in thread
From: Zach Brown @ 2012-08-13 17:32 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Andreas Dilger, Zheng Liu, Eric Sandeen, Zheng Liu,
	ext4 development

> I also reworded the commit description slightly, and this is what I
> currently have in my tree.  What do people think?

(well, you asked! :))

>     we allow it to be tunable via sysfs and set an initial default value
>     of 32, so instead of creating uninitalized extents smaller than

s/32/16/?

>     256k (assuming a 4k block size), they will be zeroed out instead.

Having to qualify the zeroout len with the block size caught my
attention!

> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -2541,6 +2541,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
>  EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
>  EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
>  EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
> +EXT4_RW_ATTR_SBI_UI(extent_zeroout_len, s_extent_zeroout_len);
>  EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);

"len" seems to stand out amongst all those "mb" names :)

It'd be nice to define the tunable in terms of some fixed unit, kb or
mb, whatever, and then translate to the block size in the code so people
don't have to do that math by hand.

No?

- z

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] ext4: dynamical adjust the length of zero-out chunk
  2012-08-13 17:32       ` Zach Brown
@ 2012-08-13 18:40         ` Theodore Ts'o
  2012-08-13 19:49           ` Zach Brown
  0 siblings, 1 reply; 13+ messages in thread
From: Theodore Ts'o @ 2012-08-13 18:40 UTC (permalink / raw)
  To: Zach Brown
  Cc: Andreas Dilger, Zheng Liu, Eric Sandeen, Zheng Liu,
	ext4 development

On Mon, Aug 13, 2012 at 10:32:24AM -0700, Zach Brown wrote:
> >     we allow it to be tunable via sysfs and set an initial default value
> >     of 32, so instead of creating uninitalized extents smaller than
> 
> s/32/16/?

Oops, nice catch.

> It'd be nice to define the tunable in terms of some fixed unit, kb or
> mb, whatever, and then translate to the block size in the code so people
> don't have to do that math by hand.
> 
> No?

Agreed, thanks for the suggestion.  The next question is whether the
default maximum zero-out size should be 256k (as previously
documented) or 128k (as previously coded).

The previously rule of thumb which I had used was that after doing a
random seek, the time needed to write 4k and 32k was pretty much in
the noise.  But that was a number from several years ago.  I suppose I
should do some quick experiments to see what is a good number these
days....

						- Ted

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] ext4: dynamical adjust the length of zero-out chunk
  2012-08-13 18:40         ` Theodore Ts'o
@ 2012-08-13 19:49           ` Zach Brown
  2012-08-13 21:35             ` Theodore Ts'o
  0 siblings, 1 reply; 13+ messages in thread
From: Zach Brown @ 2012-08-13 19:49 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Andreas Dilger, Zheng Liu, Eric Sandeen, Zheng Liu,
	ext4 development

> The previously rule of thumb which I had used was that after doing a
> random seek, the time needed to write 4k and 32k was pretty much in
> the noise.  But that was a number from several years ago.  I suppose I
> should do some quick experiments to see what is a good number these
> days....

Indeed.  fio to the rescue.

I remember Christoph saying something about 64k somewhat recently?  But,
helpfully, I can't recall the details :).

- z

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] ext4: dynamical adjust the length of zero-out chunk
  2012-08-13 19:49           ` Zach Brown
@ 2012-08-13 21:35             ` Theodore Ts'o
  2012-08-14 15:13               ` [PATCH] ext4: make the zero-out chunk size tunable Theodore Ts'o
  0 siblings, 1 reply; 13+ messages in thread
From: Theodore Ts'o @ 2012-08-13 21:35 UTC (permalink / raw)
  To: Zach Brown
  Cc: Andreas Dilger, Zheng Liu, Eric Sandeen, Zheng Liu,
	ext4 development

On Mon, Aug 13, 2012 at 12:49:02PM -0700, Zach Brown wrote:
> 
> Indeed.  fio to the rescue.
> 
> I remember Christoph saying something about 64k somewhat recently?  But,
> helpfully, I can't recall the details :).

So here are some quick fio numbers, using a modern 5400rpm 2.5" disk,
using 8192 samples doing random writes at different sizes:

4k     min=0.099 , max=69.980 , avg=1.95249
8k     min=0.112 , max=71.393 , avg=2.39577
16k    min=0.123 , max=79.951 , avg=3.29693
32k    min=0.190 , max=75.846 , avg=3.57158
64k    min=0.305 , max=71.386 , avg=4.43218
128k   min=0.554 , max=77.925 , avg=6.40304
256k   min=1     , max=68     , avg=10.21

If we take into account that a random write into a fallocate'd file
will need to update the extent tree, this is the time that it would
take to do a 4k random write if we are also using a more aggressive
max zero-out length (plus the extent tree block update):

        zerooout +
        4k metadata update
           (ms)
4k         3.90
8k         4.35
16k        5.25
32k        5.52
64k        6.38
128k       8.36
256k      12.16

So I can see going to 64k, but unless we're really concerned about
extent fragmentation, I don't think larger values make a whole lot of
sense, especially if we are concerned about lowering latency
variability when writing into freshly fallocate'd space.  And if the
concern is extent fragmentation, we may be better off fixing our
extent tree manipulation code so we are better at merging adjacent
extent tree blocks instead.

						- Ted

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] ext4: make the zero-out chunk size tunable
  2012-08-13 21:35             ` Theodore Ts'o
@ 2012-08-14 15:13               ` Theodore Ts'o
  2012-08-14 15:15                 ` [PATCH -v4] " Theodore Ts'o
  0 siblings, 1 reply; 13+ messages in thread
From: Theodore Ts'o @ 2012-08-14 15:13 UTC (permalink / raw)
  To: Ext4 Developers List
  Cc: Zheng Liu, Zach Brown, Andreas Dilger, Theodore Ts'o

From: Zheng Liu <wenqing.lz@taobao.com>

Currently in ext4 the length of zero-out chunk is set to 7 file system
blocks.  But if an inode has uninitailized extents from using
fallocate to preallocate space, and the workload issues many random
writes, this can cause a fragmented extent tree that will
unnecessarily grow the extent tree.

So create a new sysfs tunable, extent_max_zeroout_kb, which controls
the maximum size where blocks will be zeroed out instead of creating a
new uninitialized extent.  The default of this has been sent to 32kb.

CC: Zach Brown <zab@zabbo.net>
CC: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/ABI/testing/sysfs-fs-ext4 | 13 +++++++++++++
 fs/ext4/ext4.h                          |  3 +++
 fs/ext4/extents.c                       | 25 +++++++++++++------------
 fs/ext4/super.c                         |  3 +++
 4 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
index f22ac08..c631253 100644
--- a/Documentation/ABI/testing/sysfs-fs-ext4
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -96,3 +96,16 @@ Contact:	"Theodore Ts'o" <tytso@mit.edu>
 Description:
 		The maximum number of megabytes the writeback code will
 		try to write out before move on to another inode.
+
+What:		/sys/fs/ext4/<disk>/extent_max_zeroout_kb
+Date:		August 2012
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		The maximum number of kilobytes which will be zeroed
+		out in preference to creating a new uninitialized
+		extent when manipulating an inode's extent tree.  Note
+		that using a larger value will increase the
+		variability of time necessary to complete a random
+		write operation (since a 4k random write might turn
+		into a much larger write due to the zeroout
+		operation).
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 7c0841e..0df5ee1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1271,6 +1271,9 @@ struct ext4_sb_info {
 	unsigned long s_sectors_written_start;
 	u64 s_kbytes_written;
 
+	/* the size of zero-out chunk */
+	unsigned int s_extent_max_zeroout_kb;
+
 	unsigned int s_log_groups_per_flex;
 	struct flex_groups *s_flex_groups;
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 92fac2f..769151d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3084,7 +3084,6 @@ out:
 	return err ? err : map->m_len;
 }
 
-#define EXT4_EXT_ZERO_LEN 7
 /*
  * This function is called by ext4_ext_map_blocks() if someone tries to write
  * to an uninitialized extent. It may result in splitting the uninitialized
@@ -3110,13 +3109,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 					   struct ext4_map_blocks *map,
 					   struct ext4_ext_path *path)
 {
+	struct ext4_sb_info *sbi;
 	struct ext4_extent_header *eh;
 	struct ext4_map_blocks split_map;
 	struct ext4_extent zero_ex;
 	struct ext4_extent *ex;
 	ext4_lblk_t ee_block, eof_block;
 	unsigned int ee_len, depth;
-	int allocated;
+	int allocated, max_zeroout = 0;
 	int err = 0;
 	int split_flag = 0;
 
@@ -3124,6 +3124,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		"block %llu, max_blocks %u\n", inode->i_ino,
 		(unsigned long long)map->m_lblk, map->m_len);
 
+	sbi = EXT4_SB(inode->i_sb);
 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
 		inode->i_sb->s_blocksize_bits;
 	if (eof_block < map->m_lblk + map->m_len)
@@ -3223,9 +3224,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	 */
 	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
 
-	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
-	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+	if (EXT4_EXT_MAY_ZEROOUT & split_flag)
+		max_zeroout = sbi->s_extent_max_zeroout_kb >>
+			inode->i_sb->s_blocksize_bits;
+
+	/* If extent is less than s_max_zeroout_kb, zeroout directly */
+	if (max_zeroout && (ee_len <= max_zeroout)) {
 		err = ext4_ext_zeroout(inode, ex);
 		if (err)
 			goto out;
@@ -3249,9 +3253,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	split_map.m_lblk = map->m_lblk;
 	split_map.m_len = map->m_len;
 
-	if (allocated > map->m_len) {
-		if (allocated <= EXT4_EXT_ZERO_LEN &&
-		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+	if (max_zeroout && (allocated > map->m_len)) {
+		if (allocated <= max_zeroout) {
 			/* case 3 */
 			zero_ex.ee_block =
 					 cpu_to_le32(map->m_lblk);
@@ -3263,9 +3266,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 				goto out;
 			split_map.m_lblk = map->m_lblk;
 			split_map.m_len = allocated;
-		} else if ((map->m_lblk - ee_block + map->m_len <
-			   EXT4_EXT_ZERO_LEN) &&
-			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+		} else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
 			/* case 2 */
 			if (map->m_lblk != ee_block) {
 				zero_ex.ee_block = ex->ee_block;
@@ -3285,7 +3286,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	}
 
 	allocated = ext4_split_extent(handle, inode, path,
-				       &split_map, split_flag, 0);
+				      &split_map, split_flag, 0);
 	if (allocated < 0)
 		err = allocated;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5896dcb..7dc4bd7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2541,6 +2541,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
 EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
 
 static struct attribute *ext4_attrs[] = {
@@ -2556,6 +2557,7 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(mb_stream_req),
 	ATTR_LIST(mb_group_prealloc),
 	ATTR_LIST(max_writeback_mb_bump),
+	ATTR_LIST(extent_max_zeroout_kb),
 	ATTR_LIST(trigger_fs_error),
 	NULL,
 };
@@ -3752,6 +3754,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_max_writeback_mb_bump = 128;
+	sbi->s_extent_max_zeroout_kb = 256;
 
 	/*
 	 * set up enough so that it can read an inode
-- 
1.7.12.rc0.22.gcdd159b


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH -v4] ext4: make the zero-out chunk size tunable
  2012-08-14 15:13               ` [PATCH] ext4: make the zero-out chunk size tunable Theodore Ts'o
@ 2012-08-14 15:15                 ` Theodore Ts'o
  0 siblings, 0 replies; 13+ messages in thread
From: Theodore Ts'o @ 2012-08-14 15:15 UTC (permalink / raw)
  To: Ext4 Developers List
  Cc: Zheng Liu, Zach Brown, Andreas Dilger, Theodore Ts'o

From: Zheng Liu <wenqing.lz@taobao.com>

Currently in ext4 the length of zero-out chunk is set to 7 file system
blocks.  But if an inode has uninitailized extents from using
fallocate to preallocate space, and the workload issues many random
writes, this can cause a fragmented extent tree that will
unnecessarily grow the extent tree.

So create a new sysfs tunable, extent_max_zeroout_kb, which controls
the maximum size where blocks will be zeroed out instead of creating a
new uninitialized extent.  The default of this has been sent to 32kb.

CC: Zach Brown <zab@zabbo.net>
CC: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/ABI/testing/sysfs-fs-ext4 | 13 +++++++++++++
 fs/ext4/ext4.h                          |  3 +++
 fs/ext4/extents.c                       | 25 +++++++++++++------------
 fs/ext4/super.c                         |  3 +++
 4 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
index f22ac08..c631253 100644
--- a/Documentation/ABI/testing/sysfs-fs-ext4
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -96,3 +96,16 @@ Contact:	"Theodore Ts'o" <tytso@mit.edu>
 Description:
 		The maximum number of megabytes the writeback code will
 		try to write out before move on to another inode.
+
+What:		/sys/fs/ext4/<disk>/extent_max_zeroout_kb
+Date:		August 2012
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		The maximum number of kilobytes which will be zeroed
+		out in preference to creating a new uninitialized
+		extent when manipulating an inode's extent tree.  Note
+		that using a larger value will increase the
+		variability of time necessary to complete a random
+		write operation (since a 4k random write might turn
+		into a much larger write due to the zeroout
+		operation).
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 7c0841e..0df5ee1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1271,6 +1271,9 @@ struct ext4_sb_info {
 	unsigned long s_sectors_written_start;
 	u64 s_kbytes_written;
 
+	/* the size of zero-out chunk */
+	unsigned int s_extent_max_zeroout_kb;
+
 	unsigned int s_log_groups_per_flex;
 	struct flex_groups *s_flex_groups;
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 92fac2f..769151d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3084,7 +3084,6 @@ out:
 	return err ? err : map->m_len;
 }
 
-#define EXT4_EXT_ZERO_LEN 7
 /*
  * This function is called by ext4_ext_map_blocks() if someone tries to write
  * to an uninitialized extent. It may result in splitting the uninitialized
@@ -3110,13 +3109,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 					   struct ext4_map_blocks *map,
 					   struct ext4_ext_path *path)
 {
+	struct ext4_sb_info *sbi;
 	struct ext4_extent_header *eh;
 	struct ext4_map_blocks split_map;
 	struct ext4_extent zero_ex;
 	struct ext4_extent *ex;
 	ext4_lblk_t ee_block, eof_block;
 	unsigned int ee_len, depth;
-	int allocated;
+	int allocated, max_zeroout = 0;
 	int err = 0;
 	int split_flag = 0;
 
@@ -3124,6 +3124,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		"block %llu, max_blocks %u\n", inode->i_ino,
 		(unsigned long long)map->m_lblk, map->m_len);
 
+	sbi = EXT4_SB(inode->i_sb);
 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
 		inode->i_sb->s_blocksize_bits;
 	if (eof_block < map->m_lblk + map->m_len)
@@ -3223,9 +3224,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	 */
 	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
 
-	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
-	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+	if (EXT4_EXT_MAY_ZEROOUT & split_flag)
+		max_zeroout = sbi->s_extent_max_zeroout_kb >>
+			inode->i_sb->s_blocksize_bits;
+
+	/* If extent is less than s_max_zeroout_kb, zeroout directly */
+	if (max_zeroout && (ee_len <= max_zeroout)) {
 		err = ext4_ext_zeroout(inode, ex);
 		if (err)
 			goto out;
@@ -3249,9 +3253,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	split_map.m_lblk = map->m_lblk;
 	split_map.m_len = map->m_len;
 
-	if (allocated > map->m_len) {
-		if (allocated <= EXT4_EXT_ZERO_LEN &&
-		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+	if (max_zeroout && (allocated > map->m_len)) {
+		if (allocated <= max_zeroout) {
 			/* case 3 */
 			zero_ex.ee_block =
 					 cpu_to_le32(map->m_lblk);
@@ -3263,9 +3266,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 				goto out;
 			split_map.m_lblk = map->m_lblk;
 			split_map.m_len = allocated;
-		} else if ((map->m_lblk - ee_block + map->m_len <
-			   EXT4_EXT_ZERO_LEN) &&
-			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+		} else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
 			/* case 2 */
 			if (map->m_lblk != ee_block) {
 				zero_ex.ee_block = ex->ee_block;
@@ -3285,7 +3286,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	}
 
 	allocated = ext4_split_extent(handle, inode, path,
-				       &split_map, split_flag, 0);
+				      &split_map, split_flag, 0);
 	if (allocated < 0)
 		err = allocated;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5896dcb..e7ccbe5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2541,6 +2541,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
 EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
 
 static struct attribute *ext4_attrs[] = {
@@ -2556,6 +2557,7 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(mb_stream_req),
 	ATTR_LIST(mb_group_prealloc),
 	ATTR_LIST(max_writeback_mb_bump),
+	ATTR_LIST(extent_max_zeroout_kb),
 	ATTR_LIST(trigger_fs_error),
 	NULL,
 };
@@ -3752,6 +3754,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_max_writeback_mb_bump = 128;
+	sbi->s_extent_max_zeroout_kb = 32;
 
 	/*
 	 * set up enough so that it can read an inode
-- 
1.7.12.rc0.22.gcdd159b


^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2012-08-14 15:15 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-07-12  6:48 [PATCH v2] ext4: dynamical adjust the length of zero-out chunk Zheng Liu
2012-07-12 14:49 ` Eric Sandeen
2012-07-12 16:51   ` Andreas Dilger
2012-07-17  7:55     ` Zheng Liu
2012-08-13  3:22     ` Theodore Ts'o
2012-08-13  6:55       ` Zheng Liu
2012-08-13 17:32       ` Zach Brown
2012-08-13 18:40         ` Theodore Ts'o
2012-08-13 19:49           ` Zach Brown
2012-08-13 21:35             ` Theodore Ts'o
2012-08-14 15:13               ` [PATCH] ext4: make the zero-out chunk size tunable Theodore Ts'o
2012-08-14 15:15                 ` [PATCH -v4] " Theodore Ts'o
2012-07-17  7:19   ` [PATCH v2] ext4: dynamical adjust the length of zero-out chunk Zheng Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).