linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] e2fsprogs: New bitmap and inode table allocation for FLEX_BG
@ 2008-01-11 17:28 Jose R. Santos
  2008-01-11 21:01 ` Andreas Dilger
  0 siblings, 1 reply; 15+ messages in thread
From: Jose R. Santos @ 2008-01-11 17:28 UTC (permalink / raw)
  To: Theodore Tso, linux-ext4

commit 38a4134f29b06229843bfe838c23e28f8d323b86
Author: Jose R. Santos <jrs@us.ibm.com>
Date:   Fri Jan 11 11:03:03 2008 -0600

    New bitmap and inode table allocation for FLEX_BG
    
    Change the way we allocate bitmaps and inode tables if the FLEX_BG
    feature is used at mke2fs time.  It places calculates a new offset for
    bitmaps and inode table base on the number of groups that the user
    wishes to pack together using the new "-G" option.  Creating a
    filesystem with 64 block groups in a flex group can be done by:
    
    mke2fs -j -I 256 -O flex_bg -G 32 /dev/sdX
    
    Signed-off-by: Jose R. Santos <jrs@us.ibm.com>
    Singed-off-by: Valerie Clement <valerie.clement@bull.net>

diff --git a/lib/ext2fs/alloc_tables.c b/lib/ext2fs/alloc_tables.c
index 4ad2ba9..f85ef97 100644
--- a/lib/ext2fs/alloc_tables.c
+++ b/lib/ext2fs/alloc_tables.c
@@ -27,18 +27,55 @@
 #include "ext2_fs.h"
 #include "ext2fs.h"
 
+blk_t ext2fs_flexbg_offset(ext2_filsys fs, dgrp_t group, int flexbg_size, 
+			   ext2fs_block_bitmap bmap, int offset, int size)
+{
+	int		flexbg;
+	errcode_t	retval;
+	blk_t		start_blk, last_blk, first_free = 0;
+	dgrp_t	       	last_grp;
+	
+	flexbg = group / flexbg_size;
+
+	last_grp = (group + (flexbg_size - (group % flexbg_size) - 1));
+	start_blk = ext2fs_group_first_block(fs, flexbg_size * flexbg);
+	last_blk = ext2fs_group_last_block(fs, last_grp);
+	if (last_grp > fs->group_desc_count)
+		last_grp = fs->group_desc_count;
+
+	if (ext2fs_get_free_blocks(fs, start_blk, last_blk, 1, bmap, 
+				   &first_free))
+		return first_free;
+	
+	if (ext2fs_get_free_blocks(fs, first_free + offset, last_blk, size, 
+				   bmap, &first_free))
+		return first_free;
+
+	return first_free;
+}
+
 errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
 				      ext2fs_block_bitmap bmap)
 {
 	errcode_t	retval;
 	blk_t		group_blk, start_blk, last_blk, new_blk, blk;
-	int		j;
+	dgrp_t		last_grp;
+	int		j, rem_grps, flexbg_size = 0;
 
 	group_blk = ext2fs_group_first_block(fs, group);
 	last_blk = ext2fs_group_last_block(fs, group);
 
 	if (!bmap)
 		bmap = fs->block_map;
+
+	if (EXT2_HAS_INCOMPAT_FEATURE (fs->super,
+				       EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+		flexbg_size = 1 << fs->super->s_log_groups_per_flex;
+		rem_grps = flexbg_size - (group % flexbg_size);
+		last_grp = group + rem_grps - 1;
+		if (last_grp > fs->group_desc_count)
+			last_grp = fs->group_desc_count;
+	}
 	
 	/*
 	 * Allocate the block and inode bitmaps, if necessary
@@ -56,6 +93,12 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
 	} else
 		start_blk = group_blk;
 
+	if (flexbg_size) {
+		start_blk = ext2fs_flexbg_offset (fs, group, flexbg_size, bmap,
+						  0, rem_grps);
+		last_blk = ext2fs_group_last_block(fs, last_grp);
+	}
+
 	if (!fs->group_desc[group].bg_block_bitmap) {
 		retval = ext2fs_get_free_blocks(fs, start_blk, last_blk,
 						1, bmap, &new_blk);
@@ -68,6 +111,12 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
 		fs->group_desc[group].bg_block_bitmap = new_blk;
 	}
 
+	if (flexbg_size) {
+		start_blk = ext2fs_flexbg_offset (fs, group, flexbg_size, bmap,
+						  flexbg_size, rem_grps);
+		last_blk = ext2fs_group_last_block(fs, last_grp);
+	}
+
 	if (!fs->group_desc[group].bg_inode_bitmap) {
 		retval = ext2fs_get_free_blocks(fs, start_blk, last_blk,
 						1, bmap, &new_blk);
@@ -83,6 +132,13 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
 	/*
 	 * Allocate the inode table
 	 */
+	if (flexbg_size) {
+		group_blk = ext2fs_flexbg_offset (fs, group, flexbg_size, bmap,
+						  flexbg_size * 2,
+						  fs->inode_blocks_per_group * rem_grps);
+		last_blk = ext2fs_group_last_block(fs, last_grp);
+	}
+
 	if (!fs->group_desc[group].bg_inode_table) {
 		retval = ext2fs_get_free_blocks(fs, group_blk, last_blk,
 						fs->inode_blocks_per_group,
@@ -112,6 +168,7 @@ errcode_t ext2fs_allocate_tables(ext2_filsys fs)
 		if (retval)
 			return retval;
 	}
+
 	return 0;
 }
 
diff --git a/lib/ext2fs/closefs.c b/lib/ext2fs/closefs.c
index c570256..949c657 100644
--- a/lib/ext2fs/closefs.c
+++ b/lib/ext2fs/closefs.c
@@ -56,6 +56,7 @@ int ext2fs_super_and_bgd_loc(ext2_filsys fs,
 	unsigned int meta_bg, meta_bg_size;
 	int	numblocks, has_super;
 	int	old_desc_blocks;
+	unsigned int flex_bg_size = 1 << fs->super->s_log_groups_per_flex;
 
 	group_block = fs->super->s_first_data_block +
 		(group * fs->super->s_blocks_per_group);
@@ -101,7 +102,11 @@ int ext2fs_super_and_bgd_loc(ext2_filsys fs,
 		}
 	}
 		
-	numblocks -= 2 + fs->inode_blocks_per_group;
+	if (flex_bg_size) {
+		if ((group % flex_bg_size) == 0)
+			numblocks -= 2 + fs->inode_blocks_per_group;
+	} else
+		numblocks -= 2 + fs->inode_blocks_per_group;
 
 	if (ret_super_blk)
 		*ret_super_blk = super_blk;
diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
index 2394857..bcb88ff 100644
--- a/lib/ext2fs/ext2_fs.h
+++ b/lib/ext2fs/ext2_fs.h
@@ -577,7 +577,10 @@ struct ext2_super_block {
 	__u16   s_mmp_interval;         /* # seconds to wait in MMP checking */
 	__u64   s_mmp_block;            /* Block for multi-mount protection */
 	__u32   s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
-	__u32   s_reserved[163];        /* Padding to the end of the block */
+	__u8	s_log_groups_per_flex;	/* FLEX_BG group size */
+	__u8    s_reserved_char_pad;
+	__u16	s_reserved_padg;	/* Padding to next 32bits */
+	__u32   s_reserved[162];        /* Padding to the end of the block */
 };
 
 /*
diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
index 5c461c9..350e322 100644
--- a/lib/ext2fs/ext2fs.h
+++ b/lib/ext2fs/ext2fs.h
@@ -65,6 +65,7 @@ extern "C" {
 
 typedef __u32		ext2_ino_t;
 typedef __u32		blk_t;
+typedef __u64		blk64_t;
 typedef __u32		dgrp_t;
 typedef __u32		ext2_off_t;
 typedef __s64		e2_blkcnt_t;
diff --git a/lib/ext2fs/initialize.c b/lib/ext2fs/initialize.c
index 16e9eaa..2efdfeb 100644
--- a/lib/ext2fs/initialize.c
+++ b/lib/ext2fs/initialize.c
@@ -156,6 +156,7 @@ errcode_t ext2fs_initialize(const char *name, int flags,
 	set_field(s_feature_incompat, 0);
 	set_field(s_feature_ro_compat, 0);
 	set_field(s_first_meta_bg, 0);
+	set_field(s_log_groups_per_flex, 0);
 	if (super->s_feature_incompat & ~EXT2_LIB_FEATURE_INCOMPAT_SUPP) {
 		retval = EXT2_ET_UNSUPP_FEATURE;
 		goto cleanup;
diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index 9e2d7a8..504f73b 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -96,7 +96,7 @@ static void usage(void)
 {
 	fprintf(stderr, _("Usage: %s [-c|-t|-l filename] [-b block-size] "
 	"[-f fragment-size]\n\t[-i bytes-per-inode] [-I inode-size] "
-	"[-j] [-J journal-options]\n"
+	"[-j] [-J journal-options] [-G meta group size]\n"
 	"\t[-N number-of-inodes] [-m reserved-blocks-percentage] "
 	"[-o creator-os]\n\t[-g blocks-per-group] [-L volume-label] "
 	"[-M last-mounted-directory]\n\t[-O feature[,...]] "
@@ -912,6 +912,7 @@ static void PRS(int argc, char *argv[])
 	int		blocksize = 0;
 	int		inode_ratio = 0;
 	int		inode_size = 0;
+	unsigned long	flex_bg_size = 0;
 	double		reserved_ratio = 5.0;
 	int		sector_size = 0;
 	int		show_version_only = 0;
@@ -994,7 +995,7 @@ static void PRS(int argc, char *argv[])
 	}
 
 	while ((c = getopt (argc, argv,
-		    "b:cf:g:i:jl:m:no:qr:s:tvE:FI:J:L:M:N:O:R:ST:V")) != EOF) {
+		    "b:cf:g:G:i:jl:m:no:qr:s:tvE:FI:J:L:M:N:O:R:ST:V")) != EOF) {
 		switch (c) {
 		case 'b':
 			blocksize = strtol(optarg, &tmp, 0);
@@ -1045,6 +1046,20 @@ static void PRS(int argc, char *argv[])
 				exit(1);
 			}
 			break;
+		case 'G':
+			flex_bg_size = strtoul(optarg, &tmp, 0);
+			if (*tmp) {
+				com_err(program_name, 0,
+					_("Illegal number for Flex_BG size"));
+				exit(1);
+			}
+			if (flex_bg_size < 2 || 
+			    (flex_bg_size & (flex_bg_size-1)) != 0) {
+				com_err(program_name, 0,
+					_("Flex_BG size must be a power of 2"));
+				exit(1);
+			}
+			break;
 		case 'i':
 			inode_ratio = strtoul(optarg, &tmp, 0);
 			if (inode_ratio < EXT2_MIN_BLOCK_SIZE ||
@@ -1444,6 +1459,16 @@ static void PRS(int argc, char *argv[])
 		}
 	}
 
+	if(flex_bg_size) {
+		unsigned int tmp, shift;
+		shift = 0;
+		tmp = flex_bg_size;
+		while ((tmp >>= 1UL) != 0UL)
+			shift++;
+
+		fs_param.s_log_groups_per_flex = tmp;
+	}
+
 	if (!force && fs_param.s_blocks_count >= ((unsigned) 1 << 31)) {
 		com_err(program_name, 0,
 			_("Filesystem too large.  No more than 2**31-1 blocks\n"

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] e2fsprogs: New bitmap and inode table allocation for FLEX_BG
  2008-01-11 17:28 [PATCH] e2fsprogs: " Jose R. Santos
@ 2008-01-11 21:01 ` Andreas Dilger
  2008-01-11 22:11   ` Jose R. Santos
  0 siblings, 1 reply; 15+ messages in thread
From: Andreas Dilger @ 2008-01-11 21:01 UTC (permalink / raw)
  To: Jose R. Santos; +Cc: Theodore Tso, linux-ext4

On Jan 11, 2008  11:28 -0600, Jose R. Santos wrote:
> +blk_t ext2fs_flexbg_offset(ext2_filsys fs, dgrp_t group, int flexbg_size, 
> +			   ext2fs_block_bitmap bmap, int offset, int size)

Could you please add some comments for what this function is trying to do?

> +	last_grp = (group + (flexbg_size - (group % flexbg_size) - 1));

Is this the same as:

	last_grp = group + (flexbg_size - 1) / flexbg_size * flexbg_size
	
(i.e. trying to round up to the next even multiple of flexbg_size)?

Didn't we decide to have flexbg_size be a power-of-two value, so we could
use shift and mask instead of divide and mod?  It's less of an issue because
group is only a 32-bit value, I guess.

> +	if (ext2fs_get_free_blocks(fs, start_blk, last_blk, 1, bmap, 
> +				   &first_free))
> +		return first_free;
> +	
> +	if (ext2fs_get_free_blocks(fs, first_free + offset, last_blk, size, 
> +				   bmap, &first_free))
> +		return first_free;
> +
> +	return first_free;
> +}
> +
>  errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
>  				      ext2fs_block_bitmap bmap)
>  {
>  	errcode_t	retval;
>  	blk_t		group_blk, start_blk, last_blk, new_blk, blk;
> -	int		j;
> +	dgrp_t		last_grp;
> +	int		j, rem_grps, flexbg_size = 0;
>  
>  	group_blk = ext2fs_group_first_block(fs, group);
>  	last_blk = ext2fs_group_last_block(fs, group);
>  
>  	if (!bmap)
>  		bmap = fs->block_map;
> +
> +	if (EXT2_HAS_INCOMPAT_FEATURE (fs->super,
> +				       EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
> +		flexbg_size = 1 << fs->super->s_log_groups_per_flex;
> +		rem_grps = flexbg_size - (group % flexbg_size);

Hmm, no point in doing "groups % flexbg_size" if we have
s_log_groups_per_flex.  Could do "groups & (flexbg_size - 1)" instead.

> @@ -101,7 +102,11 @@ int ext2fs_super_and_bgd_loc(ext2_filsys fs,
> +	if (flex_bg_size) {
> +		if ((group % flex_bg_size) == 0)
> +			numblocks -= 2 + fs->inode_blocks_per_group;

Ditto.

> @@ -1045,6 +1046,20 @@ static void PRS(int argc, char *argv[])
>  				exit(1);
>  			}
>  			break;
> +		case 'G':
> +			flex_bg_size = strtoul(optarg, &tmp, 0);
> +			if (*tmp) {
> +				com_err(program_name, 0,
> +					_("Illegal number for Flex_BG size"));
> +				exit(1);
> +			}
> +			if (flex_bg_size < 2 || 
> +			    (flex_bg_size & (flex_bg_size-1)) != 0) {
> +				com_err(program_name, 0,
> +					_("Flex_BG size must be a power of 2"));
> +				exit(1);
> +			}
> +			break;

We've been putting new options under "-E var=value"...  I don't know what
Ted's thoughs are on using new option letters, though this one might qualify.

> @@ -1444,6 +1459,16 @@ static void PRS(int argc, char *argv[])
>  		}
>  	}
>  
> +	if(flex_bg_size) {

Space after "if ".

> +		shift = 0;
> +		tmp = flex_bg_size;
> +		while ((tmp >>= 1UL) != 0UL)
> +			shift++;

There isn't a "log2" function?

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] e2fsprogs: New bitmap and inode table allocation for FLEX_BG
  2008-01-11 21:01 ` Andreas Dilger
@ 2008-01-11 22:11   ` Jose R. Santos
  0 siblings, 0 replies; 15+ messages in thread
From: Jose R. Santos @ 2008-01-11 22:11 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Theodore Tso, linux-ext4

On Fri, 11 Jan 2008 14:01:04 -0700
Andreas Dilger <adilger@sun.com> wrote:

> On Jan 11, 2008  11:28 -0600, Jose R. Santos wrote:
> > +blk_t ext2fs_flexbg_offset(ext2_filsys fs, dgrp_t group, int flexbg_size, 
> > +			   ext2fs_block_bitmap bmap, int offset, int size)

OK.
 
> Could you please add some comments for what this function is trying to do?
> 
> > +	last_grp = (group + (flexbg_size - (group % flexbg_size) - 1));
> 
> Is this the same as:
> 
> 	last_grp = group + (flexbg_size - 1) / flexbg_size * flexbg_size
> 	
> (i.e. trying to round up to the next even multiple of flexbg_size)?
> 
> Didn't we decide to have flexbg_size be a power-of-two value, so we could
> use shift and mask instead of divide and mod?  It's less of an issue because
> group is only a 32-bit value, I guess.

Yes, I fixes this in the kernel code but neglected to fix it on the here.
 
> > +	if (ext2fs_get_free_blocks(fs, start_blk, last_blk, 1, bmap, 
> > +				   &first_free))
> > +		return first_free;
> > +	
> > +	if (ext2fs_get_free_blocks(fs, first_free + offset, last_blk, size, 
> > +				   bmap, &first_free))
> > +		return first_free;
> > +
> > +	return first_free;
> > +}
> > +
> >  errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
> >  				      ext2fs_block_bitmap bmap)
> >  {
> >  	errcode_t	retval;
> >  	blk_t		group_blk, start_blk, last_blk, new_blk, blk;
> > -	int		j;
> > +	dgrp_t		last_grp;
> > +	int		j, rem_grps, flexbg_size = 0;
> >  
> >  	group_blk = ext2fs_group_first_block(fs, group);
> >  	last_blk = ext2fs_group_last_block(fs, group);
> >  
> >  	if (!bmap)
> >  		bmap = fs->block_map;
> > +
> > +	if (EXT2_HAS_INCOMPAT_FEATURE (fs->super,
> > +				       EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
> > +		flexbg_size = 1 << fs->super->s_log_groups_per_flex;
> > +		rem_grps = flexbg_size - (group % flexbg_size);
> 
> Hmm, no point in doing "groups % flexbg_size" if we have
> s_log_groups_per_flex.  Could do "groups & (flexbg_size - 1)" instead.
> 
> > @@ -101,7 +102,11 @@ int ext2fs_super_and_bgd_loc(ext2_filsys fs,
> > +	if (flex_bg_size) {
> > +		if ((group % flex_bg_size) == 0)
> > +			numblocks -= 2 + fs->inode_blocks_per_group;
> 
> Ditto.
> 
> > @@ -1045,6 +1046,20 @@ static void PRS(int argc, char *argv[])
> >  				exit(1);
> >  			}
> >  			break;
> > +		case 'G':
> > +			flex_bg_size = strtoul(optarg, &tmp, 0);
> > +			if (*tmp) {
> > +				com_err(program_name, 0,
> > +					_("Illegal number for Flex_BG size"));
> > +				exit(1);
> > +			}
> > +			if (flex_bg_size < 2 || 
> > +			    (flex_bg_size & (flex_bg_size-1)) != 0) {
> > +				com_err(program_name, 0,
> > +					_("Flex_BG size must be a power of 2"));
> > +				exit(1);
> > +			}
> > +			break;
> 
> We've been putting new options under "-E var=value"...  I don't know what
> Ted's thoughs are on using new option letters, though this one might qualify.

I thought this would qualify as a new option letter.  Waiting on input
from Ted.

> > @@ -1444,6 +1459,16 @@ static void PRS(int argc, char *argv[])
> >  		}
> >  	}
> >  
> > +	if(flex_bg_size) {
> 
> Space after "if ".

Will fix.
 
> > +		shift = 0;
> > +		tmp = flex_bg_size;
> > +		while ((tmp >>= 1UL) != 0UL)
> > +			shift++;
> 
> There isn't a "log2" function?

Couldn't find anything in lib or include.
 
> Cheers, Andreas
> --
> Andreas Dilger
> Sr. Staff Engineer, Lustre Group
> Sun Microsystems of Canada, Inc.
> 



-JRS

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] e2fsprogs: New bitmap and inode table allocation for FLEX_BG
@ 2008-02-07 17:09 Jose R. Santos
  2008-02-08  5:11 ` Andreas Dilger
  0 siblings, 1 reply; 15+ messages in thread
From: Jose R. Santos @ 2008-02-07 17:09 UTC (permalink / raw)
  To: Theodore Tso, linux-ext4

New bitmap and inode table allocation for FLEX_BG

From: Jose R. Santos <jrs@us.ibm.com>

Change the way we allocate bitmaps and inode tables if the FLEX_BG
feature is used at mke2fs time.  It places calculates a new offset for
bitmaps and inode table base on the number of groups that the user
wishes to pack together using the new "-G" option.  Creating a
filesystem with 64 block groups in a flex group can be done by:

mke2fs -j -I 256 -O flex_bg -G 32 /dev/sdX

Signed-off-by: Jose R. Santos <jrs@us.ibm.com>
Signed-off-by: Valerie Clement <valerie.clement@bull.net>
---

 lib/ext2fs/alloc_tables.c |  116 ++++++++++++++++++++++++++++++++++++++++++++-
 lib/ext2fs/closefs.c      |    6 ++
 lib/ext2fs/ext2_fs.h      |    6 ++
 lib/ext2fs/initialize.c   |    6 ++
 misc/mke2fs.c             |   25 +++++++++-
 5 files changed, 151 insertions(+), 8 deletions(-)


diff --git a/lib/ext2fs/alloc_tables.c b/lib/ext2fs/alloc_tables.c
index 4ad2ba9..8281858 100644
--- a/lib/ext2fs/alloc_tables.c
+++ b/lib/ext2fs/alloc_tables.c
@@ -27,18 +27,82 @@
 #include "ext2_fs.h"
 #include "ext2fs.h"
 
+void ext2fs_bgd_set_flex_meta_flag(ext2_filsys fs, blk_t block)
+{
+	dgrp_t	group;
+
+	group = ext2fs_group_of_blk(fs, block);
+	if (!(fs->group_desc[group].bg_flags & EXT2_BG_FLEX_METADATA))
+		fs->group_desc[group].bg_flags |= EXT2_BG_FLEX_METADATA;
+}
+
+blk_t ext2fs_flexbg_offset(ext2_filsys fs, dgrp_t group, blk_t start_blk,
+			   ext2fs_block_bitmap bmap, int offset, int size)
+{
+	int		flexbg, flexbg_size, elem_size;
+	blk_t		last_blk, first_free = 0;
+	dgrp_t	       	last_grp;
+
+	flexbg_size = 1 << fs->super->s_log_groups_per_flex;
+	flexbg = group / flexbg_size;
+
+	if (size > fs->super->s_blocks_per_group / 8)
+		size = fs->super->s_blocks_per_group / 8;
+
+	/*
+	 * Dont do a long search if the previous block
+	 * search is still valid.
+	 */
+	if (start_blk && group % flexbg_size) {
+		if (size > flexbg_size)
+			elem_size = fs->inode_blocks_per_group;
+		else
+			elem_size = 1;
+		if (ext2fs_test_block_bitmap_range(bmap, start_blk + elem_size,
+						   size))
+			return start_blk + elem_size;
+	}
+
+	start_blk = ext2fs_group_first_block(fs, flexbg_size * flexbg);
+	last_grp = (group + (flexbg_size - (group % flexbg_size) - 1));
+	last_blk = ext2fs_group_last_block(fs, last_grp);
+	if (last_grp > fs->group_desc_count)
+		last_grp = fs->group_desc_count;
+
+	/* Find the first available block */
+	if (ext2fs_get_free_blocks(fs, start_blk, last_blk, 1, bmap,
+				   &first_free))
+		return first_free;
+
+	if (ext2fs_get_free_blocks(fs, first_free + offset, last_blk, size,
+				   bmap, &first_free))
+		return first_free;
+
+	return first_free;
+}
+
 errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
 				      ext2fs_block_bitmap bmap)
 {
 	errcode_t	retval;
 	blk_t		group_blk, start_blk, last_blk, new_blk, blk;
-	int		j;
+	dgrp_t		last_grp;
+	int		j, rem_grps, flexbg_size = 0;
 
 	group_blk = ext2fs_group_first_block(fs, group);
 	last_blk = ext2fs_group_last_block(fs, group);
 
 	if (!bmap)
 		bmap = fs->block_map;
+
+	if (EXT2_HAS_INCOMPAT_FEATURE (fs->super,
+				       EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+		flexbg_size = 1 << fs->super->s_log_groups_per_flex;
+		rem_grps = flexbg_size - (group % flexbg_size);
+		last_grp = group + rem_grps - 1;
+		if (last_grp > fs->group_desc_count)
+			last_grp = fs->group_desc_count;
+	}
 	
 	/*
 	 * Allocate the block and inode bitmaps, if necessary
@@ -56,6 +120,15 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
 	} else
 		start_blk = group_blk;
 
+	if (flexbg_size) {
+		int prev_block = 0;
+		if (group && fs->group_desc[group-1].bg_block_bitmap)
+			prev_block = fs->group_desc[group-1].bg_block_bitmap;
+		start_blk = ext2fs_flexbg_offset(fs, group, prev_block, bmap,
+						 0, rem_grps);
+		last_blk = ext2fs_group_last_block(fs, last_grp);
+	}
+
 	if (!fs->group_desc[group].bg_block_bitmap) {
 		retval = ext2fs_get_free_blocks(fs, start_blk, last_blk,
 						1, bmap, &new_blk);
@@ -66,6 +139,21 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
 			return retval;
 		ext2fs_mark_block_bitmap(bmap, new_blk);
 		fs->group_desc[group].bg_block_bitmap = new_blk;
+		if (flexbg_size) {
+			dgrp_t tmp = ext2fs_group_of_blk(fs, new_blk);
+			ext2fs_bgd_set_flex_meta_flag(fs, new_blk);
+			fs->group_desc[tmp].bg_free_blocks_count--;
+			fs->super->s_free_blocks_count--;
+		}
+	}
+
+	if (flexbg_size) {
+		int prev_block = 0;
+		if (group && fs->group_desc[group-1].bg_inode_bitmap)
+			prev_block = fs->group_desc[group-1].bg_inode_bitmap;
+		start_blk = ext2fs_flexbg_offset(fs, group, prev_block, bmap,
+						 flexbg_size, rem_grps);
+		last_blk = ext2fs_group_last_block(fs, last_grp);
 	}
 
 	if (!fs->group_desc[group].bg_inode_bitmap) {
@@ -78,11 +166,28 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
 			return retval;
 		ext2fs_mark_block_bitmap(bmap, new_blk);
 		fs->group_desc[group].bg_inode_bitmap = new_blk;
+		if (flexbg_size) {
+			dgrp_t tmp = ext2fs_group_of_blk(fs, new_blk);
+			ext2fs_bgd_set_flex_meta_flag(fs, new_blk);
+			fs->group_desc[tmp].bg_free_blocks_count--;
+			fs->super->s_free_blocks_count--;
+		}
 	}
 
 	/*
 	 * Allocate the inode table
 	 */
+	if (flexbg_size) {
+		int prev_block = 0;
+		if (group && fs->group_desc[group-1].bg_inode_table)
+			prev_block = fs->group_desc[group-1].bg_inode_table;
+		group_blk = ext2fs_flexbg_offset(fs, group, prev_block, bmap,
+						 flexbg_size * 2,
+						 fs->inode_blocks_per_group *
+						 rem_grps);
+		last_blk = ext2fs_group_last_block(fs, last_grp);
+	}
+
 	if (!fs->group_desc[group].bg_inode_table) {
 		retval = ext2fs_get_free_blocks(fs, group_blk, last_blk,
 						fs->inode_blocks_per_group,
@@ -91,8 +196,15 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
 			return retval;
 		for (j=0, blk = new_blk;
 		     j < fs->inode_blocks_per_group;
-		     j++, blk++)
+		     j++, blk++) {
 			ext2fs_mark_block_bitmap(bmap, blk);
+			if (flexbg_size) {
+				dgrp_t tmp = ext2fs_group_of_blk(fs, blk);
+				ext2fs_bgd_set_flex_meta_flag(fs, blk);
+				fs->group_desc[tmp].bg_free_blocks_count--;
+				fs->super->s_free_blocks_count--;
+			}
+		}
 		fs->group_desc[group].bg_inode_table = new_blk;
 	}
 
diff --git a/lib/ext2fs/closefs.c b/lib/ext2fs/closefs.c
index c8ef6ef..96b6af1 100644
--- a/lib/ext2fs/closefs.c
+++ b/lib/ext2fs/closefs.c
@@ -56,6 +56,7 @@ int ext2fs_super_and_bgd_loc(ext2_filsys fs,
 	unsigned int meta_bg, meta_bg_size;
 	blk_t	numblocks, old_desc_blocks;
 	int	has_super;
+	unsigned int flex_bg_size = 1 << fs->super->s_log_groups_per_flex;
 
 	group_block = ext2fs_group_first_block(fs, group);
 
@@ -99,8 +100,9 @@ int ext2fs_super_and_bgd_loc(ext2_filsys fs,
 			numblocks--;
 		}
 	}
-		
-	numblocks -= 2 + fs->inode_blocks_per_group;
+
+	if (!fs->super->s_log_groups_per_flex)
+		numblocks -= 2 + fs->inode_blocks_per_group;
 
 	if (ret_super_blk)
 		*ret_super_blk = super_blk;
diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
index a11e6be..a09bdd8 100644
--- a/lib/ext2fs/ext2_fs.h
+++ b/lib/ext2fs/ext2_fs.h
@@ -173,6 +173,7 @@ struct ext4_group_desc
 
 #define EXT2_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not initialized */
 #define EXT2_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not initialized */
+#define EXT2_BG_FLEX_METADATA	0x0004 /* FLEX_BG block group contains meta-data */
 
 /*
  * Data structures used by the directory indexing feature
@@ -555,7 +556,10 @@ struct ext2_super_block {
 	__u16   s_mmp_interval;         /* # seconds to wait in MMP checking */
 	__u64   s_mmp_block;            /* Block for multi-mount protection */
 	__u32   s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
-	__u32   s_reserved[163];        /* Padding to the end of the block */
+	__u8	s_log_groups_per_flex;	/* FLEX_BG group size */
+	__u8    s_reserved_char_pad;
+	__u16	s_reserved_pad;		/* Padding to next 32bits */
+	__u32   s_reserved[162];        /* Padding to the end of the block */
 };
 
 /*
diff --git a/lib/ext2fs/initialize.c b/lib/ext2fs/initialize.c
index a7f777b..464737d 100644
--- a/lib/ext2fs/initialize.c
+++ b/lib/ext2fs/initialize.c
@@ -156,6 +156,7 @@ errcode_t ext2fs_initialize(const char *name, int flags,
 	set_field(s_feature_incompat, 0);
 	set_field(s_feature_ro_compat, 0);
 	set_field(s_first_meta_bg, 0);
+	set_field(s_log_groups_per_flex, 0);
 	if (super->s_feature_incompat & ~EXT2_LIB_FEATURE_INCOMPAT_SUPP) {
 		retval = EXT2_ET_UNSUPP_FEATURE;
 		goto cleanup;
@@ -363,7 +364,10 @@ ipg_retry:
 	 * group, and fill in the correct group statistics for group.
 	 * Note that although the block bitmap, inode bitmap, and
 	 * inode table have not been allocated (and in fact won't be
-	 * by this routine), they are accounted for nevertheless.
+	 * by this routine), they are accounted for nevertheless.  If
+	 * FLEX_BG meta-data grouping is used, only account for the
+	 * superblock and group descriptors (the inode tables and
+	 * bitmaps will be accounted for when allocated).
 	 */
 	super->s_free_blocks_count = 0;
 	for (i = 0; i < fs->group_desc_count; i++) {
diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index 44f45aa..c043173 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -96,7 +96,7 @@ static void usage(void)
 {
 	fprintf(stderr, _("Usage: %s [-c|-t|-l filename] [-b block-size] "
 	"[-f fragment-size]\n\t[-i bytes-per-inode] [-I inode-size] "
-	"[-j] [-J journal-options]\n"
+	"[-j] [-J journal-options] [-G meta group size]\n"
 	"\t[-N number-of-inodes] [-m reserved-blocks-percentage] "
 	"[-o creator-os]\n\t[-g blocks-per-group] [-L volume-label] "
 	"[-M last-mounted-directory]\n\t[-O feature[,...]] "
@@ -464,6 +464,8 @@ static void setup_lazy_bg(ext2_filsys fs)
 				sb->s_free_inodes_count -= 
 					sb->s_inodes_per_group;
 			}
+			if ((bg->bg_flags & EXT2_BG_FLEX_METADATA))
+				continue;
 			blks = ext2fs_super_and_bgd_loc(fs, i, 0, 0, 0, 0);
 			if (bg->bg_free_blocks_count == blks) {
 				bg->bg_free_blocks_count = 0;
@@ -909,6 +911,7 @@ static void PRS(int argc, char *argv[])
 	int		blocksize = 0;
 	int		inode_ratio = 0;
 	int		inode_size = 0;
+	unsigned long	flex_bg_size = 0;
 	double		reserved_ratio = 5.0;
 	int		sector_size = 0;
 	int		show_version_only = 0;
@@ -991,7 +994,7 @@ static void PRS(int argc, char *argv[])
 	}
 
 	while ((c = getopt (argc, argv,
-		    "b:cf:g:i:jl:m:no:qr:s:tvE:FI:J:L:M:N:O:R:ST:V")) != EOF) {
+		    "b:cf:g:G:i:jl:m:no:qr:s:tvE:FI:J:L:M:N:O:R:ST:V")) != EOF) {
 		switch (c) {
 		case 'b':
 			blocksize = strtol(optarg, &tmp, 0);
@@ -1042,6 +1045,20 @@ static void PRS(int argc, char *argv[])
 				exit(1);
 			}
 			break;
+		case 'G':
+			flex_bg_size = strtoul(optarg, &tmp, 0);
+			if (*tmp) {
+				com_err(program_name, 0,
+					_("Illegal number for Flex_BG size"));
+				exit(1);
+			}
+			if (flex_bg_size < 2 ||
+			    (flex_bg_size & (flex_bg_size-1)) != 0) {
+				com_err(program_name, 0,
+					_("Flex_BG size must be a power of 2"));
+				exit(1);
+			}
+			break;
 		case 'i':
 			inode_ratio = strtoul(optarg, &tmp, 0);
 			if (inode_ratio < EXT2_MIN_BLOCK_SIZE ||
@@ -1437,6 +1454,10 @@ static void PRS(int argc, char *argv[])
 		}
 	}
 
+	if(flex_bg_size) {
+		fs_param.s_log_groups_per_flex = int_log2(flex_bg_size);
+	}
+
 	if (!force && fs_param.s_blocks_count >= ((unsigned) 1 << 31)) {
 		com_err(program_name, 0,
 			_("Filesystem too large.  No more than 2**31-1 blocks\n"



-JRS

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] e2fsprogs: New bitmap and inode table allocation for FLEX_BG
  2008-02-07 17:09 [PATCH] e2fsprogs: New bitmap and inode table allocation for FLEX_BG Jose R. Santos
@ 2008-02-08  5:11 ` Andreas Dilger
  2008-02-08 17:37   ` Jose R. Santos
  0 siblings, 1 reply; 15+ messages in thread
From: Andreas Dilger @ 2008-02-08  5:11 UTC (permalink / raw)
  To: Jose R. Santos; +Cc: Theodore Tso, linux-ext4

On Feb 07, 2008  11:09 -0600, Jose R. Santos wrote:
> +blk_t ext2fs_flexbg_offset(ext2_filsys fs, dgrp_t group, blk_t start_blk,
> +			   ext2fs_block_bitmap bmap, int offset, int size)
> +{

Can you add a comment on the intent of this function.  By the name it seems
to be calculating some offset relative to the flex group, but that doesn't
convey anything about searching for free blocks???

> +	/*
> +	 * Dont do a long search if the previous block
> +	 * search is still valid.
> +	 */

What condition here makse the previous search still valid?

> +	if (start_blk && group % flexbg_size) {
> +		if (size > flexbg_size)
> +			elem_size = fs->inode_blocks_per_group;
> +		else
> +			elem_size = 1;
> +		if (ext2fs_test_block_bitmap_range(bmap, start_blk + elem_size,
> +						   size))
> +			return start_blk + elem_size;
> +	}
> +
> +	start_blk = ext2fs_group_first_block(fs, flexbg_size * flexbg);
> +	last_grp = (group + (flexbg_size - (group % flexbg_size) - 1));

This is a confusing calculation...  Is it trying to find the last group
in the flex group that "group" is part of?  I.e. round "group" to the
end of the flex group?  Since flexbg_size is a power-of-two value, you
could just do "last_grp = group | (flexbg_size - 1)"?

> +	last_blk = ext2fs_group_last_block(fs, last_grp);
> +	if (last_grp > fs->group_desc_count)
> +		last_grp = fs->group_desc_count;

Doesn't it make more sense to calculate "last_blk" AFTER limiting
"last_grp" to the actual number of groups in the filesystem?

> +	/* Find the first available block */
> +	if (ext2fs_get_free_blocks(fs, start_blk, last_blk, 1, bmap,
> +				   &first_free))
> +		return first_free;
> +
> +	if (ext2fs_get_free_blocks(fs, first_free + offset, last_blk, size,
> +				   bmap, &first_free))
> +		return first_free;

I don't quite understand this either?  The first search is looking for a
single free block between "start_blk" and "last_blk", while the second
search is looking for "size" free blocks between "first_free + offset"
and "last_blk".  What is the reason to do the second search after doing
the first one, or alternately just doing the second one directly?

Should both of these calls actually be saving the error return value and
returning that to a caller (returning first_free via a pointer arg like
ext2fs_get_free_blocks() does)?

>  errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
>  				      ext2fs_block_bitmap bmap)
>  {
>  	if (!bmap)
>  		bmap = fs->block_map;
> +
> +	if (EXT2_HAS_INCOMPAT_FEATURE (fs->super,
> +				       EXT4_FEATURE_INCOMPAT_FLEX_BG)) {

No space between ..._FEATURE and "(".

> +		flexbg_size = 1 << fs->super->s_log_groups_per_flex;
> +		rem_grps = flexbg_size - (group % flexbg_size);
> +		last_grp = group + rem_grps - 1;

Could this be written as:

		last_grp = group | (flexbg_size - 1);
		rem_grps = last_grp - group;

I'm also trying to see if you have an off-by-one error in your version?
Consider the case of group = 5, flexbg_size = 4.  That gives us with my calc:

		last_grp = 5 | (4 - 1) = 7;
		rem_grps = 7 - 5 = 2;

This makes sense, since group 5 is in the second flexbg [4,7], and there
are 2 groups remaining after group 5 in that flexbg.

> @@ -56,6 +120,15 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs,
>  	} else
>  		start_blk = group_blk;
>  
> +	if (flexbg_size) {
> +		int prev_block = 0;
> +		if (group && fs->group_desc[group-1].bg_block_bitmap)
> +			prev_block = fs->group_desc[group-1].bg_block_bitmap;
> +		start_blk = ext2fs_flexbg_offset(fs, group, prev_block, bmap,
> +						 0, rem_grps);
> +		last_blk = ext2fs_group_last_block(fs, last_grp);
> +	}

This is presumably trying to allocate the block bitmap for "group" to be
after the block bitmap for "group - 1" (allocated in an earlier call).

> +		if (group && fs->group_desc[group-1].bg_inode_bitmap)
> +			prev_block = fs->group_desc[group-1].bg_inode_bitmap;
> +		start_blk = ext2fs_flexbg_offset(fs, group, prev_block, bmap,
> +						 flexbg_size, rem_grps);
> +		last_blk = ext2fs_group_last_block(fs, last_grp);
>  	}

This is allocating the inode bitmap in the same manner.  Would it make more
sense to change the mke2fs code to allocate all of the block bitmaps first
(well, after the sb+gdt backups), then all of the inode bitmaps, and finally
all of the inode tables?

>  #define EXT2_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not initialized */
>  #define EXT2_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not initialized */
> +#define EXT2_BG_FLEX_METADATA	0x0004 /* FLEX_BG block group contains meta-data */

Hrm, I thought I had reserved that value in the uninit_groups patch?
+#define EXT3_BG_INODE_ZEROED   0x0004  /* On-disk itable initialized to zero */

The kernel and e2fsck can use that to determine if the inode table was
zeroed out at mke2fs time (i.e. formatted with LAZY_BG or not).  Then
the kernel can zero out the entire inode table and set INODE_ZEROED lazily
some time after mount, instead of mke2fs doing it at mke2fs time sucking
up all of the node's RAM and making it very slow.

This is still needed at some point to avoid the problem of e2fsck trying
to salvage ancient inodes if the group descriptor is corrupted for some
reason and the inode high watermark is lost.

Not implemented yet... but intended to be done by a helper thread started
at mount time to do GDT/bitmap/itable checksum validation, create the
mballoc buddy buffers, prefetch metadata, etc.

> @@ -96,7 +96,7 @@ static void usage(void)
>  {
>  	fprintf(stderr, _("Usage: %s [-c|-t|-l filename] [-b block-size] "
>  	"[-f fragment-size]\n\t[-i bytes-per-inode] [-I inode-size] "
> -	"[-j] [-J journal-options]\n"
> +	"[-j] [-J journal-options] [-G meta group size]\n"
>  	"\t[-N number-of-inodes] [-m reserved-blocks-percentage] "
>  	"[-o creator-os]\n\t[-g blocks-per-group] [-L volume-label] "
>  	"[-M last-mounted-directory]\n\t[-O feature[,...]] "

This also needs an update to the mke2fs.8.in man page.

> +	if(flex_bg_size) {
> +		fs_param.s_log_groups_per_flex = int_log2(flex_bg_size);
> +	}

Whitespace, braces:

	if (flex_bg_size)
		fs_param.s_log_groups_per_flex = int_log2(flex_bg_size);


Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] e2fsprogs: New bitmap and inode table allocation for FLEX_BG
  2008-02-08  5:11 ` Andreas Dilger
@ 2008-02-08 17:37   ` Jose R. Santos
  2008-02-11  4:33     ` Theodore Tso
  0 siblings, 1 reply; 15+ messages in thread
From: Jose R. Santos @ 2008-02-08 17:37 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Theodore Tso, linux-ext4

On Fri, 08 Feb 2008 00:11:16 -0500
Andreas Dilger <adilger@sun.com> wrote:

> On Feb 07, 2008  11:09 -0600, Jose R. Santos wrote:
> > +blk_t ext2fs_flexbg_offset(ext2_filsys fs, dgrp_t group, blk_t start_blk,
> > +			   ext2fs_block_bitmap bmap, int offset, int size)
> > +{
> 
> Can you add a comment on the intent of this function.  By the name it seems
> to be calculating some offset relative to the flex group, but that doesn't
> convey anything about searching for free blocks???

I will add a comment.  The function calculates where the search of
bitmaps/inode tables for a give block group starts by returning a block
number where all of the bitmaps/inode tables can be allocated in a
contiguous fashion.  The search for free blocks is needed determine
where within the flex group we can allocated the meta-data.

> 
> > +	/*
> > +	 * Dont do a long search if the previous block
> > +	 * search is still valid.
> > +	 */
> 
> What condition here makse the previous search still valid?

We pass the previous allocation as an argument to the function.  If the
is enough space to allocate the rest of the inode tables after the
previous allocation, then no need to do a search.  There are two
reasons why this is done.

1) If the size of the of a flexbg is big enough, searching for
inode_blocks_per_group * flexbg becomes very expensive if there happens
to be some blocks in the middle of where we started the search.  This
easy happens if the size of all the inode tables in a flex group are
larger than a single block group.  If the next block group has super
block backups or meta_bg blocks ext2fs_get_free_blocks() becomes very
expensive.  If we have to do such an expensive search, better do it
once.  This is also a problem when resizing since there is no telling
what block usage of those last groups would be.

2) This avoids having inode_tables or bitmaps being allocated out of
order.  The search for very large blocks can leave empty space at the
begining of a flex group.  The search for the last groups in the flex
group could actually be place in these smaller empty blocks which would
put things out of order.

> > +	if (start_blk && group % flexbg_size) {
> > +		if (size > flexbg_size)
> > +			elem_size = fs->inode_blocks_per_group;
> > +		else
> > +			elem_size = 1;
> > +		if (ext2fs_test_block_bitmap_range(bmap, start_blk + elem_size,
> > +						   size))
> > +			return start_blk + elem_size;
> > +	}
> > +
> > +	start_blk = ext2fs_group_first_block(fs, flexbg_size * flexbg);
> > +	last_grp = (group + (flexbg_size - (group % flexbg_size) - 1));
> 
> This is a confusing calculation...  Is it trying to find the last group
> in the flex group that "group" is part of?  I.e. round "group" to the
> end of the flex group?  Since flexbg_size is a power-of-two value, you
> could just do "last_grp = group | (flexbg_size - 1)"?

Yes, I will fix that.
 
> > +	last_blk = ext2fs_group_last_block(fs, last_grp);
> > +	if (last_grp > fs->group_desc_count)
> > +		last_grp = fs->group_desc_count;
> 
> Doesn't it make more sense to calculate "last_blk" AFTER limiting
> "last_grp" to the actual number of groups in the filesystem?

Ops...  Thanks for catching.
 
> > +	/* Find the first available block */
> > +	if (ext2fs_get_free_blocks(fs, start_blk, last_blk, 1, bmap,
> > +				   &first_free))
> > +		return first_free;
> > +
> > +	if (ext2fs_get_free_blocks(fs, first_free + offset, last_blk, size,
> > +				   bmap, &first_free))
> > +		return first_free;
> 
> I don't quite understand this either?  The first search is looking for a
> single free block between "start_blk" and "last_blk", while the second
> search is looking for "size" free blocks between "first_free + offset"
> and "last_blk".  What is the reason to do the second search after doing
> the first one, or alternately just doing the second one directly?

Because the second search starts from the first free block + an
offset.  This is used in order to have space to allocate each group of
inode/block bitmaps and inode tables contiguously.  Cant do the second
search directly without knowing where I should start the search.

> Should both of these calls actually be saving the error return value and
> returning that to a caller (returning first_free via a pointer arg like
> ext2fs_get_free_blocks() does)?

Failure to find a contiguous set of blocks for all the groups meta-data
is not fatal since we don't allocate here.  This function just helps
ext2fs_allocate_group_table() find where is should start its search. I
leet ext2fs_allocate_group_table() do the error handling if it truly
can find enough space for the allocation. 

> >  errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
> >  				      ext2fs_block_bitmap bmap)
> >  {
> >  	if (!bmap)
> >  		bmap = fs->block_map;
> > +
> > +	if (EXT2_HAS_INCOMPAT_FEATURE (fs->super,
> > +				       EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
> 
> No space between ..._FEATURE and "(".

Will fix.
 
> > +		flexbg_size = 1 << fs->super->s_log_groups_per_flex;
> > +		rem_grps = flexbg_size - (group % flexbg_size);
> > +		last_grp = group + rem_grps - 1;
> 
> Could this be written as:
> 
> 		last_grp = group | (flexbg_size - 1);
> 		rem_grps = last_grp - group;

Will fix.

> I'm also trying to see if you have an off-by-one error in your version?
> Consider the case of group = 5, flexbg_size = 4.  That gives us with my calc:
> 
> 		last_grp = 5 | (4 - 1) = 7;
> 		rem_grps = 7 - 5 = 2;
> 
> This makes sense, since group 5 is in the second flexbg [4,7], and there
> are 2 groups remaining after group 5 in that flexbg.

I think you're right.  Thanks for catching.
 
> > @@ -56,6 +120,15 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs,
> >  	} else
> >  		start_blk = group_blk;
> >  
> > +	if (flexbg_size) {
> > +		int prev_block = 0;
> > +		if (group && fs->group_desc[group-1].bg_block_bitmap)
> > +			prev_block = fs->group_desc[group-1].bg_block_bitmap;
> > +		start_blk = ext2fs_flexbg_offset(fs, group, prev_block, bmap,
> > +						 0, rem_grps);
> > +		last_blk = ext2fs_group_last_block(fs, last_grp);
> > +	}
> 
> This is presumably trying to allocate the block bitmap for "group" to be
> after the block bitmap for "group - 1" (allocated in an earlier call).
> 
> > +		if (group && fs->group_desc[group-1].bg_inode_bitmap)
> > +			prev_block = fs->group_desc[group-1].bg_inode_bitmap;
> > +		start_blk = ext2fs_flexbg_offset(fs, group, prev_block, bmap,
> > +						 flexbg_size, rem_grps);
> > +		last_blk = ext2fs_group_last_block(fs, last_grp);
> >  	}
> 
> This is allocating the inode bitmap in the same manner.  Would it make more
> sense to change the mke2fs code to allocate all of the block bitmaps first
> (well, after the sb+gdt backups), then all of the inode bitmaps, and finally
> all of the inode tables?

This is how I originally created the patch but since it was handle
outside ext2_allocate_table, it would mean that it could not be used
for resize2fs.  I can be done from inside ext2_allocate_table() but it
seems wrong to allocate all of the groups in a flexbg when we expect
that this function only does it for one single group.

> >  #define EXT2_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not initialized */
> >  #define EXT2_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not initialized */
> > +#define EXT2_BG_FLEX_METADATA	0x0004 /* FLEX_BG block group contains meta-data */
> 
> Hrm, I thought I had reserved that value in the uninit_groups patch?
> +#define EXT3_BG_INODE_ZEROED   0x0004  /* On-disk itable initialized to zero */

I may have been, I just based the patch on the next branch as Ted had
ask for new e2fsprog patches.  The uninit group patch was not part of
the next branch when I pulled.

> The kernel and e2fsck can use that to determine if the inode table was
> zeroed out at mke2fs time (i.e. formatted with LAZY_BG or not).  Then
> the kernel can zero out the entire inode table and set INODE_ZEROED lazily
> some time after mount, instead of mke2fs doing it at mke2fs time sucking
> up all of the node's RAM and making it very slow.
> 
> This is still needed at some point to avoid the problem of e2fsck trying
> to salvage ancient inodes if the group descriptor is corrupted for some
> reason and the inode high watermark is lost.
> 
> Not implemented yet... but intended to be done by a helper thread started
> at mount time to do GDT/bitmap/itable checksum validation, create the
> mballoc buddy buffers, prefetch metadata, etc.
> 
> > @@ -96,7 +96,7 @@ static void usage(void)
> >  {
> >  	fprintf(stderr, _("Usage: %s [-c|-t|-l filename] [-b block-size] "
> >  	"[-f fragment-size]\n\t[-i bytes-per-inode] [-I inode-size] "
> > -	"[-j] [-J journal-options]\n"
> > +	"[-j] [-J journal-options] [-G meta group size]\n"
> >  	"\t[-N number-of-inodes] [-m reserved-blocks-percentage] "
> >  	"[-o creator-os]\n\t[-g blocks-per-group] [-L volume-label] "
> >  	"[-M last-mounted-directory]\n\t[-O feature[,...]] "
> 
> This also needs an update to the mke2fs.8.in man page.

Yes it does.
 
> > +	if(flex_bg_size) {
> > +		fs_param.s_log_groups_per_flex = int_log2(flex_bg_size);
> > +	}
> 
> Whitespace, braces:

Will fix.

> 
> 	if (flex_bg_size)
> 		fs_param.s_log_groups_per_flex = int_log2(flex_bg_size);
> 
> 
> Cheers, Andreas
> --
> Andreas Dilger
> Sr. Staff Engineer, Lustre Group
> Sun Microsystems of Canada, Inc.
> 

Thanks for the feedback.

-JRS

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] e2fsprogs: New bitmap and inode table allocation for FLEX_BG
  2008-02-08 17:37   ` Jose R. Santos
@ 2008-02-11  4:33     ` Theodore Tso
  2008-02-11 15:05       ` Jose R. Santos
  0 siblings, 1 reply; 15+ messages in thread
From: Theodore Tso @ 2008-02-11  4:33 UTC (permalink / raw)
  To: Jose R. Santos; +Cc: Andreas Dilger, linux-ext4

On Fri, Feb 08, 2008 at 11:37:40AM -0600, Jose R. Santos wrote:
> > >  #define EXT2_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not initialized */
> > >  #define EXT2_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not initialized */
> > > +#define EXT2_BG_FLEX_METADATA	0x0004 /* FLEX_BG block group contains meta-data */
> > 
> > Hrm, I thought I had reserved that value in the uninit_groups patch?
> > +#define EXT3_BG_INODE_ZEROED   0x0004  /* On-disk itable initialized to zero */
> 
> I may have been, I just based the patch on the next branch as Ted had
> ask for new e2fsprog patches.  The uninit group patch was not part of
> the next branch when I pulled.

Yes, but whenever you start reserving code points that impact the
on-disk format, you need to be careful and coordinate.  Exactly is the
purpose of this flag, and why is it here?

And I don't see any patch in the kernel patch queue that uses this
flag.  Is this intended for internal use inside e2fsprogs?  If so,
this might not be the best place for it.....

     	       	      	   	     - Ted

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] e2fsprogs: New bitmap and inode table allocation for FLEX_BG
  2008-02-11  4:33     ` Theodore Tso
@ 2008-02-11 15:05       ` Jose R. Santos
  0 siblings, 0 replies; 15+ messages in thread
From: Jose R. Santos @ 2008-02-11 15:05 UTC (permalink / raw)
  To: Theodore Tso; +Cc: Andreas Dilger, linux-ext4

On Sun, 10 Feb 2008 23:33:51 -0500
Theodore Tso <tytso@MIT.EDU> wrote:

> On Fri, Feb 08, 2008 at 11:37:40AM -0600, Jose R. Santos wrote:
> > > >  #define EXT2_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not initialized */
> > > >  #define EXT2_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not initialized */
> > > > +#define EXT2_BG_FLEX_METADATA	0x0004 /* FLEX_BG block group contains meta-data */
> > > 
> > > Hrm, I thought I had reserved that value in the uninit_groups patch?
> > > +#define EXT3_BG_INODE_ZEROED   0x0004  /* On-disk itable initialized to zero */
> > 
> > I may have been, I just based the patch on the next branch as Ted had
> > ask for new e2fsprog patches.  The uninit group patch was not part of
> > the next branch when I pulled.
> 
> Yes, but whenever you start reserving code points that impact the
> on-disk format, you need to be careful and coordinate.  Exactly is the
> purpose of this flag, and why is it here?

Will fix.

> And I don't see any patch in the kernel patch queue that uses this
> flag.  Is this intended for internal use inside e2fsprogs?  If so,
> this might not be the best place for it.....
> 
>      	       	      	   	     - Ted

Currently, this is only used in e2fsprogs to determine which groups to
avoid when setting the EXT2_BG_BLOCK_UNINIT.  It will be use on
ext4_init_block_bitmap() to return the right number of free block when
a block group does not have any meta-data in it.  Eventually, it would
be nice to accurately and efficiently calculate the number of meta data
block used for a flexbg and be able to have these block groups
uninitialized as well.  This flag will be use to determine which groups
need to have their meta data block usage calculated.

-JRS

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH][e2fsprogs] New bitmap and inode table allocation for FLEX_BG
@ 2008-04-01  3:13 Jose R. Santos
  2008-04-03 13:12 ` Theodore Tso
  0 siblings, 1 reply; 15+ messages in thread
From: Jose R. Santos @ 2008-04-01  3:13 UTC (permalink / raw)
  To: Theodore Ts'o, linux-ext4

From: Jose R. Santos <jrs@us.ibm.com>

New bitmap and inode table allocation for FLEX_BG

Change the way we allocate bitmaps and inode tables if the FLEX_BG
feature is used at mke2fs time.  It places calculates a new offset for
bitmaps and inode table base on the number of groups that the user
wishes to pack together using the new "-G" option.  Creating a
filesystem with 64 block groups in a flex group can be done by:

mke2fs -j -I 256 -O flex_bg -G 32 /dev/sdX

Signed-off-by: Jose R. Santos <jrs@us.ibm.com>
Signed-off-by: Valerie Clement <valerie.clement@bull.net>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
--

 lib/ext2fs/alloc_tables.c |  110 ++++++++++++++++++++++++++++++++++++++++++++-
 lib/ext2fs/ext2_fs.h      |    5 ++
 lib/ext2fs/initialize.c   |    9 +++-
 misc/mke2fs.8.in          |   15 ++++++
 misc/mke2fs.c             |   51 +++++++++++++++++++--
 5 files changed, 181 insertions(+), 9 deletions(-)


diff --git a/lib/ext2fs/alloc_tables.c b/lib/ext2fs/alloc_tables.c
index 9b4f0e5..c9f5e59 100644
--- a/lib/ext2fs/alloc_tables.c
+++ b/lib/ext2fs/alloc_tables.c
@@ -27,18 +27,79 @@
 #include "ext2_fs.h"
 #include "ext2fs.h"
 
+/*
+ * This routine searches for free blocks that can allocate a full
+ * group of bitmaps or inode tables for a flexbg group.  Returns the
+ * block number with a correct offset were the bitmaps and inode
+ * tables can be allocated continously and in order.
+ */
+static blk_t flexbg_offset(ext2_filsys fs, dgrp_t group, blk_t start_blk,
+			   ext2fs_block_bitmap bmap, int offset, int size)
+{
+	int		flexbg, flexbg_size, elem_size;
+	blk_t		last_blk, first_free = 0;
+	dgrp_t	       	last_grp;
+
+	flexbg_size = 1 << fs->super->s_log_groups_per_flex;
+	flexbg = group / flexbg_size;
+
+	if (size > fs->super->s_blocks_per_group / 8)
+		size = fs->super->s_blocks_per_group / 8;
+
+	/*
+	 * Dont do a long search if the previous block
+	 * search is still valid.
+	 */
+	if (start_blk && group % flexbg_size) {
+		if (size > flexbg_size)
+			elem_size = fs->inode_blocks_per_group;
+		else
+			elem_size = 1;
+		if (ext2fs_test_block_bitmap_range(bmap, start_blk + elem_size,
+						   size))
+			return start_blk + elem_size;
+	}
+
+	start_blk = ext2fs_group_first_block(fs, flexbg_size * flexbg);
+	last_grp = group | (flexbg_size - 1);
+	if (last_grp > fs->group_desc_count)
+		last_grp = fs->group_desc_count;
+	last_blk = ext2fs_group_last_block(fs, last_grp);
+
+	/* Find the first available block */
+	if (ext2fs_get_free_blocks(fs, start_blk, last_blk, 1, bmap,
+				   &first_free))
+		return first_free;
+
+	if (ext2fs_get_free_blocks(fs, first_free + offset, last_blk, size,
+				   bmap, &first_free))
+		return first_free;
+
+	return first_free;
+}
+
 errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
 				      ext2fs_block_bitmap bmap)
 {
 	errcode_t	retval;
 	blk_t		group_blk, start_blk, last_blk, new_blk, blk;
-	int		j;
+	dgrp_t		last_grp;
+	int		j, rem_grps, flexbg_size = 0;
 
 	group_blk = ext2fs_group_first_block(fs, group);
 	last_blk = ext2fs_group_last_block(fs, group);
 
 	if (!bmap)
 		bmap = fs->block_map;
+
+	if (EXT2_HAS_INCOMPAT_FEATURE(fs->super,
+				       EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+		flexbg_size = 1 << fs->super->s_log_groups_per_flex;
+		last_grp = group | (flexbg_size - 1);
+		rem_grps = last_grp - group;
+		if (last_grp > fs->group_desc_count)
+			last_grp = fs->group_desc_count;
+	}
 	
 	/*
 	 * Allocate the block and inode bitmaps, if necessary
@@ -56,6 +117,15 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
 	} else
 		start_blk = group_blk;
 
+	if (flexbg_size) {
+		int prev_block = 0;
+		if (group && fs->group_desc[group-1].bg_block_bitmap)
+			prev_block = fs->group_desc[group-1].bg_block_bitmap;
+		start_blk = flexbg_offset(fs, group, prev_block, bmap,
+						 0, rem_grps);
+		last_blk = ext2fs_group_last_block(fs, last_grp);
+	}
+
 	if (!fs->group_desc[group].bg_block_bitmap) {
 		retval = ext2fs_get_free_blocks(fs, start_blk, last_blk,
 						1, bmap, &new_blk);
@@ -66,6 +136,20 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
 			return retval;
 		ext2fs_mark_block_bitmap(bmap, new_blk);
 		fs->group_desc[group].bg_block_bitmap = new_blk;
+		if (flexbg_size) {
+			dgrp_t tmp = ext2fs_group_of_blk(fs, new_blk);
+			fs->group_desc[tmp].bg_free_blocks_count--;
+			fs->super->s_free_blocks_count--;
+		}
+	}
+
+	if (flexbg_size) {
+		int prev_block = 0;
+		if (group && fs->group_desc[group-1].bg_inode_bitmap)
+			prev_block = fs->group_desc[group-1].bg_inode_bitmap;
+		start_blk = flexbg_offset(fs, group, prev_block, bmap,
+						 flexbg_size, rem_grps);
+		last_blk = ext2fs_group_last_block(fs, last_grp);
 	}
 
 	if (!fs->group_desc[group].bg_inode_bitmap) {
@@ -78,11 +162,27 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
 			return retval;
 		ext2fs_mark_block_bitmap(bmap, new_blk);
 		fs->group_desc[group].bg_inode_bitmap = new_blk;
+		if (flexbg_size) {
+			dgrp_t tmp = ext2fs_group_of_blk(fs, new_blk);
+			fs->group_desc[tmp].bg_free_blocks_count--;
+			fs->super->s_free_blocks_count--;
+		}
 	}
 
 	/*
 	 * Allocate the inode table
 	 */
+	if (flexbg_size) {
+		int prev_block = 0;
+		if (group && fs->group_desc[group-1].bg_inode_table)
+			prev_block = fs->group_desc[group-1].bg_inode_table;
+		group_blk = flexbg_offset(fs, group, prev_block, bmap,
+						 flexbg_size * 2,
+						 fs->inode_blocks_per_group *
+						 rem_grps);
+		last_blk = ext2fs_group_last_block(fs, last_grp);
+	}
+
 	if (!fs->group_desc[group].bg_inode_table) {
 		retval = ext2fs_get_free_blocks(fs, group_blk, last_blk,
 						fs->inode_blocks_per_group,
@@ -91,8 +191,14 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
 			return retval;
 		for (j=0, blk = new_blk;
 		     j < fs->inode_blocks_per_group;
-		     j++, blk++)
+		     j++, blk++) {
 			ext2fs_mark_block_bitmap(bmap, blk);
+			if (flexbg_size) {
+				dgrp_t tmp = ext2fs_group_of_blk(fs, blk);
+				fs->group_desc[tmp].bg_free_blocks_count--;
+				fs->super->s_free_blocks_count--;
+			}
+		}
 		fs->group_desc[group].bg_inode_table = new_blk;
 	}
 	ext2fs_group_desc_csum_set(fs, group);
diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
index 444211d..d4731f8 100644
--- a/lib/ext2fs/ext2_fs.h
+++ b/lib/ext2fs/ext2_fs.h
@@ -563,7 +563,10 @@ struct ext2_super_block {
 	__u16   s_mmp_interval;         /* # seconds to wait in MMP checking */
 	__u64   s_mmp_block;            /* Block for multi-mount protection */
 	__u32   s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
-	__u32   s_reserved[163];        /* Padding to the end of the block */
+	__u8	s_log_groups_per_flex;	/* FLEX_BG group size */
+	__u8    s_reserved_char_pad;
+	__u16	s_reserved_pad;		/* Padding to next 32bits */
+	__u32   s_reserved[162];        /* Padding to the end of the block */
 };
 
 /*
diff --git a/lib/ext2fs/initialize.c b/lib/ext2fs/initialize.c
index c2e00e8..68cfbd8 100644
--- a/lib/ext2fs/initialize.c
+++ b/lib/ext2fs/initialize.c
@@ -158,6 +158,7 @@ errcode_t ext2fs_initialize(const char *name, int flags,
 	set_field(s_first_meta_bg, 0);
 	set_field(s_raid_stride, 0);		/* default stride size: 0 */
 	set_field(s_raid_stripe_width, 0);	/* default stripe width: 0 */
+	set_field(s_log_groups_per_flex, 0);
 	set_field(s_flags, 0);
 	if (super->s_feature_incompat & ~EXT2_LIB_FEATURE_INCOMPAT_SUPP) {
 		retval = EXT2_ET_UNSUPP_FEATURE;
@@ -366,12 +367,16 @@ ipg_retry:
 	 * group, and fill in the correct group statistics for group.
 	 * Note that although the block bitmap, inode bitmap, and
 	 * inode table have not been allocated (and in fact won't be
-	 * by this routine), they are accounted for nevertheless.
+	 * by this routine), they are accounted for nevertheless.  If
+	 * FLEX_BG meta-data grouping is used, only account for the
+	 * superblock and group descriptors (the inode tables and
+	 * bitmaps will be accounted for when allocated).
 	 */
 	super->s_free_blocks_count = 0;
 	for (i = 0; i < fs->group_desc_count; i++) {
 		numblocks = ext2fs_reserve_super_and_bgd(fs, i, fs->block_map);
-
+		if (fs->super->s_log_groups_per_flex)
+			numblocks += 2 + fs->inode_blocks_per_group;
 		super->s_free_blocks_count += numblocks;
 		fs->group_desc[i].bg_free_blocks_count = numblocks;
 		fs->group_desc[i].bg_free_inodes_count =
diff --git a/misc/mke2fs.8.in b/misc/mke2fs.8.in
index a32c34a..9cc3895 100644
--- a/misc/mke2fs.8.in
+++ b/misc/mke2fs.8.in
@@ -26,6 +26,10 @@ mke2fs \- create an ext2/ext3 filesystem
 .I blocks-per-group
 ]
 [
+.B \-G
+.I number-of-groups
+]
+[
 .B \-i
 .I bytes-per-inode
 ]
@@ -232,6 +236,12 @@ option rather than manipulating the number of blocks per group.)
 This option is generally used by developers who
 are developing test cases.  
 .TP
+.BI \-G " number-of-groups"
+Specify the number of block goups that will be packed together to
+create one large virtual block group on an ext4 filesystem.  This
+improves meta-data locality and performance on meta-data heavy
+workloads.  The number of goups must be a power of 2.
+.TP
 .BI \-i " bytes-per-inode"
 Specify the bytes/inode ratio. 
 .B mke2fs
@@ -425,6 +435,11 @@ Use hashed b-trees to speed up lookups in large directories.
 .B filetype
 Store file type information in directory entries.
 .TP
+.B flex_bg
+Allow bitmaps and inode tables for a block group to be placed anywhere
+on the storage media (use with -G option to group meta-data in order
+to create a large virtual block group).
+.TP
 .B has_journal
 Create an ext3 journal (as if using the
 .TP
diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index 3fd2e5d..7787341 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -98,8 +98,9 @@ static void usage(void)
 	fprintf(stderr, _("Usage: %s [-c|-l filename] [-b block-size] "
 	"[-f fragment-size]\n\t[-i bytes-per-inode] [-I inode-size] "
 	"[-J journal-options]\n"
-	"\t[-N number-of-inodes] [-m reserved-blocks-percentage] "
-	"[-o creator-os]\n\t[-g blocks-per-group] [-L volume-label] "
+	"\t[-G meta group size] [-N number-of-inodes]\n"
+	"\t[-m reserved-blocks-percentage] [-o creator-os]\n"
+	"\t[-g blocks-per-group] [-L volume-label] "
 	"[-M last-mounted-directory]\n\t[-O feature[,...]] "
 	"[-r fs-revision] [-E extended-option[,...]]\n"
 	"\t[-T fs-type] [-jnqvFSV] device [blocks-count]\n"),
@@ -448,6 +449,30 @@ static void write_inode_tables(ext2_filsys fs)
 	progress_close(&progress);
 }
 
+static blk64_t expected_free_block_count(ext2_filsys fs, dgrp_t group)
+{
+	blk64_t tmp;
+	blk64_t blks;
+
+	blks = ext2fs_super_and_bgd_loc(fs, group, 0, 0, 0, 0);
+
+	if (EXT2_HAS_INCOMPAT_FEATURE(fs->super,
+				      EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+		tmp = fs->group_desc[group].bg_block_bitmap;
+		if (group != ext2fs_group_of_blk(fs, tmp))
+			blks++;
+
+		tmp = fs->group_desc[group].bg_inode_bitmap;
+		if (group != ext2fs_group_of_blk(fs, tmp))
+			blks++;
+
+		tmp = fs->group_desc[group].bg_inode_table;
+		if (group != ext2fs_group_of_blk(fs, tmp))
+			blks +=  fs->inode_blocks_per_group;
+	}
+	return blks;
+}
+
 static void setup_lazy_bg(ext2_filsys fs)
 {
 	dgrp_t i;
@@ -481,7 +506,7 @@ static void setup_lazy_bg(ext2_filsys fs)
 			    i == fs->group_desc_count - 1)
 				continue;
 
-			blks = ext2fs_super_and_bgd_loc(fs, i, 0, 0, 0, 0);
+			blks = expected_free_block_count(fs, i);
 			if (bg->bg_free_blocks_count == blks &&
 			    bg->bg_flags & EXT2_BG_INODE_UNINIT) {
 				bg->bg_flags |= EXT2_BG_BLOCK_UNINIT;
@@ -1151,6 +1176,7 @@ static void PRS(int argc, char *argv[])
 	int		blocksize = 0;
 	int		inode_ratio = 0;
 	int		inode_size = 0;
+	unsigned long	flex_bg_size = 0;
 	double		reserved_ratio = 5.0;
 	int		sector_size = 0;
 	int		show_version_only = 0;
@@ -1234,7 +1260,7 @@ static void PRS(int argc, char *argv[])
 	}
 
 	while ((c = getopt (argc, argv,
-		    "b:cf:g:i:jl:m:no:qr:s:tvE:FI:J:L:M:N:O:R:ST:V")) != EOF) {
+		    "b:cf:g:G:i:jl:m:no:qr:s:tvE:FI:J:L:M:N:O:R:ST:V")) != EOF) {
 		switch (c) {
 		case 'b':
 			blocksize = strtol(optarg, &tmp, 0);
@@ -1285,6 +1311,20 @@ static void PRS(int argc, char *argv[])
 				exit(1);
 			}
 			break;
+		case 'G':
+			flex_bg_size = strtoul(optarg, &tmp, 0);
+			if (*tmp) {
+				com_err(program_name, 0,
+					_("Illegal number for Flex_BG size"));
+				exit(1);
+			}
+			if (flex_bg_size < 2 ||
+			    (flex_bg_size & (flex_bg_size-1)) != 0) {
+				com_err(program_name, 0,
+					_("Flex_BG size must be a power of 2"));
+				exit(1);
+			}
+			break;
 		case 'i':
 			inode_ratio = strtoul(optarg, &tmp, 0);
 			if (inode_ratio < EXT2_MIN_BLOCK_SIZE ||
@@ -1685,6 +1725,9 @@ static void PRS(int argc, char *argv[])
 		}
 	}
 
+	if (flex_bg_size)
+		fs_param.s_log_groups_per_flex = int_log2(flex_bg_size);
+
 	if (inode_size == 0) {
 		inode_size = get_int_from_profile(fs_types, "inode_size", 0);
 


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH][e2fsprogs] New bitmap and inode table allocation for FLEX_BG
  2008-04-01  3:13 [PATCH][e2fsprogs] " Jose R. Santos
@ 2008-04-03 13:12 ` Theodore Tso
  2008-04-03 14:28   ` Jose R. Santos
  0 siblings, 1 reply; 15+ messages in thread
From: Theodore Tso @ 2008-04-03 13:12 UTC (permalink / raw)
  To: Jose R. Santos; +Cc: linux-ext4

On Mon, Mar 31, 2008 at 10:13:11PM -0500, Jose R. Santos wrote:
> From: Jose R. Santos <jrs@us.ibm.com>
> 
> New bitmap and inode table allocation for FLEX_BG
> 
> Change the way we allocate bitmaps and inode tables if the FLEX_BG
> feature is used at mke2fs time.  It places calculates a new offset for
> bitmaps and inode table base on the number of groups that the user
> wishes to pack together using the new "-G" option.  Creating a
> filesystem with 64 block groups in a flex group can be done by:

I was just testing out this patch, and it looks like it creates a
filesystem with an incorrect free blocks count when creating a
filesystem with both uninit_groups and flex_bg.  Can you look at this,
please?

						- Ted

<tytso.root@closure> {/usr/projects/e2fsprogs/e2fsprogs/build/misc}, level 2  [js/flex-bg]
550# ./mke2fs -O uninit_groups,flex_bg /dev/closure/testext4 
mke2fs 1.40.8 (13-Mar-2008)
Filesystem label=
OS type: Linux
Block size=4096 (log=2)
Fragment size=4096 (log=2)
65536 inodes, 262144 blocks
13107 blocks (5.00%) reserved for the super user
First data block=0
Maximum filesystem blocks=268435456
8 block groups
32768 blocks per group, 32768 fragments per group
8192 inodes per group
Superblock backups stored on blocks: 
        32768, 98304, 163840, 229376

Writing inode tables: done                            
Writing superblocks and filesystem accounting information: done

This filesystem will be automatically checked every 27 mounts or
180 days, whichever comes first.  Use tune2fs -c or -i to override.
<tytso.root@closure> {/usr/projects/e2fsprogs/e2fsprogs/build/misc}, level 2  [js/flex-bg]
551# ../e2fsck/e2fsck -fy /dev/closure/testext4
e2fsck 1.40.8 (13-Mar-2008)
Pass 1: Checking inodes, blocks, and sizes
Pass 2: Checking directory structure
Pass 3: Checking directory connectivity
Pass 4: Checking reference counts
Pass 5: Checking group summary information
Free blocks count wrong for group #0 (31669, counted=32183).
Fix? yes

Free blocks count wrong for group #1 (31675, counted=32189).
Fix? yes

Free blocks count wrong for group #2 (31740, counted=32254).
Fix? yes

Free blocks count wrong for group #3 (31675, counted=32189).
Fix? yes

Free blocks count wrong for group #4 (31740, counted=32254).
Fix? yes

Free blocks count wrong for group #5 (31675, counted=32189).
Fix? yes

Free blocks count wrong for group #6 (31740, counted=32254).
Fix? yes

Free blocks count wrong for group #7 (31675, counted=32189).
Fix? yes

Free blocks count wrong (253589, counted=257701).
Fix? yes


/dev/closure/testext4: ***** FILE SYSTEM WAS MODIFIED *****
/dev/closure/testext4: 11/65536 files (9.1% non-contiguous), 4443/262144 blocks


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH][e2fsprogs] New bitmap and inode table allocation for FLEX_BG
  2008-04-03 13:12 ` Theodore Tso
@ 2008-04-03 14:28   ` Jose R. Santos
  2008-04-04  3:24     ` Theodore Tso
  0 siblings, 1 reply; 15+ messages in thread
From: Jose R. Santos @ 2008-04-03 14:28 UTC (permalink / raw)
  To: Theodore Tso; +Cc: linux-ext4

On Thu, 3 Apr 2008 09:12:40 -0400
Theodore Tso <tytso@mit.edu> wrote:

> On Mon, Mar 31, 2008 at 10:13:11PM -0500, Jose R. Santos wrote:
> > From: Jose R. Santos <jrs@us.ibm.com>
> > 
> > New bitmap and inode table allocation for FLEX_BG
> > 
> > Change the way we allocate bitmaps and inode tables if the FLEX_BG
> > feature is used at mke2fs time.  It places calculates a new offset for
> > bitmaps and inode table base on the number of groups that the user
> > wishes to pack together using the new "-G" option.  Creating a
> > filesystem with 64 block groups in a flex group can be done by:
> 
> I was just testing out this patch, and it looks like it creates a
> filesystem with an incorrect free blocks count when creating a
> filesystem with both uninit_groups and flex_bg.  Can you look at this,
> please?
> 
> 						- Ted

I blame Undo Manager for being so slow that cause me to skip some of
the testing needed to be done.  I was incorrectly checking the feature
flag instead of checking the value of fs->super->s_log_groups_per_flex.

Fixed patch is on its way.

 -JRS

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH][e2fsprogs] New bitmap and inode table allocation for FLEX_BG
  2008-04-03 14:28   ` Jose R. Santos
@ 2008-04-04  3:24     ` Theodore Tso
  2008-04-04  5:37       ` Jose R. Santos
  0 siblings, 1 reply; 15+ messages in thread
From: Theodore Tso @ 2008-04-04  3:24 UTC (permalink / raw)
  To: Jose R. Santos; +Cc: linux-ext4

On Thu, Apr 03, 2008 at 09:28:58AM -0500, Jose R. Santos wrote:
> I blame Undo Manager for being so slow that cause me to skip some of
> the testing needed to be done.  

If that means we need a patch to disable the undo manager, via a
command-line option, feel free.  :-)

> I was incorrectly checking the feature
> flag instead of checking the value of fs->super->s_log_groups_per_flex.

Actually, you should check both, and we need to make mke2fs have an
intelligent default, which can be overridden via mke2fs.conf.

Also, it looks like this patch doesn't create a valid filesystem in
combination with meta_bg:

mke2fs G 32 -O meta_bg,flex_bg,uninit_groups,^resize_inode /tmp/foo.img

Then try running "e2fsck -f /tmp/foo.img" with the patch applied.

One obvious question is why is this patch so fragile....?  Is there
some way we can make it more likely not to break given other changes
to e2fsprogs in the future.

						- Ted

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH][e2fsprogs] New bitmap and inode table allocation for FLEX_BG
  2008-04-04  3:24     ` Theodore Tso
@ 2008-04-04  5:37       ` Jose R. Santos
  2008-04-04 12:43         ` Theodore Tso
  0 siblings, 1 reply; 15+ messages in thread
From: Jose R. Santos @ 2008-04-04  5:37 UTC (permalink / raw)
  To: Theodore Tso; +Cc: linux-ext4

On Thu, 3 Apr 2008 23:24:43 -0400
Theodore Tso <tytso@MIT.EDU> wrote:

> On Thu, Apr 03, 2008 at 09:28:58AM -0500, Jose R. Santos wrote:
> > I blame Undo Manager for being so slow that cause me to skip some of
> > the testing needed to be done.  
> 
> If that means we need a patch to disable the undo manager, via a
> command-line option, feel free.  :-)

Im sufficiently annoyed that I may just do that.
 
> > I was incorrectly checking the feature
> > flag instead of checking the value of fs->super->s_log_groups_per_flex.
> 
> Actually, you should check both, and we need to make mke2fs have an
> intelligent default, which can be overridden via mke2fs.conf.

Yes it should check both(will fix).  I was expecting more people to
give flexbg a test before trying to determined an intelligent default
though.

> Also, it looks like this patch doesn't create a valid filesystem in
> combination with meta_bg:
> 
> mke2fs G 32 -O meta_bg,flex_bg,uninit_groups,^resize_inode /tmp/foo.img
> 
> Then try running "e2fsck -f /tmp/foo.img" with the patch applied.

Wow, it really breaks.  Throws e2fsck into an infinite loop.

> One obvious question is why is this patch so fragile....?  Is there
> some way we can make it more likely not to break given other changes
> to e2fsprogs in the future.

Getting the right free block count for every group descriptor seems to
be the tricky part since libe2fs make all sort of assumptions about
number of used blocks that break when meta-data is no longer on the
same block group.  Seems like I forgot a check for the adjusted flexbg
block group size in meta_bg since the first place it barfs is in group
127 which is the last group of the meta_bg with a group descriptor
block being used.  This should be easy to find and fix.

> 						- Ted

-JRS

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH][e2fsprogs] New bitmap and inode table allocation for FLEX_BG
  2008-04-04  5:37       ` Jose R. Santos
@ 2008-04-04 12:43         ` Theodore Tso
  2008-04-04 15:20           ` Jose R. Santos
  0 siblings, 1 reply; 15+ messages in thread
From: Theodore Tso @ 2008-04-04 12:43 UTC (permalink / raw)
  To: Jose R. Santos; +Cc: linux-ext4

On Fri, Apr 04, 2008 at 12:37:57AM -0500, Jose R. Santos wrote:
> Getting the right free block count for every group descriptor seems to
> be the tricky part since libe2fs make all sort of assumptions about
> number of used blocks that break when meta-data is no longer on the
> same block group.  

Originally the overlap you're referring to was very simple.
ext2fs_initialize() set the free block counts, and then
ext2fs_alloc_tables() actually did the allocation of the bitmaps and
inode tables.

Flex_bg made things more complex, but it transferred the
responsibility of setting the block number to the routines that
allocate the inode and bitmaps.

> Seems like I forgot a check for the adjusted flexbg
> block group size in meta_bg since the first place it barfs is in group
> 127 which is the last group of the meta_bg with a group descriptor
> block being used.  This should be easy to find and fix.

It can't be just that since e2fsck is also reporting a block bitmap
discrepancy.  And there's the question of why e2fsck is doing that in
an infinite loop.

Hmm....  So at least part of the problem is that BLOCK_UNINIT isn't
getting set when it should.  That seems to be bug #1, and that was the
proximate cause of the failure, that was triggered by the flex-bg
allocation patch.

In e2fsck pass5, the uninit_groups patch changed it to compare the
actual bitmap against a virtual bitmap if the bitmap block hadn't been
initialized.  (Around line 180, in e2fsck/pass5.c, in the function
check_block_bitmaps()).  The problem is that it is using
ext2fs_bg_has_super(), and assuming that if it is set, that the
superblock and block group descriptors are present using the old-style
non-meta_bg layout.  What it *should* have used is
ext2fs_super_and_bgd_loc(), and used it to set the pseudo-bitmap
correctly.  That's bug #2.

Since it is wrong, it is attempting to fix it, but code to clear
BLOCK_UNINIT is only when the block is used but marked so in the
bitmap --- and not in the other case, when the block isn't used, but
is apparently marked as such.  That's bug #3.

Bugs #2 and #3 is my fault, and reflects the fact that LAZY_BG was a
quick hack that I had coded up only for testing purposes for people
who wanted to play with really big filesystems.  I'll take care of
that, which is why e2fsck was looping.  In retrospect, lazy_bg was a
mistake, or at least should have been implemented *much* more
carefully, and I'm beginning to think it should be removed entirely in
favor of uninit_groups, per my other e-mail.

		    	     		    	   - Ted

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH][e2fsprogs] New bitmap and inode table allocation for FLEX_BG
  2008-04-04 12:43         ` Theodore Tso
@ 2008-04-04 15:20           ` Jose R. Santos
  0 siblings, 0 replies; 15+ messages in thread
From: Jose R. Santos @ 2008-04-04 15:20 UTC (permalink / raw)
  To: Theodore Tso; +Cc: linux-ext4

On Fri, 4 Apr 2008 08:43:58 -0400
Theodore Tso <tytso@MIT.EDU> wrote:

> On Fri, Apr 04, 2008 at 12:37:57AM -0500, Jose R. Santos wrote:
> > Getting the right free block count for every group descriptor seems to
> > be the tricky part since libe2fs make all sort of assumptions about
> > number of used blocks that break when meta-data is no longer on the
> > same block group.  
> 
> Originally the overlap you're referring to was very simple.
> ext2fs_initialize() set the free block counts, and then
> ext2fs_alloc_tables() actually did the allocation of the bitmaps and
> inode tables.
> 
> Flex_bg made things more complex, but it transferred the
> responsibility of setting the block number to the routines that
> allocate the inode and bitmaps.
> 
> > Seems like I forgot a check for the adjusted flexbg
> > block group size in meta_bg since the first place it barfs is in group
> > 127 which is the last group of the meta_bg with a group descriptor
> > block being used.  This should be easy to find and fix.
> 
> It can't be just that since e2fsck is also reporting a block bitmap
> discrepancy.  And there's the question of why e2fsck is doing that in
> an infinite loop.
> 
> Hmm....  So at least part of the problem is that BLOCK_UNINIT isn't
> getting set when it should.  That seems to be bug #1, and that was the
> proximate cause of the failure, that was triggered by the flex-bg
> allocation patch.

The problem is that ext2fs_super_and_bgd_loc() does not correctly
predict the size of a block group with no bitmaps and not inode
tables.  So the BLOCK_UNINIT is being set correctly, but e2fsck just
gets confused when the predicted number of used blocks is different
than the actual.  In mke2fs I go around this in
expected_free_block_count() but maybe the right way to fix it is in
ext2fs_super_and_bgd_loc().  The only problem is that predicting the
size of groups with all the meta-data is tricky especially when the
inode tables are too big to fit in a single block group.  This is why I
currently do not mark BLOCK_UNINIT on block group with meta-data in
them. Does ext2fs_super_and_bgd_loc() need to return an exact number of
free blocks or is a guesstimate good enough?

> 
> In e2fsck pass5, the uninit_groups patch changed it to compare the
> actual bitmap against a virtual bitmap if the bitmap block hadn't been
> initialized.  (Around line 180, in e2fsck/pass5.c, in the function
> check_block_bitmaps()).  The problem is that it is using
> ext2fs_bg_has_super(), and assuming that if it is set, that the
> superblock and block group descriptors are present using the old-style
> non-meta_bg layout.  What it *should* have used is
> ext2fs_super_and_bgd_loc(), and used it to set the pseudo-bitmap
> correctly.  That's bug #2.

It would still break since ext2fs_super_and_bgd_loc() does not know of
block groups with not meta-data in them.

> Since it is wrong, it is attempting to fix it, but code to clear
> BLOCK_UNINIT is only when the block is used but marked so in the
> bitmap --- and not in the other case, when the block isn't used, but
> is apparently marked as such.  That's bug #3.
> 
> Bugs #2 and #3 is my fault, and reflects the fact that LAZY_BG was a
> quick hack that I had coded up only for testing purposes for people
> who wanted to play with really big filesystems.  I'll take care of
> that, which is why e2fsck was looping.  In retrospect, lazy_bg was a
> mistake, or at least should have been implemented *much* more
> carefully, and I'm beginning to think it should be removed entirely in
> favor of uninit_groups, per my other e-mail.
> 
> 		    	     		    	   - Ted

Agree.

-JRS

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2008-04-04 15:20 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-02-07 17:09 [PATCH] e2fsprogs: New bitmap and inode table allocation for FLEX_BG Jose R. Santos
2008-02-08  5:11 ` Andreas Dilger
2008-02-08 17:37   ` Jose R. Santos
2008-02-11  4:33     ` Theodore Tso
2008-02-11 15:05       ` Jose R. Santos
  -- strict thread matches above, loose matches on Subject: below --
2008-04-01  3:13 [PATCH][e2fsprogs] " Jose R. Santos
2008-04-03 13:12 ` Theodore Tso
2008-04-03 14:28   ` Jose R. Santos
2008-04-04  3:24     ` Theodore Tso
2008-04-04  5:37       ` Jose R. Santos
2008-04-04 12:43         ` Theodore Tso
2008-04-04 15:20           ` Jose R. Santos
2008-01-11 17:28 [PATCH] e2fsprogs: " Jose R. Santos
2008-01-11 21:01 ` Andreas Dilger
2008-01-11 22:11   ` Jose R. Santos

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).