* [RFC] [PATCH] Flex_BG ialloc awareness V2.
@ 2007-12-06 22:10 Jose R. Santos
2007-12-07 10:14 ` Andreas Dilger
0 siblings, 1 reply; 11+ messages in thread
From: Jose R. Santos @ 2007-12-06 22:10 UTC (permalink / raw)
To: linux-ext4
[-- Attachment #1: Type: text/plain, Size: 11612 bytes --]
Hi folks,
New version of the Flex_BG ialloc allocation patch.
Changes from the last version:
- Size of the FLEX_BG in now written to the super block at mke2fs time
instead of calculating at mount time (testing patch for e2fsprog's
next branch attached).
- Rename a lots of the confusing "meta" terms as suggested by Andreas.
- Use the Orlov if the FLEX_BG size is 0.
- Use shift instead of divide in ext4_meta_group() as suggested by
Andreas.
- Use sb_bgl_lock() instead of one spinlock per FLEX_BG as suggested by
Andreas. (Needs more perf testing)
- Remove some dead/prototype code.
Signed-off-by: Jose R. Santos <jrs@us.ibm.com>
--
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index b102b0e..7ef9787 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -600,6 +600,7 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
struct ext4_sb_info *sbi;
int err = 0, ret;
ext4_grpblk_t group_freed;
+ ext4_group_t flex_group;
*pdquot_freed_blocks = 0;
sbi = EXT4_SB(sb);
@@ -745,6 +746,14 @@ do_more:
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_groups_per_flex_shift) {
+ flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks += count;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1610,6 +1619,7 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
unsigned short windowsz = 0;
ext4_group_t ngroups;
unsigned long num = *count;
+ ext4_group_t flex_group;
*errp = -ENOSPC;
sb = inode->i_sb;
@@ -1815,6 +1825,14 @@ allocated:
spin_unlock(sb_bgl_lock(sbi, group_no));
percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_groups_per_flex_shift) {
+ flex_group = ext4_flex_group(sbi, group_no);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks -= num;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
err = ext4_journal_dirty_metadata(handle, gdp_bh);
if (!fatal)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 17b5df1..35ab8ff 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -158,6 +158,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
struct ext4_super_block * es;
struct ext4_sb_info *sbi;
int fatal = 0, err;
+ ext4_group_t flex_group;
if (atomic_read(&inode->i_count) > 1) {
printk ("ext4_free_inode: inode has count=%d\n",
@@ -235,6 +236,13 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
if (is_directory)
percpu_counter_dec(&sbi->s_dirs_counter);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_groups_per_flex_shift) {
+ flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_inodes++;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
}
BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
err = ext4_journal_dirty_metadata(handle, bh2);
@@ -289,6 +297,71 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
return ret;
}
+#define free_block_ratio 10
+
+int find_group_flex(struct super_block *sb, struct inode *parent)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *desc;
+ struct buffer_head *bh;
+ struct flex_groups *flex_group = sbi->s_flex_groups;
+ ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+ ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
+ ext4_group_t ngroups = sbi->s_groups_count;
+ int flex_size = ext4_flex_bg_size(sbi);
+ ext4_group_t best_flex = -1;
+ ext4_group_t best_group = -1;
+ int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
+ int flex_freeb_ratio;
+ ext4_group_t n_fbg_groups;
+ ext4_group_t i;
+
+ n_fbg_groups = (sbi->s_groups_count + flex_size - 1) / flex_size;
+ best_flex = parent_fbg_group;
+
+find_close_to_parent:
+ flex_freeb_ratio = flex_group[best_flex].free_blocks*100/blocks_per_flex;
+ if (flex_group[best_flex].free_inodes &&
+ flex_freeb_ratio > free_block_ratio)
+ goto found_flexbg;
+
+ if (best_flex && best_flex == parent_fbg_group) {
+ best_flex--;
+ goto find_close_to_parent;
+ }
+
+ for (i = 0; i < n_fbg_groups; i++) {
+ if (i == parent_fbg_group || i == parent_fbg_group - 1)
+ continue;
+
+ flex_freeb_ratio = flex_group[i].free_blocks*100/blocks_per_flex;
+
+ if (flex_freeb_ratio > free_block_ratio &&
+ flex_group[i].free_inodes) {
+ best_flex = i;
+ break;
+ }
+
+ if (best_flex < 0 ||
+ (flex_group[i].free_blocks >
+ flex_group[best_flex].free_blocks &&
+ flex_group[i].free_inodes))
+ best_flex = i;
+ }
+
+found_flexbg:
+ for (i = best_flex * flex_size; i < ngroups &&
+ i < (best_flex + 1) * flex_size; i++) {
+ desc = ext4_get_group_desc(sb, i, &bh);
+ if (le16_to_cpu(desc->bg_free_inodes_count)) {
+ best_group = i;
+ goto out;
+ }
+ }
+out:
+ return best_group;
+}
+
/*
* Orlov's allocator for directories.
*
@@ -504,6 +577,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
struct inode *ret;
ext4_group_t i;
int free = 0;
+ ext4_group_t flex_group;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -517,6 +591,13 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
sbi = EXT4_SB(sb);
es = sbi->s_es;
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_groups_per_flex_shift) {
+ group = find_group_flex(sb, dir);
+ goto got_group;
+ }
+
if (S_ISDIR(mode)) {
if (test_opt (sb, OLDALLOC))
ret2 = find_group_dir(sb, dir, &group);
@@ -525,6 +606,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
} else
ret2 = find_group_other(sb, dir, &group);
+got_group:
err = -ENOSPC;
if (ret2 == -1)
goto out;
@@ -681,6 +763,14 @@ got:
percpu_counter_inc(&sbi->s_dirs_counter);
sb->s_dirt = 1;
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_groups_per_flex_shift) {
+ flex_group = ext4_flex_group(sbi, group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_inodes--;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
inode->i_uid = current->fsuid;
if (test_opt (sb, GRPID))
inode->i_gid = dir->i_gid;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b626339..81ad9b1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -518,6 +518,7 @@ static void ext4_put_super (struct super_block * sb)
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
+ kfree(sbi->s_flex_groups);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -1423,6 +1424,61 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
return res;
}
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp = NULL;
+ struct buffer_head *bh;
+ ext4_group_t flex_group_count;
+ ext4_group_t flex_group;
+ unsigned int shift;
+ int groups_per_flex = 0;
+ int tmp = 0;
+ __u64 block_bitmap = 0, cur_block_bitmap;
+ int i;
+
+ groups_per_flex = le16_to_cpu(sbi->s_es->s_flex_bg_size);
+
+ if (!groups_per_flex) {
+ sbi->s_groups_per_flex_shift = 0;
+ return 1;
+ }
+
+ shift = 0;
+ tmp = groups_per_flex;
+ while ((tmp >>= 1UL) != 0UL)
+ shift++;
+
+ sbi->s_groups_per_flex_shift = shift;
+ flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
+ groups_per_flex;
+ sbi->s_flex_groups = kmalloc(flex_group_count *
+ sizeof(struct flex_groups), GFP_KERNEL);
+ if (sbi->s_flex_groups == NULL) {
+ printk(KERN_ERR "EXT4-fs: not enough memory\n");
+ goto failed;
+ }
+ memset(sbi->s_flex_groups, 0, flex_group_count *
+ sizeof(struct flex_groups));
+
+ gdp = ext4_get_group_desc(sb, 1, &bh);
+ block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
+
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ gdp = ext4_get_group_desc(sb, i, &bh);
+
+ flex_group = ext4_flex_group(sbi, i);
+ sbi->s_flex_groups[flex_group].free_inodes +=
+ le16_to_cpu(gdp->bg_free_inodes_count);
+ sbi->s_flex_groups[flex_group].free_blocks +=
+ le16_to_cpu(gdp->bg_free_blocks_count);
+ }
+
+ return 1;
+failed:
+ return 0;
+}
+
__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
struct ext4_group_desc *gdp)
{
@@ -2037,6 +2093,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
goto failed_mount2;
}
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (!ext4_fill_flex_info(sb)) {
+ printk(KERN_ERR
+ "EXT4-fs: unable to initialize flex_bg meta info!\n");
+ goto failed_mount2;
+ }
+
sbi->s_gdb_count = db_count;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
index bcdb59d..3b94dbf 100644
--- a/include/linux/ext4_fs.h
+++ b/include/linux/ext4_fs.h
@@ -152,6 +152,15 @@ struct ext4_group_desc
__u32 bg_reserved2[3];
};
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+ __u32 free_inodes;
+ __u32 free_blocks;
+};
+
#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
@@ -622,7 +631,9 @@ struct ext4_super_block {
__le16 s_mmp_interval; /* # seconds to wait in MMP checking */
__le64 s_mmp_block; /* Block for multi-mount protection */
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
- __u32 s_reserved[163]; /* Padding to the end of the block */
+ __le16 s_flex_bg_size; /* FLEX_BG group size */
+ __le16 padding; /* Padding to next 32bits */
+ __u32 s_reserved[162]; /* Padding to the end of the block */
};
#ifdef __KERNEL__
@@ -1120,6 +1131,17 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
}
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+ ext4_group_t block_group)
+{
+ return block_group >> sbi->s_groups_per_flex_shift;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+ return 1 << sbi->s_groups_per_flex_shift;
+}
+
#define ext4_std_error(sb, errno) \
do { \
if ((errno)) \
diff --git a/include/linux/ext4_fs_sb.h b/include/linux/ext4_fs_sb.h
index 744e746..ac7af1b 100644
--- a/include/linux/ext4_fs_sb.h
+++ b/include/linux/ext4_fs_sb.h
@@ -147,6 +147,9 @@ struct ext4_sb_info {
/* locality groups */
struct ext4_locality_group *s_locality_groups;
+
+ unsigned int s_groups_per_flex_shift;
+ struct flex_groups *s_flex_groups;
};
#define EXT4_GROUP_INFO(sb, group) \
EXT4_SB(sb)->s_group_info[(group) >> EXT4_DESC_PER_BLOCK_BITS(sb)] \
[-- Attachment #2: e2fsprogs-flexbg-grouping.patch --]
[-- Type: text/x-patch, Size: 7926 bytes --]
commit 6aea7c4b5bf1c4da4630823a926e025fb7dbddd5
Author: Jose R. Santos <jrs@us.ibm.com>
Date: Thu Dec 6 14:00:48 2007 -0600
New bitmap and inode table allocation for FLEX_BG
Change the way we allocate bitmaps and inode tables if the FLEX_BG
feature is used at mke2fs time. The block and inode bitmaps are
allocated as a one contiguous set for each flex block group. Due to
the size of the inode tables, the inode table for each block group is
allocate individually but packed close together at the beginning of a
flex group. For now, this allow for the inode table to be packed
close to the inode bitmaps in cases where we try to allocate a large
group of inode tables right after the bitmaps and fail.
Signed-off-by: Jose R. Santos <jrs@us.ibm.com>
diff --git a/lib/ext2fs/alloc_tables.c b/lib/ext2fs/alloc_tables.c
index 4ad2ba9..ce81eaa 100644
--- a/lib/ext2fs/alloc_tables.c
+++ b/lib/ext2fs/alloc_tables.c
@@ -27,6 +27,124 @@
#include "ext2_fs.h"
#include "ext2fs.h"
+#define ALLOC_BLOCK_BITMAPS 1
+#define ALLOC_INODE_BITMAPS 2
+#define ALLOC_INODE_TABLES 3
+
+errcode_t ext2fs_allocate_contiguous(ext2_filsys fs, dgrp_t group,
+ int type, blk_t start_blk, blk_t last_blk,
+ int count, ext2fs_block_bitmap bmap)
+{
+ errcode_t retval;
+ blk_t new_blk, blk;
+ int i, j;
+
+ if (!bmap)
+ bmap = fs->block_map;
+
+ switch (type) {
+ case ALLOC_BLOCK_BITMAPS:
+ retval = ext2fs_get_free_blocks(fs, start_blk, last_blk,
+ 1 * count, bmap, &new_blk);
+ if (retval)
+ return retval;
+ for (i=0, blk=new_blk; i < count; i++, blk++) {
+ ext2fs_mark_block_bitmap(bmap, blk);
+ fs->group_desc[group+i].bg_block_bitmap = blk;
+ }
+ break;
+
+ case ALLOC_INODE_BITMAPS:
+ retval = ext2fs_get_free_blocks(fs, start_blk, last_blk,
+ 1 * count, bmap, &new_blk);
+ if (retval)
+ return retval;
+ for (i=0, blk=new_blk; i < count; i++, blk++) {
+ ext2fs_mark_block_bitmap(bmap, blk);
+ fs->group_desc[group+i].bg_inode_bitmap = blk;
+ }
+ break;
+
+ case ALLOC_INODE_TABLES:
+ for (i=0; i < count; i++) {
+ retval = ext2fs_get_free_blocks(fs, start_blk, last_blk,
+ fs->inode_blocks_per_group,
+ bmap, &new_blk);
+ if (retval)
+ return retval;
+ blk = new_blk;
+ for (j=0; j < fs->inode_blocks_per_group; j++, blk++)
+ ext2fs_mark_block_bitmap(bmap, blk);
+ fs->group_desc[group+i].bg_inode_table = new_blk;
+ }
+ break;
+
+ }
+ return 0;
+}
+
+
+
+errcode_t ext2fs_allocate_flex_groups(ext2_filsys fs)
+{
+ errcode_t retval;
+ blk_t start, last, j, blocks;
+ dgrp_t i, k;
+ int meta_bg_size;
+
+ meta_bg_size = ext2fs_swab16(fs->super->s_flex_bg_size);
+ blocks = 0;
+
+ for (i = 0; i < fs->group_desc_count; i=i+meta_bg_size) {
+
+ start = ext2fs_group_first_block(fs, i);
+
+ if (i+meta_bg_size >= fs->group_desc_count) {
+ last = ext2fs_group_last_block(fs, fs->group_desc_count);
+ meta_bg_size = fs->group_desc_count - i;
+ }
+ else
+ last = ext2fs_group_last_block(fs, i+meta_bg_size-1);
+
+ retval = ext2fs_allocate_contiguous(fs, i, ALLOC_BLOCK_BITMAPS,
+ start, last, meta_bg_size,
+ fs->block_map);
+ if (retval)
+ return retval;
+ retval = ext2fs_allocate_contiguous(fs, i, ALLOC_INODE_BITMAPS,
+ start, last, meta_bg_size,
+ fs->block_map);
+ if (retval)
+ return retval;
+ retval = ext2fs_allocate_contiguous(fs, i, ALLOC_INODE_TABLES,
+ start, last, meta_bg_size,
+ fs->block_map);
+ if (retval)
+ return retval;
+
+ /*
+ * The content of bg_free_blocks_count is previously
+ * assigned with out knowledge of the new allocation
+ * scheme. Need to update the number of free blocks
+ * per group descriptor or fsck will complain.
+ */
+
+ for (k=i; k<i+meta_bg_size; k++){
+ if (k > fs->group_desc_count)
+ break;
+ start = ext2fs_group_first_block(fs, k);
+ last = ext2fs_group_last_block(fs, k);
+ for (j=start; j<=last; j++) {
+ if( !ext2fs_fast_test_block_bitmap(fs->block_map, j))
+ blocks++;
+ }
+ fs->group_desc[k].bg_free_blocks_count = blocks;
+ blocks = 0;
+ }
+ }
+ return 0;
+}
+
errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
ext2fs_block_bitmap bmap)
{
@@ -107,10 +225,16 @@ errcode_t ext2fs_allocate_tables(ext2_filsys fs)
errcode_t retval;
dgrp_t i;
- for (i = 0; i < fs->group_desc_count; i++) {
- retval = ext2fs_allocate_group_table(fs, i, fs->block_map);
- if (retval)
- return retval;
+ if (EXT2_HAS_INCOMPAT_FEATURE (fs->super,
+ EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ ext2fs_allocate_flex_groups(fs);
+
+ else {
+ for (i = 0; i < fs->group_desc_count; i++) {
+ retval = ext2fs_allocate_group_table(fs, i, fs->block_map);
+ if (retval)
+ return retval;
+ }
}
return 0;
}
diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
index 2394857..7528707 100644
--- a/lib/ext2fs/ext2_fs.h
+++ b/lib/ext2fs/ext2_fs.h
@@ -577,7 +577,9 @@ struct ext2_super_block {
__u16 s_mmp_interval; /* # seconds to wait in MMP checking */
__u64 s_mmp_block; /* Block for multi-mount protection */
__u32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
- __u32 s_reserved[163]; /* Padding to the end of the block */
+ __u16 s_flex_bg_size; /* FLEX_BG group size */
+ __u16 padding; /* Padding to next 32bits */
+ __u32 s_reserved[162]; /* Padding to the end of the block */
};
/*
diff --git a/lib/ext2fs/initialize.c b/lib/ext2fs/initialize.c
index 16e9eaa..935a013 100644
--- a/lib/ext2fs/initialize.c
+++ b/lib/ext2fs/initialize.c
@@ -156,6 +156,7 @@ errcode_t ext2fs_initialize(const char *name, int flags,
set_field(s_feature_incompat, 0);
set_field(s_feature_ro_compat, 0);
set_field(s_first_meta_bg, 0);
+ set_field(s_flex_bg_size, 0);
if (super->s_feature_incompat & ~EXT2_LIB_FEATURE_INCOMPAT_SUPP) {
retval = EXT2_ET_UNSUPP_FEATURE;
goto cleanup;
diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index 9e2d7a8..7319832 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -96,7 +96,7 @@ static void usage(void)
{
fprintf(stderr, _("Usage: %s [-c|-t|-l filename] [-b block-size] "
"[-f fragment-size]\n\t[-i bytes-per-inode] [-I inode-size] "
- "[-j] [-J journal-options]\n"
+ "[-j] [-J journal-options] [-G meta group size]\n"
"\t[-N number-of-inodes] [-m reserved-blocks-percentage] "
"[-o creator-os]\n\t[-g blocks-per-group] [-L volume-label] "
"[-M last-mounted-directory]\n\t[-O feature[,...]] "
@@ -912,6 +912,7 @@ static void PRS(int argc, char *argv[])
int blocksize = 0;
int inode_ratio = 0;
int inode_size = 0;
+ unsigned long flex_bg_size = 0;
double reserved_ratio = 5.0;
int sector_size = 0;
int show_version_only = 0;
@@ -994,7 +995,7 @@ static void PRS(int argc, char *argv[])
}
while ((c = getopt (argc, argv,
- "b:cf:g:i:jl:m:no:qr:s:tvE:FI:J:L:M:N:O:R:ST:V")) != EOF) {
+ "b:cf:g:G:i:jl:m:no:qr:s:tvE:FI:J:L:M:N:O:R:ST:V")) != EOF) {
switch (c) {
case 'b':
blocksize = strtol(optarg, &tmp, 0);
@@ -1045,6 +1046,19 @@ static void PRS(int argc, char *argv[])
exit(1);
}
break;
+ case 'G':
+ flex_bg_size = strtoul(optarg, &tmp, 0);
+ if (*tmp) {
+ com_err(program_name, 0,
+ _("Illegal number for Flex_BG size"));
+ exit(1);
+ }
+ if ((flex_bg_size & (flex_bg_size-1)) != 0) {
+ com_err(program_name, 0,
+ _("Flex_BG size must be a power of 2"));
+ exit(1);
+ }
+ break;
case 'i':
inode_ratio = strtoul(optarg, &tmp, 0);
if (inode_ratio < EXT2_MIN_BLOCK_SIZE ||
@@ -1444,6 +1458,10 @@ static void PRS(int argc, char *argv[])
}
}
+ if(flex_bg_size) {
+ fs_param.s_flex_bg_size = ext2fs_swab16(flex_bg_size);
+ }
+
if (!force && fs_param.s_blocks_count >= ((unsigned) 1 << 31)) {
com_err(program_name, 0,
_("Filesystem too large. No more than 2**31-1 blocks\n"
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.
2007-12-06 22:10 [RFC] [PATCH] Flex_BG ialloc awareness V2 Jose R. Santos
@ 2007-12-07 10:14 ` Andreas Dilger
2007-12-07 15:52 ` Jose R. Santos
0 siblings, 1 reply; 11+ messages in thread
From: Andreas Dilger @ 2007-12-07 10:14 UTC (permalink / raw)
To: Jose R. Santos; +Cc: linux-ext4
On Dec 06, 2007 16:10 -0600, Jose R. Santos wrote:
> @@ -600,6 +600,7 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
> struct ext4_sb_info *sbi;
> int err = 0, ret;
> ext4_grpblk_t group_freed;
> + ext4_group_t flex_group;
>
> *pdquot_freed_blocks = 0;
> sbi = EXT4_SB(sb);
> @@ -745,6 +746,14 @@ do_more:
> spin_unlock(sb_bgl_lock(sbi, block_group));
> percpu_counter_add(&sbi->s_freeblocks_counter, count);
>
> + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
> + sbi->s_groups_per_flex_shift) {
> + flex_group = ext4_flex_group(sbi, block_group);
> + spin_lock(sb_bgl_lock(sbi, flex_group));
> + sbi->s_flex_groups[flex_group].free_blocks += count;
> + spin_unlock(sb_bgl_lock(sbi, flex_group));
> + }
In general, I prefer to keep variables in as local a scope as possible.
In this case, flex_group could be declared inside the "if (EXT4_HAS_INCOMPAT"
check.
> @@ -1610,6 +1619,7 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
> unsigned short windowsz = 0;
> ext4_group_t ngroups;
> unsigned long num = *count;
> + ext4_group_t flex_group;
>
> *errp = -ENOSPC;
> sb = inode->i_sb;
> @@ -1815,6 +1825,14 @@ allocated:
> spin_unlock(sb_bgl_lock(sbi, group_no));
> percpu_counter_sub(&sbi->s_freeblocks_counter, num);
>
> + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
> + sbi->s_groups_per_flex_shift) {
> + flex_group = ext4_flex_group(sbi, group_no);
> + spin_lock(sb_bgl_lock(sbi, flex_group));
> + sbi->s_flex_groups[flex_group].free_blocks -= num;
> + spin_unlock(sb_bgl_lock(sbi, flex_group));
> + }
Same as above.
> @@ -158,6 +158,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
> struct ext4_super_block * es;
> struct ext4_sb_info *sbi;
> int fatal = 0, err;
> + ext4_group_t flex_group;
>
> if (atomic_read(&inode->i_count) > 1) {
> printk ("ext4_free_inode: inode has count=%d\n",
> @@ -235,6 +236,13 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
> if (is_directory)
> percpu_counter_dec(&sbi->s_dirs_counter);
>
> + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
> + sbi->s_groups_per_flex_shift) {
> + flex_group = ext4_flex_group(sbi, block_group);
> + spin_lock(sb_bgl_lock(sbi, flex_group));
> + sbi->s_flex_groups[flex_group].free_inodes++;
> + spin_unlock(sb_bgl_lock(sbi, flex_group));
> + }
Same as above...
> +#define free_block_ratio 10
> +
> +int find_group_flex(struct super_block *sb, struct inode *parent)
> +{
> + n_fbg_groups = (sbi->s_groups_count + flex_size - 1) / flex_size;
> + best_flex = parent_fbg_group;
> +
> +find_close_to_parent:
> + flex_freeb_ratio = flex_group[best_flex].free_blocks*100/blocks_per_flex;
There is no particular reason that this ratio needs to be "*100", it could
just as easily be a fraction of 256 and make the multiply into a shift.
The free_block_ratio would be 26 in that case.
> + for (i = 0; i < n_fbg_groups; i++) {
> + if (i == parent_fbg_group || i == parent_fbg_group - 1)
> + continue;
It seems this scans flex groups the way we used to scan groups?
> +found_flexbg:
> + for (i = best_flex * flex_size; i < ngroups &&
> + i < (best_flex + 1) * flex_size; i++) {
And now that we've found a suitable flex group, we need to find which
block group therein has some free inodes...
> +static int ext4_fill_flex_info(struct super_block *sb)
> +{
It still seems desirable to have a single per-group array instead of
> @@ -622,7 +631,9 @@ struct ext4_super_block {
> __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
> __le64 s_mmp_block; /* Block for multi-mount protection */
> __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
> - __u32 s_reserved[163]; /* Padding to the end of the block */
> + __le16 s_flex_bg_size; /* FLEX_BG group size */
Shouldn't this be "s_flex_bg_bits"?
> +{
> + return block_group >> sbi->s_groups_per_flex_shift;
> +}
> +
> +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
> +{
> + return 1 << sbi->s_groups_per_flex_shift;
> +}
> +
> #define ext4_std_error(sb, errno) \
> do { \
> if ((errno)) \
> diff --git a/lib/ext2fs/alloc_tables.c b/lib/ext2fs/alloc_tables.c
> --- a/lib/ext2fs/alloc_tables.c
> +++ b/lib/ext2fs/alloc_tables.c
> + if (EXT2_HAS_INCOMPAT_FEATURE (fs->super,
> + EXT4_FEATURE_INCOMPAT_FLEX_BG))
> + ext2fs_allocate_flex_groups(fs);
> +
> + else {
> + for (i = 0; i < fs->group_desc_count; i++) {
> + retval = ext2fs_allocate_group_table(fs, i, fs->block_map);
> + if (retval)
> + return retval;
> + }
> }
My preference would be to have "if (EXT2_HAS_INCOMPAT...) { ... } else {"
(i.e. add { } for the first part) since there are { } on the second part,
and it is just easier to read.
> @@ -1045,6 +1046,19 @@ static void PRS(int argc, char *argv[])
> + if ((flex_bg_size & (flex_bg_size-1)) != 0) {
> + com_err(program_name, 0,
> + _("Flex_BG size must be a power of 2"));
> + exit(1);
If flex_bg_size is a power of two then there isn't any need to store anything
except __u8 s_flex_bg_bits in the superblock.
> @@ -1444,6 +1458,10 @@ static void PRS(int argc, char *argv[])
> + if(flex_bg_size) {
> + fs_param.s_flex_bg_size = ext2fs_swab16(flex_bg_size);
> + }
Space between if and (, and no need for braces for a single line body.
It would also be nice to get a m_flexbg test case along with this patch
that (at minimum) creates a filesystem with flexbg enabled, and then
runs e2fsck on it. This was broken for the lazy_bg feature for a long
time, so it makes sense to add a test to verify each new feature has
some basic functionality. If the f_random_corruption test is in the
git tree, it would be good to add the flex_bg option to the list of
possible feature combinations to test.
Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.
2007-12-07 10:14 ` Andreas Dilger
@ 2007-12-07 15:52 ` Jose R. Santos
2007-12-11 11:00 ` Andreas Dilger
0 siblings, 1 reply; 11+ messages in thread
From: Jose R. Santos @ 2007-12-07 15:52 UTC (permalink / raw)
To: Andreas Dilger; +Cc: linux-ext4
On Fri, 7 Dec 2007 03:14:28 -0700
Andreas Dilger <adilger@sun.com> wrote:
> On Dec 06, 2007 16:10 -0600, Jose R. Santos wrote:
> > @@ -600,6 +600,7 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
> > struct ext4_sb_info *sbi;
> > int err = 0, ret;
> > ext4_grpblk_t group_freed;
> > + ext4_group_t flex_group;
> >
> > *pdquot_freed_blocks = 0;
> > sbi = EXT4_SB(sb);
> > @@ -745,6 +746,14 @@ do_more:
> > spin_unlock(sb_bgl_lock(sbi, block_group));
> > percpu_counter_add(&sbi->s_freeblocks_counter, count);
> >
> > + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
> > + sbi->s_groups_per_flex_shift) {
> > + flex_group = ext4_flex_group(sbi, block_group);
> > + spin_lock(sb_bgl_lock(sbi, flex_group));
> > + sbi->s_flex_groups[flex_group].free_blocks += count;
> > + spin_unlock(sb_bgl_lock(sbi, flex_group));
> > + }
>
> In general, I prefer to keep variables in as local a scope as possible.
> In this case, flex_group could be declared inside the "if (EXT4_HAS_INCOMPAT"
> check.
Ok.
> > +#define free_block_ratio 10
> > +
> > +int find_group_flex(struct super_block *sb, struct inode *parent)
> > +{
> > + n_fbg_groups = (sbi->s_groups_count + flex_size - 1) / flex_size;
> > + best_flex = parent_fbg_group;
> > +
> > +find_close_to_parent:
> > + flex_freeb_ratio = flex_group[best_flex].free_blocks*100/blocks_per_flex;
>
> There is no particular reason that this ratio needs to be "*100", it could
> just as easily be a fraction of 256 and make the multiply into a shift.
> The free_block_ratio would be 26 in that case.
The idea here is to reserve 10% (free_block_ratio) of free blocks in a
flexbg for allocation of new files and expansion of existing one. The
"*100" make the math here easy but this still something that need to be
tune further. I'm sure we can do this in a series of shifts, just
haven't spent the time thinking of a clever way to do this.
Although, given all the multiplies, divides, endian changes that occur
while using Orlov, I'm not so concern about this right now.
> > + for (i = 0; i < n_fbg_groups; i++) {
> > + if (i == parent_fbg_group || i == parent_fbg_group - 1)
> > + continue;
>
> It seems this scans flex groups the way we used to scan groups?
No. It does something slightly different, the scan does not start from
the parent group forward. This help compress data as much as possible
in the disk and helps avoid large seeks. Reclaiming as much used
groups as possible will also help uninitialized block groups by avoiding
using groups when there is perfectly good unused space at the beginning
of the disk. Currently the search starts at the first flexbg but for
very large filesystems, this should be tune to start at a location
closer to the parents flex group. This is another area where the patch
needs more tuning, but I was hopping people would give this patch a
try to see what deficiencies they find before going into lengthy disk
testing/tuning cycle.
> > +found_flexbg:
> > + for (i = best_flex * flex_size; i < ngroups &&
> > + i < (best_flex + 1) * flex_size; i++) {
>
> And now that we've found a suitable flex group, we need to find which
> block group therein has some free inodes...
Yes, but we treat all inode tables in a flex group as one giant table
to improve locality and reduce initialization of inode tables to
improve fsck time.
>
> > +static int ext4_fill_flex_info(struct super_block *sb)
> > +{
>
> It still seems desirable to have a single per-group array instead of
?
> > @@ -622,7 +631,9 @@ struct ext4_super_block {
> > __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
> > __le64 s_mmp_block; /* Block for multi-mount protection */
> > __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
> > - __u32 s_reserved[163]; /* Padding to the end of the block */
> > + __le16 s_flex_bg_size; /* FLEX_BG group size */
>
> Shouldn't this be "s_flex_bg_bits"?
I debated whether to store this as the s_flex_bg_size and calculate the
bits during the filesystem mount time or just stored the bit in the
super block to begging with. The reason I stored the size is that it
seemed more in line with the other fields in the super block. I don't
mind either way since this is more of a style issue, although saving an
extra 8bits in the super block may be good enough reason to change it.
> > +{
> > + return block_group >> sbi->s_groups_per_flex_shift;
> > +}
> > +
> > +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
> > +{
> > + return 1 << sbi->s_groups_per_flex_shift;
> > +}
> > +
> > #define ext4_std_error(sb, errno) \
> > do { \
> > if ((errno)) \
>
> > diff --git a/lib/ext2fs/alloc_tables.c b/lib/ext2fs/alloc_tables.c
> > --- a/lib/ext2fs/alloc_tables.c
> > +++ b/lib/ext2fs/alloc_tables.c
> > + if (EXT2_HAS_INCOMPAT_FEATURE (fs->super,
> > + EXT4_FEATURE_INCOMPAT_FLEX_BG))
> > + ext2fs_allocate_flex_groups(fs);
> > +
> > + else {
> > + for (i = 0; i < fs->group_desc_count; i++) {
> > + retval = ext2fs_allocate_group_table(fs, i, fs->block_map);
> > + if (retval)
> > + return retval;
> > + }
> > }
>
> My preference would be to have "if (EXT2_HAS_INCOMPAT...) { ... } else {"
> (i.e. add { } for the first part) since there are { } on the second part,
> and it is just easier to read.
Mine too, but checkpatch complained about this. :)
>
> > @@ -1045,6 +1046,19 @@ static void PRS(int argc, char *argv[])
> > + if ((flex_bg_size & (flex_bg_size-1)) != 0) {
> > + com_err(program_name, 0,
> > + _("Flex_BG size must be a power of 2"));
> > + exit(1);
>
> If flex_bg_size is a power of two then there isn't any need to store anything
> except __u8 s_flex_bg_bits in the superblock.
Agree.
> > @@ -1444,6 +1458,10 @@ static void PRS(int argc, char *argv[])
> > + if(flex_bg_size) {
> > + fs_param.s_flex_bg_size = ext2fs_swab16(flex_bg_size);
> > + }
>
> Space between if and (, and no need for braces for a single line body.
This patch is intended for testing and need a lot of work. I still
haven't looked at any of the styling issues, but I'm surprise this is
the only one you found. :)
> It would also be nice to get a m_flexbg test case along with this patch
> that (at minimum) creates a filesystem with flexbg enabled, and then
> runs e2fsck on it. This was broken for the lazy_bg feature for a long
> time, so it makes sense to add a test to verify each new feature has
> some basic functionality. If the f_random_corruption test is in the
> git tree, it would be good to add the flex_bg option to the list of
> possible feature combinations to test.
Yes, it's on my todo list. First I need to get a patch that meta-data
allocation patch that Ted is willing to put into the e2fsprog's next
branch. After that, I will add test case for the feature.
> Cheers, Andreas
> --
> Andreas Dilger
> Sr. Staff Engineer, Lustre Group
> Sun Microsystems of Canada, Inc.
>
Thanks
-JRS
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.
2007-12-07 15:52 ` Jose R. Santos
@ 2007-12-11 11:00 ` Andreas Dilger
2007-12-11 16:08 ` Jose R. Santos
0 siblings, 1 reply; 11+ messages in thread
From: Andreas Dilger @ 2007-12-11 11:00 UTC (permalink / raw)
To: Jose R. Santos; +Cc: linux-ext4
On Dec 07, 2007 09:52 -0600, Jose R. Santos wrote:
> Andreas Dilger <adilger@sun.com> wrote:
> > There is no particular reason that this ratio needs to be "*100", it could
> > just as easily be a fraction of 256 and make the multiply into a shift.
> > The free_block_ratio would be 26 in that case.
>
> The idea here is to reserve 10% (free_block_ratio) of free blocks in a
> flexbg for allocation of new files and expansion of existing one. The
> "*100" make the math here easy but this still something that need to be
> tune further. I'm sure we can do this in a series of shifts, just
> haven't spent the time thinking of a clever way to do this.
This is a common misconception for code to have 10% mean 10 / 100. It
is just as good to have 26/256
> > > @@ -622,7 +631,9 @@ struct ext4_super_block {
> > > __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
> > > __le64 s_mmp_block; /* Block for multi-mount protection */
> > > __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
> > > - __u32 s_reserved[163]; /* Padding to the end of the block */
> > > + __le16 s_flex_bg_size; /* FLEX_BG group size */
> >
> > Shouldn't this be "s_flex_bg_bits"?
>
> I debated whether to store this as the s_flex_bg_size and calculate the
> bits during the filesystem mount time or just stored the bit in the
> super block to begging with. The reason I stored the size is that it
> seemed more in line with the other fields in the super block. I don't
> mind either way since this is more of a style issue, although saving an
> extra 8bits in the super block may be good enough reason to change it.
I'd think being able to avoid the divide for every inode allocation is more
important than 8 bits in the superblock.
> > My preference would be to have "if (EXT2_HAS_INCOMPAT...) { ... } else {"
> > (i.e. add { } for the first part) since there are { } on the second part,
> > and it is just easier to read.
>
> Mine too, but checkpatch complained about this. :)
Time to fix checkpatch it would seem.
Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.
2007-12-11 11:00 ` Andreas Dilger
@ 2007-12-11 16:08 ` Jose R. Santos
2007-12-11 23:15 ` Andreas Dilger
0 siblings, 1 reply; 11+ messages in thread
From: Jose R. Santos @ 2007-12-11 16:08 UTC (permalink / raw)
To: Andreas Dilger; +Cc: linux-ext4
On Tue, 11 Dec 2007 04:00:33 -0700
Andreas Dilger <adilger@sun.com> wrote:
> On Dec 07, 2007 09:52 -0600, Jose R. Santos wrote:
> > Andreas Dilger <adilger@sun.com> wrote:
> > > There is no particular reason that this ratio needs to be "*100", it could
> > > just as easily be a fraction of 256 and make the multiply into a shift.
> > > The free_block_ratio would be 26 in that case.
> >
> > The idea here is to reserve 10% (free_block_ratio) of free blocks in a
> > flexbg for allocation of new files and expansion of existing one. The
> > "*100" make the math here easy but this still something that need to be
> > tune further. I'm sure we can do this in a series of shifts, just
> > haven't spent the time thinking of a clever way to do this.
>
> This is a common misconception for code to have 10% mean 10 / 100. It
> is just as good to have 26/256
I understand that part, but my point is that changing the multiply to
256 doesn't do anything to eliminate the divide by blocks_per_flex.
Give that is more common to have an arch with no divide instruction
than one with no multiply, it seems more important to take care of the
divide by blocks_per_flex rather than the multiply by 100.
We could store the blocks_per_flex_bits in the sbi to do this but the
last flexbg is not guarantied to be the same size as the other flexbg
so it needs to be treated differently.
Hum... Now that I think of it, the last flexbg is not treated
differently on the current patch either. Looks like I found a bug. :)
> > > > @@ -622,7 +631,9 @@ struct ext4_super_block {
> > > > __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
> > > > __le64 s_mmp_block; /* Block for multi-mount protection */
> > > > __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
> > > > - __u32 s_reserved[163]; /* Padding to the end of the block */
> > > > + __le16 s_flex_bg_size; /* FLEX_BG group size */
> > >
> > > Shouldn't this be "s_flex_bg_bits"?
> >
> > I debated whether to store this as the s_flex_bg_size and calculate the
> > bits during the filesystem mount time or just stored the bit in the
> > super block to begging with. The reason I stored the size is that it
> > seemed more in line with the other fields in the super block. I don't
> > mind either way since this is more of a style issue, although saving an
> > extra 8bits in the super block may be good enough reason to change it.
>
> I'd think being able to avoid the divide for every inode allocation is more
> important than 8 bits in the superblock.
We already avoid the divide since what we store in the sbi IS the bits
which are calculated at mount time for each fs. Base on the other
fields in the super block struct, I decided to put explicit size of the
flexbg in the super block. The kernel code can decide how best to use
that number which in this case its used to calculate the number of bits
in order to avoid doing divides.
So this is really a styling issue in how to record data in the super
block. The only technical issue with this is whether it's important to
save those extra 8 bits in the super block struct.
> > > My preference would be to have "if (EXT2_HAS_INCOMPAT...) { ... } else {"
> > > (i.e. add { } for the first part) since there are { } on the second part,
> > > and it is just easier to read.
> >
> > Mine too, but checkpatch complained about this. :)
>
> Time to fix checkpatch it would seem.
>
> Cheers, Andreas
> --
> Andreas Dilger
> Sr. Staff Engineer, Lustre Group
> Sun Microsystems of Canada, Inc.
>
-JRS
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.
2007-12-11 16:08 ` Jose R. Santos
@ 2007-12-11 23:15 ` Andreas Dilger
2007-12-13 15:51 ` Jose R. Santos
0 siblings, 1 reply; 11+ messages in thread
From: Andreas Dilger @ 2007-12-11 23:15 UTC (permalink / raw)
To: Jose R. Santos; +Cc: linux-ext4
On Dec 11, 2007 10:08 -0600, Jose R. Santos wrote:
> > I'd think being able to avoid the divide for every inode allocation is more
> > important than 8 bits in the superblock.
>
> We already avoid the divide since what we store in the sbi IS the bits
> which are calculated at mount time for each fs. Base on the other
> fields in the super block struct, I decided to put explicit size of the
> flexbg in the super block. The kernel code can decide how best to use
> that number which in this case its used to calculate the number of bits
> in order to avoid doing divides.
>
> So this is really a styling issue in how to record data in the super
> block. The only technical issue with this is whether it's important to
> save those extra 8 bits in the super block struct.
Well, if it is stored in the superblock as a non-power-of-two value, then
there always exists the possibility that it is set incorrectly (maybe by
a version of mke2fs that doesn't verify this) and the code will not do the
right thing. Storing it in bits (as is done with e.g. s_log_block_size and
s_log_frag_size) ensures there is no possibility of a value that isn't a
power-of-two.
Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.
2007-12-11 23:15 ` Andreas Dilger
@ 2007-12-13 15:51 ` Jose R. Santos
2007-12-13 22:58 ` Andreas Dilger
0 siblings, 1 reply; 11+ messages in thread
From: Jose R. Santos @ 2007-12-13 15:51 UTC (permalink / raw)
To: Andreas Dilger; +Cc: linux-ext4
On Tue, 11 Dec 2007 16:15:28 -0700
Andreas Dilger <adilger@sun.com> wrote:
> On Dec 11, 2007 10:08 -0600, Jose R. Santos wrote:
> > > I'd think being able to avoid the divide for every inode allocation is more
> > > important than 8 bits in the superblock.
> >
> > We already avoid the divide since what we store in the sbi IS the bits
> > which are calculated at mount time for each fs. Base on the other
> > fields in the super block struct, I decided to put explicit size of the
> > flexbg in the super block. The kernel code can decide how best to use
> > that number which in this case its used to calculate the number of bits
> > in order to avoid doing divides.
> >
> > So this is really a styling issue in how to record data in the super
> > block. The only technical issue with this is whether it's important to
> > save those extra 8 bits in the super block struct.
>
> Well, if it is stored in the superblock as a non-power-of-two value, then
> there always exists the possibility that it is set incorrectly (maybe by
> a version of mke2fs that doesn't verify this) and the code will not do the
> right thing. Storing it in bits (as is done with e.g. s_log_block_size and
> s_log_frag_size) ensures there is no possibility of a value that isn't a
> power-of-two.
While I don't necessary buy the mke2fs example (the only patch that
set this already checks for power-of-two), you are right about the
possibility of being set incorrectly. I will change it to store the
bits in the next release which I'll do after I fix the resize2fs issues
since this will require changes to the e2fsprogs as well.
Now, storing the bits only guaranties that the flexbg size is always a
power-of-two and does not guarantee that the super block flexbg size
represents the actual meta-data grouping on disk. For this we need to
verify that the bitmap offsets match what the super block reports. It
may be an unlikely scenario, but it may be worth it to check this as
well at mount time.
> Cheers, Andreas
> --
> Andreas Dilger
> Sr. Staff Engineer, Lustre Group
> Sun Microsystems of Canada, Inc.
>
-JRS
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.
2007-12-13 15:51 ` Jose R. Santos
@ 2007-12-13 22:58 ` Andreas Dilger
2007-12-14 2:36 ` Jose R. Santos
0 siblings, 1 reply; 11+ messages in thread
From: Andreas Dilger @ 2007-12-13 22:58 UTC (permalink / raw)
To: Jose R. Santos; +Cc: linux-ext4
On Dec 13, 2007 09:51 -0600, Jose R. Santos wrote:
> Now, storing the bits only guaranties that the flexbg size is always a
> power-of-two and does not guarantee that the super block flexbg size
> represents the actual meta-data grouping on disk. For this we need to
> verify that the bitmap offsets match what the super block reports. It
> may be an unlikely scenario, but it may be worth it to check this as
> well at mount time.
I'm not sure what you mean... Isn't the flexbg size just a count of
the number of block groups? If it is always a power of two, and the
groups per metabg is always a power of two (it is) then they will
always be even multiples.
Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.
2007-12-13 22:58 ` Andreas Dilger
@ 2007-12-14 2:36 ` Jose R. Santos
2007-12-14 17:01 ` Andreas Dilger
0 siblings, 1 reply; 11+ messages in thread
From: Jose R. Santos @ 2007-12-14 2:36 UTC (permalink / raw)
To: Andreas Dilger; +Cc: linux-ext4
On Thu, 13 Dec 2007 15:58:57 -0700
Andreas Dilger <adilger@sun.com> wrote:
> On Dec 13, 2007 09:51 -0600, Jose R. Santos wrote:
> > Now, storing the bits only guaranties that the flexbg size is always a
> > power-of-two and does not guarantee that the super block flexbg size
> > represents the actual meta-data grouping on disk. For this we need to
> > verify that the bitmap offsets match what the super block reports. It
> > may be an unlikely scenario, but it may be worth it to check this as
> > well at mount time.
>
> I'm not sure what you mean... Isn't the flexbg size just a count of
> the number of block groups? If it is always a power of two, and the
> groups per metabg is always a power of two (it is) then they will
> always be even multiples.
What I mean is that if the value in the super block is corrupted and
does not represent the actual flexbg size, the inode allocation will
behave in weird unexpected ways. Just as we check that the bitmaps are
within the block group range (when not using flexbg), we should
probably sanity check the size of the flexbg as reported in the super
block.
Or do you believe the check is unnecessary?
> Cheers, Andreas
> --
> Andreas Dilger
> Sr. Staff Engineer, Lustre Group
> Sun Microsystems of Canada, Inc.
>
-JRS
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.
2007-12-14 2:36 ` Jose R. Santos
@ 2007-12-14 17:01 ` Andreas Dilger
2007-12-14 18:07 ` Jose R. Santos
0 siblings, 1 reply; 11+ messages in thread
From: Andreas Dilger @ 2007-12-14 17:01 UTC (permalink / raw)
To: Jose R. Santos; +Cc: linux-ext4
On Dec 13, 2007 20:36 -0600, Jose R. Santos wrote:
> ... if the value in the super block is corrupted and
> does not represent the actual flexbg size, the inode allocation will
> behave in weird unexpected ways. Just as we check that the bitmaps are
> within the block group range (when not using flexbg), we should
> probably sanity check the size of the flexbg as reported in the super
> block.
>
> Or do you believe the check is unnecessary?
Well, I can imagine in some cases that the flexbg will not be completely
contiguous on disk (e.g. after a filesystem resize, if there are bad
blocks, etc). As long as the group descriptors themselves are correct
(i.e. referencing valid bitmaps/itable) then it shouldn't cause a mount
failure if the per-group data isn't strictly aligned according to the
superblock flexbg count.
We would need to validate the group descriptor separately though (e.g.
group checksums).
Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.
2007-12-14 17:01 ` Andreas Dilger
@ 2007-12-14 18:07 ` Jose R. Santos
0 siblings, 0 replies; 11+ messages in thread
From: Jose R. Santos @ 2007-12-14 18:07 UTC (permalink / raw)
To: Andreas Dilger; +Cc: linux-ext4
On Fri, 14 Dec 2007 10:01:06 -0700
Andreas Dilger <adilger@sun.com> wrote:
> Well, I can imagine in some cases that the flexbg will not be completely
> contiguous on disk (e.g. after a filesystem resize, if there are bad
> blocks, etc). As long as the group descriptors themselves are correct
> (i.e. referencing valid bitmaps/itable) then it shouldn't cause a mount
> failure if the per-group data isn't strictly aligned according to the
> superblock flexbg count.
Yes, the meta-data may not be completely contiguous on the disk as per
the definition of flexbg. What I was planing on doing was to check the
first, second and last-1 flexbg to see if how the meta-data is
arranged. If none of those flexbg matches the size of the flexbg size
in the super block the we can set sbi->s_groups_per_flex_shift to zero
which would make the fs fallback to Orlov.
> We would need to validate the group descriptor separately though (e.g.
> group checksums).
Agree
> Cheers, Andreas
> --
> Andreas Dilger
> Sr. Staff Engineer, Lustre Group
> Sun Microsystems of Canada, Inc.
>
-JRS
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2007-12-14 18:07 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-12-06 22:10 [RFC] [PATCH] Flex_BG ialloc awareness V2 Jose R. Santos
2007-12-07 10:14 ` Andreas Dilger
2007-12-07 15:52 ` Jose R. Santos
2007-12-11 11:00 ` Andreas Dilger
2007-12-11 16:08 ` Jose R. Santos
2007-12-11 23:15 ` Andreas Dilger
2007-12-13 15:51 ` Jose R. Santos
2007-12-13 22:58 ` Andreas Dilger
2007-12-14 2:36 ` Jose R. Santos
2007-12-14 17:01 ` Andreas Dilger
2007-12-14 18:07 ` Jose R. Santos
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).