From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
To: "Theodore Ts'o" <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH 29/42] ext4: Use readahead when reading an inode from the inode table
Date: Thu, 9 Oct 2008 13:48:34 +0530 [thread overview]
Message-ID: <20081009081834.GD9918@skywalker> (raw)
In-Reply-To: <1223525160-9887-30-git-send-email-tytso@mit.edu>
On Thu, Oct 09, 2008 at 12:05:47AM -0400, Theodore Ts'o wrote:
> With modern hard drives, reading 64k takes roughly the same time as
> reading a 4k block. So request readahead for adjacent inode table
> blocks to reduce the time it takes when iterating over directories
> (especially when doing this in htree sort order) in a cold cache case.
> With this patch, the time it takes to run "git status" on a kernel
> tree after flushing the caches via "echo 3 > /proc/sys/vm/drop_caches"
> is reduced by 21%.
>
> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
> ---
> fs/ext4/ext4.h | 2 +
> fs/ext4/ext4_sb.h | 1 +
> fs/ext4/inode.c | 134 +++++++++++++++++++++++++---------------------------
> fs/ext4/super.c | 27 ++++++++++-
> 4 files changed, 92 insertions(+), 72 deletions(-)
Need documentation for the new mount option and the /proc tunable.
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 163c445..922d187 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -790,6 +790,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
> #define EXT4_DEF_RESUID 0
> #define EXT4_DEF_RESGID 0
>
> +#define EXT4_DEF_INODE_READAHEAD_BLKS 32
> +
> /*
> * Default mount options
> */
> diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
> index f92af01..94e0757 100644
> --- a/fs/ext4/ext4_sb.h
> +++ b/fs/ext4/ext4_sb.h
> @@ -52,6 +52,7 @@ struct ext4_sb_info {
> int s_desc_per_block_bits;
> int s_inode_size;
> int s_first_ino;
> + unsigned int s_inode_readahead_blks;
> spinlock_t s_next_gen_lock;
> u32 s_next_generation;
> u32 s_hash_seed[4];
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 22fcbb6..ef4ca3d 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3833,41 +3833,6 @@ out_stop:
> ext4_journal_stop(handle);
> }
>
> -static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
> - unsigned long ino, struct ext4_iloc *iloc)
> -{
> - ext4_group_t block_group;
> - unsigned long offset;
> - ext4_fsblk_t block;
> - struct ext4_group_desc *gdp;
> -
> - if (!ext4_valid_inum(sb, ino)) {
> - /*
> - * This error is already checked for in namei.c unless we are
> - * looking at an NFS filehandle, in which case no error
> - * report is needed
> - */
> - return 0;
> - }
> -
> - block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
> - gdp = ext4_get_group_desc(sb, block_group, NULL);
> - if (!gdp)
> - return 0;
> -
> - /*
> - * Figure out the offset within the block group inode table
> - */
> - offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
> - EXT4_INODE_SIZE(sb);
> - block = ext4_inode_table(sb, gdp) +
> - (offset >> EXT4_BLOCK_SIZE_BITS(sb));
> -
> - iloc->block_group = block_group;
> - iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
> - return block;
> -}
> -
> /*
> * ext4_get_inode_loc returns with an extra refcount against the inode's
> * underlying buffer_head on success. If 'in_mem' is true, we have all
> @@ -3877,19 +3842,35 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
> static int __ext4_get_inode_loc(struct inode *inode,
> struct ext4_iloc *iloc, int in_mem)
> {
> - ext4_fsblk_t block;
> - struct buffer_head *bh;
> + struct ext4_group_desc *gdp;
> + struct buffer_head *bh;
> + struct super_block *sb = inode->i_sb;
> + ext4_fsblk_t block;
> + int inodes_per_block, inode_offset;
> +
> + iloc->bh = 0;
> + if (!ext4_valid_inum(sb, inode->i_ino))
> + return -EIO;
>
> - block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
> - if (!block)
> + iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
> + gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
> + if (!gdp)
> return -EIO;
>
> - bh = sb_getblk(inode->i_sb, block);
> + /*
> + * Figure out the offset within the block group inode table
> + */
> + inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
> + inode_offset = ((inode->i_ino - 1) %
> + EXT4_INODES_PER_GROUP(sb));
> + block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
> + iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
> +
> + bh = sb_getblk(sb, block);
> if (!bh) {
> - ext4_error (inode->i_sb, "ext4_get_inode_loc",
> - "unable to read inode block - "
> - "inode=%lu, block=%llu",
> - inode->i_ino, block);
> + ext4_error(sb, "ext4_get_inode_loc", "unable to read "
> + "inode block - inode=%lu, block=%llu",
> + inode->i_ino, block);
> return -EIO;
> }
> if (!buffer_uptodate(bh)) {
> @@ -3917,28 +3898,12 @@ static int __ext4_get_inode_loc(struct inode *inode,
> */
> if (in_mem) {
> struct buffer_head *bitmap_bh;
> - struct ext4_group_desc *desc;
> - int inodes_per_buffer;
> - int inode_offset, i;
> - ext4_group_t block_group;
> - int start;
> -
> - block_group = (inode->i_ino - 1) /
> - EXT4_INODES_PER_GROUP(inode->i_sb);
> - inodes_per_buffer = bh->b_size /
> - EXT4_INODE_SIZE(inode->i_sb);
> - inode_offset = ((inode->i_ino - 1) %
> - EXT4_INODES_PER_GROUP(inode->i_sb));
> - start = inode_offset & ~(inodes_per_buffer - 1);
> + int i, start;
>
> - /* Is the inode bitmap in cache? */
> - desc = ext4_get_group_desc(inode->i_sb,
> - block_group, NULL);
> - if (!desc)
> - goto make_io;
> + start = inode_offset & ~(inodes_per_block - 1);
>
> - bitmap_bh = sb_getblk(inode->i_sb,
> - ext4_inode_bitmap(inode->i_sb, desc));
> + /* Is the inode bitmap in cache? */
> + bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
> if (!bitmap_bh)
> goto make_io;
>
> @@ -3951,14 +3916,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
> brelse(bitmap_bh);
> goto make_io;
> }
> - for (i = start; i < start + inodes_per_buffer; i++) {
> + for (i = start; i < start + inodes_per_block; i++) {
> if (i == inode_offset)
> continue;
> if (ext4_test_bit(i, bitmap_bh->b_data))
> break;
> }
> brelse(bitmap_bh);
> - if (i == start + inodes_per_buffer) {
> + if (i == start + inodes_per_block) {
> /* all other inodes are free, so skip I/O */
> memset(bh->b_data, 0, bh->b_size);
> set_buffer_uptodate(bh);
> @@ -3969,6 +3934,36 @@ static int __ext4_get_inode_loc(struct inode *inode,
>
> make_io:
> /*
> + * If we need to do any I/O, try to pre-readahead extra
> + * blocks from the inode table.
> + */
> + if (EXT4_SB(sb)->s_inode_readahead_blks) {
> + ext4_fsblk_t b, end, table;
> + unsigned num;
> +
> + table = ext4_inode_table(sb, gdp);
> + /* Make sure s_inode_readahead_blks is a power of 2 */
> + while (EXT4_SB(sb)->s_inode_readahead_blks &
> + (EXT4_SB(sb)->s_inode_readahead_blks-1))
> + EXT4_SB(sb)->s_inode_readahead_blks =
> + (EXT4_SB(sb)->s_inode_readahead_blks &
> + (EXT4_SB(sb)->s_inode_readahead_blks-1));
> + b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
> + if (table > b)
> + b = table;
> + end = b + EXT4_SB(sb)->s_inode_readahead_blks;
> + num = EXT4_INODES_PER_GROUP(sb);
> + if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
> + EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
> + num -= le16_to_cpu(gdp->bg_itable_unused);
> + table += num / inodes_per_block;
> + if (end > table)
> + end = table;
> + while (b <= end)
> + sb_breadahead(sb, b++);
> + }
> +
> + /*
> * There are other valid inodes in the buffer, this inode
> * has in-inode xattrs, or we don't have this inode in memory.
> * Read the block from disk.
> @@ -3978,10 +3973,9 @@ make_io:
> submit_bh(READ_META, bh);
> wait_on_buffer(bh);
> if (!buffer_uptodate(bh)) {
> - ext4_error(inode->i_sb, "ext4_get_inode_loc",
> - "unable to read inode block - "
> - "inode=%lu, block=%llu",
> - inode->i_ino, block);
> + ext4_error(sb, __func__,
> + "unable to read inode block - inode=%lu, "
> + "block=%llu", inode->i_ino, block);
> brelse(bh);
> return -EIO;
> }
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 9f5468f..6583aee 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -515,8 +515,10 @@ static void ext4_put_super(struct super_block *sb)
> mark_buffer_dirty(sbi->s_sbh);
> ext4_commit_super(sb, es, 1);
> }
> - if (sbi->s_proc)
> + if (sbi->s_proc) {
> + remove_proc_entry("inode_readahead_blks", sbi->s_proc);
> remove_proc_entry(sb->s_id, ext4_proc_root);
> + }
>
> for (i = 0; i < sbi->s_gdb_count; i++)
> brelse(sbi->s_group_desc[i]);
> @@ -779,6 +781,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
> else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
> seq_puts(seq, ",data=writeback");
>
> + if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
> + seq_printf(seq, ",inode_readahead_blks=%u",
> + sbi->s_inode_readahead_blks);
> +
> ext4_show_quota_options(seq, sb);
> return 0;
> }
> @@ -913,6 +919,7 @@ enum {
> Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
> Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
> Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
> + Opt_inode_readahead_blks
> };
>
> static match_table_t tokens = {
> @@ -973,6 +980,7 @@ static match_table_t tokens = {
> {Opt_resize, "resize"},
> {Opt_delalloc, "delalloc"},
> {Opt_nodelalloc, "nodelalloc"},
> + {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
> {Opt_err, NULL},
> };
>
> @@ -1381,6 +1389,13 @@ set_qf_format:
> case Opt_delalloc:
> set_opt(sbi->s_mount_opt, DELALLOC);
> break;
> + case Opt_inode_readahead_blks:
> + if (match_int(&args[0], &option))
> + return 0;
> + if (option < 0 || option > (1 << 30))
> + return 0;
> + sbi->s_inode_readahead_blks = option;
> + break;
> default:
> printk(KERN_ERR
> "EXT4-fs: Unrecognized mount option \"%s\" "
> @@ -1938,6 +1953,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> sbi->s_mount_opt = 0;
> sbi->s_resuid = EXT4_DEF_RESUID;
> sbi->s_resgid = EXT4_DEF_RESGID;
> + sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
> sbi->s_sb_block = sb_block;
>
> unlock_kernel();
> @@ -2234,6 +2250,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> if (ext4_proc_root)
> sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
>
> + if (sbi->s_proc)
> + proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
> + &ext4_ui_proc_fops,
> + &sbi->s_inode_readahead_blks);
> +
> bgl_lock_init(&sbi->s_blockgroup_lock);
>
> for (i = 0; i < db_count; i++) {
> @@ -2513,8 +2534,10 @@ failed_mount2:
> brelse(sbi->s_group_desc[i]);
> kfree(sbi->s_group_desc);
> failed_mount:
> - if (sbi->s_proc)
> + if (sbi->s_proc) {
> + remove_proc_entry("inode_readahead_blks", sbi->s_proc);
> remove_proc_entry(sb->s_id, ext4_proc_root);
> + }
> #ifdef CONFIG_QUOTA
> for (i = 0; i < MAXQUOTAS; i++)
> kfree(sbi->s_qf_names[i]);
> --
> 1.5.6.1.205.ge2c7.dirty
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
next prev parent reply other threads:[~2008-10-09 8:18 UTC|newest]
Thread overview: 61+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-10-09 4:05 [PATCH 0/42] Ext4 patches queued up for the 2.6.28 merge window Theodore Ts'o
2008-10-09 4:05 ` [PATCH 01/42] percpu counter: clean up percpu_counter_sum_and_set() Theodore Ts'o
2008-10-09 4:05 ` [PATCH 02/42] ext4: Add printk priority levels to clean up checkpatch warnings Theodore Ts'o
2008-10-09 4:05 ` [PATCH 03/42] ext4: Fix long long " Theodore Ts'o
2008-10-09 4:05 ` [PATCH 04/42] ext4: Fix whitespace checkpatch warnings/errors Theodore Ts'o
2008-10-09 4:05 ` [PATCH 05/42] ext4: invalidate pages if delalloc block allocation fails Theodore Ts'o
2008-10-09 4:05 ` [PATCH 06/42] ext4: Make sure all the block allocation paths reserve blocks Theodore Ts'o
2008-10-09 4:05 ` Theodore Ts'o
2008-10-09 4:05 ` [PATCH 07/42] ext4: Retry block reservation Theodore Ts'o
2008-10-09 4:05 ` [PATCH 08/42] ext4: Add percpu dirty block accounting Theodore Ts'o
2008-10-09 4:05 ` [PATCH 09/42] ext4: Switch to non delalloc mode when we are low on free blocks count Theodore Ts'o
2008-10-09 4:05 ` [PATCH 10/42] ext4: Signed arithmetic fix Theodore Ts'o
2008-10-09 4:05 ` [PATCH 11/42] ext4: Fix ext4 nomballoc allocator for ENOSPC Theodore Ts'o
2008-10-09 4:05 ` [PATCH 12/42] ext4: Don't add the inode to journal handle until after the block is allocated Theodore Ts'o
2008-10-09 4:05 ` [PATCH 13/42] ext4: Retry block allocation if we have free blocks left Theodore Ts'o
2008-10-09 4:05 ` Theodore Ts'o
2008-10-09 4:05 ` [PATCH 14/42] ext4: truncate block allocated on a failed ext4_write_begin Theodore Ts'o
2008-10-09 4:05 ` [PATCH 15/42] ext4: Properly update i_disksize Theodore Ts'o
2008-10-09 4:05 ` [PATCH 16/42] ext4: Avoid printk floods in the face of directory corruption Theodore Ts'o
2008-10-09 4:05 ` [PATCH 17/42] Update flex_bg free blocks and free inodes counters when resizing Theodore Ts'o
2008-10-09 4:05 ` [PATCH 18/42] ext4: fix #11321: create /proc/ext4/*/stats more carefully Theodore Ts'o
2008-10-09 4:05 ` [PATCH 19/42] jbd2: clean up how the journal device name is printed Theodore Ts'o
2008-10-09 4:05 ` [PATCH 20/42] ext4: add missing unlock in ext4_check_descriptors() on error path Theodore Ts'o
2008-10-09 4:05 ` [PATCH 21/42] ext4: elevate write count for migrate ioctl Theodore Ts'o
2008-10-09 4:05 ` [PATCH 22/42] ext4: hook the ext3 migration interface to the EXT4_IOC_SETFLAGS ioctl Theodore Ts'o
2008-10-09 4:05 ` [PATCH 23/42] ext4: Renumber EXT4_IOC_MIGRATE Theodore Ts'o
2008-10-09 4:05 ` [PATCH 24/42] ext4: use percpu data structures for lg_prealloc_list Theodore Ts'o
2008-10-09 4:05 ` Theodore Ts'o
2008-10-09 4:05 ` [PATCH 25/42] ext4/jbd2: Avoid WARN() messages when failing to write to the superblock Theodore Ts'o
2008-10-09 4:05 ` [PATCH 26/42] ext4: Don't use 'struct dentry' for internal lookups Theodore Ts'o
2008-10-09 4:05 ` [PATCH 27/42] ext4: move /proc setup and teardown out of mballoc.c Theodore Ts'o
2008-10-09 4:05 ` [PATCH 28/42] ext4: Combine proc file handling into a single set of functions Theodore Ts'o
2008-10-09 4:05 ` [PATCH 29/42] ext4: Use readahead when reading an inode from the inode table Theodore Ts'o
2008-10-09 4:05 ` [PATCH 30/42] ext4: Remove old legacy block allocator Theodore Ts'o
2008-10-09 4:05 ` Theodore Ts'o
2008-10-09 4:05 ` [PATCH 31/42] ext4: fix initialization of UNINIT bitmap blocks Theodore Ts'o
2008-10-09 4:05 ` [PATCH 32/42] jbd2: abort instead of waiting for nonexistent transaction Theodore Ts'o
2008-10-09 4:05 ` [PATCH 33/42] ext4: Add debugging markers that can be used by systemtap Theodore Ts'o
2008-10-09 4:05 ` [PATCH 34/42] jbd2: Fix buffer head leak when writing the commit block Theodore Ts'o
2008-10-09 4:05 ` [PATCH 35/42] ext4: fix xattr deadlock Theodore Ts'o
[not found] ` <1223525160-9887-36-git-send-email-tytso-3s7WtUTddSA@public.gmane.org>
2008-10-09 4:05 ` [PATCH 36/42] vfs: vfs-level fiemap interface Theodore Ts'o
2008-10-09 4:05 ` Theodore Ts'o
2008-10-09 4:05 ` [PATCH 37/42] ocfs2: fiemap support Theodore Ts'o
2008-10-09 4:05 ` [Ocfs2-devel] " Theodore Ts'o
2008-10-09 4:05 ` [PATCH 38/42] generic block based fiemap implementation Theodore Ts'o
2008-10-09 4:05 ` [PATCH 39/42] Hook ext4 to the vfs fiemap interface Theodore Ts'o
2008-10-09 4:05 ` [PATCH 40/42] Update ext4 MAINTAINERS file Theodore Ts'o
2008-10-09 4:05 ` [PATCH 41/42] ext4: Avoid double dirtying of super block in ext4_put_super() Theodore Ts'o
2008-10-09 4:06 ` [PATCH 42/42] ext4: Rename ext4dev to ext4 Theodore Ts'o
2008-10-11 22:04 ` Jeremy Fitzhardinge
2008-10-11 22:04 ` Jeremy Fitzhardinge
2008-10-11 22:09 ` Eric Sandeen
2008-10-11 22:09 ` Eric Sandeen
2008-10-11 22:54 ` Jeremy Fitzhardinge
2008-10-11 22:54 ` Jeremy Fitzhardinge
2008-10-11 22:58 ` Theodore Tso
2008-10-11 23:08 ` Grant Coady
2008-10-12 1:06 ` Eric Sandeen
2008-10-09 8:18 ` Aneesh Kumar K.V [this message]
2008-10-09 8:52 ` [PATCH 01/42] percpu counter: clean up percpu_counter_sum_and_set() Peter Zijlstra
2008-10-09 16:52 ` Theodore Tso
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20081009081834.GD9918@skywalker \
--to=aneesh.kumar@linux.vnet.ibm.com \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=tytso@mit.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.