All of lore.kernel.org
 help / color / mirror / Atom feed
From: Coly Li <coyli@suse.de>
To: linux-ext4@vger.kernel.org
Subject: [PATCH] ext4: dir inode reservation V2
Date: Tue, 30 Oct 2007 18:10:27 +0800	[thread overview]
Message-ID: <47270313.6060301@suse.de> (raw)

>From Mingming's feedback, there were a typo which could break compiling in V1 patch.
This patch fixed this typo, also fixed errors reported by script/checkpatch.pl.

BTW, it seems some duplicated functions were introduced into fs/ext4/inode.c in commit
92ae2b932ed127edff4354929c477f24112341b0 (maybe my mistake during git-pull). If there is any error
reported from fs/ext4/inode.c, please fix it before apply this patch.

Thank Mingming's for the feedback.

Signed-off-by: Coly Li <coyli@suse.de>
Cc: Andreas Dilger <adilger@sun.com>
Cc: Mingming Cao <cmm@us.ibm.com>
---
 fs/ext4/ialloc.c           |  201 ++++++++++++++++++++++++++++++++++++++++++--
 fs/ext4/super.c            |   16 ++++
 include/linux/ext4_fs.h    |    8 ++
 include/linux/ext4_fs_sb.h |    2 +
 4 files changed, 218 insertions(+), 9 deletions(-)

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index d775170..1c79ca4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -7,9 +7,11 @@
  * Universite Pierre et Marie Curie (Paris VI)
  *
  *  BSD ufs-inspired inode and directory allocation by
- *  Stephen Tweedie (sct@redhat.com), 1993
+ *        Stephen Tweedie (sct@redhat.com), 1993
  *  Big-endian to little-endian byte-swapping/bitmaps by
  *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *  Dir inode reservation support by
+ *        Coly Li (coyli@suse.de), 2007
  */

 #include <linux/time.h>
@@ -130,6 +132,41 @@ error_out:
 }

 /*
+ * When calling this function, spin_lock of gdp is hold already.
+ */
+static void ext4_update_itable_unused(handle_t *handle, struct inode *inode,
+		struct ext4_group_desc *gdp, struct buffer_head *bitmap_bh)
+{
+	struct super_block *sb;
+	int bit, offset;
+	int free, group, ires;
+
+	sb = inode->i_sb;
+	ires =  EXT4_SB(sb)->s_dir_ireserve_nr;
+	bit = (inode->i_ino - 1) % EXT4_INODES_PER_GROUP(sb);
+	if (bit & (ires - 1))
+		return;
+	free = EXT4_INODES_PER_GROUP(sb) - le16_to_cpu(gdp->bg_itable_unused);
+	if (free < ires)
+		return;
+	group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	do {
+		offset = ext4_find_next_bit(
+			bitmap_bh->b_data, free, free - ires);
+		if (offset >= free)
+			free -= ires;
+		else
+			break;
+	} while (free > 0);
+	if (free < 0)
+		free = 0;
+	if (group == 0 && (free < EXT4_DIR_IRESERVE_NORMAL))
+		free = EXT4_DIR_IRESERVE_NORMAL;
+	gdp->bg_itable_unused = cpu_to_le16(
+		EXT4_INODES_PER_GROUP(sb) - free);
+}
+
+/*
  * NOTE! When we get the inode, we're the only people
  * that have access to it, and as such there are no
  * race conditions we have to worry about. The inode
@@ -225,9 +262,13 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
 			spin_lock(sb_bgl_lock(sbi, block_group));
 			gdp->bg_free_inodes_count = cpu_to_le16(
 				le16_to_cpu(gdp->bg_free_inodes_count) + 1);
-			if (is_directory)
+			if (is_directory) {
 				gdp->bg_used_dirs_count = cpu_to_le16(
 				  le16_to_cpu(gdp->bg_used_dirs_count) - 1);
+				if (test_opt(sb, DIR_IRESERVE))
+					ext4_update_itable_unused(
+						handle, inode, gdp, bitmap_bh);
+			}
 			gdp->bg_checksum = ext4_group_desc_csum(sbi,
 							block_group, gdp);
 			spin_unlock(sb_bgl_lock(sbi, block_group));
@@ -264,9 +305,10 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
 			  ext4_grpnum_t *best_group)
 {
 	ext4_grpnum_t ngroups = EXT4_SB(sb)->s_groups_count;
+	int ires = EXT4_SB(sb)->s_dir_ireserve_nr;
 	unsigned int freei, avefreei;
-	struct ext4_group_desc *desc, *best_desc = NULL;
-	ext4_grpnum_t group;
+	struct ext4_group_desc *desc, *best_desc = NULL, *best_ires_desc = NULL;
+	ext4_grpnum_t group, best_ires_group = -1;
 	int ret = -1;

 	freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
@@ -285,7 +327,21 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
 			best_desc = desc;
 			ret = 0;
 		}
+		if (test_opt(sb, DIR_IRESERVE)) {
+			if ((best_ires_desc &&
+			   (le16_to_cpu(desc->bg_itable_unused) >
+			   le16_to_cpu(best_ires_desc->bg_itable_unused))) ||
+			   ((!best_ires_desc) &&
+			   (le16_to_cpu(desc->bg_itable_unused) >= ires))) {
+				best_ires_group = group;
+				best_ires_desc = desc;
+				ret = 0;
+			}
+		}
 	}
+	if (test_opt(sb, DIR_IRESERVE) && best_ires_desc)
+		*best_group = best_ires_group;
+
 	return ret;
 }

@@ -354,6 +410,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 			desc = ext4_get_group_desc(sb, grp, NULL);
 			if (!desc || !desc->bg_free_inodes_count)
 				continue;
+			if (test_opt(sb, DIR_IRESERVE) &&
+			    (le16_to_cpu(desc->bg_itable_unused)
+					< EXT4_SB(sb)->s_dir_ireserve_nr))
+				continue;
 			if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
 				continue;
 			if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
@@ -390,6 +450,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 		desc = ext4_get_group_desc(sb, *group, NULL);
 		if (!desc || !desc->bg_free_inodes_count)
 			continue;
+		if (test_opt(sb, DIR_IRESERVE) &&
+		    (le16_to_cpu(desc->bg_itable_unused)
+					< EXT4_SB(sb)->s_dir_ireserve_nr))
+			continue;
 		if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
 			continue;
 		if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
@@ -478,6 +542,105 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 	return -1;
 }

+static int ext4_ino_from_ireserve(handle_t *handle, struct inode *dir,
+			  int mode, ext4_grpnum_t *group, unsigned long *ino)
+{
+	struct ext4_group_desc *gdp = NULL;
+	struct super_block *sb;
+	struct ext4_sb_info *sbi;
+	struct buffer_head *gdp_bh = NULL, *bitmap_bh = NULL;
+	int free;
+	int i;
+	int retries;
+	unsigned long ires_ino;
+	int ires_group = *group;
+
+	sb = dir->i_sb;
+	sbi = EXT4_SB(sb);
+
+	/* if the inode number is not for directory,
+	 * only try to allocate after directory's inode
+	 */
+	if (!S_ISDIR(mode)) {
+		ires_ino = dir->i_ino % EXT4_INODES_PER_GROUP(sb);
+		goto find;
+	}
+
+	/* reserve inodes for new directory */
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		gdp = ext4_get_group_desc(sb, ires_group, &gdp_bh);
+		if (!gdp)
+			goto fail;
+		retries = 2;
+still_reserve_in_this_group:
+		if (le16_to_cpu(gdp->bg_itable_unused) >=
+		    sbi->s_dir_ireserve_nr) {
+
+			brelse(bitmap_bh);
+			bitmap_bh = read_inode_bitmap(sb, ires_group);
+			if (!bitmap_bh)
+				goto fail;
+
+			BUFFER_TRACE(bitmap_bh, "get_write_access");
+			if (ext4_journal_get_write_access(
+				handle, bitmap_bh) != 0)
+				goto fail;
+			free = EXT4_INODES_PER_GROUP(sb) -
+				le16_to_cpu(gdp->bg_itable_unused);
+			if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, ires_group),
+					free, bitmap_bh->b_data)) {
+				/* we won it */
+				BUFFER_TRACE(bitmap_bh,
+					"call ext4_journal_dirty_metadata");
+				if (ext4_journal_dirty_metadata(handle,
+							bitmap_bh) != 0)
+					goto fail;
+				ires_ino = free;
+				goto find;
+			}
+			/* we lost it */
+			jbd2_journal_release_buffer(handle, bitmap_bh);
+			if (-- retries > 0)
+				goto still_reserve_in_this_group;
+		}
+		if (++ires_group == sbi->s_groups_count)
+			ires_group = 0;
+	}
+	goto fail;
+find:
+	if (S_ISDIR(mode)) {
+		free = ires_ino + sbi->s_dir_ireserve_nr;
+		if (free > EXT4_INODES_PER_GROUP(sb))
+			free = EXT4_INODES_PER_GROUP(sb);
+
+		spin_lock(sb_bgl_lock(sbi, ires_group));
+		if ((EXT4_INODES_PER_GROUP(sb) - free) <
+		     le16_to_cpu(gdp->bg_itable_unused)) {
+			BUFFER_TRACE(gdp_bh,
+				      "call ext4_journal_get_write_access");
+			if (ext4_journal_get_write_access(handle, gdp_bh)) {
+				spin_unlock(sb_bgl_lock(sbi, ires_group));
+				goto fail;
+			}
+			gdp->bg_itable_unused =
+				EXT4_INODES_PER_GROUP(sb) - free;
+			spin_unlock(sb_bgl_lock(sbi, ires_group));
+			BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+			if (ext4_journal_dirty_metadata(handle, gdp_bh) != 0)
+				goto fail;
+		} else {
+			spin_unlock(sb_bgl_lock(sbi, ires_group));
+		}
+		brelse(bitmap_bh);
+		*group = ires_group;
+	}
+	*ino = ires_ino;
+	return 0;
+fail:
+	brelse(bitmap_bh);
+	return -ENOSPC;
+}
+
 /*
  * There are two policies for allocating an inode.  If the new inode is
  * a directory, then a forward search is made for a block group with both
@@ -541,7 +704,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
 			goto fail;

 		ino = 0;
-
+		if (test_opt(sb, DIR_IRESERVE)) {
+			err = ext4_ino_from_ireserve(handle, dir,
+						     mode, &group, &ino);
+			if ((!err) && S_ISDIR(mode))
+				goto got;
+		}
 repeat_in_this_group:
 		ino = ext4_find_next_zero_bit((unsigned long *)
 				bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino);
@@ -633,6 +801,20 @@ got:
 	}

 	spin_lock(sb_bgl_lock(sbi, group));
+
+	if (test_opt(sb, DIR_IRESERVE)) {
+		free = EXT4_INODES_PER_GROUP(sb) -
+			le16_to_cpu(gdp->bg_itable_unused);
+		if (ino > free) {
+			free += sbi->s_dir_ireserve_nr;
+			free = (free + sbi->s_dir_ireserve_nr - 1) &
+				~(sbi->s_dir_ireserve_nr - 1);
+			if (free > EXT4_INODES_PER_GROUP(sb))
+				free = EXT4_INODES_PER_GROUP(sb);
+			gdp->bg_itable_unused = cpu_to_le16(
+				EXT4_INODES_PER_GROUP(sb) - free);
+		}
+	}
 	/* If we didn't allocate from within the initialized part of the inode
 	 * table then we need to initialize up to this inode. */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
@@ -655,12 +837,13 @@ got:
 		/*
 		 * Check the relative inode number against the last used
 		 * relative inode number in this group. if it is greater
-		 * we need to  update the bg_itable_unused count
-		 *
+		 * we need to  update the bg_itable_unused count. If
+		 * directory inode reservation is enabled, try to make it
+		 * align on a s_dir_ireserve_nr boundary.
 		 */
 		if (ino > free)
-			gdp->bg_itable_unused =
-				cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
+			gdp->bg_itable_unused = cpu_to_le16(
+				EXT4_INODES_PER_GROUP(sb) - ino);
 	}

 	gdp->bg_free_inodes_count =
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 37afc41..a9b87c3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -874,6 +874,7 @@ enum {
 	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+	Opt_dir_ireserve_low, Opt_dir_ireserve_normal, Opt_dir_ireserve_high,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
@@ -919,6 +920,9 @@ static match_table_t tokens = {
 	{Opt_data_journal, "data=journal"},
 	{Opt_data_ordered, "data=ordered"},
 	{Opt_data_writeback, "data=writeback"},
+	{Opt_dir_ireserve_low, "dir_ireserve=low"},
+	{Opt_dir_ireserve_normal, "dir_ireserve=normal"},
+	{Opt_dir_ireserve_high, "dir_ireserve=high"},
 	{Opt_offusrjquota, "usrjquota="},
 	{Opt_usrjquota, "usrjquota=%s"},
 	{Opt_offgrpjquota, "grpjquota="},
@@ -1297,6 +1301,18 @@ clear_qf_name:
 				return 0;
 			sbi->s_stripe = option;
 			break;
+		case Opt_dir_ireserve_low:
+			set_opt(sbi->s_mount_opt, DIR_IRESERVE);
+			sbi->s_dir_ireserve_nr = EXT4_DIR_IRESERVE_LOW;
+			break;
+		case Opt_dir_ireserve_normal:
+			set_opt(sbi->s_mount_opt, DIR_IRESERVE);
+			sbi->s_dir_ireserve_nr = EXT4_DIR_IRESERVE_NORMAL;
+			break;
+		case Opt_dir_ireserve_high:
+			set_opt(sbi->s_mount_opt, DIR_IRESERVE);
+			sbi->s_dir_ireserve_nr = EXT4_DIR_IRESERVE_HIGH;
+			break;
 		default:
 			printk (KERN_ERR
 				"EXT4-fs: Unrecognized mount option \"%s\" "
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
index 8d56b86..d9493e3 100644
--- a/include/linux/ext4_fs.h
+++ b/include/linux/ext4_fs.h
@@ -92,6 +92,13 @@ struct ext4_allocation_request {
 #define EXT4_GOOD_OLD_FIRST_INO	11

 /*
+ * Macro-instructions used to reserve inodes for directories
+ */
+#define EXT4_DIR_IRESERVE_LOW		16
+#define EXT4_DIR_IRESERVE_NORMAL	64
+#define EXT4_DIR_IRESERVE_HIGH		128
+
+/*
  * Maximal count of links to a file
  */
 #define EXT4_LINK_MAX		65000
@@ -502,6 +509,7 @@ do {									       \
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_DELALLOC		0x2000000 /* Delalloc support */
 #define EXT4_MOUNT_MBALLOC		0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DIR_IRESERVE		0x10000000/* dir inode reservation support */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
diff --git a/include/linux/ext4_fs_sb.h b/include/linux/ext4_fs_sb.h
index 4098d4f..fa5e866 100644
--- a/include/linux/ext4_fs_sb.h
+++ b/include/linux/ext4_fs_sb.h
@@ -147,6 +147,8 @@ struct ext4_sb_info {

 	/* locality groups */
 	struct ext4_locality_group *s_locality_groups;
+	/* directory inodes reservation number */
+	int s_dir_ireserve_nr;
 };
 #define EXT4_GROUP_INFO(sb, group)					   \
 	EXT4_SB(sb)->s_group_info[(group) >> EXT4_DESC_PER_BLOCK_BITS(sb)] \

-- 
Coly Li
SuSE PRC Labs

             reply	other threads:[~2007-10-30 10:09 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-10-30 10:10 Coly Li [this message]
2007-10-31 21:19 ` [PATCH] ext4: dir inode reservation V2 Jan Kara

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=47270313.6060301@suse.de \
    --to=coyli@suse.de \
    --cc=linux-ext4@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.