linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Theodore Ts'o <tytso@mit.edu>
To: stable@vger.kernel.org
Cc: Ext4 Developers List <linux-ext4@vger.kernel.org>,
	Curt Wohlgemuth <curtw@google.com>,
	"Theodore Ts'o" <tytso@mit.edu>
Subject: [PATCH 2.6.33.y 32/40] ext4: check for a good block group before loading buddy pages
Date: Tue,  1 Jun 2010 08:03:19 -0400	[thread overview]
Message-ID: <1275393807-14369-32-git-send-email-tytso@mit.edu> (raw)
In-Reply-To: <1275393807-14369-1-git-send-email-tytso@mit.edu>

From: Curt Wohlgemuth <curtw@google.com>

commit 8a57d9d61a6e361c7bb159dda797672c1df1a691 upstream (as of v2.6.34-git13)

This adds a new field in ext4_group_info to cache the largest available
block range in a block group; and don't load the buddy pages until *after*
we've done a sanity check on the block group.

With large allocation requests (e.g., fallocate(), 8MiB) and relatively full
partitions, it's easy to have no block groups with a block extent large
enough to satisfy the input request length.  This currently causes the loop
during cr == 0 in ext4_mb_regular_allocator() to load the buddy bitmap pages
for EVERY block group.  That can be a lot of pages.  The patch below allows
us to call ext4_mb_good_group() BEFORE we load the buddy pages (although we
have check again after we lock the block group).

Addresses-Google-Bug: #2578108
Addresses-Google-Bug: #2704453

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |    1 +
 fs/ext4/mballoc.c |   70 +++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 81b11cf..bc4013b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1660,6 +1660,7 @@ struct ext4_group_info {
 	ext4_grpblk_t	bb_first_free;	/* first free block */
 	ext4_grpblk_t	bb_free;	/* total free blocks */
 	ext4_grpblk_t	bb_fragments;	/* nr of freespace fragments */
+	ext4_grpblk_t	bb_largest_free_order;/* order of largest frag in BG */
 	struct          list_head bb_prealloc_list;
 #ifdef DOUBLE_CHECK
 	void            *bb_bitmap;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d0ddea0..665e1c9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
 	}
 }
 
+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+	int i;
+	int bits;
+
+	grp->bb_largest_free_order = -1; /* uninit */
+
+	bits = sb->s_blocksize_bits + 1;
+	for (i = bits; i >= 0; i--) {
+		if (grp->bb_counters[i] > 0) {
+			grp->bb_largest_free_order = i;
+			break;
+		}
+	}
+}
+
 static noinline_for_stack
 void ext4_mb_generate_buddy(struct super_block *sb,
 				void *buddy, void *bitmap, ext4_group_t group)
@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 		 */
 		grp->bb_free = free;
 	}
+	mb_set_largest_free_order(sb, grp);
 
 	clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
 
@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
  * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
  * So it can have information regarding groups_per_page which
  * is blocks_per_page/2
+ *
+ * Locking note:  This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
  */
 
 static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -910,6 +935,11 @@ out:
 	return err;
 }
 
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
 static noinline_for_stack
 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
@@ -1004,6 +1034,11 @@ err:
 	return ret;
 }
 
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
 static noinline_for_stack int
 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 					struct ext4_buddy *e4b)
@@ -1300,6 +1335,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			buddy = buddy2;
 		} while (1);
 	}
+	mb_set_largest_free_order(sb, e4b->bd_info);
 	mb_check_buddy(e4b);
 }
 
@@ -1428,6 +1464,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 		e4b->bd_info->bb_counters[ord]++;
 		e4b->bd_info->bb_counters[ord]++;
 	}
+	mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
 
 	mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
 	mb_check_buddy(e4b);
@@ -1823,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
 	}
 }
 
+/* This is now called BEFORE we load the buddy bitmap. */
 static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 				ext4_group_t group, int cr)
 {
 	unsigned free, fragments;
-	unsigned i, bits;
 	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
 	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
 
 	BUG_ON(cr < 0 || cr >= 4);
-	BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
+
+	/* We only do this if the grp has never been initialized */
+	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+		int ret = ext4_mb_init_group(ac->ac_sb, group);
+		if (ret)
+			return 0;
+	}
 
 	free = grp->bb_free;
 	fragments = grp->bb_fragments;
@@ -1845,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	case 0:
 		BUG_ON(ac->ac_2order == 0);
 
+		if (grp->bb_largest_free_order < ac->ac_2order)
+			return 0;
+
 		/* Avoid using the first bg of a flexgroup for data files */
 		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
 		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
 		    ((group % flex_size) == 0))
 			return 0;
 
-		bits = ac->ac_sb->s_blocksize_bits + 1;
-		for (i = ac->ac_2order; i <= bits; i++)
-			if (grp->bb_counters[i] > 0)
-				return 1;
-		break;
+		return 1;
 	case 1:
 		if ((free / fragments) >= ac->ac_g_ex.fe_len)
 			return 1;
@@ -2026,14 +2068,11 @@ repeat:
 		group = ac->ac_g_ex.fe_group;
 
 		for (i = 0; i < ngroups; group++, i++) {
-			struct ext4_group_info *grp;
-
 			if (group == ngroups)
 				group = 0;
 
-			/* quick check to skip empty groups */
-			grp = ext4_get_group_info(sb, group);
-			if (grp->bb_free == 0)
+			/* This now checks without needing the buddy page */
+			if (!ext4_mb_good_group(ac, group, cr))
 				continue;
 
 			err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2041,8 +2080,12 @@ repeat:
 				goto out;
 
 			ext4_lock_group(sb, group);
+
+			/*
+			 * We need to check again after locking the
+			 * block group
+			 */
 			if (!ext4_mb_good_group(ac, group, cr)) {
-				/* someone did allocation from this group */
 				ext4_unlock_group(sb, group);
 				ext4_mb_unload_buddy(&e4b);
 				continue;
@@ -2255,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
 	init_rwsem(&meta_group_info[i]->alloc_sem);
 	meta_group_info[i]->bb_free_root.rb_node = NULL;
+	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
 
 #ifdef DOUBLE_CHECK
 	{
-- 
1.6.6.1.1.g974db.dirty


  parent reply	other threads:[~2010-06-01 12:03 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-06-01 12:02 [PATCH 2.6.33.y 01/40] ext4: Use bitops to read/modify EXT4_I(inode)->i_state Theodore Ts'o
2010-06-01 12:02 ` [PATCH 2.6.33.y 02/40] ext4: Fix BUG_ON at fs/buffer.c:652 in no journal mode Theodore Ts'o
2010-06-01 12:02 ` [PATCH 2.6.33.y 03/40] ext4: Add flag to files with blocks intentionally past EOF Theodore Ts'o
2010-06-01 12:02 ` [PATCH 2.6.33.y 04/40] ext4: Fix fencepost error in chosing choosing group vs file preallocation Theodore Ts'o
2010-06-01 12:02 ` [PATCH 2.6.33.y 05/40] ext4: fix error handling in migrate Theodore Ts'o
2010-06-01 12:02 ` [PATCH 2.6.33.y 06/40] ext4: explicitly remove inode from orphan list after failed direct io Theodore Ts'o
2010-06-01 12:02 ` [PATCH 2.6.33.y 07/40] ext4: Handle non empty on-disk orphan link Theodore Ts'o
2010-06-01 12:02 ` [PATCH 2.6.33.y 08/40] ext4: make "offset" consistent in ext4_check_dir_entry() Theodore Ts'o
2010-06-01 12:02 ` [PATCH 2.6.33.y 09/40] ext4: Fix insertion point of extent in mext_insert_across_blocks() Theodore Ts'o
2010-06-01 12:02 ` [PATCH 2.6.33.y 10/40] ext4: Fix the NULL reference in double_down_write_data_sem() Theodore Ts'o
2010-06-01 12:02 ` [PATCH 2.6.33.y 11/40] ext4: Code cleanup for EXT4_IOC_MOVE_EXT ioctl Theodore Ts'o
2010-06-01 12:02 ` [PATCH 2.6.33.y 12/40] ext4: Fix estimate of # of blocks needed to write indirect-mapped files Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 13/40] ext4: Fixed inode allocator to correctly track a flex_bg's used_dirs Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 14/40] ext4: Fix possible lost inode write in no journal mode Theodore Ts'o
2012-09-28  3:08   ` Yongqiang Yang
2012-09-28  3:21     ` Yongqiang Yang
2010-06-01 12:03 ` [PATCH 2.6.33.y 15/40] ext4: Fix buffer head leaks after calls to ext4_get_inode_loc() Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 16/40] ext4: Issue the discard operation *before* releasing the blocks to be reused Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 17/40] ext4: check missed return value in ext4_sync_file() Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 18/40] ext4: fix memory leaks in error path handling of ext4_ext_zeroout() Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 19/40] ext4: Remove unnecessary call to ext4_get_group_desc() in mballoc Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 20/40] ext4: rename ext4_mb_release_desc() to ext4_mb_unload_buddy() Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 21/40] ext4: allow defrag (EXT4_IOC_MOVE_EXT) in 32bit compat mode Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 22/40] ext4: fix quota accounting in case of fallocate Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 23/40] ext4: check s_log_groups_per_flex in online resize code Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 24/40] ext4: don't return to userspace after freezing the fs with a mutex held Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 25/40] ext4: stop issuing discards if not supported by device Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 26/40] ext4: don't scan/accumulate more pages than mballoc will allocate Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 27/40] ext4: Do not zero out uninitialized extents beyond i_size Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 28/40] ext4: clean up inode bitmaps manipulation in ext4_free_inode Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 29/40] ext4: init statistics after journal recovery Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 30/40] ext4: Remove extraneous newlines in ext4_msg() calls Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 31/40] ext4: Prevent creation of files larger than RLIMIT_FSIZE using fallocate Theodore Ts'o
2010-06-01 12:03 ` Theodore Ts'o [this message]
2010-06-01 12:03 ` [PATCH 2.6.33.y 33/40] ext4: Show journal_checksum option Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 34/40] ext4: Use bitops to read/modify i_flags in struct ext4_inode_info Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 35/40] ext4: Avoid crashing on NULL ptr dereference on a filesystem error Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 36/40] ext4: Clear the EXT4_EOFBLOCKS_FL flag only when warranted Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 37/40] ext4: restart ext4_ext_remove_space() after transaction restart Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 38/40] ext4: Conditionally define compat ioctl numbers Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 39/40] ext4: Fix compat EXT4_IOC_ADD_GROUP Theodore Ts'o
2010-06-01 12:03 ` [PATCH 2.6.33.y 40/40] ext4: Make fsync sync new parent directories in no-journal mode Theodore Ts'o
2010-07-28 23:29 ` [stable] [PATCH 2.6.33.y 01/40] ext4: Use bitops to read/modify EXT4_I(inode)->i_state Greg KH

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1275393807-14369-32-git-send-email-tytso@mit.edu \
    --to=tytso@mit.edu \
    --cc=curtw@google.com \
    --cc=linux-ext4@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).