From: Andreas Dilger <adilger@whamcloud.com>
To: tytso@mit.edu
Cc: linux-ext4@vger.kernel.org,
Andreas Dilger <adilger@whamcloud.com>,
Yu Jian <yujian@whamcloud.com>
Subject: [PATCH] ext4: fall back to vmalloc() for large allocations
Date: Tue, 12 Jul 2011 15:40:46 -0600 [thread overview]
Message-ID: <1310506846-25843-1-git-send-email-adilger@whamcloud.com> (raw)
For very large ext4 filesystems (128TB and larger) kmalloc() of
some per-group structures can fail at mount time due to memory
fragmentation. If kmalloc() fails, fall back to vmalloc() for
the s_group_info and s_group_desc arrays.
Signed-off-by: Yu Jian <yujian@whamcloud.com>
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
---
fs/ext4/mballoc.c | 49 +++++++++++++++++++++++++++++++++----------------
fs/ext4/super.c | 29 +++++++++++++++++++++++------
2 files changed, 56 insertions(+), 22 deletions(-)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 6ed859d..72c5796 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2325,25 +2325,37 @@ static int ext4_mb_init_backend(struct super_block *sb)
while (array_size < sizeof(*sbi->s_group_info) *
num_meta_group_infos_max)
array_size = array_size << 1;
- /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
- * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
- * So a two level scheme suffices for now. */
+ /* A 16TB filesystem with 64-bit pointers requires an 8192 byte
+ * kmalloc(). Filesystems larger than 2^32 blocks (16TB normally)
+ * have group descriptors at least twice as large (64 bytes or
+ * more vs. 32 bytes for traditional ext3 filesystems), so a 128TB
+ * filesystem needs a 128kB allocation, which may need vmalloc(). */
sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
if (sbi->s_group_info == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
- return -ENOMEM;
+ sbi->s_group_info = vmalloc(array_size);
+ if (sbi->s_group_info != NULL) {
+ memset(sbi->s_group_info, 0, array_size);
+ } else {
+ ext4_msg(sb, KERN_ERR, "no memory for groupinfo (%u)\n",
+ array_size);
+ return -ENOMEM;
+ }
}
sbi->s_buddy_cache = new_inode(sb);
if (sbi->s_buddy_cache == NULL) {
- printk(KERN_ERR "EXT4-fs: can't get new inode\n");
+ ext4_msg(sb, KERN_ERR, "can't get new inode\n");
goto err_freesgi;
}
- sbi->s_buddy_cache->i_ino = get_next_ino();
+ /* To avoid potentially colliding with an valid on-disk inode number,
+ * use EXT4_BAD_INO for the buddy cache inode number. This inode is
+ * not in the inode hash, so it should never be found by iget(), but
+ * this will avoid confusion if it ever shows up during debugging. */
+ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
for (i = 0; i < ngroups; i++) {
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
- printk(KERN_ERR
+ ext4_msg(sb, KERN_ERR,
"EXT4-fs: can't read descriptor %u\n", i);
goto err_freebuddy;
}
@@ -2362,7 +2374,10 @@ err_freebuddy:
kfree(sbi->s_group_info[i]);
iput(sbi->s_buddy_cache);
err_freesgi:
- kfree(sbi->s_group_info);
+ if (is_vmalloc_addr(sbi->s_group_info))
+ vfree(sbi->s_group_info);
+ else
+ kfree(sbi->s_group_info);
return -ENOMEM;
}
@@ -2457,12 +2472,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
i++;
} while (i <= sb->s_blocksize_bits + 1);
- /* init file for buddy data */
- ret = ext4_mb_init_backend(sb);
- if (ret != 0) {
- goto out;
- }
-
spin_lock_init(&sbi->s_md_lock);
spin_lock_init(&sbi->s_bal_lock);
@@ -2487,6 +2496,11 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
spin_lock_init(&lg->lg_prealloc_lock);
}
+ /* init file for buddy data */
+ ret = ext4_mb_init_backend(sb);
+ if (ret != 0)
+ goto out;
+
if (sbi->s_proc)
proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_fops, sb);
@@ -2544,7 +2558,10 @@ int ext4_mb_release(struct super_block *sb)
EXT4_DESC_PER_BLOCK_BITS(sb);
for (i = 0; i < num_meta_group_infos; i++)
kfree(sbi->s_group_info[i]);
- kfree(sbi->s_group_info);
+ if (is_vmalloc_addr(sbi->s_group_info))
+ vfree(sbi->s_group_info);
+ else
+ kfree(sbi->s_group_info);
}
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9ea71aa..556084b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -789,7 +789,12 @@ static void ext4_put_super(struct super_block *sb)
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
- kfree(sbi->s_group_desc);
+
+ if (is_vmalloc_addr(sbi->s_group_desc))
+ vfree(sbi->s_group_desc);
+ else
+ kfree(sbi->s_group_desc);
+
if (is_vmalloc_addr(sbi->s_flex_groups))
vfree(sbi->s_flex_groups);
else
@@ -3059,6 +3064,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
int ret = -ENOMEM;
int blocksize;
unsigned int db_count;
+ size_t size;
unsigned int i;
int needs_recovery, has_huge_files;
__u64 blocks_count;
@@ -3408,11 +3414,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
EXT4_DESC_PER_BLOCK(sb);
- sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
- GFP_KERNEL);
+ size = (size_t)db_count * sizeof(struct buffer_head *);
+ sbi->s_group_desc = kzalloc(size, GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
- ext4_msg(sb, KERN_ERR, "not enough memory");
- goto failed_mount;
+ sbi->s_group_desc = vmalloc(size);
+ if (sbi->s_group_desc != NULL) {
+ memset(sbi->s_group_desc, 0, size);
+ } else {
+ ext4_msg(sb, KERN_ERR, "no memory for %u groups (%u)\n",
+ sbi->s_groups_count, (unsigned int)size);
+ ret = -ENOMEM;
+ goto failed_mount;
+ }
}
#ifdef CONFIG_PROC_FS
@@ -3756,7 +3769,11 @@ failed_mount3:
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
- kfree(sbi->s_group_desc);
+
+ if (is_vmalloc_addr(sbi->s_group_desc))
+ vfree(sbi->s_group_desc);
+ else
+ kfree(sbi->s_group_desc);
failed_mount:
if (sbi->s_proc) {
remove_proc_entry(sb->s_id, ext4_proc_root);
--
1.7.3.4
next reply other threads:[~2011-07-12 21:40 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-07-12 21:40 Andreas Dilger [this message]
2011-07-18 1:25 ` [PATCH] ext4: fall back to vmalloc() for large allocations Ted Ts'o
2011-07-18 5:12 ` Andreas Dilger
2011-07-18 11:36 ` Ted Ts'o
2011-08-01 13:13 ` [PATCH 0/5] Break up "fall back to vmalloc() for large allocations" Theodore Ts'o
2011-08-01 13:13 ` [PATCH 1/5] ext4: introduce ext4_kvmalloc(), ext4_kzalloc(), and ext4_kvfree() Theodore Ts'o
2011-08-02 6:07 ` [PATCH] ext4: use kzalloc in ext4_kzalloc() Mathias Krause
2011-08-03 18:57 ` Ted Ts'o
2011-08-01 13:13 ` [PATCH 2/5] ext4: use ext4_kvzalloc()/ext4_kvmalloc() for s_group_desc and s_group_info Theodore Ts'o
2011-08-01 13:13 ` [PATCH 3/5] ext4: use ext4_msg() instead of printk in mballoc Theodore Ts'o
2011-08-02 3:12 ` Andreas Dilger
2011-08-01 13:13 ` [PATCH 4/5] ext4: use EXT4_BAD_INO for buddy cache to avoid colliding with valid inode # Theodore Ts'o
2011-08-01 13:13 ` [PATCH 5/5] ext4: prevent memory leaks from ext4_mb_init_backend() on error path Theodore Ts'o
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1310506846-25843-1-git-send-email-adilger@whamcloud.com \
--to=adilger@whamcloud.com \
--cc=linux-ext4@vger.kernel.org \
--cc=tytso@mit.edu \
--cc=yujian@whamcloud.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).