Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Andrew Morton <akpm@linux-foundation.org>
To: "Theodore Ts'o" <tytso@MIT.EDU>
Cc: linux-kernel@vger.kernel.org, alex@clusterfs.com,
	adilger@clusterfs.com, aneesh.kumar@linux.vnet.ibm.com,
	sandeen@redhat.com, tytso@MIT.EDU,
	"linux-ext4@vger.kernel.org" <linux-ext4@vger.kernel.org>
Subject: Re: [PATCH 41/49] ext4: Add multi block allocator for ext4
Date: Wed, 23 Jan 2008 14:07:27 -0800	[thread overview]
Message-ID: <20080123140727.f47e9c9d.akpm@linux-foundation.org> (raw)
In-Reply-To: <1200970948-17903-42-git-send-email-tytso@mit.edu>

> On Mon, 21 Jan 2008 22:02:20 -0500 "Theodore Ts'o" <tytso@MIT.EDU> wrote:
> From: Alex Tomas <alex@clusterfs.com>
> 
> Signed-off-by: Alex Tomas <alex@clusterfs.com>
> Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> Signed-off-by: Eric Sandeen <sandeen@redhat.com>
> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
>
> ...
>
> +#if BITS_PER_LONG == 64
> +#define mb_correct_addr_and_bit(bit, addr)		\
> +{							\
> +	bit += ((unsigned long) addr & 7UL) << 3;	\
> +	addr = (void *) ((unsigned long) addr & ~7UL);	\
> +}
> +#elif BITS_PER_LONG == 32
> +#define mb_correct_addr_and_bit(bit, addr)		\
> +{							\
> +	bit += ((unsigned long) addr & 3UL) << 3;	\
> +	addr = (void *) ((unsigned long) addr & ~3UL);	\
> +}
> +#else
> +#error "how many bits you are?!"
> +#endif

Why do these exist?

> +static inline int mb_test_bit(int bit, void *addr)
> +{
> +	mb_correct_addr_and_bit(bit, addr);
> +	return ext4_test_bit(bit, addr);
> +}

ext2_test_bit() already handles bitnum > wordsize.

If mb_correct_addr_and_bit() is actually needed then some suitable comment
would help.

> +static inline void mb_set_bit(int bit, void *addr)
> +{
> +	mb_correct_addr_and_bit(bit, addr);
> +	ext4_set_bit(bit, addr);
> +}
> +
> +static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
> +{
> +	mb_correct_addr_and_bit(bit, addr);
> +	ext4_set_bit_atomic(lock, bit, addr);
> +}
> +
> +static inline void mb_clear_bit(int bit, void *addr)
> +{
> +	mb_correct_addr_and_bit(bit, addr);
> +	ext4_clear_bit(bit, addr);
> +}
> +
> +static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
> +{
> +	mb_correct_addr_and_bit(bit, addr);
> +	ext4_clear_bit_atomic(lock, bit, addr);
> +}
> +
> +static inline void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)

uninlining this will save about eighty squigabytes of text.

Please review all of ext4/jbd2 with a view to removig unnecessary and wrong
inlings.

> +{
> +	char *bb;
> +
> +	/* FIXME!! is this needed */
> +	BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
> +	BUG_ON(max == NULL);
> +
> +	if (order > e4b->bd_blkbits + 1) {
> +		*max = 0;
> +		return NULL;
> +	}
> +
> +	/* at order 0 we see each particular block */
> +	*max = 1 << (e4b->bd_blkbits + 3);
> +	if (order == 0)
> +		return EXT4_MB_BITMAP(e4b);
> +
> +	bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
> +	*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
> +
> +	return bb;
> +}
> +
>
> ...
>
> +#else
> +#define mb_free_blocks_double(a, b, c, d)
> +#define mb_mark_used_double(a, b, c)
> +#define mb_cmp_bitmaps(a, b)
> +#endif

Please use the do{}while(0) thing.  Or, better, proper C functions which
have typechecking (unless this will cause undefined-var compile errors,
which happens sometimes)

> +/* find most significant bit */
> +static int fmsb(unsigned short word)
> +{
> +	int order;
> +
> +	if (word > 255) {
> +		order = 7;
> +		word >>= 8;
> +	} else {
> +		order = -1;
> +	}
> +
> +	do {
> +		order++;
> +		word >>= 1;
> +	} while (word != 0);
> +
> +	return order;
> +}

Did we just reinvent fls()?

> +/* FIXME!! need more doc */
> +static void ext4_mb_mark_free_simple(struct super_block *sb,
> +				void *buddy, unsigned first, int len,
> +					struct ext4_group_info *grp)
> +{
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	unsigned short min;
> +	unsigned short max;
> +	unsigned short chunk;
> +	unsigned short border;
> +
> +	BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb));
> +
> +	border = 2 << sb->s_blocksize_bits;

Won't this explode with >= 32k blocksize?

> +	while (len > 0) {
> +		/* find how many blocks can be covered since this position */
> +		max = ffs(first | border) - 1;
> +
> +		/* find how many blocks of power 2 we need to mark */
> +		min = fmsb(len);
> +
> +		if (max < min)
> +			min = max;
> +		chunk = 1 << min;
> +
> +		/* mark multiblock chunks only */
> +		grp->bb_counters[min]++;
> +		if (min > 0)
> +			mb_clear_bit(first >> min,
> +				     buddy + sbi->s_mb_offsets[min]);
> +
> +		len -= chunk;
> +		first += chunk;
> +	}
> +}
> +
> 
> ...
>
> +static int ext4_mb_init_cache(struct page *page, char *incore)
> +{
> +	int blocksize;
> +	int blocks_per_page;
> +	int groups_per_page;
> +	int err = 0;
> +	int i;
> +	ext4_group_t first_group;
> +	int first_block;
> +	struct super_block *sb;
> +	struct buffer_head *bhs;
> +	struct buffer_head **bh;
> +	struct inode *inode;
> +	char *data;
> +	char *bitmap;
> +
> +	mb_debug("init page %lu\n", page->index);
> +
> +	inode = page->mapping->host;
> +	sb = inode->i_sb;
> +	blocksize = 1 << inode->i_blkbits;
> +	blocks_per_page = PAGE_CACHE_SIZE / blocksize;
> +
> +	groups_per_page = blocks_per_page >> 1;
> +	if (groups_per_page == 0)
> +		groups_per_page = 1;
> +
> +	/* allocate buffer_heads to read bitmaps */
> +	if (groups_per_page > 1) {
> +		err = -ENOMEM;
> +		i = sizeof(struct buffer_head *) * groups_per_page;
> +		bh = kmalloc(i, GFP_NOFS);
> +		if (bh == NULL)
> +			goto out;
> +		memset(bh, 0, i);

kzalloc()

> +	} else
> +		bh = &bhs;
> +
> +	first_group = page->index * blocks_per_page / 2;
> +
> +	/* read all groups the page covers into the cache */
> +	for (i = 0; i < groups_per_page; i++) {
> +		struct ext4_group_desc *desc;
> +
> +		if (first_group + i >= EXT4_SB(sb)->s_groups_count)
> +			break;
> +
> +		err = -EIO;
> +		desc = ext4_get_group_desc(sb, first_group + i, NULL);
> +		if (desc == NULL)
> +			goto out;
> +
> +		err = -ENOMEM;
> +		bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
> +		if (bh[i] == NULL)
> +			goto out;
> +
> +		if (buffer_uptodate(bh[i]))
> +			continue;
> +
> +		lock_buffer(bh[i]);
> +		if (buffer_uptodate(bh[i])) {
> +			unlock_buffer(bh[i]);
> +			continue;
> +		}

Didn't we just add a helper in fs/buffer.c to do this?

> +		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
> +			ext4_init_block_bitmap(sb, bh[i],
> +						first_group + i, desc);
> +			set_buffer_uptodate(bh[i]);
> +			unlock_buffer(bh[i]);
> +			continue;
> +		}
> +		get_bh(bh[i]);
> +		bh[i]->b_end_io = end_buffer_read_sync;
> +		submit_bh(READ, bh[i]);
> +		mb_debug("read bitmap for group %lu\n", first_group + i);
> +	}
> +
> +	/* wait for I/O completion */
> +	for (i = 0; i < groups_per_page && bh[i]; i++)
> +		wait_on_buffer(bh[i]);
> +
> +	err = -EIO;
> +	for (i = 0; i < groups_per_page && bh[i]; i++)
> +		if (!buffer_uptodate(bh[i]))
> +			goto out;
> +
> +	first_block = page->index * blocks_per_page;
> +	for (i = 0; i < blocks_per_page; i++) {
> +		int group;
> +		struct ext4_group_info *grinfo;
> +
> +		group = (first_block + i) >> 1;
> +		if (group >= EXT4_SB(sb)->s_groups_count)
> +			break;
> +
> +		/*
> +		 * data carry information regarding this
> +		 * particular group in the format specified
> +		 * above
> +		 *
> +		 */
> +		data = page_address(page) + (i * blocksize);
> +		bitmap = bh[group - first_group]->b_data;
> +
> +		/*
> +		 * We place the buddy block and bitmap block
> +		 * close together
> +		 */
> +		if ((first_block + i) & 1) {
> +			/* this is block of buddy */
> +			BUG_ON(incore == NULL);
> +			mb_debug("put buddy for group %u in page %lu/%x\n",
> +				group, page->index, i * blocksize);
> +			memset(data, 0xff, blocksize);
> +			grinfo = ext4_get_group_info(sb, group);
> +			grinfo->bb_fragments = 0;
> +			memset(grinfo->bb_counters, 0,
> +			       sizeof(unsigned short)*(sb->s_blocksize_bits+2));
> +			/*
> +			 * incore got set to the group block bitmap below
> +			 */
> +			ext4_mb_generate_buddy(sb, data, incore, group);
> +			incore = NULL;
> +		} else {
> +			/* this is block of bitmap */
> +			BUG_ON(incore != NULL);
> +			mb_debug("put bitmap for group %u in page %lu/%x\n",
> +				group, page->index, i * blocksize);
> +
> +			/* see comments in ext4_mb_put_pa() */
> +			ext4_lock_group(sb, group);
> +			memcpy(data, bitmap, blocksize);
> +
> +			/* mark all preallocated blks used in in-core bitmap */
> +			ext4_mb_generate_from_pa(sb, data, group);
> +			ext4_unlock_group(sb, group);
> +
> +			/* set incore so that the buddy information can be
> +			 * generated using this
> +			 */
> +			incore = data;
> +		}
> +	}
> +	SetPageUptodate(page);

Is the page locked here?

> +out:
> +	if (bh) {
> +		for (i = 0; i < groups_per_page && bh[i]; i++)
> +			brelse(bh[i]);

put_bh()

> +		if (bh != &bhs)
> +			kfree(bh);
> +	}
> +	return err;
> +}
> +
>
> ...
>
> +static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
> +{
> +	__u32 *addr;
> +
> +	len = cur + len;
> +	while (cur < len) {
> +		if ((cur & 31) == 0 && (len - cur) >= 32) {
> +			/* fast path: clear whole word at once */

s/clear/set/

> +			addr = bm + (cur >> 3);
> +			*addr = 0xffffffff;
> +			cur += 32;
> +			continue;
> +		}
> +		mb_set_bit_atomic(lock, cur, bm);
> +		cur++;
> +	}
> +}
> +
>
> ...
>
> +static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
> +					struct ext4_buddy *e4b)
> +{
> +	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
> +	int ret;
> +
> +	BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
> +	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
> +
> +	ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
> +	ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
> +	ret = mb_mark_used(e4b, &ac->ac_b_ex);
> +
> +	/* preallocation can change ac_b_ex, thus we store actually
> +	 * allocated blocks for history */
> +	ac->ac_f_ex = ac->ac_b_ex;
> +
> +	ac->ac_status = AC_STATUS_FOUND;
> +	ac->ac_tail = ret & 0xffff;
> +	ac->ac_buddy = ret >> 16;
> +
> +	/* XXXXXXX: SUCH A HORRIBLE **CK */
> +	/*FIXME!! Why ? */

?

> +	ac->ac_bitmap_page = e4b->bd_bitmap_page;
> +	get_page(ac->ac_bitmap_page);
> +	ac->ac_buddy_page = e4b->bd_buddy_page;
> +	get_page(ac->ac_buddy_page);
> +
> +	/* store last allocated for subsequent stream allocation */
> +	if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
> +		spin_lock(&sbi->s_md_lock);
> +		sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
> +		sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
> +		spin_unlock(&sbi->s_md_lock);
> +	}
> +}
>
> ...
>
> +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
> +					ext4_group_t group)
> +{
> +	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
> +	struct ext4_prealloc_space *pa;
> +	struct list_head *cur;
> +	ext4_group_t groupnr;
> +	ext4_grpblk_t start;
> +	int preallocated = 0;
> +	int count = 0;
> +	int len;
> +
> +	/* all form of preallocation discards first load group,
> +	 * so the only competing code is preallocation use.
> +	 * we don't need any locking here
> +	 * notice we do NOT ignore preallocations with pa_deleted
> +	 * otherwise we could leave used blocks available for
> +	 * allocation in buddy when concurrent ext4_mb_put_pa()
> +	 * is dropping preallocation
> +	 */
> +	list_for_each_rcu(cur, &grp->bb_prealloc_list) {
> +		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
> +		spin_lock(&pa->pa_lock);
> +		ext4_get_group_no_and_offset(sb, pa->pa_pstart,
> +					     &groupnr, &start);
> +		len = pa->pa_len;
> +		spin_unlock(&pa->pa_lock);
> +		if (unlikely(len == 0))
> +			continue;
> +		BUG_ON(groupnr != group);
> +		mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
> +						bitmap, start, len);
> +		preallocated += len;
> +		count++;
> +	}

Seems to be missing rcu_read_lock()

> +	mb_debug("prellocated %u for group %lu\n", preallocated, group);
> +}
> +
> +static void ext4_mb_pa_callback(struct rcu_head *head)
> +{
> +	struct ext4_prealloc_space *pa;
> +	pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
> +	kmem_cache_free(ext4_pspace_cachep, pa);
> +}
> +#define mb_call_rcu(__pa)	call_rcu(&(__pa)->u.pa_rcu, ext4_mb_pa_callback)

Is there any reason why this had to be implemented as a macro?

>
> ...
>
> +static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
> +{
> +	struct super_block *sb = ac->ac_sb;
> +	struct ext4_prealloc_space *pa;
> +	struct ext4_group_info *grp;
> +	struct ext4_inode_info *ei;
> +
> +	/* preallocate only when found space is larger then requested */
> +	BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
> +	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
> +	BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
> +
> +	pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);

Do all the GFP_NOFS's in this code really need to be GFP_NOFS?

> +	if (pa == NULL)
> +		return -ENOMEM;
> +
> +	if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
> +		int winl;
> +		int wins;
> +		int win;
> +		int offs;
> +
> +		/* we can't allocate as much as normalizer wants.
> +		 * so, found space must get proper lstart
> +		 * to cover original request */
> +		BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
> +		BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
> +
> +		/* we're limited by original request in that
> +		 * logical block must be covered any way
> +		 * winl is window we can move our chunk within */
> +		winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
> +
> +		/* also, we should cover whole original request */
> +		wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
> +
> +		/* the smallest one defines real window */
> +		win = min(winl, wins);
> +
> +		offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
> +		if (offs && offs < win)
> +			win = offs;
> +
> +		ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
> +		BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
> +		BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
> +	}
> +
> +	/* preallocation can change ac_b_ex, thus we store actually
> +	 * allocated blocks for history */
> +	ac->ac_f_ex = ac->ac_b_ex;
> +
> +	pa->pa_lstart = ac->ac_b_ex.fe_logical;
> +	pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
> +	pa->pa_len = ac->ac_b_ex.fe_len;
> +	pa->pa_free = pa->pa_len;
> +	atomic_set(&pa->pa_count, 1);
> +	spin_lock_init(&pa->pa_lock);
> +	pa->pa_deleted = 0;
> +	pa->pa_linear = 0;
> +
> +	mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
> +			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
> +
> +	ext4_mb_use_inode_pa(ac, pa);
> +	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
> +
> +	ei = EXT4_I(ac->ac_inode);
> +	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
> +
> +	pa->pa_obj_lock = &ei->i_prealloc_lock;
> +	pa->pa_inode = ac->ac_inode;
> +
> +	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
> +	list_add_rcu(&pa->pa_group_list, &grp->bb_prealloc_list);
> +	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
> +
> +	spin_lock(pa->pa_obj_lock);
> +	list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
> +	spin_unlock(pa->pa_obj_lock);

hm.  Strange to see list_add_rcu() inside spinlock like this.

> +	return 0;
> +}
> +
>
> ...
>
> +static int ext4_mb_discard_group_preallocations(struct super_block *sb,
> +					ext4_group_t group, int needed)
> +{
> +	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
> +	struct buffer_head *bitmap_bh = NULL;
> +	struct ext4_prealloc_space *pa, *tmp;
> +	struct list_head list;
> +	struct ext4_buddy e4b;
> +	int err;
> +	int busy = 0;
> +	int free = 0;
> +
> +	mb_debug("discard preallocation for group %lu\n", group);
> +
> +	if (list_empty(&grp->bb_prealloc_list))
> +		return 0;
> +
> +	bitmap_bh = read_block_bitmap(sb, group);
> +	if (bitmap_bh == NULL) {
> +		/* error handling here */
> +		ext4_mb_release_desc(&e4b);
> +		BUG_ON(bitmap_bh == NULL);
> +	}
> +
> +	err = ext4_mb_load_buddy(sb, group, &e4b);
> +	BUG_ON(err != 0); /* error handling here */
> +
> +	if (needed == 0)
> +		needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
> +
> +	grp = ext4_get_group_info(sb, group);
> +	INIT_LIST_HEAD(&list);
> +
> +repeat:
> +	ext4_lock_group(sb, group);
> +	list_for_each_entry_safe(pa, tmp,
> +				&grp->bb_prealloc_list, pa_group_list) {
> +		spin_lock(&pa->pa_lock);
> +		if (atomic_read(&pa->pa_count)) {
> +			spin_unlock(&pa->pa_lock);
> +			busy = 1;
> +			continue;
> +		}
> +		if (pa->pa_deleted) {
> +			spin_unlock(&pa->pa_lock);
> +			continue;
> +		}
> +
> +		/* seems this one can be freed ... */
> +		pa->pa_deleted = 1;
> +
> +		/* we can trust pa_free ... */
> +		free += pa->pa_free;
> +
> +		spin_unlock(&pa->pa_lock);
> +
> +		list_del_rcu(&pa->pa_group_list);
> +		list_add(&pa->u.pa_tmp_list, &list);
> +	}

Strange to see rcu operations outside rcu_read_lock().

> +	/* if we still need more blocks and some PAs were used, try again */
> +	if (free < needed && busy) {
> +		busy = 0;
> +		ext4_unlock_group(sb, group);
> +		/*
> +		 * Yield the CPU here so that we don't get soft lockup
> +		 * in non preempt case.
> +		 */
> +		yield();

argh, no, yield() is basically unusable.  schedule_timeout(1) is preferable.

Please test this code whe there are lots of cpu-intensive tasks running.

> +		goto repeat;
> +	}
> +
> +	/* found anything to free? */
> +	if (list_empty(&list)) {
> +		BUG_ON(free != 0);
> +		goto out;
> +	}
> +
> +	/* now free all selected PAs */
> +	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
> +
> +		/* remove from object (inode or locality group) */
> +		spin_lock(pa->pa_obj_lock);
> +		list_del_rcu(&pa->pa_inode_list);
> +		spin_unlock(pa->pa_obj_lock);
> +
> +		if (pa->pa_linear)
> +			ext4_mb_release_group_pa(&e4b, pa);
> +		else
> +			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
> +
> +		list_del(&pa->u.pa_tmp_list);
> +		mb_call_rcu(pa);
> +	}
> +
> +out:
> +	ext4_unlock_group(sb, group);
> +	ext4_mb_release_desc(&e4b);
> +	brelse(bitmap_bh);

put_bh()

> +	return free;
> +}
> +
> +/*
> + * releases all non-used preallocated blocks for given inode
> + *
> + * It's important to discard preallocations under i_data_sem
> + * We don't want another block to be served from the prealloc
> + * space when we are discarding the inode prealloc space.
> + *
> + * FIXME!! Make sure it is valid at all the call sites
> + */
> +void ext4_mb_discard_inode_preallocations(struct inode *inode)
> +{
> +	struct ext4_inode_info *ei = EXT4_I(inode);
> +	struct super_block *sb = inode->i_sb;
> +	struct buffer_head *bitmap_bh = NULL;
> +	struct ext4_prealloc_space *pa, *tmp;
> +	ext4_group_t group = 0;
> +	struct list_head list;
> +	struct ext4_buddy e4b;
> +	int err;
> +
> +	if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) {
> +		/*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
> +		return;
> +	}
> +
> +	mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
> +
> +	INIT_LIST_HEAD(&list);
> +
> +repeat:
> +	/* first, collect all pa's in the inode */
> +	spin_lock(&ei->i_prealloc_lock);
> +	while (!list_empty(&ei->i_prealloc_list)) {
> +		pa = list_entry(ei->i_prealloc_list.next,
> +				struct ext4_prealloc_space, pa_inode_list);
> +		BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
> +		spin_lock(&pa->pa_lock);
> +		if (atomic_read(&pa->pa_count)) {
> +			/* this shouldn't happen often - nobody should
> +			 * use preallocation while we're discarding it */
> +			spin_unlock(&pa->pa_lock);
> +			spin_unlock(&ei->i_prealloc_lock);
> +			printk(KERN_ERR "uh-oh! used pa while discarding\n");
> +			dump_stack();

WARN_ON(1) would be more conventional.

> +			current->state = TASK_UNINTERRUPTIBLE;
> +			schedule_timeout(HZ);

schedule_timeout_uninterruptible()

> +			goto repeat;
> +
> +		}
> +		if (pa->pa_deleted == 0) {
> +			pa->pa_deleted = 1;
> +			spin_unlock(&pa->pa_lock);
> +			list_del_rcu(&pa->pa_inode_list);
> +			list_add(&pa->u.pa_tmp_list, &list);
> +			continue;
> +		}
> +
> +		/* someone is deleting pa right now */
> +		spin_unlock(&pa->pa_lock);
> +		spin_unlock(&ei->i_prealloc_lock);
> +
> +		/* we have to wait here because pa_deleted
> +		 * doesn't mean pa is already unlinked from
> +		 * the list. as we might be called from
> +		 * ->clear_inode() the inode will get freed
> +		 * and concurrent thread which is unlinking
> +		 * pa from inode's list may access already
> +		 * freed memory, bad-bad-bad */
> +
> +		/* XXX: if this happens too often, we can
> +		 * add a flag to force wait only in case
> +		 * of ->clear_inode(), but not in case of
> +		 * regular truncate */
> +		current->state = TASK_UNINTERRUPTIBLE;
> +		schedule_timeout(HZ);

ditto

> +		goto repeat;
> +	}
> +	spin_unlock(&ei->i_prealloc_lock);
> +
> +	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
> +		BUG_ON(pa->pa_linear != 0);
> +		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
> +
> +		err = ext4_mb_load_buddy(sb, group, &e4b);
> +		BUG_ON(err != 0); /* error handling here */
> +
> +		bitmap_bh = read_block_bitmap(sb, group);
> +		if (bitmap_bh == NULL) {
> +			/* error handling here */
> +			ext4_mb_release_desc(&e4b);
> +			BUG_ON(bitmap_bh == NULL);
> +		}
> +
> +		ext4_lock_group(sb, group);
> +		list_del_rcu(&pa->pa_group_list);
> +		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
> +		ext4_unlock_group(sb, group);
> +
> +		ext4_mb_release_desc(&e4b);
> +		brelse(bitmap_bh);
> +
> +		list_del(&pa->u.pa_tmp_list);
> +		mb_call_rcu(pa);
> +	}
> +}

Would be nice to ask Paul to review all the rcu usage in here.  It looks odd.

>
> ...
>
> +#else
> +#define ext4_mb_show_ac(x)
> +#endif

static inlined C functions are preferred (+1e6 dittoes)

> +/*
> + * We use locality group preallocation for small size file. The size of the
> + * file is determined by the current size or the resulting size after
> + * allocation which ever is larger
> + *
> + * One can tune this size via /proc/fs/ext4/<partition>/stream_req
> + */
> +static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
> +{
> +	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
> +	int bsbits = ac->ac_sb->s_blocksize_bits;
> +	loff_t size, isize;
> +
> +	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
> +		return;
> +
> +	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
> +	isize = i_size_read(ac->ac_inode) >> bsbits;
> +	if (size < isize)
> +		size = isize;

min()?

> +	/* don't use group allocation for large files */
> +	if (size >= sbi->s_mb_stream_request)
> +		return;
> +
> +	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
> +		return;
> +
> +	BUG_ON(ac->ac_lg != NULL);
> +	ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
> +	put_cpu();

Strange-looking code.  I'd be interested in a description of the per-cou
design here.

> +	/* we're going to use group allocation */
> +	ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
> +
> +	/* serialize all allocations in the group */
> +	down(&ac->ac_lg->lg_sem);

This should be a mutex, shouldn't it?

> +}
> +
>
> ...
>
> +static int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
> +			  ext4_group_t group, ext4_grpblk_t block, int count)
> +{
> +	struct ext4_group_info *db = e4b->bd_info;
> +	struct super_block *sb = e4b->bd_sb;
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	struct ext4_free_metadata *md;
> +	int i;
> +
> +	BUG_ON(e4b->bd_bitmap_page == NULL);
> +	BUG_ON(e4b->bd_buddy_page == NULL);
> +
> +	ext4_lock_group(sb, group);
> +	for (i = 0; i < count; i++) {
> +		md = db->bb_md_cur;
> +		if (md && db->bb_tid != handle->h_transaction->t_tid) {
> +			db->bb_md_cur = NULL;
> +			md = NULL;
> +		}
> +
> +		if (md == NULL) {
> +			ext4_unlock_group(sb, group);
> +			md = kmalloc(sizeof(*md), GFP_KERNEL);

Why was this one not GFP_NOFS?

> +			if (md == NULL)
> +				return -ENOMEM;

Did we just leak some memory?

> +			md->num = 0;
> +			md->group = group;
> +
> +			ext4_lock_group(sb, group);
> +			if (db->bb_md_cur == NULL) {
> +				spin_lock(&sbi->s_md_lock);
> +				list_add(&md->list, &sbi->s_active_transaction);
> +				spin_unlock(&sbi->s_md_lock);
> +				/* protect buddy cache from being freed,
> +				 * otherwise we'll refresh it from
> +				 * on-disk bitmap and lose not-yet-available
> +				 * blocks */
> +				page_cache_get(e4b->bd_buddy_page);
> +				page_cache_get(e4b->bd_bitmap_page);
> +				db->bb_md_cur = md;
> +				db->bb_tid = handle->h_transaction->t_tid;
> +				mb_debug("new md 0x%p for group %lu\n",
> +						md, md->group);
> +			} else {
> +				kfree(md);
> +				md = db->bb_md_cur;
> +			}
> +		}
> +
> +		BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
> +		md->blocks[md->num] = block + i;
> +		md->num++;
> +		if (md->num == EXT4_BB_MAX_BLOCKS) {
> +			/* no more space, put full container on a sb's list */
> +			db->bb_md_cur = NULL;
> +		}
> +	}
> +	ext4_unlock_group(sb, group);
> +	return 0;
> +}
> +
>
> ...
>
> +		case Opt_mballoc:
> +			set_opt(sbi->s_mount_opt, MBALLOC);
> +			break;
> +		case Opt_nomballoc:
> +			clear_opt(sbi->s_mount_opt, MBALLOC);
> +			break;
> +		case Opt_stripe:
> +			if (match_int(&args[0], &option))
> +				return 0;
> +			if (option < 0)
> +				return 0;
> +			sbi->s_stripe = option;
> +			break;

These appear to be undocumented.

>  		default:
>  			printk (KERN_ERR
>  				"EXT4-fs: Unrecognized mount option \"%s\" "
> @@ -1742,6 +1762,33 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
>  	return (has_super + ext4_group_first_block_no(sb, bg));
>  }
>  
> +/**
> + * ext4_get_stripe_size: Get the stripe size.
> + * @sbi: In memory super block info
> + *
> + * If we have specified it via mount option, then
> + * use the mount option value. If the value specified at mount time is
> + * greater than the blocks per group use the super block value.
> + * If the super block value is greater than blocks per group return 0.
> + * Allocator needs it be less than blocks per group.
> + *
> + */
> +static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
> +{
> +	unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
> +	unsigned long stripe_width =
> +			le32_to_cpu(sbi->s_es->s_raid_stripe_width);
> +
> +	if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) {
> +		return sbi->s_stripe;
> +	} else if (stripe_width <= sbi->s_blocks_per_group) {
> +		return stripe_width;
> +	} else if (stride <= sbi->s_blocks_per_group) {
> +		return stride;
> +	}

unneeded braces.

> +	return 0;
> +}
>  
>  ...
>
> +static inline
> +struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
> +							ext4_group_t group)
> +{
> +	 struct ext4_group_info ***grp_info;
> +	 long indexv, indexh;
> +	 grp_info = EXT4_SB(sb)->s_group_info;
> +	 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
> +	 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
> +	 return grp_info[indexv][indexh];
> +}

This should be uninlined.



Gosh what a lot of code.  Is it faster?

next prev parent reply	other threads:[~2008-01-23 22:08 UTC|newest]

Thread overview: 73+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-01-22  3:01 ext4 merge plans for 2.6.25 Theodore Ts'o
2008-01-22  3:01 ` [PATCH 01/49] ext4: Support large blocksize up to PAGESIZE Theodore Ts'o
2008-01-22  3:01   ` [PATCH 02/49] ext4: Avoid rec_len overflow with 64KB block size Theodore Ts'o
2008-01-22  3:01     ` [PATCH 03/49] ext4: Introduce ext4_lblk_t Theodore Ts'o
2008-01-22  3:01       ` [PATCH 04/49] ext4 extents: remove unneeded casts Theodore Ts'o
2008-01-22  3:01         ` [PATCH 05/49] ext4: add ext4_group_t, and change all group variables to this type Theodore Ts'o
2008-01-22  3:01           ` [PATCH 06/49] ext4: fixes block group number being set to a negative value Theodore Ts'o
2008-01-22  3:01             ` [PATCH 07/49] ext4: Introduce ext4_update_*_feature Theodore Ts'o
2008-01-22  3:01               ` [PATCH 08/49] ext4: Fix sparse warnings Theodore Ts'o
2008-01-22  3:01                 ` [PATCH 09/49] ext4: Rename i_file_acl to i_file_acl_lo Theodore Ts'o
2008-01-22  3:01                   ` [PATCH 10/49] ext4: Rename i_dir_acl to i_size_high Theodore Ts'o
2008-01-22  3:01                     ` [PATCH 11/49] ext4: Add support for 48 bit inode i_blocks Theodore Ts'o
2008-01-22  3:01                       ` [PATCH 12/49] ext4: Support large files Theodore Ts'o
2008-01-22  3:01                         ` [PATCH 13/49] ext4: different maxbytes functions for bitmap & extent files Theodore Ts'o
2008-01-22  3:01                           ` [PATCH 14/49] ext4: export iov_shorten from kernel for ext4's use Theodore Ts'o
2008-01-22  3:01                             ` [PATCH 15/49] ext4: store maxbytes for bitmapped files and return EFBIG as appropriate Theodore Ts'o
2008-01-22  3:01                               ` [PATCH 16/49] ext2: Fix the max file size for ext2 file system Theodore Ts'o
2008-01-22  3:01                                 ` [PATCH 17/49] ext3: Fix the max file size for ext3 " Theodore Ts'o
2008-01-22  3:01                                   ` [PATCH 18/49] ext4: sync up block group descriptor with e2fsprogs Theodore Ts'o
2008-01-22  3:01                                     ` [PATCH 19/49] ext4: Return after ext4_error in case of failures Theodore Ts'o
2008-01-22  3:01                                       ` [PATCH 20/49] ext4/super.c: fix #ifdef's (CONFIG_EXT4_* -> CONFIG_EXT4DEV_*) Theodore Ts'o
2008-01-22  3:02                                         ` [PATCH 21/49] ext4: fix oops on corrupted ext4 mount Theodore Ts'o
2008-01-22  3:02                                           ` [PATCH 22/49] ext4: Change the default behaviour on error Theodore Ts'o
2008-01-22  3:02                                             ` [PATCH 23/49] Add buffer head related helper functions Theodore Ts'o
2008-01-22  3:02                                               ` [PATCH 24/49] ext4: add block bitmap validation Theodore Ts'o
2008-01-22  3:02                                                 ` [PATCH 25/49] jbd2: Remove printk from J_ASSERT to preserve registers during BUG Theodore Ts'o
2008-01-22  3:02                                                   ` [PATCH 26/49] jbd2: Fix assertion failure in fs/jbd2/checkpoint.c Theodore Ts'o
2008-01-22  3:02                                                     ` [PATCH 27/49] ext4: Check for the correct error return from Theodore Ts'o
2008-01-22  3:02                                                       ` [PATCH 28/49] ext4: remove unused code from ext4_find_entry() Theodore Ts'o
2008-01-22  3:02                                                         ` [PATCH 29/49] ext4: Make ext4_get_blocks_wrap take the truncate_mutex early Theodore Ts'o
2008-01-22  3:02                                                           ` [PATCH 30/49] ext4: Convert truncate_mutex to read write semaphore Theodore Ts'o
2008-01-22  3:02                                                             ` [PATCH 31/49] ext4: Take read lock during overwrite case Theodore Ts'o
2008-01-22  3:02                                                               ` [PATCH 32/49] jbd2: jbd2 stats through procfs Theodore Ts'o
2008-01-22  3:02                                                                 ` [PATCH 33/49] ext4: Add the journal checksum feature Theodore Ts'o
2008-01-22  3:02                                                                   ` [PATCH 34/49] vfs: Add 64 bit i_version support Theodore Ts'o
2008-01-22  3:02                                                                     ` [PATCH 35/49] ext4: Add inode version support in ext4 Theodore Ts'o
2008-01-22  3:02                                                                       ` [PATCH 36/49] ext4: Add EXT4_IOC_MIGRATE ioctl Theodore Ts'o
2008-01-22  3:02                                                                         ` [PATCH 37/49] ext4: Fix ext4_show_options to show the correct mount options Theodore Ts'o
2008-01-22  3:02                                                                           ` [PATCH 38/49] ext4: fix up EXT4FS_DEBUG builds Theodore Ts'o
2008-01-22  3:02                                                                             ` [PATCH 39/49] ext4: Add ext4_find_next_bit() Theodore Ts'o
2008-01-22  3:02                                                                               ` [PATCH 40/49] ext4: Add new functions for searching extent tree Theodore Ts'o
2008-01-22  3:02                                                                                 ` [PATCH 41/49] ext4: Add multi block allocator for ext4 Theodore Ts'o
2008-01-22  3:02                                                                                   ` [PATCH 42/49] ext4: Enable the multiblock allocator by default Theodore Ts'o
2008-01-22  3:02                                                                                     ` [PATCH 43/49] ext4: Check for return value from sb_set_blocksize Theodore Ts'o
2008-01-22  3:02                                                                                       ` [PATCH 44/49] ext4: fix uniniatilized extent splitting error Theodore Ts'o
2008-01-22  3:02                                                                                         ` [PATCH 45/49] ext4: Use the ext4_ext_actual_len() helper function Theodore Ts'o
2008-01-22  3:02                                                                                           ` [PATCH 46/49] jbd2: add lockdep support Theodore Ts'o
2008-01-22  3:02                                                                                             ` [PATCH 47/49] jbd2: Mark jbd2 slabs as SLAB_TEMPORARY Theodore Ts'o
2008-01-22  3:02                                                                                               ` [PATCH 48/49] jbd2: Use round-jiffies() function for the "5 second" ext4/jbd2 wakeup Theodore Ts'o
2008-01-22  3:02                                                                                                 ` [PATCH 49/49] jbd2: sparse pointer use of zero as null Theodore Ts'o
2008-01-23 22:07                                                                                   ` Andrew Morton [this message]
2008-01-23 23:20                                                                                     ` [PATCH 41/49] ext4: Add multi block allocator for ext4 Andreas Dilger
2008-01-24  7:56                                                                                     ` Aneesh Kumar K.V
2008-01-24  9:04                                                                                       ` Aneesh Kumar K.V
2008-01-24 14:53                                                                                       ` Aneesh Kumar K.V
2008-01-28 18:45                                                                                   ` Eric Sandeen
2008-01-23 22:07                                                                         ` [PATCH 36/49] ext4: Add EXT4_IOC_MIGRATE ioctl Andrew Morton
2008-01-24  5:55                                                                           ` Aneesh Kumar K.V
2008-01-26  4:15                                                                             ` Theodore Tso
2008-01-26  8:42                                                                               ` Aneesh Kumar K.V
2008-01-23 22:07                                                                   ` [PATCH 33/49] ext4: Add the journal checksum feature Andrew Morton
2008-01-23 22:40                                                                     ` Andreas Dilger
2008-01-24 21:24                                                                     ` Mingming Cao
2008-02-01 20:50                                                                       ` Girish Shilamkar
2008-01-23 22:06                                                             ` [PATCH 30/49] ext4: Convert truncate_mutex to read write semaphore Andrew Morton
2008-01-24  5:29                                                               ` Aneesh Kumar K.V
2008-01-24 13:00                                                               ` Andy Whitcroft
2008-01-23 22:06                                                 ` [PATCH 24/49] ext4: add block bitmap validation Andrew Morton
2008-01-26 13:26                                                   ` Theodore Tso
2008-01-23 22:06                                               ` [PATCH 23/49] Add buffer head related helper functions Andrew Morton
2008-01-24  5:22                                                 ` Aneesh Kumar K.V
2008-01-24  8:53                                                   ` Andrew Morton
2008-01-23 12:43 ` ext4 merge plans for 2.6.25 Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20080123140727.f47e9c9d.akpm@linux-foundation.org \
    --to=akpm@linux-foundation.org \
    --cc=adilger@clusterfs.com \
    --cc=alex@clusterfs.com \
    --cc=aneesh.kumar@linux.vnet.ibm.com \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=sandeen@redhat.com \
    --cc=tytso@MIT.EDU \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.