linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Theodore Ts'o <tytso@mit.edu>
To: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: linux-ext4@vger.kernel.org
Subject: Re: [PATCH 50/74] libext2fs: support allocating uninit blocks in bmap2()
Date: Wed, 15 Jan 2014 17:19:51 -0500	[thread overview]
Message-ID: <20140115221951.GA12226@thunk.org> (raw)
In-Reply-To: <20140115211122.GJ9229@birch.djwong.org>

On Wed, Jan 15, 2014 at 01:11:22PM -0800, Darrick J. Wong wrote:
> > The other thing to note about this patch is that if you want to
> > implement fallocate, ext2fs_bmap2() is really the wrong tool to use.
> > I've been working on a program for work which pre-creates a bunch of
> 
> I think that ext2fs_fallocate would be a good addition to the library.  Is your
> program far enough along to share?  fuse2fs would benefit greatly.

An ext2fs_fallocate() is way more difficult than what I've done, since
you have to deal with all sorts of corner cases where the file has
pre-existing sparse extents, which may or may not be initialized, and
making sure that it works in that case.  Allocating blocks to a file
which you know started as a zero length file is in fact much easier.
Here are the key bits from my program:

/*
 * This should eventually be cleaned up and put into libext2fs
 * This is much faster than calling ext2fs_block_alloc_stats() for each
 * block, since it requires recalculating the bg descriptor checksum
 * for every single block that you allocate.
 */
static void ext2fs_block_alloc_stats_range(ext2_filsys fs, blk64_t blk,
					   blk_t num, int inuse)
{
	int	group;

#ifndef OMIT_COM_ERR
	if (blk + num >= ext2fs_blocks_count(fs->super)) {
		com_err("ext2fs_block_alloc_stats_range", 0,
			"Illegal block range: %llu (%u) ",
			(unsigned long long) blk, num);
		return;
	}
#endif
	if (inuse == 0)
		return;
	if (inuse > 0) {
		ext2fs_mark_block_bitmap_range2(fs->block_map, blk, num);
		inuse = 1;
	} else {
		ext2fs_unmark_block_bitmap_range2(fs->block_map, blk, num);
		inuse = -1;
	}
	while (num) {
		group = ext2fs_group_of_blk2(fs, blk);
		blk64_t last_blk = ext2fs_group_last_block2(fs, group);
		blk_t n = num;

		if (blk + num > last_blk)
			n = last_blk - blk + 1;

		ext2fs_bg_free_blocks_count_set(fs, group, 
			ext2fs_bg_free_blocks_count(fs, group) -
			inuse*n/EXT2FS_CLUSTER_RATIO(fs));
		ext2fs_bg_flags_clear(fs, group, EXT2_BG_BLOCK_UNINIT);
		ext2fs_group_desc_csum_set(fs, group);

		ext2fs_free_blocks_count_add(fs->super, -inuse * n);
		ext2fs_mark_super_dirty(fs);
		ext2fs_mark_bb_dirty(fs);
		blk += n;
		num -= n;
	}
}

/* 
 * ext2fs_allocate_tables() is not optimally allocating blocks in all
 * situations.  We need to take a look at this at some point.  For
 * now, just replace it with something simple and stupid.
 */ 
errcode_t my_allocate_tables(ext2_filsys fs)
{
	errcode_t	retval;
	dgrp_t		i;

	for (i = 0; i < fs->group_desc_count; i++) {
		retval = ext2fs_new_block2(fs, goal, NULL, &goal);
		if (retval)
			return retval;
		ext2fs_block_alloc_stats2(fs, goal, +1);
		ext2fs_block_bitmap_loc_set(fs, i, goal);
	}
	for (i = 0; i < fs->group_desc_count; i++) {
		retval = ext2fs_new_block2(fs, goal, NULL, &goal);
		if (retval)
			return retval;
		ext2fs_block_alloc_stats2(fs, goal, +1);
		ext2fs_inode_bitmap_loc_set(fs, i, goal);
	}
	for (i = 0; i < fs->group_desc_count; i++) {
		blk64_t end = ext2fs_blocks_count(fs->super) - 1;
		retval = ext2fs_get_free_blocks2(fs, goal, end,
						 fs->inode_blocks_per_group,
						 fs->block_map, &goal);
		if (retval)
			return retval;
		ext2fs_block_alloc_stats_range(fs, goal,
					       fs->inode_blocks_per_group, +1);
		ext2fs_inode_table_loc_set(fs, i, goal);
	}
	return 0;
}

/* 
 * Some of this could eventually get turned into fallocate, but that's
 * actually a much more difficult and tricking thing to implement.
 */
static errcode_t mk_hugefile(ext2_filsys fs, unsigned int num, 
			     ext2_ino_t dir, int idx, ext2_ino_t *ino)

{
	errcode_t		retval;
	blk64_t			lblk, blk, bend;
	__u64			size;
	unsigned int		i;
	struct ext2_inode	inode;
	ext2_extent_handle_t	handle;
	char			fn[32];

	retval = ext2fs_new_inode(fs, 0, LINUX_S_IFREG, NULL, ino);
	if (retval)
		return retval;

	memset(&inode, 0, sizeof(struct ext2_inode));
	inode.i_mode = LINUX_S_IFREG | 0600;
	ext2fs_iblk_set(fs, &inode, num / EXT2FS_CLUSTER_RATIO(fs));
	size = (__u64) num * fs->blocksize;
	inode.i_size = size & 0xffffffff;
	inode.i_size_high = (size >> 32);
	inode.i_links_count = 1;

	retval = ext2fs_write_new_inode(fs, *ino, &inode);
	if (retval)
		return retval;

	ext2fs_inode_alloc_stats2(fs, *ino, +1, 0);

	retval = ext2fs_extent_open2(fs, *ino, &inode, &handle);
	if (retval)
		return retval;

	{
		struct ext2_inode t;

		ext2fs_read_inode(fs, *ino, &t);
		printf("eo: i_size_high: %lu size: %llu\n", t.i_size_high,
		       EXT2_I_SIZE(&t));
	}
	lblk = 0;
	while (num) {
		blk64_t pblk, end;
		blk_t n = num;

		retval =  ext2fs_find_first_zero_block_bitmap2(fs->block_map,
			goal, ext2fs_blocks_count(fs->super) - 1, &end);
		if (retval)
			return ENOSPC;
		goal = end;

		retval =  ext2fs_find_first_set_block_bitmap2(fs->block_map, goal,
			       ext2fs_blocks_count(fs->super) - 1, &bend);
		if (bend == ENOENT)
			bend = ext2fs_blocks_count(fs->super);
		if (bend - goal < num)
			n = bend - goal;
		printf("goal %llu bend %llu num %u n %u\n", goal, bend, num, n);
		pblk = goal;
		num -= n;
		goal += n;
		ext2fs_block_alloc_stats_range(fs, pblk, n, +1);

		while (n) {
			blk_t l = n;
			struct ext2fs_extent newextent;

	{
		struct ext2_inode t;

		ext2fs_read_inode(fs, *ino, &t);
		printf("i_size_high: %lu size: %llu\n", t.i_size_high,
		       EXT2_I_SIZE(&t));
	}

			if (l > EXT_INIT_MAX_LEN)
				l = EXT_INIT_MAX_LEN;

			newextent.e_len = l;
			newextent.e_pblk = pblk;
			newextent.e_lblk = lblk;
			newextent.e_flags = 0;

			printf("inserting extent: %llu %llu %u\n", lblk, pblk, l);
			retval = ext2fs_extent_insert(handle,
					EXT2_EXTENT_INSERT_AFTER, &newextent);
			if (retval)
				return retval;
			pblk += l;
			lblk += l;
			n -= l;
		}
	}

	{
		struct ext2_inode t;

		ext2fs_read_inode(fs, *ino, &t);
		printf("i_size_high: %lu size: %llu\n", t.i_size_high,
		       EXT2_I_SIZE(&t));
	}
	sprintf(fn, "hugefile%05d", idx);
retry:
	retval = ext2fs_link(fs, dir, fn, *ino, EXT2_FT_REG_FILE);
	if (retval == EXT2_ET_DIR_NO_SPACE) {
		retval = ext2fs_expand_dir(fs, dir);
		if (retval)
			goto errout;
		goto retry;
	}

	if (retval)
		goto errout;

errout:
	if (handle)
		ext2fs_extent_free(handle);

	return retval;
}

Note that this requires some of the test patches I've been sending
out, since it uses ext2fs_find_first_{set,zero}_block_bitmap2().

There are also some bugs in the versions which I sent out; I'm working
on fixing them....

> That said, I've also found a couple of bugs in the extent code by implementing
> fallocate in such a stupid way. :)  It turns out that if (a) we need to split
> an extent into three pieces (say we write to a block in the middle of an
> unwritten extent and don't want to convert the whole extent) and (b) either of
> the extent_insert calls requires us to split the extent block and (c) we ENOSPC
> while trying to allocate a new extent block, we don't put the extent tree back
> the way it was before the split, and all the blocks after that point are lost.

Well, I found a bug in extfs_extent_insert() which showed up when I
tried to implement the block allocation in an intelligent way.  :-)
I'll send out that bug fix a bit.


> I will send patches to avoid this corruption by checking for enough space soon.
> I think your local git tree has patches in it that aren't on kernel.org yet, so
> I'll hold off until I see them show up.

Yeah, some of those patches still need some clean up, so I haven't
pushed my maint branch to kernel.org yet.

But anyway, the above code will give you an idea where I'm going ---
this is **way** faster than trying to allocate blocks using the
set_bmap() function.  :-)

						- Ted

  reply	other threads:[~2014-01-15 22:19 UTC|newest]

Thread overview: 150+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-12-11  1:18 [PATCH v3 00/74] e2fsprogs patchbomb 12/2013 Darrick J. Wong
2013-12-11  1:18 ` [PATCH 01/74] libext2fs: don't overflow when punching indirect blocks with large blocks Darrick J. Wong
2013-12-12 17:06   ` Theodore Ts'o
2013-12-11  1:18 ` [PATCH 02/74] libext2fs: fix tests that set LARGE_FILE Darrick J. Wong
2013-12-12 17:09   ` Theodore Ts'o
2013-12-11  1:18 ` [PATCH 03/74] mke2fs: load configfile blocksize setting before 64bit checks Darrick J. Wong
2013-12-12 17:27   ` Theodore Ts'o
2013-12-12 22:28   ` Andreas Dilger
2013-12-12 23:13     ` Darrick J. Wong
2013-12-12 23:14     ` [PATCH] mke2fs: clean up kernel version tests Darrick J. Wong
2013-12-11  1:18 ` [PATCH 04/74] libext2fs: use ext2fs_punch() to truncate quota file Darrick J. Wong
2013-12-12 17:28   ` Theodore Ts'o
2013-12-12 17:36     ` Theodore Ts'o
2013-12-12 20:07       ` Darrick J. Wong
2013-12-12 20:56         ` Theodore Ts'o
2013-12-12 21:10         ` Darrick J. Wong
2013-12-11  1:18 ` [PATCH 05/74] debugfs: fix init_filesys help text Darrick J. Wong
2013-12-12 17:37   ` Theodore Ts'o
2013-12-11  1:18 ` [PATCH 06/74] tune2fs: forbid changing uuid on an uninit_bg filesystem Darrick J. Wong
2013-12-15  2:02   ` Theodore Ts'o
2013-12-11  1:19 ` [PATCH 07/74] libext2fs: tweak inline data error wording Darrick J. Wong
2013-12-13  4:33   ` Theodore Ts'o
2013-12-11  1:19 ` [PATCH 08/74] libext2fs: don't allow ridiculously large logical block numbers Darrick J. Wong
2013-12-12 17:41   ` Theodore Ts'o
2013-12-11  1:19 ` [PATCH 09/74] libext2fs: fix another minor grammatical error in the error catalog Darrick J. Wong
2013-12-12 17:42   ` Theodore Ts'o
2013-12-11  1:19 ` [PATCH 10/74] debugfs: fix various minor bogosity Darrick J. Wong
2013-12-12 17:44   ` Theodore Ts'o
2013-12-11  1:19 ` [PATCH 11/74] misc: use the checksum predicate function, not raw flag tests Darrick J. Wong
2013-12-13  4:34   ` Theodore Ts'o
2013-12-11  1:19 ` [PATCH 12/74] libext2fs: make symlinks safe for 64bit blocks and extents Darrick J. Wong
2013-12-12 17:48   ` Theodore Ts'o
2013-12-11  1:19 ` [PATCH 13/74] debugfs: handle 64bit block numbers Darrick J. Wong
2013-12-12 17:49   ` Theodore Ts'o
2013-12-17 17:01   ` Eric Sandeen
2013-12-11  1:19 ` [PATCH 14/74] libext2fs: fileio should use 64bit io routines Darrick J. Wong
2013-12-12 17:50   ` Theodore Ts'o
2013-12-11  1:20 ` [PATCH 15/74] resize2fs: rewrite extent/dir/ea block checksums when migrating Darrick J. Wong
2013-12-13  4:35   ` Theodore Ts'o
2013-12-11  1:20 ` [PATCH 16/74] debugfs: don't leak fd when calling dump_file Darrick J. Wong
2013-12-12 17:51   ` Theodore Ts'o
2013-12-11  1:20 ` [PATCH 17/74] debugfs: don't leak mmp_s memory Darrick J. Wong
2013-12-12 17:52   ` Theodore Ts'o
2013-12-12 22:33   ` Andreas Dilger
2013-12-12 22:44     ` Darrick J. Wong
2013-12-11  1:20 ` [PATCH 18/74] e2fsck: fix memory leaks Darrick J. Wong
2013-12-12 17:58   ` Theodore Ts'o
2013-12-17 16:12   ` Eric Sandeen
2013-12-11  1:20 ` [PATCH 19/74] misc: don't leak file descriptors Darrick J. Wong
2013-12-12 18:06   ` Theodore Ts'o
2013-12-11  1:20 ` [PATCH 20/74] mke2fs: don't leak memory Darrick J. Wong
2013-12-12 18:07   ` Theodore Ts'o
2013-12-11  1:20 ` [PATCH 21/74] e4defrag: don't crash if umounts the filesystem races with us Darrick J. Wong
2013-12-12 18:08   ` Theodore Ts'o
2013-12-11  1:20 ` [PATCH 22/74] e4defrag: defensively check results of sysconf(_SC_PAGESIZE) Darrick J. Wong
2013-12-12 18:09   ` Theodore Ts'o
2013-12-11  1:20 ` [PATCH 23/74] e2image: check return value from check_if_mounted Darrick J. Wong
2013-12-12 18:09   ` Theodore Ts'o
2013-12-11  1:20 ` [PATCH 24/74] dumpe2fs: check return values Darrick J. Wong
2013-12-12 18:10   ` Theodore Ts'o
2013-12-11  1:21 ` [PATCH 25/74] libss: fix fd error handling Darrick J. Wong
2013-12-12 18:11   ` Theodore Ts'o
2013-12-11  1:21 ` [PATCH 26/74] libss: fix memory handling errors Darrick J. Wong
2013-12-12 18:13   ` Theodore Ts'o
2013-12-17 17:04   ` Eric Sandeen
2013-12-18 22:23     ` Darrick J. Wong
2013-12-11  1:21 ` [PATCH 27/74] libquota: fix memory leak Darrick J. Wong
2013-12-12 18:14   ` Theodore Ts'o
2013-12-11  1:21 ` [PATCH 28/74] libext2fs: check return values Darrick J. Wong
2013-12-12 18:15   ` Theodore Ts'o
2013-12-17 16:57   ` Eric Sandeen
2013-12-17 16:59     ` Eric Sandeen
2013-12-11  1:21 ` [PATCH 29/74] libext2fs: fix memory leaks Darrick J. Wong
2013-12-12 18:17   ` Theodore Ts'o
2013-12-11  1:21 ` [PATCH 30/74] libext2fs: fix a broken close() test Darrick J. Wong
2013-12-12 18:18   ` Theodore Ts'o
2013-12-11  1:21 ` [PATCH 31/74] libext2fs: fail fileio write if we can't allocate a block Darrick J. Wong
2013-12-12 18:23   ` Theodore Ts'o
2013-12-11  1:21 ` [PATCH 32/74] libext2fs: fix punching extents when there are no left extents Darrick J. Wong
2013-12-12 18:25   ` Theodore Ts'o
2013-12-11  1:22 ` [PATCH 33/74] libext2fs: don't error out when punching a totally sparse file Darrick J. Wong
2013-12-12 18:26   ` Theodore Ts'o
2013-12-11  1:22 ` [PATCH 34/74] e2fsck: in rehash, mark newly allocated extent blocks as found Darrick J. Wong
2013-12-12 18:27   ` Theodore Ts'o
2013-12-11  1:22 ` [PATCH 35/74] libext2fs: zero block contents past EOF when setting size Darrick J. Wong
2013-12-12 18:40   ` Theodore Ts'o
2013-12-11  1:22 ` [PATCH 36/74] libext2fs: detect Darrick J. Wong
2013-12-13  4:39   ` Theodore Ts'o
2014-01-15 21:00     ` Darrick J. Wong
2013-12-11  1:22 ` [PATCH 37/74] libext2fs: don't always read backup group descriptors on a 1k-block meta_bg fs Darrick J. Wong
2014-01-11 18:59   ` Theodore Ts'o
2013-12-11  1:22 ` [PATCH 38/74] libext2fs: mark group data blocks when loading block bitmap Darrick J. Wong
2014-01-11 19:08   ` Theodore Ts'o
2013-12-11  1:22 ` [PATCH 39/74] e2fsck: remove uninit block bitmap calculation Darrick J. Wong
2014-01-11 19:08   ` Theodore Ts'o
2013-12-11  1:22 ` [PATCH 40/74] libext2fs: no need to clear BLOCK_UNINIT during ext2fs_reserve_super_and_bgd Darrick J. Wong
2014-01-10  8:17   ` Akira Fujita
2014-01-11 19:18   ` Theodore Ts'o
2013-12-11  1:22 ` [PATCH 41/74] tests: adjust test output to reflect block_uninit calculated block bitmaps Darrick J. Wong
2014-01-11 19:19   ` Theodore Ts'o
2013-12-11  1:22 ` [PATCH 42/74] libext2fs: only punch complete clusters Darrick J. Wong
2013-12-16  4:52   ` Theodore Ts'o
2013-12-11  1:23 ` [PATCH 43/74] libext2fs: don't update the summary counts when doing implied cluster allocation Darrick J. Wong
2013-12-16  4:53   ` Theodore Ts'o
2013-12-11  1:23 ` [PATCH 44/74] e2fsck: only release clusters when shortening a directory during a rehash Darrick J. Wong
2013-12-16  4:55   ` Theodore Ts'o
2013-12-11  1:23 ` [PATCH 45/74] e2fsck: print cluster ranges when encountering bitmap errors Darrick J. Wong
2013-12-16  4:55   ` Theodore Ts'o
2013-12-11  1:23 ` [PATCH 46/74] e2fsck: try implied cluster allocation when expanding a dir Darrick J. Wong
2013-12-16  4:56   ` Theodore Ts'o
2013-12-11  1:23 ` [PATCH 47/74] resize2fs: during shrink, don't free in-use bg data clusters Darrick J. Wong
2013-12-16  5:01   ` Theodore Ts'o
2013-12-16 20:10     ` Darrick J. Wong
2014-02-24  1:39       ` Theodore Ts'o
2013-12-11  1:23 ` [PATCH 48/74] resize2fs: don't free in-use clusters when moving blocks Darrick J. Wong
2014-02-24  1:56   ` Theodore Ts'o
2013-12-11  1:23 ` [PATCH 49/74] mke2fs: set block_validity as a default mount option Darrick J. Wong
2013-12-11  1:23 ` [PATCH 50/74] libext2fs: support allocating uninit blocks in bmap2() Darrick J. Wong
2014-01-11 22:57   ` Theodore Ts'o
2014-01-15 21:11     ` Darrick J. Wong
2014-01-15 22:19       ` Theodore Ts'o [this message]
2014-01-15 22:23         ` Theodore Ts'o
2013-12-11  1:23 ` [PATCH 51/74] libext2fs: file IO routines should handle uninit blocks Darrick J. Wong
2013-12-11  1:24 ` [PATCH 52/74] resize2fs: convert fs to and from 64bit mode Darrick J. Wong
2013-12-11  1:24 ` [PATCH 53/74] resize2fs: when toggling 64bit, don't free in-use bg data clusters Darrick J. Wong
2013-12-11  1:24 ` [PATCH 54/74] resize2fs: adjust reserved_gdt_blocks when changing group descriptor size Darrick J. Wong
2013-12-11  1:24 ` [PATCH 55/74] libext2fs: support modifying arbitrary extended attributes Darrick J. Wong
2014-02-24  4:09   ` Theodore Ts'o
2013-12-11  1:24 ` [PATCH 56/74] libext2fs: various tweaks to the xattr editor APIs Darrick J. Wong
2014-02-24  4:10   ` Theodore Ts'o
2013-12-11  1:24 ` [PATCH 57/74] libext2fs: extend xattr api to query number of attrs Darrick J. Wong
2014-02-24  4:10   ` Theodore Ts'o
2013-12-11  1:24 ` [PATCH 58/74] libext2fs: free key/value pairs before reading Darrick J. Wong
2014-02-24  4:10   ` Theodore Ts'o
2013-12-11  1:24 ` [PATCH 59/74] debugfs: dump all extended attributes Darrick J. Wong
2014-02-24  4:10   ` Theodore Ts'o
2013-12-11  1:24 ` [PATCH 60/74] libext2fs: ensure that inline data is always written to ibody Darrick J. Wong
2013-12-11  1:25 ` [PATCH 61/74] libext2fs: fix ext2fs_open2() truncation of the superblock parameter Darrick J. Wong
2013-12-11  1:25 ` [PATCH 62/74] misc: add fuse2fs, a FUSE server for e2fsprogs Darrick J. Wong
2013-12-11  1:25 ` [PATCH 63/74] fuse2fs: translate ACL structures Darrick J. Wong
2013-12-11  1:25 ` [PATCH 64/74] Subject: [PATCH] fuse2fs: support allocating uninit blocks in fallocate Darrick J. Wong
2013-12-11  1:25 ` [PATCH 65/74] fuse2fs: handle 64-bit dates correctly Darrick J. Wong
2013-12-11  1:25 ` [PATCH 67/74] tests: check correct handling of reading and writing uninit extents Darrick J. Wong
2013-12-11  1:25 ` [PATCH 68/74] tests: Add block_validity speed test Darrick J. Wong
2013-12-11  1:26 ` [PATCH 69/74] Subject: [PATCH] tests: test what happens if we run out of space Darrick J. Wong
2013-12-11  1:26 ` [PATCH 70/74] tests: add stale data after truncate test Darrick J. Wong
2013-12-11  1:26 ` [PATCH 71/74] tests: check mapping of really high logical block offsets Darrick J. Wong
2013-12-11  1:26 ` [PATCH 72/74] Subject: [PATCH] tests: enable using fuse2fs with metadata checksum test Darrick J. Wong
2013-12-11  1:26 ` [PATCH 73/74] tests: add large symlink test Darrick J. Wong
2013-12-11  1:26 ` [PATCH 74/74] tests: test date handling Darrick J. Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140115221951.GA12226@thunk.org \
    --to=tytso@mit.edu \
    --cc=darrick.wong@oracle.com \
    --cc=linux-ext4@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).