linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] basic delayed allocation in ext4
@ 2007-07-26  9:00 Alex Tomas
  2007-07-26 11:35 ` Aneesh Kumar K.V
  0 siblings, 1 reply; 3+ messages in thread
From: Alex Tomas @ 2007-07-26  9:00 UTC (permalink / raw)
  To: ext4 development, linux-fsdevel

Good day,

please review ...

thanks, Alex

Basic delayed allocation in ext4

Two special ->get_block() methods are introduced:

  * ext4_da_get_block_prep()
    to be used with ->prepare_write(), defers allocation till flush
  * ext4_da_get_block_write()
    to be used with mpage_da_writepages(), allocate blocks and correct on-disk size

Current implementation works with data=writeback only, you should
mount filesystem with delalloc,data=writeback options.

TODO:
  * reservation
  * data=ordered
  * quota
  * bmap

Signed-off-by: Alex Tomas <alex@clusterfs.com>


Index: linux-2.6.22/include/linux/ext4_fs.h
===================================================================
--- linux-2.6.22.orig/include/linux/ext4_fs.h	2007-07-26 12:30:25.000000000 +0400
+++ linux-2.6.22/include/linux/ext4_fs.h	2007-07-26 12:32:04.000000000 +0400
@@ -488,6 +488,7 @@ do {									       \
  #define EXT4_MOUNT_EXTENTS		0x400000 /* Extents support */
  #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
  #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
+#define EXT4_MOUNT_DELALLOC		0x2000000 /* Delalloc support */
  /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
  #ifndef _LINUX_EXT2_FS_H
  #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
Index: linux-2.6.22/fs/ext4/super.c
===================================================================
--- linux-2.6.22.orig/fs/ext4/super.c	2007-07-26 12:30:25.000000000 +0400
+++ linux-2.6.22/fs/ext4/super.c	2007-07-26 12:32:04.000000000 +0400
@@ -728,7 +728,7 @@ enum {
  	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
  	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
  	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-	Opt_grpquota, Opt_extents, Opt_noextents,
+	Opt_grpquota, Opt_extents, Opt_noextents, Opt_delalloc,
  };

  static match_table_t tokens = {
@@ -782,6 +782,7 @@ static match_table_t tokens = {
  	{Opt_barrier, "barrier=%u"},
  	{Opt_extents, "extents"},
  	{Opt_noextents, "noextents"},
+	{Opt_delalloc, "delalloc"},
  	{Opt_err, NULL},
  	{Opt_resize, "resize"},
  };
@@ -1127,6 +1128,9 @@ clear_qf_name:
  		case Opt_noextents:
  			clear_opt (sbi->s_mount_opt, EXTENTS);
  			break;
+		case Opt_delalloc:
+			set_opt (sbi->s_mount_opt, DELALLOC);
+			break;
  		default:
  			printk (KERN_ERR
  				"EXT4-fs: Unrecognized mount option \"%s\" "
Index: linux-2.6.22/fs/ext4/inode.c
===================================================================
--- linux-2.6.22.orig/fs/ext4/inode.c	2007-07-26 12:30:22.000000000 +0400
+++ linux-2.6.22/fs/ext4/inode.c	2007-07-26 12:32:04.000000000 +0400
@@ -39,6 +39,8 @@
  #include "xattr.h"
  #include "acl.h"

+static void ext4_invalidatepage(struct page *page, unsigned long offset);
+
  /*
   * Test whether an inode is a fast symlink.
   */
@@ -1291,6 +1293,142 @@ static int ext4_journalled_commit_write(
  }

  /*
+ * this is a special callback for ->prepare_write() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+				struct buffer_head *bh_result, int create)
+{
+	int ret = 0;
+
+	BUG_ON(create == 0);
+	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
+	/* first, we need to know whether the block is allocated already
+	 * XXX: when the filesystem has a lot of free blocks, we could
+	 * reserve even allocated blocks to save this lookup */
+	ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0);
+	if (ret >= 0) {
+		if (buffer_mapped(bh_result)) {
+			bh_result->b_size = (ret << inode->i_blkbits);
+		} else {
+			/* OK, the block isn't allocated yet, let's reserve space */
+			/* XXX: call reservation here */
+			/* XXX: __block_prepare_write() unmaps passed block, is it OK? */
+			map_bh(bh_result, inode->i_sb, 0);
+			set_buffer_new(bh_result);
+			set_buffer_delay(bh_result);
+		}
+		ret = 0;
+	}
+
+	return ret;
+}
+
+
+static int ext4_da_prepare_write(struct file *file, struct page *page,
+			      		unsigned from, unsigned to)
+{
+	return block_prepare_write(page, from, to, ext4_da_get_block_prep);
+}
+
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+				struct buffer_head *bh_result, int create)
+{
+	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+	loff_t disksize = EXT4_I(inode)->i_disksize;
+	handle_t *handle = NULL;
+
+	if (create) {
+		handle = ext4_journal_start(inode, needed_blocks);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+	}
+
+	ret = ext4_get_blocks_wrap(handle, inode, iblock,
+				max_blocks, bh_result, create, 0);
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+
+		/*
+		 * Update on-disk size along with block allocation
+		 * we don't use 'extend_disksize' as size may change
+		 * within already allocated block -bzzz
+		 */
+		disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+		if (disksize > i_size_read(inode))
+			disksize = i_size_read(inode);
+		if (disksize > EXT4_I(inode)->i_disksize) {
+			/*
+			 * XXX: replace with spinlock if seen contended -bzzz
+			 */
+			mutex_lock(&EXT4_I(inode)->truncate_mutex);
+			if (disksize > EXT4_I(inode)->i_disksize)
+				EXT4_I(inode)->i_disksize = disksize;
+			mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+
+			if (EXT4_I(inode)->i_disksize == disksize) {
+				if (handle == NULL)
+					handle = ext4_journal_start(inode, 1);
+				if (!IS_ERR(handle))
+					ext4_mark_inode_dirty(handle, inode);
+			}
+		}
+
+		ret = 0;
+	}
+
+out:
+	if (handle && !IS_ERR(handle))
+		ext4_journal_stop(handle);
+
+	return ret;
+}
+
+static int ext4_da_writepages(struct address_space *mapping,
+				struct writeback_control *wbc)
+{
+	return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct buffer_head *head, *bh;
+	unsigned int curr_off = 0;
+
+	/*
+	 * Drop reserved blocks
+	 */
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		goto out;
+
+	bh = head = page_buffers(page);
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+
+		/*
+		 * is this block fully invalidated?
+		 */
+		if (offset <= curr_off && buffer_delay(bh)) {
+			clear_buffer_delay(bh);
+			/* XXX: add real stuff here */
+		}
+		curr_off = next_off;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+out:
+	ext4_invalidatepage(page, offset);
+
+	return;
+}
+
+
+/*
   * bmap() is special.  It gets used by applications such as lilo and by
   * the swapper to find the on-disk block of a specific piece of data.
   *
@@ -1741,10 +1879,28 @@ static const struct address_space_operat
  	.releasepage	= ext4_releasepage,
  };

+static const struct address_space_operations ext4_da_aops = {
+	.readpage	= ext4_readpage,
+	.readpages	= ext4_readpages,
+	.writepage	= ext4_writeback_writepage,
+	.writepages	= ext4_da_writepages,
+	.sync_page	= block_sync_page,
+	.prepare_write	= ext4_da_prepare_write,
+	.commit_write	= generic_commit_write,
+	.bmap		= ext4_bmap,
+	.invalidatepage	= ext4_da_invalidatepage,
+	.releasepage	= ext4_releasepage,
+	.direct_IO	= ext4_direct_IO,
+	.migratepage	= buffer_migrate_page,
+};
+
  void ext4_set_aops(struct inode *inode)
  {
  	if (ext4_should_order_data(inode))
  		inode->i_mapping->a_ops = &ext4_ordered_aops;
+	else if (ext4_should_writeback_data(inode) &&
+			test_opt(inode->i_sb, DELALLOC))
+		inode->i_mapping->a_ops = &ext4_da_aops;
  	else if (ext4_should_writeback_data(inode))
  		inode->i_mapping->a_ops = &ext4_writeback_aops;
  	else

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC] basic delayed allocation in ext4
  2007-07-26  9:00 [RFC] basic delayed allocation in ext4 Alex Tomas
@ 2007-07-26 11:35 ` Aneesh Kumar K.V
  2007-07-26 11:50   ` Alex Tomas
  0 siblings, 1 reply; 3+ messages in thread
From: Aneesh Kumar K.V @ 2007-07-26 11:35 UTC (permalink / raw)
  To: Alex Tomas; +Cc: ext4 development, linux-fsdevel



Alex Tomas wrote:
> Good day,
> 
> please review ...
> 
> thanks, Alex
> 
> Basic delayed allocation in ext4
> 
> Two special ->get_block() methods are introduced:
> 
>  * ext4_da_get_block_prep()
>    to be used with ->prepare_write(), defers allocation till flush
>  * ext4_da_get_block_write()
>    to be used with mpage_da_writepages(), allocate blocks and correct 
> on-disk size
> 
> Current implementation works with data=writeback only, you should
> mount filesystem with delalloc,data=writeback options.
> 
> TODO:
>  * reservation
>  * data=ordered
>  * quota
>  * bmap
> 
> Signed-off-by: Alex Tomas <alex@clusterfs.com>
> 


[.. snip...]

>  /*
>   * Test whether an inode is a fast symlink.
>   */
> @@ -1291,6 +1293,142 @@ static int ext4_journalled_commit_write(
>  }
> 
>  /*
> + * this is a special callback for ->prepare_write() only
> + * it's intention is to return mapped block or reserve space
> + */
> +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
> +                struct buffer_head *bh_result, int create)
> +{
> +    int ret = 0;
> +
> +    BUG_ON(create == 0);
> +    BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
> +
> +    /* first, we need to know whether the block is allocated already
> +     * XXX: when the filesystem has a lot of free blocks, we could
> +     * reserve even allocated blocks to save this lookup */
> +    ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0);
> +    if (ret >= 0) {


I guess this should be (ret > 0)


> +        if (buffer_mapped(bh_result)) {
> +            bh_result->b_size = (ret << inode->i_blkbits);
> +        } else {
> +            /* OK, the block isn't allocated yet, let's reserve space */
> +            /* XXX: call reservation here */



[...snip..]

epare_write(struct file *file, struct page *page,
> +                          unsigned from, unsigned to)
> +{
> +    return block_prepare_write(page, from, to, ext4_da_get_block_prep);
> +}
> +
> +    return ret;
> +}
> +
> +static int ext4_da_writepages(struct address_space *mapping,
> +                struct writeback_control *wbc)
> +{
> +    return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
> +}


I was not able to find mpage_da_writepages()..



-aneesh

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC] basic delayed allocation in ext4
  2007-07-26 11:35 ` Aneesh Kumar K.V
@ 2007-07-26 11:50   ` Alex Tomas
  0 siblings, 0 replies; 3+ messages in thread
From: Alex Tomas @ 2007-07-26 11:50 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: ext4 development, linux-fsdevel

Aneesh Kumar K.V wrote:
>> +    /* first, we need to know whether the block is allocated already
>> +     * XXX: when the filesystem has a lot of free blocks, we could
>> +     * reserve even allocated blocks to save this lookup */
>> +    ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 
>> 0);
>> +    if (ret >= 0) {
> 
> 
> I guess this should be (ret > 0)

well, no. it has to catch allocated and non-allocated blocks
(though can be written more clear, probably)

> I was not able to find mpage_da_writepages()..

sorry, it's part of vfs delayed allocation patch, should be on the list now.

thanks, Alex

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2007-07-26 11:50 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-07-26  9:00 [RFC] basic delayed allocation in ext4 Alex Tomas
2007-07-26 11:35 ` Aneesh Kumar K.V
2007-07-26 11:50   ` Alex Tomas

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).