From: Alex Tomas <alex@clusterfs.com>
To: ext4 development <linux-ext4@vger.kernel.org>,
linux-fsdevel@vger.kernel.org
Subject: [RFC] basic delayed allocation in ext4
Date: Thu, 26 Jul 2007 13:00:04 +0400 [thread overview]
Message-ID: <46A86294.6050608@clusterfs.com> (raw)
Good day,
please review ...
thanks, Alex
Basic delayed allocation in ext4
Two special ->get_block() methods are introduced:
* ext4_da_get_block_prep()
to be used with ->prepare_write(), defers allocation till flush
* ext4_da_get_block_write()
to be used with mpage_da_writepages(), allocate blocks and correct on-disk size
Current implementation works with data=writeback only, you should
mount filesystem with delalloc,data=writeback options.
TODO:
* reservation
* data=ordered
* quota
* bmap
Signed-off-by: Alex Tomas <alex@clusterfs.com>
Index: linux-2.6.22/include/linux/ext4_fs.h
===================================================================
--- linux-2.6.22.orig/include/linux/ext4_fs.h 2007-07-26 12:30:25.000000000 +0400
+++ linux-2.6.22/include/linux/ext4_fs.h 2007-07-26 12:32:04.000000000 +0400
@@ -488,6 +488,7 @@ do { \
#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
+#define EXT4_MOUNT_DELALLOC 0x2000000 /* Delalloc support */
/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
Index: linux-2.6.22/fs/ext4/super.c
===================================================================
--- linux-2.6.22.orig/fs/ext4/super.c 2007-07-26 12:30:25.000000000 +0400
+++ linux-2.6.22/fs/ext4/super.c 2007-07-26 12:32:04.000000000 +0400
@@ -728,7 +728,7 @@ enum {
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
- Opt_grpquota, Opt_extents, Opt_noextents,
+ Opt_grpquota, Opt_extents, Opt_noextents, Opt_delalloc,
};
static match_table_t tokens = {
@@ -782,6 +782,7 @@ static match_table_t tokens = {
{Opt_barrier, "barrier=%u"},
{Opt_extents, "extents"},
{Opt_noextents, "noextents"},
+ {Opt_delalloc, "delalloc"},
{Opt_err, NULL},
{Opt_resize, "resize"},
};
@@ -1127,6 +1128,9 @@ clear_qf_name:
case Opt_noextents:
clear_opt (sbi->s_mount_opt, EXTENTS);
break;
+ case Opt_delalloc:
+ set_opt (sbi->s_mount_opt, DELALLOC);
+ break;
default:
printk (KERN_ERR
"EXT4-fs: Unrecognized mount option \"%s\" "
Index: linux-2.6.22/fs/ext4/inode.c
===================================================================
--- linux-2.6.22.orig/fs/ext4/inode.c 2007-07-26 12:30:22.000000000 +0400
+++ linux-2.6.22/fs/ext4/inode.c 2007-07-26 12:32:04.000000000 +0400
@@ -39,6 +39,8 @@
#include "xattr.h"
#include "acl.h"
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
+
/*
* Test whether an inode is a fast symlink.
*/
@@ -1291,6 +1293,142 @@ static int ext4_journalled_commit_write(
}
/*
+ * this is a special callback for ->prepare_write() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret = 0;
+
+ BUG_ON(create == 0);
+ BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
+ /* first, we need to know whether the block is allocated already
+ * XXX: when the filesystem has a lot of free blocks, we could
+ * reserve even allocated blocks to save this lookup */
+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0);
+ if (ret >= 0) {
+ if (buffer_mapped(bh_result)) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ } else {
+ /* OK, the block isn't allocated yet, let's reserve space */
+ /* XXX: call reservation here */
+ /* XXX: __block_prepare_write() unmaps passed block, is it OK? */
+ map_bh(bh_result, inode->i_sb, 0);
+ set_buffer_new(bh_result);
+ set_buffer_delay(bh_result);
+ }
+ ret = 0;
+ }
+
+ return ret;
+}
+
+
+static int ext4_da_prepare_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ return block_prepare_write(page, from, to, ext4_da_get_block_prep);
+}
+
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ loff_t disksize = EXT4_I(inode)->i_disksize;
+ handle_t *handle = NULL;
+
+ if (create) {
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ }
+
+ ret = ext4_get_blocks_wrap(handle, inode, iblock,
+ max_blocks, bh_result, create, 0);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+
+ /*
+ * Update on-disk size along with block allocation
+ * we don't use 'extend_disksize' as size may change
+ * within already allocated block -bzzz
+ */
+ disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ /*
+ * XXX: replace with spinlock if seen contended -bzzz
+ */
+ mutex_lock(&EXT4_I(inode)->truncate_mutex);
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
+ mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+
+ if (EXT4_I(inode)->i_disksize == disksize) {
+ if (handle == NULL)
+ handle = ext4_journal_start(inode, 1);
+ if (!IS_ERR(handle))
+ ext4_mark_inode_dirty(handle, inode);
+ }
+ }
+
+ ret = 0;
+ }
+
+out:
+ if (handle && !IS_ERR(handle))
+ ext4_journal_stop(handle);
+
+ return ret;
+}
+
+static int ext4_da_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+ struct buffer_head *head, *bh;
+ unsigned int curr_off = 0;
+
+ /*
+ * Drop reserved blocks
+ */
+ BUG_ON(!PageLocked(page));
+ if (!page_has_buffers(page))
+ goto out;
+
+ bh = head = page_buffers(page);
+ do {
+ unsigned int next_off = curr_off + bh->b_size;
+
+ /*
+ * is this block fully invalidated?
+ */
+ if (offset <= curr_off && buffer_delay(bh)) {
+ clear_buffer_delay(bh);
+ /* XXX: add real stuff here */
+ }
+ curr_off = next_off;
+ bh = bh->b_this_page;
+ } while (bh != head);
+
+out:
+ ext4_invalidatepage(page, offset);
+
+ return;
+}
+
+
+/*
* bmap() is special. It gets used by applications such as lilo and by
* the swapper to find the on-disk block of a specific piece of data.
*
@@ -1741,10 +1879,28 @@ static const struct address_space_operat
.releasepage = ext4_releasepage,
};
+static const struct address_space_operations ext4_da_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_writeback_writepage,
+ .writepages = ext4_da_writepages,
+ .sync_page = block_sync_page,
+ .prepare_write = ext4_da_prepare_write,
+ .commit_write = generic_commit_write,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_da_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+};
+
void ext4_set_aops(struct inode *inode)
{
if (ext4_should_order_data(inode))
inode->i_mapping->a_ops = &ext4_ordered_aops;
+ else if (ext4_should_writeback_data(inode) &&
+ test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
else if (ext4_should_writeback_data(inode))
inode->i_mapping->a_ops = &ext4_writeback_aops;
else
next reply other threads:[~2007-07-26 9:00 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-07-26 9:00 Alex Tomas [this message]
2007-07-26 11:35 ` [RFC] basic delayed allocation in ext4 Aneesh Kumar K.V
2007-07-26 11:50 ` Alex Tomas
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=46A86294.6050608@clusterfs.com \
--to=alex@clusterfs.com \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.