* [RFC PATCH] ext4: Add ordered mode support for delalloc
@ 2008-06-12 15:25 Aneesh Kumar K.V
2008-06-13 20:53 ` Aneesh Kumar K.V
` (2 more replies)
0 siblings, 3 replies; 15+ messages in thread
From: Aneesh Kumar K.V @ 2008-06-12 15:25 UTC (permalink / raw)
To: cmm, tytso, sandeen; +Cc: linux-ext4, Aneesh Kumar K.V
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++--
fs/jbd2/commit.c | 41 ++++++++++++--
2 files changed, 198 insertions(+), 12 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 63355ab..7d87641 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
return !buffer_mapped(bh) || buffer_delay(bh);
}
-/* FIXME!! only support data=writeback mode */
/*
* get called vi ext4_da_writepages after taking page lock
* We may end up doing block allocation here in case
* mpage_da_map_blocks failed to allocate blocks.
*/
-static int ext4_da_writepage(struct page *page,
+static int ext4_da_writeback_writepage(struct page *page,
struct writeback_control *wbc)
{
int ret = 0;
@@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page,
return ret;
}
+/*
+ * get called vi ext4_da_writepages after taking page lock
+ * We may end up doing block allocation here in case
+ * mpage_da_map_blocks failed to allocate blocks.
+ *
+ * We also get called via journal_submit_inode_data_buffers
+ */
+static int ext4_da_ordered_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ int ret = 0;
+ loff_t size;
+ unsigned long len;
+ handle_t *handle = NULL;
+ struct buffer_head *page_bufs;
+ struct inode *inode = page->mapping->host;
+
+ handle = ext4_journal_current_handle();
+ if (!handle) {
+ /*
+ * This can happen when we aren't called via
+ * ext4_da_writepages() but directly (shrink_page_list).
+ * We cannot easily start a transaction here so we just skip
+ * writing the page in case we would have to do so.
+ */
+ size = i_size_read(inode);
+
+ page_bufs = page_buffers(page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (walk_page_buffers(NULL, page_bufs, 0,
+ len, NULL, ext4_bh_unmapped_or_delay)) {
+ /*
+ * We can't do block allocation under
+ * page lock without a handle . So redirty
+ * the page and return.
+ * We may reach here when we do a journal commit
+ * via journal_submit_inode_data_buffers.
+ * If we don't have mapping block we just ignore
+ * them
+ *
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ }
+
+ ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
+
+ return ret;
+}
/*
* For now just follow the DIO way to estimate the max credits
@@ -1745,19 +1799,99 @@ static int ext4_da_writepages(struct address_space *mapping,
return ret;
}
+static int ext4_da_ordered_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ handle_t *handle = NULL;
+ int needed_blocks;
+ int ret = 0;
+ long to_write;
+ loff_t range_start = 0;
+
+
+ /*
+ * No pages to write? This is mainly a kludge to avoid starting
+ * a transaction for special inodes like journal inode on last iput()
+ * because that could violate lock ordering on umount
+ */
+ if (!mapping->nrpages)
+ return 0;
+
+ /*
+ * Estimate the worse case needed credits to write out
+ * EXT4_MAX_BUF_BLOCKS pages
+ */
+ needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+
+ to_write = wbc->nr_to_write;
+ if (!wbc->range_cyclic) {
+ /*
+ * If range_cyclic is not set force range_cont
+ * and save the old writeback_index
+ */
+ wbc->range_cont = 1;
+ range_start = wbc->range_start;
+ }
+
+ while (!ret && to_write) {
+ /* start a new transaction*/
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_writepages;
+ }
+
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out_writepages;
+ }
+ /*
+ * set the max dirty pages could be write at a time
+ * to fit into the reserved transaction credits
+ */
+ if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+ wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+
+ to_write -= wbc->nr_to_write;
+ ret = mpage_da_writepages(mapping, wbc,
+ ext4_da_get_block_write);
+ ext4_journal_stop(handle);
+ if (wbc->nr_to_write) {
+ /*
+ * There is no more writeout needed
+ * or we requested for a noblocking writeout
+ * and we found the device congested
+ */
+ to_write += wbc->nr_to_write;
+ break;
+ }
+ wbc->nr_to_write = to_write;
+ }
+
+out_writepages:
+ wbc->nr_to_write = to_write;
+ if (range_start)
+ wbc->range_start = range_start;
+ return ret;
+}
+
static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
- int ret;
+ int ret, retries = 0;
struct page *page;
pgoff_t index;
unsigned from, to;
+ struct inode *inode = mapping->host;
index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
+retry:
page = __grab_cache_page(mapping, index);
if (!page)
return -ENOMEM;
@@ -1770,6 +1904,9 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
page_cache_release(page);
}
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+
return ret;
}
@@ -2224,10 +2361,10 @@ static int ext4_journalled_set_page_dirty(struct page *page)
.releasepage = ext4_releasepage,
};
-static const struct address_space_operations ext4_da_aops = {
+static const struct address_space_operations ext4_da_writeback_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
- .writepage = ext4_da_writepage,
+ .writepage = ext4_da_writeback_writepage,
.writepages = ext4_da_writepages,
.sync_page = block_sync_page,
.write_begin = ext4_da_write_begin,
@@ -2239,13 +2376,31 @@ static int ext4_journalled_set_page_dirty(struct page *page)
.migratepage = buffer_migrate_page,
};
+static const struct address_space_operations ext4_da_ordered_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_da_ordered_writepage,
+ .writepages = ext4_da_ordered_writepages,
+ .sync_page = block_sync_page,
+ .write_begin = ext4_da_write_begin,
+ .write_end = generic_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_da_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+};
+
void ext4_set_aops(struct inode *inode)
{
- if (ext4_should_order_data(inode))
+ if (ext4_should_order_data(inode) &&
+ test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_ordered_aops;
+ else if (ext4_should_order_data(inode))
inode->i_mapping->a_ops = &ext4_ordered_aops;
else if (ext4_should_writeback_data(inode) &&
test_opt(inode->i_sb, DELALLOC))
- inode->i_mapping->a_ops = &ext4_da_aops;
+ inode->i_mapping->a_ops = &ext4_da_writeback_aops;
else if (ext4_should_writeback_data(inode))
inode->i_mapping->a_ops = &ext4_writeback_aops;
else
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 483183d..32ca3c3 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -185,6 +187,30 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
}
/*
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
+ */
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
+{
+ int ret;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = mapping->nrpages * 2,
+ .range_start = 0,
+ .range_end = i_size_read(mapping->host),
+ .for_writepages = 1,
+ };
+
+ if (!mapping_cap_writeback_dirty(mapping))
+ return 0;
+
+ ret = generic_writepages(mapping, &wbc);
+ return ret;
+}
+
+/*
* Submit all the data buffers of inode associated with the transaction to
* disk.
*
@@ -192,7 +218,7 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
* our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
* operate on from being released while we write out pages.
*/
-static int journal_submit_inode_data_buffers(journal_t *journal,
+static int journal_submit_data_buffers(journal_t *journal,
transaction_t *commit_transaction)
{
struct jbd2_inode *jinode;
@@ -204,8 +230,13 @@ static int journal_submit_inode_data_buffers(journal_t *journal,
mapping = jinode->i_vfs_inode->i_mapping;
jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
- err = filemap_fdatawrite_range(mapping, 0,
- i_size_read(jinode->i_vfs_inode));
+ /*
+ * submit the inode data buffers. We use writepage
+ * instead of writepages. Because writepages can do
+ * block allocation with delalloc. We need to write
+ * only allocated blocks here.
+ */
+ err = journal_submit_inode_data_buffers(mapping);
if (!ret)
ret = err;
spin_lock(&journal->j_list_lock);
@@ -228,7 +259,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
struct jbd2_inode *jinode, *next_i;
int err, ret = 0;
- /* For locking, see the comment in journal_submit_inode_data_buffers() */
+ /* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
jinode->i_flags |= JI_COMMIT_RUNNING;
@@ -431,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
- err = journal_submit_inode_data_buffers(journal, commit_transaction);
+ err = journal_submit_data_buffers(journal, commit_transaction);
if (err)
jbd2_journal_abort(journal, err);
--
1.5.6.rc2.15.g457bb.dirty
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [RFC PATCH] ext4: Add ordered mode support for delalloc 2008-06-12 15:25 [RFC PATCH] ext4: Add ordered mode support for delalloc Aneesh Kumar K.V @ 2008-06-13 20:53 ` Aneesh Kumar K.V 2008-06-13 23:01 ` Mingming 2008-06-16 15:05 ` [RFC] ext4: Semantics of delalloc,data=ordered Jan Kara 2 siblings, 0 replies; 15+ messages in thread From: Aneesh Kumar K.V @ 2008-06-13 20:53 UTC (permalink / raw) To: cmm, tytso, sandeen; +Cc: linux-ext4 On Thu, Jun 12, 2008 at 08:55:16PM +0530, Aneesh Kumar K.V wrote: > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> > --- > fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++-- > fs/jbd2/commit.c | 41 ++++++++++++-- > 2 files changed, 198 insertions(+), 12 deletions(-) > I tested this with fsstress with fallocate, fsx-linux, fs_inode, ffsb, and generic cp test of / to the new filesystem. I will now do a crash test and will verify whether ordered behavior is working correctly. -aneesh ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC PATCH] ext4: Add ordered mode support for delalloc 2008-06-12 15:25 [RFC PATCH] ext4: Add ordered mode support for delalloc Aneesh Kumar K.V 2008-06-13 20:53 ` Aneesh Kumar K.V @ 2008-06-13 23:01 ` Mingming 2008-06-14 6:34 ` Aneesh Kumar K.V 2008-06-16 15:05 ` [RFC] ext4: Semantics of delalloc,data=ordered Jan Kara 2 siblings, 1 reply; 15+ messages in thread From: Mingming @ 2008-06-13 23:01 UTC (permalink / raw) To: Aneesh Kumar K.V; +Cc: tytso, sandeen, linux-ext4 Thanks, some comments below... On Thu, 2008-06-12 at 20:55 +0530, Aneesh Kumar K.V wrote: > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> > --- > fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++-- > fs/jbd2/commit.c | 41 ++++++++++++-- > 2 files changed, 198 insertions(+), 12 deletions(-) > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index 63355ab..7d87641 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) > return !buffer_mapped(bh) || buffer_delay(bh); > } > > -/* FIXME!! only support data=writeback mode */ > /* > * get called vi ext4_da_writepages after taking page lock > * We may end up doing block allocation here in case > * mpage_da_map_blocks failed to allocate blocks. > */ > -static int ext4_da_writepage(struct page *page, > +static int ext4_da_writeback_writepage(struct page *page, > struct writeback_control *wbc) > { > int ret = 0; > @@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page, > return ret; > } > > +/* > + * get called vi ext4_da_writepages after taking page lock > + * We may end up doing block allocation here in case > + * mpage_da_map_blocks failed to allocate blocks. > + * > + * We also get called via journal_submit_inode_data_buffers > + */ > +static int ext4_da_ordered_writepage(struct page *page, > + struct writeback_control *wbc) > +{ > + int ret = 0; > + loff_t size; > + unsigned long len; > + handle_t *handle = NULL; > + struct buffer_head *page_bufs; > + struct inode *inode = page->mapping->host; > + > + handle = ext4_journal_current_handle(); > + if (!handle) { > + /* > + * This can happen when we aren't called via > + * ext4_da_writepages() but directly (shrink_page_list). > + * We cannot easily start a transaction here so we just skip > + * writing the page in case we would have to do so. > + */ > + size = i_size_read(inode); > + > + page_bufs = page_buffers(page); > + if (page->index == size >> PAGE_CACHE_SHIFT) > + len = size & ~PAGE_CACHE_MASK; > + else > + len = PAGE_CACHE_SIZE; > + > + if (walk_page_buffers(NULL, page_bufs, 0, > + len, NULL, ext4_bh_unmapped_or_delay)) { > + /* > + * We can't do block allocation under > + * page lock without a handle . So redirty > + * the page and return. > + * We may reach here when we do a journal commit > + * via journal_submit_inode_data_buffers. > + * If we don't have mapping block we just ignore > + * them > + * > + */ > + redirty_page_for_writepage(wbc, page); > + unlock_page(page); > + return 0; > + } > + } > + It seems we missed to file the inode to the journal list before calling block_write_full_page(), since it's possible block_write_full_page() could do block allocation. something like this? + if (ext4_should_order_data(inode)) + ret = ext4_jbd2_file_inode(handle, inode); + if (ret) { + ext4_journal_stop(handle); + return ret; + } > + ret = block_write_full_page(page, ext4_da_get_block_write, wbc); > + > + return ret; > +} > It seems this code is duplicated from ext4_da_writeback_writepage()(except for the file inode to keep the ordering), is there any reason not to making it one function for both ordered mode and writeback mode? > /* > * For now just follow the DIO way to estimate the max credits > @@ -1745,19 +1799,99 @@ static int ext4_da_writepages(struct address_space *mapping, > return ret; > } > > +static int ext4_da_ordered_writepages(struct address_space *mapping, > + struct writeback_control *wbc) > +{ > + struct inode *inode = mapping->host; > + handle_t *handle = NULL; > + int needed_blocks; > + int ret = 0; > + long to_write; > + loff_t range_start = 0; > + > + > + /* > + * No pages to write? This is mainly a kludge to avoid starting > + * a transaction for special inodes like journal inode on last iput() > + * because that could violate lock ordering on umount > + */ > + if (!mapping->nrpages) > + return 0; > + > + /* > + * Estimate the worse case needed credits to write out > + * EXT4_MAX_BUF_BLOCKS pages > + */ > + needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; > + > + to_write = wbc->nr_to_write; > + if (!wbc->range_cyclic) { > + /* > + * If range_cyclic is not set force range_cont > + * and save the old writeback_index > + */ > + wbc->range_cont = 1; > + range_start = wbc->range_start; > + } > + > + while (!ret && to_write) { > + /* start a new transaction*/ > + handle = ext4_journal_start(inode, needed_blocks); > + if (IS_ERR(handle)) { > + ret = PTR_ERR(handle); > + goto out_writepages; > + } > + > + ret = ext4_jbd2_file_inode(handle, inode); > + if (ret) { > + ext4_journal_stop(handle); > + goto out_writepages; > + } > + /* > + * set the max dirty pages could be write at a time > + * to fit into the reserved transaction credits > + */ > + if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) > + wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES; > + > + to_write -= wbc->nr_to_write; > + ret = mpage_da_writepages(mapping, wbc, > + ext4_da_get_block_write); > + ext4_journal_stop(handle); > + if (wbc->nr_to_write) { > + /* > + * There is no more writeout needed > + * or we requested for a noblocking writeout > + * and we found the device congested > + */ > + to_write += wbc->nr_to_write; > + break; > + } > + wbc->nr_to_write = to_write; > + } > + > +out_writepages: > + wbc->nr_to_write = to_write; > + if (range_start) > + wbc->range_start = range_start; > + return ret; > +} > + It seems this code is duplicated from ext4_da_writeback_writepages()also. The only part different is in ordered mode, we need to file the inode to the journal list to keep the ordering. I think we could use existing da_writepages() function for both ordered mode and writeback mode as well. > static int ext4_da_write_begin(struct file *file, struct address_space *mapping, > loff_t pos, unsigned len, unsigned flags, > struct page **pagep, void **fsdata) > { > - int ret; > + int ret, retries = 0; > struct page *page; > pgoff_t index; > unsigned from, to; > + struct inode *inode = mapping->host; > > index = pos >> PAGE_CACHE_SHIFT; > from = pos & (PAGE_CACHE_SIZE - 1); > to = from + len; > > +retry: > page = __grab_cache_page(mapping, index); > if (!page) > return -ENOMEM; > @@ -1770,6 +1904,9 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, > page_cache_release(page); > } > > + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) > + goto retry; > + > return ret; > } > In case of ENOSPC, instead of go back and try to do reservation (which may overestimate the total number of metablocks to reserve) again, I think we should not doing delayed allocation, instead call the real get_block() function to try the real block allocation. Just to clarify, this is not part of the ordered mode support, I think we should make a separate patch for this kind of improvement. > @@ -2224,10 +2361,10 @@ static int ext4_journalled_set_page_dirty(struct page *page) > .releasepage = ext4_releasepage, > }; > > -static const struct address_space_operations ext4_da_aops = { > +static const struct address_space_operations ext4_da_writeback_aops = { > .readpage = ext4_readpage, > .readpages = ext4_readpages, > - .writepage = ext4_da_writepage, > + .writepage = ext4_da_writeback_writepage, > .writepages = ext4_da_writepages, > .sync_page = block_sync_page, > .write_begin = ext4_da_write_begin, > @@ -2239,13 +2376,31 @@ static int ext4_journalled_set_page_dirty(struct page *page) > .migratepage = buffer_migrate_page, > }; > > +static const struct address_space_operations ext4_da_ordered_aops = { > + .readpage = ext4_readpage, > + .readpages = ext4_readpages, > + .writepage = ext4_da_ordered_writepage, > + .writepages = ext4_da_ordered_writepages, > + .sync_page = block_sync_page, > + .write_begin = ext4_da_write_begin, > + .write_end = generic_write_end, > + .bmap = ext4_bmap, > + .invalidatepage = ext4_da_invalidatepage, > + .releasepage = ext4_releasepage, > + .direct_IO = ext4_direct_IO, > + .migratepage = buffer_migrate_page, > +}; > + With the new ordered mode, we could share the same address space operations for delayed allocation over writeback and ordered mode. > void ext4_set_aops(struct inode *inode) > { > - if (ext4_should_order_data(inode)) > + if (ext4_should_order_data(inode) && > + test_opt(inode->i_sb, DELALLOC)) > + inode->i_mapping->a_ops = &ext4_da_ordered_aops; > + else if (ext4_should_order_data(inode)) > inode->i_mapping->a_ops = &ext4_ordered_aops; > else if (ext4_should_writeback_data(inode) && > test_opt(inode->i_sb, DELALLOC)) > - inode->i_mapping->a_ops = &ext4_da_aops; > + inode->i_mapping->a_ops = &ext4_da_writeback_aops; > else if (ext4_should_writeback_data(inode)) > inode->i_mapping->a_ops = &ext4_writeback_aops; > else > diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c > index 483183d..32ca3c3 100644 > --- a/fs/jbd2/commit.c > +++ b/fs/jbd2/commit.c > @@ -22,6 +22,8 @@ > #include <linux/pagemap.h> > #include <linux/jiffies.h> > #include <linux/crc32.h> > +#include <linux/writeback.h> > +#include <linux/backing-dev.h> > > /* > * Default IO end handler for temporary BJ_IO buffer_heads. > @@ -185,6 +187,30 @@ static int journal_wait_on_commit_record(struct buffer_head *bh) > } > > /* > + * write the filemap data using writepage() address_space_operations. > + * We don't do block allocation here even for delalloc. We don't > + * use writepages() because with dealyed allocation we may be doing > + * block allocation in writepages(). > + */ > +static int journal_submit_inode_data_buffers(struct address_space *mapping) > +{ > + int ret; > + struct writeback_control wbc = { > + .sync_mode = WB_SYNC_ALL, > + .nr_to_write = mapping->nrpages * 2, > + .range_start = 0, > + .range_end = i_size_read(mapping->host), > + .for_writepages = 1, > + }; > + > + if (!mapping_cap_writeback_dirty(mapping)) > + return 0; > + > + ret = generic_writepages(mapping, &wbc); > + return ret; > +} > + > +/* > * Submit all the data buffers of inode associated with the transaction to > * disk. > * > @@ -192,7 +218,7 @@ static int journal_wait_on_commit_record(struct buffer_head *bh) > * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently > * operate on from being released while we write out pages. > */ > -static int journal_submit_inode_data_buffers(journal_t *journal, > +static int journal_submit_data_buffers(journal_t *journal, > transaction_t *commit_transaction) > { > struct jbd2_inode *jinode; > @@ -204,8 +230,13 @@ static int journal_submit_inode_data_buffers(journal_t *journal, > mapping = jinode->i_vfs_inode->i_mapping; > jinode->i_flags |= JI_COMMIT_RUNNING; > spin_unlock(&journal->j_list_lock); > - err = filemap_fdatawrite_range(mapping, 0, > - i_size_read(jinode->i_vfs_inode)); > + /* > + * submit the inode data buffers. We use writepage > + * instead of writepages. Because writepages can do > + * block allocation with delalloc. We need to write > + * only allocated blocks here. > + */ Hmm, when writepage()->ext4_da_orderd_writepage() is called from here, the handle is expecting to be NULL? Otherwise block_write_full_page() could do block allocation, that's against the locking ordering...:( > + err = journal_submit_inode_data_buffers(mapping); > if (!ret) > ret = err; > spin_lock(&journal->j_list_lock); > @@ -228,7 +259,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, > struct jbd2_inode *jinode, *next_i; > int err, ret = 0; > > - /* For locking, see the comment in journal_submit_inode_data_buffers() */ > + /* For locking, see the comment in journal_submit_data_buffers() */ > spin_lock(&journal->j_list_lock); > list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { > jinode->i_flags |= JI_COMMIT_RUNNING; > @@ -431,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) > * Now start flushing things to disk, in the order they appear > * on the transaction lists. Data blocks go first. > */ > - err = journal_submit_inode_data_buffers(journal, commit_transaction); > + err = journal_submit_data_buffers(journal, commit_transaction); > if (err) > jbd2_journal_abort(journal, err); > Regards, Mingming ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC PATCH] ext4: Add ordered mode support for delalloc 2008-06-13 23:01 ` Mingming @ 2008-06-14 6:34 ` Aneesh Kumar K.V 0 siblings, 0 replies; 15+ messages in thread From: Aneesh Kumar K.V @ 2008-06-14 6:34 UTC (permalink / raw) To: Mingming; +Cc: tytso, sandeen, linux-ext4 On Fri, Jun 13, 2008 at 04:01:22PM -0700, Mingming wrote: > Thanks, some comments below... > On Thu, 2008-06-12 at 20:55 +0530, Aneesh Kumar K.V wrote: > > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> > > --- > > fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++-- > > fs/jbd2/commit.c | 41 ++++++++++++-- > > 2 files changed, 198 insertions(+), 12 deletions(-) > > > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > > index 63355ab..7d87641 100644 > > --- a/fs/ext4/inode.c > > +++ b/fs/ext4/inode.c > > @@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) > > return !buffer_mapped(bh) || buffer_delay(bh); > > } > > > > -/* FIXME!! only support data=writeback mode */ > > /* > > * get called vi ext4_da_writepages after taking page lock > > * We may end up doing block allocation here in case > > * mpage_da_map_blocks failed to allocate blocks. > > */ > > -static int ext4_da_writepage(struct page *page, > > +static int ext4_da_writeback_writepage(struct page *page, > > struct writeback_control *wbc) > > { > > int ret = 0; > > @@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page, > > return ret; > > } > > > > +/* > > + * get called vi ext4_da_writepages after taking page lock > > + * We may end up doing block allocation here in case > > + * mpage_da_map_blocks failed to allocate blocks. > > + * > > + * We also get called via journal_submit_inode_data_buffers > > + */ > > +static int ext4_da_ordered_writepage(struct page *page, > > + struct writeback_control *wbc) > > +{ > > + int ret = 0; > > + loff_t size; > > + unsigned long len; > > + handle_t *handle = NULL; > > + struct buffer_head *page_bufs; > > + struct inode *inode = page->mapping->host; > > + > > + handle = ext4_journal_current_handle(); > > + if (!handle) { > > + /* > > + * This can happen when we aren't called via > > + * ext4_da_writepages() but directly (shrink_page_list). > > + * We cannot easily start a transaction here so we just skip > > + * writing the page in case we would have to do so. > > + */ > > + size = i_size_read(inode); > > + > > + page_bufs = page_buffers(page); > > + if (page->index == size >> PAGE_CACHE_SHIFT) > > + len = size & ~PAGE_CACHE_MASK; > > + else > > + len = PAGE_CACHE_SIZE; > > + > > + if (walk_page_buffers(NULL, page_bufs, 0, > > + len, NULL, ext4_bh_unmapped_or_delay)) { > > + /* > > + * We can't do block allocation under > > + * page lock without a handle . So redirty > > + * the page and return. > > + * We may reach here when we do a journal commit > > + * via journal_submit_inode_data_buffers. > > + * If we don't have mapping block we just ignore > > + * them > > + * > > + */ > > + redirty_page_for_writepage(wbc, page); > > + unlock_page(page); > > + return 0; > > + } > > + } > > + > It seems we missed to file the inode to the journal list before calling > block_write_full_page(), since it's possible block_write_full_page() > could do block allocation. > > something like this? > > + if (ext4_should_order_data(inode)) > + ret = ext4_jbd2_file_inode(handle, inode); > + if (ret) { > + ext4_journal_stop(handle); > + return ret; > + } > No we can't do that. The writepage get called back in the following case. a) via background_writeout from pdflush b) via do_sync via sync call c) via shrink_page_list when under memory pressure from kswapd. In both (a) and (b) we get called via writepages. That means in both (a) and (b) we can do block allocation. In case of (c) we get called directly and we can't do block allocation because we are already called with page_lock and we can't start a journal transaction. That means we do block allocation only via writepages. So in ext4_da_ordered_writepages() we do 1844 ret = ext4_jbd2_file_inode(handle, inode); for each journal start. Now if you see I have also added journal_submit_inode_data_buffers that will be called during journal commit. The function use writepage instead of writepages. The idea here is to ensure that we submit only the pages that have all the buffer_head mapped. Because we don't want to do block allocation during a journal commit. It also have the advantage that we don't wait for writing out all the buffer_head during a sync from user space. Because not all buffer_head would be mapped at that time. This should actually improve the case "sync take lot of time with ordered mode on ext4" To list out: a) We do block allocation only when called via ext4_da_*_writepages. This is true even for writeback mode. c) If we call writepage and if we find any buffer_head in the page unmapped or delayed we don't write the page. Instead we redirty the page and return. d) journal commit is updated to use writepage instead of writepages so that we don't do block allocation during commit. This will result in writeout of only those pages that have fully mapped buffer_heads. e) Since we do block allocation only via ext4_da_*_writepages, we need to add the inode to journal list only during ext4_da_*_writepages. We do that for each journal_start in the loop. > > + ret = block_write_full_page(page, ext4_da_get_block_write, wbc); > > + > > > > + return ret; > > +} > > > > It seems this code is duplicated from > ext4_da_writeback_writepage()(except for the file inode to keep the > ordering), is there any reason not to making it one function for both > ordered mode and writeback mode? Nothing particular I will merge both to one function in the next post. > > /* > > * For now just follow the DIO way to estimate the max credits > > @@ -1745,19 +1799,99 @@ static int ext4_da_writepages(struct address_space *mapping, > > return ret; > > } > > > > +static int ext4_da_ordered_writepages(struct address_space *mapping, > > + struct writeback_control *wbc) > > +{ > > + struct inode *inode = mapping->host; > > + handle_t *handle = NULL; > > + int needed_blocks; > > + int ret = 0; > > + long to_write; > > + loff_t range_start = 0; > > + > > + > > + /* > > + * No pages to write? This is mainly a kludge to avoid starting > > + * a transaction for special inodes like journal inode on last iput() > > + * because that could violate lock ordering on umount > > + */ > > + if (!mapping->nrpages) > > + return 0; > > + > > + /* > > + * Estimate the worse case needed credits to write out > > + * EXT4_MAX_BUF_BLOCKS pages > > + */ > > + needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; > > + > > + to_write = wbc->nr_to_write; > > + if (!wbc->range_cyclic) { > > + /* > > + * If range_cyclic is not set force range_cont > > + * and save the old writeback_index > > + */ > > + wbc->range_cont = 1; > > + range_start = wbc->range_start; > > + } > > + > > + while (!ret && to_write) { > > + /* start a new transaction*/ > > + handle = ext4_journal_start(inode, needed_blocks); > > + if (IS_ERR(handle)) { > > + ret = PTR_ERR(handle); > > + goto out_writepages; > > + } > > + > > + ret = ext4_jbd2_file_inode(handle, inode); > > + if (ret) { > > + ext4_journal_stop(handle); > > + goto out_writepages; > > + } > > + /* > > + * set the max dirty pages could be write at a time > > + * to fit into the reserved transaction credits > > + */ > > + if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) > > + wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES; > > + > > + to_write -= wbc->nr_to_write; > > + ret = mpage_da_writepages(mapping, wbc, > > + ext4_da_get_block_write); > > + ext4_journal_stop(handle); > > + if (wbc->nr_to_write) { > > + /* > > + * There is no more writeout needed > > + * or we requested for a noblocking writeout > > + * and we found the device congested > > + */ > > + to_write += wbc->nr_to_write; > > + break; > > + } > > + wbc->nr_to_write = to_write; > > + } > > + > > +out_writepages: > > + wbc->nr_to_write = to_write; > > + if (range_start) > > + wbc->range_start = range_start; > > + return ret; > > +} > > + > > It seems this code is duplicated from > ext4_da_writeback_writepages()also. The only part different is in > ordered mode, we need to file the inode to the journal list to keep the > ordering. I think we could use existing da_writepages() function for > both ordered mode and writeback mode as well. Will do that in the next post. > > > static int ext4_da_write_begin(struct file *file, struct address_space *mapping, > > loff_t pos, unsigned len, unsigned flags, > > struct page **pagep, void **fsdata) > > { > > - int ret; > > + int ret, retries = 0; > > struct page *page; > > pgoff_t index; > > unsigned from, to; > > + struct inode *inode = mapping->host; > > > > index = pos >> PAGE_CACHE_SHIFT; > > from = pos & (PAGE_CACHE_SIZE - 1); > > to = from + len; > > > > +retry: > > page = __grab_cache_page(mapping, index); > > if (!page) > > return -ENOMEM; > > @@ -1770,6 +1904,9 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, > > page_cache_release(page); > > } > > > > + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) > > + goto retry; > > + > > return ret; > > } > > > > In case of ENOSPC, instead of go back and try to do reservation (which > may overestimate the total number of metablocks to reserve) again, I > think we should not doing delayed allocation, instead call the real > get_block() function to try the real block allocation. > > Just to clarify, this is not part of the ordered mode support, I think > we should make a separate patch for this kind of improvement. I already have a patch queued for the same. I have 16 small patches with various cleanups sitting in my local repo. I should be sending out the same in the next update. > > > @@ -2224,10 +2361,10 @@ static int ext4_journalled_set_page_dirty(struct page *page) > > .releasepage = ext4_releasepage, > > }; > > > > -static const struct address_space_operations ext4_da_aops = { > > +static const struct address_space_operations ext4_da_writeback_aops = { > > .readpage = ext4_readpage, > > .readpages = ext4_readpages, > > - .writepage = ext4_da_writepage, > > + .writepage = ext4_da_writeback_writepage, > > .writepages = ext4_da_writepages, > > .sync_page = block_sync_page, > > .write_begin = ext4_da_write_begin, > > @@ -2239,13 +2376,31 @@ static int ext4_journalled_set_page_dirty(struct page *page) > > .migratepage = buffer_migrate_page, > > }; > > > > +static const struct address_space_operations ext4_da_ordered_aops = { > > + .readpage = ext4_readpage, > > + .readpages = ext4_readpages, > > + .writepage = ext4_da_ordered_writepage, > > + .writepages = ext4_da_ordered_writepages, > > + .sync_page = block_sync_page, > > + .write_begin = ext4_da_write_begin, > > + .write_end = generic_write_end, > > + .bmap = ext4_bmap, > > + .invalidatepage = ext4_da_invalidatepage, > > + .releasepage = ext4_releasepage, > > + .direct_IO = ext4_direct_IO, > > + .migratepage = buffer_migrate_page, > > +}; > > + > > With the new ordered mode, we could share the same address space > operations for delayed allocation over writeback and ordered mode. > > > > void ext4_set_aops(struct inode *inode) > > { > > - if (ext4_should_order_data(inode)) > > + if (ext4_should_order_data(inode) && > > + test_opt(inode->i_sb, DELALLOC)) > > + inode->i_mapping->a_ops = &ext4_da_ordered_aops; > > + else if (ext4_should_order_data(inode)) > > inode->i_mapping->a_ops = &ext4_ordered_aops; > > else if (ext4_should_writeback_data(inode) && > > test_opt(inode->i_sb, DELALLOC)) > > - inode->i_mapping->a_ops = &ext4_da_aops; > > + inode->i_mapping->a_ops = &ext4_da_writeback_aops; > > else if (ext4_should_writeback_data(inode)) > > inode->i_mapping->a_ops = &ext4_writeback_aops; > > else > > diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c > > index 483183d..32ca3c3 100644 > > --- a/fs/jbd2/commit.c > > +++ b/fs/jbd2/commit.c > > @@ -22,6 +22,8 @@ > > #include <linux/pagemap.h> > > #include <linux/jiffies.h> > > #include <linux/crc32.h> > > +#include <linux/writeback.h> > > +#include <linux/backing-dev.h> > > > > /* > > * Default IO end handler for temporary BJ_IO buffer_heads. > > @@ -185,6 +187,30 @@ static int journal_wait_on_commit_record(struct buffer_head *bh) > > } > > > > /* > > + * write the filemap data using writepage() address_space_operations. > > + * We don't do block allocation here even for delalloc. We don't > > + * use writepages() because with dealyed allocation we may be doing > > + * block allocation in writepages(). > > + */ > > +static int journal_submit_inode_data_buffers(struct address_space *mapping) > > +{ > > + int ret; > > + struct writeback_control wbc = { > > + .sync_mode = WB_SYNC_ALL, > > + .nr_to_write = mapping->nrpages * 2, > > + .range_start = 0, > > + .range_end = i_size_read(mapping->host), > > + .for_writepages = 1, > > + }; > > + > > + if (!mapping_cap_writeback_dirty(mapping)) > > + return 0; > > + > > + ret = generic_writepages(mapping, &wbc); > > + return ret; > > +} > > + > > +/* > > * Submit all the data buffers of inode associated with the transaction to > > * disk. > > * > > @@ -192,7 +218,7 @@ static int journal_wait_on_commit_record(struct buffer_head *bh) > > * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently > > * operate on from being released while we write out pages. > > */ > > -static int journal_submit_inode_data_buffers(journal_t *journal, > > +static int journal_submit_data_buffers(journal_t *journal, > > transaction_t *commit_transaction) > > { > > struct jbd2_inode *jinode; > > @@ -204,8 +230,13 @@ static int journal_submit_inode_data_buffers(journal_t *journal, > > mapping = jinode->i_vfs_inode->i_mapping; > > jinode->i_flags |= JI_COMMIT_RUNNING; > > spin_unlock(&journal->j_list_lock); > > - err = filemap_fdatawrite_range(mapping, 0, > > - i_size_read(jinode->i_vfs_inode)); > > + /* > > + * submit the inode data buffers. We use writepage > > + * instead of writepages. Because writepages can do > > + * block allocation with delalloc. We need to write > > + * only allocated blocks here. > > + */ > > Hmm, when writepage()->ext4_da_orderd_writepage() is called from here, > the handle is expecting to be NULL? Otherwise block_write_full_page() > could do block allocation, that's against the locking ordering...:( I didn't quiet get that. We don't want to do block allocation when called via journal_submit_data_buffers. In writepage if handle is null and if we have some buffer_head as delayed or not mapped we don't write the page. > > > + err = journal_submit_inode_data_buffers(mapping); > > if (!ret) > > ret = err; > > spin_lock(&journal->j_list_lock); > > @@ -228,7 +259,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, > > struct jbd2_inode *jinode, *next_i; > > int err, ret = 0; > > > > - /* For locking, see the comment in journal_submit_inode_data_buffers() */ > > + /* For locking, see the comment in journal_submit_data_buffers() */ > > spin_lock(&journal->j_list_lock); > > list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { > > jinode->i_flags |= JI_COMMIT_RUNNING; > > @@ -431,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) > > * Now start flushing things to disk, in the order they appear > > * on the transaction lists. Data blocks go first. > > */ > > - err = journal_submit_inode_data_buffers(journal, commit_transaction); > > + err = journal_submit_data_buffers(journal, commit_transaction); > > if (err) > > jbd2_journal_abort(journal, err); > > -aneesh ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC] ext4: Semantics of delalloc,data=ordered 2008-06-12 15:25 [RFC PATCH] ext4: Add ordered mode support for delalloc Aneesh Kumar K.V 2008-06-13 20:53 ` Aneesh Kumar K.V 2008-06-13 23:01 ` Mingming @ 2008-06-16 15:05 ` Jan Kara 2008-06-16 16:02 ` Aneesh Kumar K.V 2008-06-16 18:55 ` Andreas Dilger 2 siblings, 2 replies; 15+ messages in thread From: Jan Kara @ 2008-06-16 15:05 UTC (permalink / raw) To: Aneesh Kumar K.V; +Cc: cmm, tytso, sandeen, linux-ext4, adilger Hi Aneesh, First, I'd like to see some short comment on what semantics delalloc,data=ordered is going to have. At least I can imagine at least two sensible approaches: 1) All we guarantee is that user is not going to see uninitialized data. We send writes to disk (and allocate blocks) whenever it fits our needs (usually when pdflush finds them). 2) We guarantee that when transaction commits, your data is on disk - i.e., we allocate actual blocks on transaction commit. Both these possibilities have their pros and cons. Most importantly, 1) gives better disk layout while 2) gives higher consistency guarantees. Note that with 1), it can under some circumstances happen, that after a crash you see block 1 and 3 of your 3-block-write on disk, while block 2 is still a hole. 1) is easy to implement (you mostly did it below), 2) is harder. I think there should be broader consensus on what the semantics should be (changed subject to catch more attention ;). A few comments to your patch are also below. Honza > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> > --- > fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++-- > fs/jbd2/commit.c | 41 ++++++++++++-- > 2 files changed, 198 insertions(+), 12 deletions(-) > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index 63355ab..7d87641 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) > return !buffer_mapped(bh) || buffer_delay(bh); > } > > -/* FIXME!! only support data=writeback mode */ > /* > * get called vi ext4_da_writepages after taking page lock > * We may end up doing block allocation here in case > * mpage_da_map_blocks failed to allocate blocks. > */ > -static int ext4_da_writepage(struct page *page, > +static int ext4_da_writeback_writepage(struct page *page, > struct writeback_control *wbc) > { > int ret = 0; > @@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page, > return ret; > } > > +/* > + * get called vi ext4_da_writepages after taking page lock > + * We may end up doing block allocation here in case > + * mpage_da_map_blocks failed to allocate blocks. > + * > + * We also get called via journal_submit_inode_data_buffers > + */ > +static int ext4_da_ordered_writepage(struct page *page, > + struct writeback_control *wbc) > +{ > + int ret = 0; > + loff_t size; > + unsigned long len; > + handle_t *handle = NULL; > + struct buffer_head *page_bufs; > + struct inode *inode = page->mapping->host; > + > + handle = ext4_journal_current_handle(); > + if (!handle) { > + /* > + * This can happen when we aren't called via > + * ext4_da_writepages() but directly (shrink_page_list). > + * We cannot easily start a transaction here so we just skip > + * writing the page in case we would have to do so. > + */ > + size = i_size_read(inode); > + > + page_bufs = page_buffers(page); > + if (page->index == size >> PAGE_CACHE_SHIFT) > + len = size & ~PAGE_CACHE_MASK; > + else > + len = PAGE_CACHE_SIZE; > + > + if (walk_page_buffers(NULL, page_bufs, 0, > + len, NULL, ext4_bh_unmapped_or_delay)) { > + /* > + * We can't do block allocation under > + * page lock without a handle . So redirty > + * the page and return. > + * We may reach here when we do a journal commit > + * via journal_submit_inode_data_buffers. > + * If we don't have mapping block we just ignore > + * them > + * > + */ > + redirty_page_for_writepage(wbc, page); > + unlock_page(page); > + return 0; > + } > + } > + > + ret = block_write_full_page(page, ext4_da_get_block_write, wbc); > + > + return ret; > +} If you're going to use this writepage() implementation from commit code, you cannot simply do redirty_page_for_writepage() and bail out when there's an unmapped buffer. You must write out at least mapped buffers to satisfy ordering guarantees (think of filesystems with blocksize < page size). <snip> Please separate changes to JBD2 into a separate patch - they should be probably merged into ordered-mode rewrite patch later on... > diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c > index 483183d..32ca3c3 100644 > --- a/fs/jbd2/commit.c > +++ b/fs/jbd2/commit.c > @@ -22,6 +22,8 @@ > #include <linux/pagemap.h> > #include <linux/jiffies.h> > #include <linux/crc32.h> > +#include <linux/writeback.h> > +#include <linux/backing-dev.h> > > /* > * Default IO end handler for temporary BJ_IO buffer_heads. > @@ -185,6 +187,30 @@ static int journal_wait_on_commit_record(struct buffer_head *bh) > } > > /* > + * write the filemap data using writepage() address_space_operations. > + * We don't do block allocation here even for delalloc. We don't > + * use writepages() because with dealyed allocation we may be doing > + * block allocation in writepages(). > + */ > +static int journal_submit_inode_data_buffers(struct address_space *mapping) > +{ > + int ret; > + struct writeback_control wbc = { > + .sync_mode = WB_SYNC_ALL, > + .nr_to_write = mapping->nrpages * 2, > + .range_start = 0, > + .range_end = i_size_read(mapping->host), > + .for_writepages = 1, > + }; > + > + if (!mapping_cap_writeback_dirty(mapping)) > + return 0; > + > + ret = generic_writepages(mapping, &wbc); > + return ret; > +} > + > +/* > * Submit all the data buffers of inode associated with the transaction to > * disk. > * > @@ -192,7 +218,7 @@ static int journal_wait_on_commit_record(struct buffer_head *bh) > * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently > * operate on from being released while we write out pages. > */ > -static int journal_submit_inode_data_buffers(journal_t *journal, > +static int journal_submit_data_buffers(journal_t *journal, > transaction_t *commit_transaction) journal_submit_data_buffers() isn't a lucky name. This is how the old function was called and so it would clash with it in the patch series... I'd be for keeping this name and call the above journal_submit_mapping_buffers() or just fold the above function into journal_submit_inode_data_buffers(). > { > struct jbd2_inode *jinode; > @@ -204,8 +230,13 @@ static int journal_submit_inode_data_buffers(journal_t *journal, > mapping = jinode->i_vfs_inode->i_mapping; > jinode->i_flags |= JI_COMMIT_RUNNING; > spin_unlock(&journal->j_list_lock); > - err = filemap_fdatawrite_range(mapping, 0, > - i_size_read(jinode->i_vfs_inode)); > + /* > + * submit the inode data buffers. We use writepage > + * instead of writepages. Because writepages can do > + * block allocation with delalloc. We need to write > + * only allocated blocks here. > + */ > + err = journal_submit_inode_data_buffers(mapping); > if (!ret) > ret = err; > spin_lock(&journal->j_list_lock); > @@ -228,7 +259,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, > struct jbd2_inode *jinode, *next_i; > int err, ret = 0; > > - /* For locking, see the comment in journal_submit_inode_data_buffers() */ > + /* For locking, see the comment in journal_submit_data_buffers() */ > spin_lock(&journal->j_list_lock); > list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { > jinode->i_flags |= JI_COMMIT_RUNNING; > @@ -431,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) > * Now start flushing things to disk, in the order they appear > * on the transaction lists. Data blocks go first. > */ > - err = journal_submit_inode_data_buffers(journal, commit_transaction); > + err = journal_submit_data_buffers(journal, commit_transaction); > if (err) > jbd2_journal_abort(journal, err); -- Jan Kara <jack@suse.cz> SuSE CR Labs ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC] ext4: Semantics of delalloc,data=ordered 2008-06-16 15:05 ` [RFC] ext4: Semantics of delalloc,data=ordered Jan Kara @ 2008-06-16 16:02 ` Aneesh Kumar K.V 2008-06-16 17:20 ` Jan Kara 2008-06-16 18:55 ` Andreas Dilger 1 sibling, 1 reply; 15+ messages in thread From: Aneesh Kumar K.V @ 2008-06-16 16:02 UTC (permalink / raw) To: Jan Kara; +Cc: cmm, tytso, sandeen, linux-ext4, adilger On Mon, Jun 16, 2008 at 05:05:33PM +0200, Jan Kara wrote: > Hi Aneesh, > > First, I'd like to see some short comment on what semantics > delalloc,data=ordered is going to have. At least I can imagine at least > two sensible approaches: > 1) All we guarantee is that user is not going to see uninitialized data. > We send writes to disk (and allocate blocks) whenever it fits our needs > (usually when pdflush finds them). > 2) We guarantee that when transaction commits, your data is on disk - > i.e., we allocate actual blocks on transaction commit. > > Both these possibilities have their pros and cons. Most importantly, > 1) gives better disk layout while 2) gives higher consistency > guarantees. Note that with 1), it can under some circumstances happen, > that after a crash you see block 1 and 3 of your 3-block-write on disk, > while block 2 is still a hole. 1) is easy to implement (you mostly did > it below), 2) is harder. I think there should be broader consensus on > what the semantics should be (changed subject to catch more attention > ;). > > A few comments to your patch are also below. > > Honza The way I was looking at ordered mode was, we only guarantee that the meta-data blocks corresponding to the data block allocated get committed only after the data-blocks are written to the disk. As long as we don't allocate blocks corresponding to a page we don't write the page to disk. This should also speed up the "sync slowness" that lot of people are reporting with ordered mode. Can you explain " 1), it can under some circumstances happen, that after a crash you see block 1 and 3 of your 3-block-write on disk, while block 2 is still a hole. " > > > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> > > --- > > fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++-- > > fs/jbd2/commit.c | 41 ++++++++++++-- > > 2 files changed, 198 insertions(+), 12 deletions(-) > > > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > > index 63355ab..7d87641 100644 > > --- a/fs/ext4/inode.c > > +++ b/fs/ext4/inode.c > > @@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) > > return !buffer_mapped(bh) || buffer_delay(bh); > > } > > > > -/* FIXME!! only support data=writeback mode */ > > /* > > * get called vi ext4_da_writepages after taking page lock > > * We may end up doing block allocation here in case > > * mpage_da_map_blocks failed to allocate blocks. > > */ > > -static int ext4_da_writepage(struct page *page, > > +static int ext4_da_writeback_writepage(struct page *page, > > struct writeback_control *wbc) > > { > > int ret = 0; > > @@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page, > > return ret; > > } > > > > +/* > > + * get called vi ext4_da_writepages after taking page lock > > + * We may end up doing block allocation here in case > > + * mpage_da_map_blocks failed to allocate blocks. > > + * > > + * We also get called via journal_submit_inode_data_buffers > > + */ > > +static int ext4_da_ordered_writepage(struct page *page, > > + struct writeback_control *wbc) > > +{ > > + int ret = 0; > > + loff_t size; > > + unsigned long len; > > + handle_t *handle = NULL; > > + struct buffer_head *page_bufs; > > + struct inode *inode = page->mapping->host; > > + > > + handle = ext4_journal_current_handle(); > > + if (!handle) { > > + /* > > + * This can happen when we aren't called via > > + * ext4_da_writepages() but directly (shrink_page_list). > > + * We cannot easily start a transaction here so we just skip > > + * writing the page in case we would have to do so. > > + */ > > + size = i_size_read(inode); > > + > > + page_bufs = page_buffers(page); > > + if (page->index == size >> PAGE_CACHE_SHIFT) > > + len = size & ~PAGE_CACHE_MASK; > > + else > > + len = PAGE_CACHE_SIZE; > > + > > + if (walk_page_buffers(NULL, page_bufs, 0, > > + len, NULL, ext4_bh_unmapped_or_delay)) { > > + /* > > + * We can't do block allocation under > > + * page lock without a handle . So redirty > > + * the page and return. > > + * We may reach here when we do a journal commit > > + * via journal_submit_inode_data_buffers. > > + * If we don't have mapping block we just ignore > > + * them > > + * > > + */ > > + redirty_page_for_writepage(wbc, page); > > + unlock_page(page); > > + return 0; > > + } > > + } > > + > > + ret = block_write_full_page(page, ext4_da_get_block_write, wbc); > > + > > + return ret; > > +} > If you're going to use this writepage() implementation from commit > code, you cannot simply do redirty_page_for_writepage() and bail out > when there's an unmapped buffer. You must write out at least mapped > buffers to satisfy ordering guarantees (think of filesystems with > blocksize < page size). With delalloc is it possible to have a page that have some buffer_heads marked delay ? -aneesh ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC] ext4: Semantics of delalloc,data=ordered 2008-06-16 16:02 ` Aneesh Kumar K.V @ 2008-06-16 17:20 ` Jan Kara 2008-06-16 18:11 ` Aneesh Kumar K.V 0 siblings, 1 reply; 15+ messages in thread From: Jan Kara @ 2008-06-16 17:20 UTC (permalink / raw) To: Aneesh Kumar K.V; +Cc: cmm, tytso, sandeen, linux-ext4, adilger > On Mon, Jun 16, 2008 at 05:05:33PM +0200, Jan Kara wrote: > > Hi Aneesh, > > > > First, I'd like to see some short comment on what semantics > > delalloc,data=ordered is going to have. At least I can imagine at least > > two sensible approaches: > > 1) All we guarantee is that user is not going to see uninitialized data. > > We send writes to disk (and allocate blocks) whenever it fits our needs > > (usually when pdflush finds them). > > 2) We guarantee that when transaction commits, your data is on disk - > > i.e., we allocate actual blocks on transaction commit. > > > > Both these possibilities have their pros and cons. Most importantly, > > 1) gives better disk layout while 2) gives higher consistency > > guarantees. Note that with 1), it can under some circumstances happen, > > that after a crash you see block 1 and 3 of your 3-block-write on disk, > > while block 2 is still a hole. 1) is easy to implement (you mostly did > > it below), 2) is harder. I think there should be broader consensus on > > what the semantics should be (changed subject to catch more attention > > ;). > > > > A few comments to your patch are also below. > > > > Honza > > The way I was looking at ordered mode was, we only guarantee that the > meta-data blocks corresponding to the data block allocated get committed > only after the data-blocks are written to the disk. As long as we don't > allocate blocks corresponding to a page we don't write the page to > disk. This should also speed up the "sync slowness" that lot of people > are reporting with ordered mode. I'm not sure if it helps - tons of dirty data have to get to transaction at some time even with delayed alloc and at that moment any "interactive application" is going to be starved. > Can you explain > " > 1), it can under some circumstances happen, that after a crash you see > block 1 and 3 of your 3-block-write on disk, while block 2 is still a hole. > " Imagine you have a file with blocks 1 and 3 allocated and block 2 is a hole. You write blocks 1-3. Block 2 isn't allocated because of delalloc. Now if inode is already in the current transaction's list, during commit writes to blocks 1 and 3 will land on disk but write to block 2 will happen only after pdflush finds it. > > > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> > > > --- > > > fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++-- > > > fs/jbd2/commit.c | 41 ++++++++++++-- > > > 2 files changed, 198 insertions(+), 12 deletions(-) > > > > > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > > > index 63355ab..7d87641 100644 > > > --- a/fs/ext4/inode.c > > > +++ b/fs/ext4/inode.c > > > @@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) > > > return !buffer_mapped(bh) || buffer_delay(bh); > > > } > > > > > > -/* FIXME!! only support data=writeback mode */ > > > /* > > > * get called vi ext4_da_writepages after taking page lock > > > * We may end up doing block allocation here in case > > > * mpage_da_map_blocks failed to allocate blocks. > > > */ > > > -static int ext4_da_writepage(struct page *page, > > > +static int ext4_da_writeback_writepage(struct page *page, > > > struct writeback_control *wbc) > > > { > > > int ret = 0; > > > @@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page, > > > return ret; > > > } > > > > > > +/* > > > + * get called vi ext4_da_writepages after taking page lock > > > + * We may end up doing block allocation here in case > > > + * mpage_da_map_blocks failed to allocate blocks. > > > + * > > > + * We also get called via journal_submit_inode_data_buffers > > > + */ > > > +static int ext4_da_ordered_writepage(struct page *page, > > > + struct writeback_control *wbc) > > > +{ > > > + int ret = 0; > > > + loff_t size; > > > + unsigned long len; > > > + handle_t *handle = NULL; > > > + struct buffer_head *page_bufs; > > > + struct inode *inode = page->mapping->host; > > > + > > > + handle = ext4_journal_current_handle(); > > > + if (!handle) { > > > + /* > > > + * This can happen when we aren't called via > > > + * ext4_da_writepages() but directly (shrink_page_list). > > > + * We cannot easily start a transaction here so we just skip > > > + * writing the page in case we would have to do so. > > > + */ > > > + size = i_size_read(inode); > > > + > > > + page_bufs = page_buffers(page); > > > + if (page->index == size >> PAGE_CACHE_SHIFT) > > > + len = size & ~PAGE_CACHE_MASK; > > > + else > > > + len = PAGE_CACHE_SIZE; > > > + > > > + if (walk_page_buffers(NULL, page_bufs, 0, > > > + len, NULL, ext4_bh_unmapped_or_delay)) { > > > + /* > > > + * We can't do block allocation under > > > + * page lock without a handle . So redirty > > > + * the page and return. > > > + * We may reach here when we do a journal commit > > > + * via journal_submit_inode_data_buffers. > > > + * If we don't have mapping block we just ignore > > > + * them > > > + * > > > + */ > > > + redirty_page_for_writepage(wbc, page); > > > + unlock_page(page); > > > + return 0; > > > + } > > > + } > > > + > > > + ret = block_write_full_page(page, ext4_da_get_block_write, wbc); > > > + > > > + return ret; > > > +} > > If you're going to use this writepage() implementation from commit > > code, you cannot simply do redirty_page_for_writepage() and bail out > > when there's an unmapped buffer. You must write out at least mapped > > buffers to satisfy ordering guarantees (think of filesystems with > > blocksize < page size). > > With delalloc is it possible to have a page that have some buffer_heads > marked delay ? I thought more about the case where some buffers are mapped and some aren't (because there's a hole in the middle of the page)... Honza -- Jan Kara <jack@suse.cz> SuSE CR Labs ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC] ext4: Semantics of delalloc,data=ordered 2008-06-16 17:20 ` Jan Kara @ 2008-06-16 18:11 ` Aneesh Kumar K.V 2008-06-16 19:17 ` Jan Kara 0 siblings, 1 reply; 15+ messages in thread From: Aneesh Kumar K.V @ 2008-06-16 18:11 UTC (permalink / raw) To: Jan Kara; +Cc: cmm, tytso, sandeen, linux-ext4, adilger On Mon, Jun 16, 2008 at 07:20:46PM +0200, Jan Kara wrote: > > On Mon, Jun 16, 2008 at 05:05:33PM +0200, Jan Kara wrote: > > > Hi Aneesh, > > > > > > First, I'd like to see some short comment on what semantics > > > delalloc,data=ordered is going to have. At least I can imagine at least > > > two sensible approaches: > > > 1) All we guarantee is that user is not going to see uninitialized data. > > > We send writes to disk (and allocate blocks) whenever it fits our needs > > > (usually when pdflush finds them). > > > 2) We guarantee that when transaction commits, your data is on disk - > > > i.e., we allocate actual blocks on transaction commit. > > > > > > Both these possibilities have their pros and cons. Most importantly, > > > 1) gives better disk layout while 2) gives higher consistency > > > guarantees. Note that with 1), it can under some circumstances happen, > > > that after a crash you see block 1 and 3 of your 3-block-write on disk, > > > while block 2 is still a hole. 1) is easy to implement (you mostly did > > > it below), 2) is harder. I think there should be broader consensus on > > > what the semantics should be (changed subject to catch more attention > > > ;). > > > > > > A few comments to your patch are also below. > > > > > > Honza > > > > The way I was looking at ordered mode was, we only guarantee that the > > meta-data blocks corresponding to the data block allocated get committed > > only after the data-blocks are written to the disk. As long as we don't > > allocate blocks corresponding to a page we don't write the page to > > disk. This should also speed up the "sync slowness" that lot of people > > are reporting with ordered mode. > I'm not sure if it helps - tons of dirty data have to get to > transaction at some time even with delayed alloc and at that moment any > "interactive application" is going to be starved. > > > Can you explain > > " > > 1), it can under some circumstances happen, that after a crash you see > > block 1 and 3 of your 3-block-write on disk, while block 2 is still a hole. > > " > Imagine you have a file with blocks 1 and 3 allocated and block 2 is a > hole. You write blocks 1-3. Block 2 isn't allocated because of delalloc. > Now if inode is already in the current transaction's list, during commit > writes to blocks 1 and 3 will land on disk but write to block 2 will > happen only after pdflush finds it. And that should be fine with data=ordered mode right ?. Because since block 2 is not yet allocated we don't have associated meta-data. So even if we crash we have meta-data pointing to 1 and 3 and not 2. The problem is only when we write the meta-data pointing to block 2 and not block 2 itself ?. > > > > > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> > > > > --- > > > > fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++-- > > > > fs/jbd2/commit.c | 41 ++++++++++++-- > > > > 2 files changed, 198 insertions(+), 12 deletions(-) > > > > > > > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > > > > index 63355ab..7d87641 100644 > > > > --- a/fs/ext4/inode.c > > > > +++ b/fs/ext4/inode.c > > > > @@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) > > > > return !buffer_mapped(bh) || buffer_delay(bh); > > > > } > > > > > > > > -/* FIXME!! only support data=writeback mode */ > > > > /* > > > > * get called vi ext4_da_writepages after taking page lock > > > > * We may end up doing block allocation here in case > > > > * mpage_da_map_blocks failed to allocate blocks. > > > > */ > > > > -static int ext4_da_writepage(struct page *page, > > > > +static int ext4_da_writeback_writepage(struct page *page, > > > > struct writeback_control *wbc) > > > > { > > > > int ret = 0; > > > > @@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page, > > > > return ret; > > > > } > > > > > > > > +/* > > > > + * get called vi ext4_da_writepages after taking page lock > > > > + * We may end up doing block allocation here in case > > > > + * mpage_da_map_blocks failed to allocate blocks. > > > > + * > > > > + * We also get called via journal_submit_inode_data_buffers > > > > + */ > > > > +static int ext4_da_ordered_writepage(struct page *page, > > > > + struct writeback_control *wbc) > > > > +{ > > > > + int ret = 0; > > > > + loff_t size; > > > > + unsigned long len; > > > > + handle_t *handle = NULL; > > > > + struct buffer_head *page_bufs; > > > > + struct inode *inode = page->mapping->host; > > > > + > > > > + handle = ext4_journal_current_handle(); > > > > + if (!handle) { > > > > + /* > > > > + * This can happen when we aren't called via > > > > + * ext4_da_writepages() but directly (shrink_page_list). > > > > + * We cannot easily start a transaction here so we just skip > > > > + * writing the page in case we would have to do so. > > > > + */ > > > > + size = i_size_read(inode); > > > > + > > > > + page_bufs = page_buffers(page); > > > > + if (page->index == size >> PAGE_CACHE_SHIFT) > > > > + len = size & ~PAGE_CACHE_MASK; > > > > + else > > > > + len = PAGE_CACHE_SIZE; > > > > + > > > > + if (walk_page_buffers(NULL, page_bufs, 0, > > > > + len, NULL, ext4_bh_unmapped_or_delay)) { > > > > + /* > > > > + * We can't do block allocation under > > > > + * page lock without a handle . So redirty > > > > + * the page and return. > > > > + * We may reach here when we do a journal commit > > > > + * via journal_submit_inode_data_buffers. > > > > + * If we don't have mapping block we just ignore > > > > + * them > > > > + * > > > > + */ > > > > + redirty_page_for_writepage(wbc, page); > > > > + unlock_page(page); > > > > + return 0; > > > > + } > > > > + } > > > > + > > > > + ret = block_write_full_page(page, ext4_da_get_block_write, wbc); > > > > + > > > > + return ret; > > > > +} > > > If you're going to use this writepage() implementation from commit > > > code, you cannot simply do redirty_page_for_writepage() and bail out > > > when there's an unmapped buffer. You must write out at least mapped > > > buffers to satisfy ordering guarantees (think of filesystems with > > > blocksize < page size). > > > > With delalloc is it possible to have a page that have some buffer_heads > > marked delay ? > I thought more about the case where some buffers are mapped and some > aren't (because there's a hole in the middle of the page)... With delalloc it won't be a problem. This is what i understood. We allocate blocks only during writepages. That means we allocate blocks and write then via block_write_full_page at the same time. Also we add meta-data to the journal list. We also add inode to the journal list. We also mark page_writeback on the page. Now when the journal_commit happens we again walk through the inode and try to write the page. The page may already have finished writeback by then or it may be in writeback. In both the case we won't actually be sending any data block to disk on journal commit. If is it in writeback we would have page_writeback set and journal commit would wait via filemap_fdatawait. -aneesh ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC] ext4: Semantics of delalloc,data=ordered 2008-06-16 18:11 ` Aneesh Kumar K.V @ 2008-06-16 19:17 ` Jan Kara 2008-06-16 19:22 ` Eric Sandeen 0 siblings, 1 reply; 15+ messages in thread From: Jan Kara @ 2008-06-16 19:17 UTC (permalink / raw) To: Aneesh Kumar K.V; +Cc: cmm, tytso, sandeen, linux-ext4, adilger On Mon 16-06-08 23:41:52, Aneesh Kumar K.V wrote: > On Mon, Jun 16, 2008 at 07:20:46PM +0200, Jan Kara wrote: > > > On Mon, Jun 16, 2008 at 05:05:33PM +0200, Jan Kara wrote: > > > > Hi Aneesh, > > > > > > > > First, I'd like to see some short comment on what semantics > > > > delalloc,data=ordered is going to have. At least I can imagine at least > > > > two sensible approaches: > > > > 1) All we guarantee is that user is not going to see uninitialized data. > > > > We send writes to disk (and allocate blocks) whenever it fits our needs > > > > (usually when pdflush finds them). > > > > 2) We guarantee that when transaction commits, your data is on disk - > > > > i.e., we allocate actual blocks on transaction commit. > > > > > > > > Both these possibilities have their pros and cons. Most importantly, > > > > 1) gives better disk layout while 2) gives higher consistency > > > > guarantees. Note that with 1), it can under some circumstances happen, > > > > that after a crash you see block 1 and 3 of your 3-block-write on disk, > > > > while block 2 is still a hole. 1) is easy to implement (you mostly did > > > > it below), 2) is harder. I think there should be broader consensus on > > > > what the semantics should be (changed subject to catch more attention > > > > ;). > > > > > > > > A few comments to your patch are also below. > > > > > > > > Honza > > > > > > The way I was looking at ordered mode was, we only guarantee that the > > > meta-data blocks corresponding to the data block allocated get committed > > > only after the data-blocks are written to the disk. As long as we don't > > > allocate blocks corresponding to a page we don't write the page to > > > disk. This should also speed up the "sync slowness" that lot of people > > > are reporting with ordered mode. > > I'm not sure if it helps - tons of dirty data have to get to > > transaction at some time even with delayed alloc and at that moment any > > "interactive application" is going to be starved. > > > > > Can you explain > > > " > > > 1), it can under some circumstances happen, that after a crash you see > > > block 1 and 3 of your 3-block-write on disk, while block 2 is still a hole. > > > " > > Imagine you have a file with blocks 1 and 3 allocated and block 2 is a > > hole. You write blocks 1-3. Block 2 isn't allocated because of delalloc. > > Now if inode is already in the current transaction's list, during commit > > writes to blocks 1 and 3 will land on disk but write to block 2 will > > happen only after pdflush finds it. > > And that should be fine with data=ordered mode right ?. Because since > block 2 is not yet allocated we don't have associated meta-data. So > even if we crash we have meta-data pointing to 1 and 3 and not 2. The > problem is only when we write the meta-data pointing to block 2 and not > block 2 itself ?. Yes, it is correct. I may be just surprising (we didn't do things like this in data=ordered mode before). > > > > > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> > > > > > --- > > > > > fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++-- > > > > > fs/jbd2/commit.c | 41 ++++++++++++-- > > > > > 2 files changed, 198 insertions(+), 12 deletions(-) > > > > > > > > > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > > > > > index 63355ab..7d87641 100644 > > > > > --- a/fs/ext4/inode.c > > > > > +++ b/fs/ext4/inode.c > > > > > @@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) > > > > > return !buffer_mapped(bh) || buffer_delay(bh); > > > > > } > > > > > > > > > > -/* FIXME!! only support data=writeback mode */ > > > > > /* > > > > > * get called vi ext4_da_writepages after taking page lock > > > > > * We may end up doing block allocation here in case > > > > > * mpage_da_map_blocks failed to allocate blocks. > > > > > */ > > > > > -static int ext4_da_writepage(struct page *page, > > > > > +static int ext4_da_writeback_writepage(struct page *page, > > > > > struct writeback_control *wbc) > > > > > { > > > > > int ret = 0; > > > > > @@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page, > > > > > return ret; > > > > > } > > > > > > > > > > +/* > > > > > + * get called vi ext4_da_writepages after taking page lock > > > > > + * We may end up doing block allocation here in case > > > > > + * mpage_da_map_blocks failed to allocate blocks. > > > > > + * > > > > > + * We also get called via journal_submit_inode_data_buffers > > > > > + */ > > > > > +static int ext4_da_ordered_writepage(struct page *page, > > > > > + struct writeback_control *wbc) > > > > > +{ > > > > > + int ret = 0; > > > > > + loff_t size; > > > > > + unsigned long len; > > > > > + handle_t *handle = NULL; > > > > > + struct buffer_head *page_bufs; > > > > > + struct inode *inode = page->mapping->host; > > > > > + > > > > > + handle = ext4_journal_current_handle(); > > > > > + if (!handle) { > > > > > + /* > > > > > + * This can happen when we aren't called via > > > > > + * ext4_da_writepages() but directly (shrink_page_list). > > > > > + * We cannot easily start a transaction here so we just skip > > > > > + * writing the page in case we would have to do so. > > > > > + */ > > > > > + size = i_size_read(inode); > > > > > + > > > > > + page_bufs = page_buffers(page); > > > > > + if (page->index == size >> PAGE_CACHE_SHIFT) > > > > > + len = size & ~PAGE_CACHE_MASK; > > > > > + else > > > > > + len = PAGE_CACHE_SIZE; > > > > > + > > > > > + if (walk_page_buffers(NULL, page_bufs, 0, > > > > > + len, NULL, ext4_bh_unmapped_or_delay)) { > > > > > + /* > > > > > + * We can't do block allocation under > > > > > + * page lock without a handle . So redirty > > > > > + * the page and return. > > > > > + * We may reach here when we do a journal commit > > > > > + * via journal_submit_inode_data_buffers. > > > > > + * If we don't have mapping block we just ignore > > > > > + * them > > > > > + * > > > > > + */ > > > > > + redirty_page_for_writepage(wbc, page); > > > > > + unlock_page(page); > > > > > + return 0; > > > > > + } > > > > > + } > > > > > + > > > > > + ret = block_write_full_page(page, ext4_da_get_block_write, wbc); > > > > > + > > > > > + return ret; > > > > > +} > > > > If you're going to use this writepage() implementation from commit > > > > code, you cannot simply do redirty_page_for_writepage() and bail out > > > > when there's an unmapped buffer. You must write out at least mapped > > > > buffers to satisfy ordering guarantees (think of filesystems with > > > > blocksize < page size). > > > > > > With delalloc is it possible to have a page that have some buffer_heads > > > marked delay ? > > I thought more about the case where some buffers are mapped and some > > aren't (because there's a hole in the middle of the page)... > > With delalloc it won't be a problem. This is what i understood. > > We allocate blocks only during writepages. That means we allocate blocks > and write then via block_write_full_page at the same time. Also we add > meta-data to the journal list. We also add inode to the journal list. > We also mark page_writeback on the page. Now when the journal_commit > happens we again walk through the inode and try to write the page. The > page may already have finished writeback by then or it may be in > writeback. In both the case we won't actually be sending any data block to disk > on journal commit. If is it in writeback we would have page_writeback > set and journal commit would wait via filemap_fdatawait. Ah, you are right. We need to ensure ordering only for new block allocations and for them, everything is fine. Sorry for confusion. Honza -- Jan Kara <jack@suse.cz> SUSE Labs, CR ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC] ext4: Semantics of delalloc,data=ordered 2008-06-16 19:17 ` Jan Kara @ 2008-06-16 19:22 ` Eric Sandeen 2008-06-16 19:45 ` Jan Kara 0 siblings, 1 reply; 15+ messages in thread From: Eric Sandeen @ 2008-06-16 19:22 UTC (permalink / raw) To: Jan Kara; +Cc: Aneesh Kumar K.V, cmm, tytso, linux-ext4, adilger Jan Kara wrote: >>> Imagine you have a file with blocks 1 and 3 allocated and block 2 is a >>> hole. You write blocks 1-3. Block 2 isn't allocated because of delalloc. >>> Now if inode is already in the current transaction's list, during commit >>> writes to blocks 1 and 3 will land on disk but write to block 2 will >>> happen only after pdflush finds it. >> And that should be fine with data=ordered mode right ?. Because since >> block 2 is not yet allocated we don't have associated meta-data. So >> even if we crash we have meta-data pointing to 1 and 3 and not 2. The >> problem is only when we write the meta-data pointing to block 2 and not >> block 2 itself ?. > Yes, it is correct. I may be just surprising (we didn't do things like > this in data=ordered mode before). Will it even be surprising? "fill-in-hole; crash;" today may give you the same thing, right? It's just that with delalloc it might be a bigger window in time for this to happen? -Eric ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC] ext4: Semantics of delalloc,data=ordered 2008-06-16 19:22 ` Eric Sandeen @ 2008-06-16 19:45 ` Jan Kara 0 siblings, 0 replies; 15+ messages in thread From: Jan Kara @ 2008-06-16 19:45 UTC (permalink / raw) To: Eric Sandeen; +Cc: Aneesh Kumar K.V, cmm, tytso, linux-ext4, adilger On Mon 16-06-08 14:22:24, Eric Sandeen wrote: > Jan Kara wrote: > >>> Imagine you have a file with blocks 1 and 3 allocated and block 2 is a > >>> hole. You write blocks 1-3. Block 2 isn't allocated because of delalloc. > >>> Now if inode is already in the current transaction's list, during commit > >>> writes to blocks 1 and 3 will land on disk but write to block 2 will > >>> happen only after pdflush finds it. > >> And that should be fine with data=ordered mode right ?. Because since > >> block 2 is not yet allocated we don't have associated meta-data. So > >> even if we crash we have meta-data pointing to 1 and 3 and not 2. The > >> problem is only when we write the meta-data pointing to block 2 and not > >> block 2 itself ?. > > Yes, it is correct. I may be just surprising (we didn't do things like > > this in data=ordered mode before). > > Will it even be surprising? "fill-in-hole; crash;" today may give you > the same thing, right? It's just that with delalloc it might be a > bigger window in time for this to happen? Thinking more about it, yes, it could happen even with current ordered mode. If the crash happens between writeback of data buffers and commit of transaction, block in the middle would remain to be a hole. OK, so we just make the window larger. You are starting to convince me that 1) is really the better choice ;). Honza -- Jan Kara <jack@suse.cz> SUSE Labs, CR ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC] ext4: Semantics of delalloc,data=ordered 2008-06-16 15:05 ` [RFC] ext4: Semantics of delalloc,data=ordered Jan Kara 2008-06-16 16:02 ` Aneesh Kumar K.V @ 2008-06-16 18:55 ` Andreas Dilger 2008-06-16 18:58 ` Eric Sandeen 2008-06-16 19:40 ` Jan Kara 1 sibling, 2 replies; 15+ messages in thread From: Andreas Dilger @ 2008-06-16 18:55 UTC (permalink / raw) To: Jan Kara; +Cc: Aneesh Kumar K.V, cmm, tytso, sandeen, linux-ext4 On Jun 16, 2008 17:05 +0200, Jan Kara wrote: > First, I'd like to see some short comment on what semantics > delalloc,data=ordered is going to have. At least I can imagine at least > two sensible approaches: > 1) All we guarantee is that user is not going to see uninitialized data. > We send writes to disk (and allocate blocks) whenever it fits our needs > (usually when pdflush finds them). > 2) We guarantee that when transaction commits, your data is on disk - > i.e., we allocate actual blocks on transaction commit. > > Both these possibilities have their pros and cons. Most importantly, > 1) gives better disk layout while 2) gives higher consistency > guarantees. Note that with 1), it can under some circumstances happen, > that after a crash you see block 1 and 3 of your 3-block-write on disk, > while block 2 is still a hole. 1) is easy to implement (you mostly did > it below), 2) is harder. I think there should be broader consensus on > what the semantics should be (changed subject to catch more attention ;). IMHO, the semantic should be (1) and not (2). Applications don't understand "when the transaction commits" so it doesn't provide any useful guarantee to userspace, and if they actually need the data on disk (e.g. MTA) then they need to call fsync to ensure this. While I agree it is theoretically possible to have the "hole in data where there shouldn't be one" scenario, in real life these blocks would be allocated together by delalloc+mballoc and this situation should not happen. As for "sync with heavy IO causing slowness" problem of Firefox, I think that delalloc will help this noticably, but I agree we can still get into cases where a lot of dirty data was just allocated and now needs to be flushed to disk to commit the transaction. In the short term I don't think this can be completely fixed, but in the long term I think it can be fixed by having mballoc do "reservations" of space on disk, in which the dirty pages can be written. Only after the data is on disk does the "reservation" turn into an "allocation" in the journal (i.e. filesystem buffers added to transaction and modified). At that point a sync operation only has to write out the journal blocks, because all of the data is on disk already. I don't think it is a huge difference from what we have today, but I also don't think it should be in the first implementation. We would need to split up handling of the in-memory block bitmaps so that only the in-memory ones are updated first, then the on-disk bitmaps are later marked in use in a transaction after the data blocks are on disk. Cheers, Andreas -- Andreas Dilger Sr. Staff Engineer, Lustre Group Sun Microsystems of Canada, Inc. ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC] ext4: Semantics of delalloc,data=ordered 2008-06-16 18:55 ` Andreas Dilger @ 2008-06-16 18:58 ` Eric Sandeen 2008-06-16 19:22 ` Jan Kara 2008-06-16 19:40 ` Jan Kara 1 sibling, 1 reply; 15+ messages in thread From: Eric Sandeen @ 2008-06-16 18:58 UTC (permalink / raw) To: Andreas Dilger; +Cc: Jan Kara, Aneesh Kumar K.V, cmm, tytso, linux-ext4 Andreas Dilger wrote: > On Jun 16, 2008 17:05 +0200, Jan Kara wrote: >> First, I'd like to see some short comment on what semantics >> delalloc,data=ordered is going to have. At least I can imagine at least >> two sensible approaches: >> 1) All we guarantee is that user is not going to see uninitialized data. >> We send writes to disk (and allocate blocks) whenever it fits our needs >> (usually when pdflush finds them). >> 2) We guarantee that when transaction commits, your data is on disk - >> i.e., we allocate actual blocks on transaction commit. >> >> Both these possibilities have their pros and cons. Most importantly, >> 1) gives better disk layout while 2) gives higher consistency >> guarantees. Note that with 1), it can under some circumstances happen, >> that after a crash you see block 1 and 3 of your 3-block-write on disk, >> while block 2 is still a hole. 1) is easy to implement (you mostly did >> it below), 2) is harder. I think there should be broader consensus on >> what the semantics should be (changed subject to catch more attention ;). > > IMHO, the semantic should be (1) and not (2). Applications don't understand > "when the transaction commits" so it doesn't provide any useful guarantee > to userspace, and if they actually need the data on disk (e.g. MTA) then > they need to call fsync to ensure this. > > While I agree it is theoretically possible to have the "hole in data > where there shouldn't be one" scenario, in real life these blocks would be > allocated together by delalloc+mballoc and this situation should not happen. I'm not sure that's true; filling in holes is not that uncommon. But, I'm not sure that it actually leads to a problem, as the metadata gets "created" for the hole-fill-in only when the block actually gets allocated right? -Eric ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC] ext4: Semantics of delalloc,data=ordered 2008-06-16 18:58 ` Eric Sandeen @ 2008-06-16 19:22 ` Jan Kara 0 siblings, 0 replies; 15+ messages in thread From: Jan Kara @ 2008-06-16 19:22 UTC (permalink / raw) To: Eric Sandeen; +Cc: Andreas Dilger, Aneesh Kumar K.V, cmm, tytso, linux-ext4 On Mon 16-06-08 13:58:57, Eric Sandeen wrote: > Andreas Dilger wrote: > > On Jun 16, 2008 17:05 +0200, Jan Kara wrote: > >> First, I'd like to see some short comment on what semantics > >> delalloc,data=ordered is going to have. At least I can imagine at least > >> two sensible approaches: > >> 1) All we guarantee is that user is not going to see uninitialized data. > >> We send writes to disk (and allocate blocks) whenever it fits our needs > >> (usually when pdflush finds them). > >> 2) We guarantee that when transaction commits, your data is on disk - > >> i.e., we allocate actual blocks on transaction commit. > >> > >> Both these possibilities have their pros and cons. Most importantly, > >> 1) gives better disk layout while 2) gives higher consistency > >> guarantees. Note that with 1), it can under some circumstances happen, > >> that after a crash you see block 1 and 3 of your 3-block-write on disk, > >> while block 2 is still a hole. 1) is easy to implement (you mostly did > >> it below), 2) is harder. I think there should be broader consensus on > >> what the semantics should be (changed subject to catch more attention ;). > > > > IMHO, the semantic should be (1) and not (2). Applications don't understand > > "when the transaction commits" so it doesn't provide any useful guarantee > > to userspace, and if they actually need the data on disk (e.g. MTA) then > > they need to call fsync to ensure this. > > > > While I agree it is theoretically possible to have the "hole in data > > where there shouldn't be one" scenario, in real life these blocks would be > > allocated together by delalloc+mballoc and this situation should not happen. > > I'm not sure that's true; filling in holes is not that uncommon. > > But, I'm not sure that it actually leads to a problem, as the metadata > gets "created" for the hole-fill-in only when the block actually gets > allocated right? From filesystem point of view everything is correct. Just application may get confused that only a (non-continuous) subset of it's write made it to disk. This didn't use to happen before in data=ordered mode. Honza -- Jan Kara <jack@suse.cz> SUSE Labs, CR ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC] ext4: Semantics of delalloc,data=ordered 2008-06-16 18:55 ` Andreas Dilger 2008-06-16 18:58 ` Eric Sandeen @ 2008-06-16 19:40 ` Jan Kara 1 sibling, 0 replies; 15+ messages in thread From: Jan Kara @ 2008-06-16 19:40 UTC (permalink / raw) To: Andreas Dilger; +Cc: Aneesh Kumar K.V, cmm, tytso, sandeen, linux-ext4 On Mon 16-06-08 12:55:24, Andreas Dilger wrote: > On Jun 16, 2008 17:05 +0200, Jan Kara wrote: > > First, I'd like to see some short comment on what semantics > > delalloc,data=ordered is going to have. At least I can imagine at least > > two sensible approaches: > > 1) All we guarantee is that user is not going to see uninitialized data. > > We send writes to disk (and allocate blocks) whenever it fits our needs > > (usually when pdflush finds them). > > 2) We guarantee that when transaction commits, your data is on disk - > > i.e., we allocate actual blocks on transaction commit. > > > > Both these possibilities have their pros and cons. Most importantly, > > 1) gives better disk layout while 2) gives higher consistency > > guarantees. Note that with 1), it can under some circumstances happen, > > that after a crash you see block 1 and 3 of your 3-block-write on disk, > > while block 2 is still a hole. 1) is easy to implement (you mostly did > > it below), 2) is harder. I think there should be broader consensus on > > what the semantics should be (changed subject to catch more attention ;). > > IMHO, the semantic should be (1) and not (2). Applications don't understand > "when the transaction commits" so it doesn't provide any useful guarantee > to userspace, and if they actually need the data on disk (e.g. MTA) then > they need to call fsync to ensure this. Well, in principle I agree with you. But let's take example of firefox / konqueror. They call fsync() on each modification to their "browsing history" file. And when I asked one Konqueror developer why do they do such a stupid thing, he told me that XFS used to zero-out portions of the file on crash (because of delalloc) and this broke the browser in funny ways. Now the point I wanted to make is that if the filesystem behaves "reasonably" apps often do not need to call fsync (Konquerror is fine with how current ext3 in data=ordered mode behaves without using fsync, I'm not sure if it would be fine with delalloc,data=ordered of ext4). So I'm not really decided whether the additional performance is worth the decreased data consistency guarantees... > While I agree it is theoretically possible to have the "hole in data > where there shouldn't be one" scenario, in real life these blocks would be > allocated together by delalloc+mballoc and this situation should not happen. > > As for "sync with heavy IO causing slowness" problem of Firefox, I think > that delalloc will help this noticably, but I agree we can still get into > cases where a lot of dirty data was just allocated and now needs to be > flushed to disk to commit the transaction. > > In the short term I don't think this can be completely fixed, but in the > long term I think it can be fixed by having mballoc do "reservations" of > space on disk, in which the dirty pages can be written. Only after the > data is on disk does the "reservation" turn into an "allocation" in the > journal (i.e. filesystem buffers added to transaction and modified). > At that point a sync operation only has to write out the journal blocks, > because all of the data is on disk already. Interesting idea, this seems like a viable way to fix the problem. > I don't think it is a huge difference from what we have today, but I > also don't think it should be in the first implementation. We would > need to split up handling of the in-memory block bitmaps so that only > the in-memory ones are updated first, then the on-disk bitmaps are > later marked in use in a transaction after the data blocks are on disk. Honza -- Jan Kara <jack@suse.cz> SUSE Labs, CR ^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2008-06-16 19:45 UTC | newest] Thread overview: 15+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2008-06-12 15:25 [RFC PATCH] ext4: Add ordered mode support for delalloc Aneesh Kumar K.V 2008-06-13 20:53 ` Aneesh Kumar K.V 2008-06-13 23:01 ` Mingming 2008-06-14 6:34 ` Aneesh Kumar K.V 2008-06-16 15:05 ` [RFC] ext4: Semantics of delalloc,data=ordered Jan Kara 2008-06-16 16:02 ` Aneesh Kumar K.V 2008-06-16 17:20 ` Jan Kara 2008-06-16 18:11 ` Aneesh Kumar K.V 2008-06-16 19:17 ` Jan Kara 2008-06-16 19:22 ` Eric Sandeen 2008-06-16 19:45 ` Jan Kara 2008-06-16 18:55 ` Andreas Dilger 2008-06-16 18:58 ` Eric Sandeen 2008-06-16 19:22 ` Jan Kara 2008-06-16 19:40 ` Jan Kara
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).