* [PATCH] ext4: Fix file fragmentation during large file write. @ 2008-10-07 8:47 Aneesh Kumar K.V 0 siblings, 0 replies; 2+ messages in thread From: Aneesh Kumar K.V @ 2008-10-07 8:47 UTC (permalink / raw) To: cmm, tytso, sandeen; +Cc: linux-ext4, Aneesh Kumar K.V The range_cyclic writeback mode use the address_space writeback_index as the start index for writeback. With delayed allocation we were updating writeback_index wrongly resulting in highly fragmented file. Number of extents reduced from 4000 to 27 for a 3GB file with the below patch. The patch also cleanup the ext4 delayed allocation writepages by implementing write_cache_pages locally with needed changes for cleanup. Also it drops the range_cont writeback mode added for ext4 delayed allocation writeback Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> --- fs/ext4/inode.c | 192 +++++++++++++++++++++++++++++++++------------ include/linux/writeback.h | 1 - mm/page-writeback.c | 2 - 3 files changed, 143 insertions(+), 52 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 21f1d3a..b6b0985 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) int ret = 0, err, nr_pages, i; unsigned long index, end; struct pagevec pvec; + long pages_skipped; BUG_ON(mpd->next_page <= mpd->first_page); pagevec_init(&pvec, 0); @@ -1655,20 +1656,30 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) end = mpd->next_page - 1; while (index <= end) { - /* XXX: optimize tail */ - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + /* + * We can use PAGECACHE_TAG_DIRTY lookup here because + * even though we have cleared the dirty flag on the page + * We still keep the page in the radix tree with tag + * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io. + * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback + * which is called via the below writepage callback. + */ + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, + (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - index = page->index; - if (index > end) - break; - index++; - + pages_skipped = mpd->wbc->pages_skipped; err = mapping->a_ops->writepage(page, mpd->wbc); - if (!err) + if (!err && (pages_skipped == mpd->wbc->pages_skipped)) + /* + * have successfully written the page + * without skipping the same + */ mpd->pages_written++; /* * In error case, we have to continue because @@ -2088,6 +2099,100 @@ static int __mpage_da_writepage(struct page *page, return 0; } +static int ext4_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, writepage_t writepage, + void *data) +{ + struct pagevec pvec; + pgoff_t index, end; + long to_write = wbc->nr_to_write; + int ret = 0, done = 0, scanned = 0, nr_pages; + struct backing_dev_info *bdi = mapping->backing_dev_info; + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + return 0; + } + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + scanned = 1; + } +retry: + while (!done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + if (!wbc->range_cyclic && page->index > end) { + done = 1; + unlock_page(page); + continue; + } + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + ret = (*writepage)(page, wbc, data); + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { + unlock_page(page); + ret = 0; + } + if (ret || (--(to_write) <= 0)) + /* + * writepage either failed. + * or did an extent write. + * We wrote what we are asked to + * write + */ + done = 1; + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; + } + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + return ret; +} + /* * mpage_da_writepages - walk the list of dirty pages of the given * address space, allocates non-allocated blocks, maps newly-allocated @@ -2104,7 +2209,6 @@ static int mpage_da_writepages(struct address_space *mapping, struct writeback_control *wbc, struct mpage_da_data *mpd) { - long to_write; int ret; if (!mpd->get_block) @@ -2119,10 +2223,7 @@ static int mpage_da_writepages(struct address_space *mapping, mpd->pages_written = 0; mpd->retval = 0; - to_write = wbc->nr_to_write; - - ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); - + ret = ext4_write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); /* * Handle last extent of pages */ @@ -2131,7 +2232,7 @@ static int mpage_da_writepages(struct address_space *mapping, mpage_da_submit_io(mpd); } - wbc->nr_to_write = to_write - mpd->pages_written; + wbc->nr_to_write -= mpd->pages_written; return ret; } @@ -2360,12 +2461,13 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) static int ext4_da_writepages(struct address_space *mapping, struct writeback_control *wbc) { + pgoff_t index; + int range_whole = 0; handle_t *handle = NULL; - loff_t range_start = 0; + long pages_written = 0; struct mpage_da_data mpd; struct inode *inode = mapping->host; int needed_blocks, ret = 0, nr_to_writebump = 0; - long to_write, pages_skipped = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); /* @@ -2385,23 +2487,18 @@ static int ext4_da_writepages(struct address_space *mapping, nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; wbc->nr_to_write = sbi->s_mb_stream_request; } + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; - if (!wbc->range_cyclic) - /* - * If range_cyclic is not set force range_cont - * and save the old writeback_index - */ - wbc->range_cont = 1; - - range_start = wbc->range_start; - pages_skipped = wbc->pages_skipped; + if (wbc->range_cyclic) + index = mapping->writeback_index; + else + index = wbc->range_start >> PAGE_CACHE_SHIFT; mpd.wbc = wbc; mpd.inode = mapping->host; -restart_loop: - to_write = wbc->nr_to_write; - while (!ret && to_write > 0) { + while (!ret && wbc->nr_to_write > 0) { /* * we insert one extent at a time. So we need @@ -2422,48 +2519,45 @@ static int ext4_da_writepages(struct address_space *mapping, dump_stack(); goto out_writepages; } - to_write -= wbc->nr_to_write; - mpd.get_block = ext4_da_get_block_write; ret = mpage_da_writepages(mapping, wbc, &mpd); ext4_journal_stop(handle); - if (mpd.retval == -ENOSPC) + if (mpd.retval == -ENOSPC) { + /* commit the transaction which would + * free blocks released in the transaction + * and try again + */ jbd2_journal_force_commit_nested(sbi->s_journal); - - /* reset the retry count */ - if (ret == MPAGE_DA_EXTENT_TAIL) { + ret = 0; + } else if (ret == MPAGE_DA_EXTENT_TAIL) { /* * got one extent now try with * rest of the pages */ - to_write += wbc->nr_to_write; + pages_written += mpd.pages_written; ret = 0; - } else if (wbc->nr_to_write) { + } else if (wbc->nr_to_write) /* * There is no more writeout needed * or we requested for a noblocking writeout * and we found the device congested */ - to_write += wbc->nr_to_write; break; - } - wbc->nr_to_write = to_write; } - if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { - /* We skipped pages in this loop */ - wbc->range_start = range_start; - wbc->nr_to_write = to_write + - wbc->pages_skipped - pages_skipped; - wbc->pages_skipped = pages_skipped; - goto restart_loop; - } + /* Update index */ + index += pages_written; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + /* + * set the writeback_index so that range_cyclic + * mode will write it back later + */ + mapping->writeback_index = index; out_writepages: - wbc->nr_to_write = to_write - nr_to_writebump; - wbc->range_start = range_start; + wbc->nr_to_write -= nr_to_writebump; return ret; } diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 12b15c5..bd91987 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -63,7 +63,6 @@ struct writeback_control { unsigned for_writepages:1; /* This is a writepages() call */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned more_io:1; /* more io to be dispatched */ - unsigned range_cont:1; }; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 24de8b6..718efa6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -961,8 +961,6 @@ int write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = index; - if (wbc->range_cont) - wbc->range_start = index << PAGE_CACHE_SHIFT; return ret; } EXPORT_SYMBOL(write_cache_pages); -- 1.6.0.1.285.g1070 ^ permalink raw reply related [flat|nested] 2+ messages in thread
* Patches for patchqueue @ 2008-10-14 6:33 Aneesh Kumar K.V 2008-10-14 6:33 ` [PATCH -V5] ext4: Use an rbtree for tracking blocks freed during transaction Aneesh Kumar K.V 0 siblings, 1 reply; 2+ messages in thread From: Aneesh Kumar K.V @ 2008-10-14 6:33 UTC (permalink / raw) To: aneesh.kumar, cmm, tytso, sandeen; +Cc: linux-ext4 Hi Ted, Below patches are updates for patches in the patch queue. 0001-ext4-Use-an-rbtree-for-tracking-blocks-freed-during.patch Updated with the fix from you. I also removed the definition of EXT4_BB_MAX_BLOCKS 0004-ext4-Use-tag-dirty-lookup-during-mpage_da_submit_io.patch 0005-vfs-Remove-the-range_cont-writeback-mode.patch 0006-vfs-Add-no_nrwrite_update-and-no_index_update-write.patch 0007-ext4-Fix-file-fragmentation-during-large-file-write.patch Split the single patch fix_file_fragmentation_during_large_file_write into 4 patches. The VFS changes are cc to linux-fsdevel@vger.kernel.org 0010-ext4-Free-ext4_prealloc_space-using-kmem_cache_free.patch -aneesh ^ permalink raw reply [flat|nested] 2+ messages in thread
* [PATCH -V5] ext4: Use an rbtree for tracking blocks freed during transaction. 2008-10-14 6:33 Patches for patchqueue Aneesh Kumar K.V @ 2008-10-14 6:33 ` Aneesh Kumar K.V 2008-10-14 6:33 ` [PATCH] ext4: Use tag dirty lookup during mpage_da_submit_io Aneesh Kumar K.V 0 siblings, 1 reply; 2+ messages in thread From: Aneesh Kumar K.V @ 2008-10-14 6:33 UTC (permalink / raw) To: aneesh.kumar, cmm, tytso, sandeen; +Cc: linux-ext4 With this patch we track the block freed during a transaction using rb tree. We also make sure contiguous blocks freed are collected in one rb node. Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/mballoc.c | 184 +++++++++++++++++++++++++++++++++------------------- fs/ext4/mballoc.h | 26 +++++--- 2 files changed, 133 insertions(+), 77 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b580714..7023228 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + meta_group_info[i]->bb_free_root.rb_node = NULL;; #ifdef DOUBLE_CHECK { @@ -2647,13 +2648,11 @@ int ext4_mb_release(struct super_block *sb) static noinline_for_stack void ext4_mb_free_committed_blocks(struct super_block *sb) { - struct ext4_sb_info *sbi = EXT4_SB(sb); - int err; - int i; - int count = 0; - int count2 = 0; - struct ext4_free_metadata *md; struct ext4_buddy e4b; + struct ext4_group_info *db; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err, count = 0, count2 = 0; + struct ext4_free_data *entry; if (list_empty(&sbi->s_committed_transaction)) return; @@ -2661,44 +2660,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb) /* there is committed blocks to be freed yet */ do { /* get next array of blocks */ - md = NULL; + entry = NULL; spin_lock(&sbi->s_md_lock); if (!list_empty(&sbi->s_committed_transaction)) { - md = list_entry(sbi->s_committed_transaction.next, - struct ext4_free_metadata, list); - list_del(&md->list); + entry = list_entry(sbi->s_committed_transaction.next, + struct ext4_free_data, list); + list_del(&entry->list); } spin_unlock(&sbi->s_md_lock); - if (md == NULL) + if (entry == NULL) break; mb_debug("gonna free %u blocks in group %lu (0x%p):", - md->num, md->group, md); + entry->count, entry->group, entry); - err = ext4_mb_load_buddy(sb, md->group, &e4b); + err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); + db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ - count += md->num; + count += entry->count; count2++; - ext4_lock_group(sb, md->group); - for (i = 0; i < md->num; i++) { - mb_debug(" %u", md->blocks[i]); - mb_free_blocks(NULL, &e4b, md->blocks[i], 1); + ext4_lock_group(sb, entry->group); + /* Take it out of per group rb tree */ + rb_erase(&entry->node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); } - mb_debug("\n"); - ext4_unlock_group(sb, md->group); - - /* balance refcounts from ext4_mb_free_metadata() */ - page_cache_release(e4b.bd_buddy_page); - page_cache_release(e4b.bd_bitmap_page); + ext4_unlock_group(sb, entry->group); - kfree(md); + kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); - - } while (md); + } while (1); mb_debug("freed %u blocks in %u structures\n", count, count2); } @@ -2771,6 +2772,16 @@ int __init init_ext4_mballoc(void) kmem_cache_destroy(ext4_pspace_cachep); return -ENOMEM; } + + ext4_free_ext_cachep = + kmem_cache_create("ext4_free_block_extents", + sizeof(struct ext4_free_data), + 0, SLAB_RECLAIM_ACCOUNT, NULL); + if (ext4_free_ext_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + return -ENOMEM; + } return 0; } @@ -2779,6 +2790,7 @@ void exit_ext4_mballoc(void) /* XXX: synchronize_rcu(); */ kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); + kmem_cache_destroy(ext4_free_ext_cachep); } @@ -4415,6 +4427,21 @@ static void ext4_mb_poll_new_transaction(struct super_block *sb, ext4_mb_free_committed_blocks(sb); } +/* + * We can merge two free data extents only if the physical blocks + * are contiguous, AND the extents were freed by the same transaction, + * AND the blocks are associated with the same group. + */ +static int can_merge(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + if (entry1->t_tid == entry2->t_tid && + (entry1->group == entry2->group) && + (entry1->start_blk + entry1->count) == entry2->start_blk) + return 1; + return 0; +} + static noinline_for_stack int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, ext4_group_t group, ext4_grpblk_t block, int count) @@ -4422,57 +4449,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_free_metadata *md; - int i; + struct ext4_free_data *entry, *new_entry; + struct rb_node **n = &db->bb_free_root.rb_node, *node; + struct rb_node *parent = NULL, *new_node; + BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry->start_blk = block; + new_entry->group = group; + new_entry->count = count; + new_entry->t_tid = handle->h_transaction->t_tid; + new_node = &new_entry->node; + ext4_lock_group(sb, group); - for (i = 0; i < count; i++) { - md = db->bb_md_cur; - if (md && db->bb_tid != handle->h_transaction->t_tid) { - db->bb_md_cur = NULL; - md = NULL; + if (!*n) { + /* first free block exent. We need to + protect buddy cache from being freed, + * otherwise we'll refresh it from + * on-disk bitmap and lose not-yet-available + * blocks */ + page_cache_get(e4b->bd_buddy_page); + page_cache_get(e4b->bd_bitmap_page); + } + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_free_data, node); + if (block < entry->start_blk) + n = &(*n)->rb_left; + else if (block >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else { + ext4_error(sb, __func__, + "Double free of blocks %d (%d %d)\n", + block, entry->start_blk, entry->count); + return 0; } + } - if (md == NULL) { - ext4_unlock_group(sb, group); - md = kmalloc(sizeof(*md), GFP_NOFS); - if (md == NULL) - return -ENOMEM; - md->num = 0; - md->group = group; - - ext4_lock_group(sb, group); - if (db->bb_md_cur == NULL) { - spin_lock(&sbi->s_md_lock); - list_add(&md->list, &sbi->s_active_transaction); - spin_unlock(&sbi->s_md_lock); - /* protect buddy cache from being freed, - * otherwise we'll refresh it from - * on-disk bitmap and lose not-yet-available - * blocks */ - page_cache_get(e4b->bd_buddy_page); - page_cache_get(e4b->bd_bitmap_page); - db->bb_md_cur = md; - db->bb_tid = handle->h_transaction->t_tid; - mb_debug("new md 0x%p for group %lu\n", - md, md->group); - } else { - kfree(md); - md = db->bb_md_cur; - } + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &db->bb_free_root); + + /* Now try to see the extent can be merged to left and right */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); } + } - BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); - md->blocks[md->num] = block + i; - md->num++; - if (md->num == EXT4_BB_MAX_BLOCKS) { - /* no more space, put full container on a sb's list */ - db->bb_md_cur = NULL; + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); } } + /* Add the extent to active_transaction list */ + spin_lock(&sbi->s_md_lock); + list_add(&new_entry->list, &sbi->s_active_transaction); + spin_unlock(&sbi->s_md_lock); ext4_unlock_group(sb, group); return 0; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index b3b4828..9e815c4 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -98,23 +98,29 @@ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; +static struct kmem_cache *ext4_free_ext_cachep; -#ifdef EXT4_BB_MAX_BLOCKS -#undef EXT4_BB_MAX_BLOCKS -#endif -#define EXT4_BB_MAX_BLOCKS 30 +struct ext4_free_data { + /* this links the free block information from group_info */ + struct rb_node node; -struct ext4_free_metadata { - ext4_group_t group; - unsigned short num; - ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; + /* this links the free block information from ext4_sb_info */ struct list_head list; + + /* group which free block extent belongs */ + ext4_group_t group; + + /* free block extent */ + ext4_grpblk_t start_blk; + ext4_grpblk_t count; + + /* transaction which freed this extent */ + tid_t t_tid; }; struct ext4_group_info { unsigned long bb_state; - unsigned long bb_tid; - struct ext4_free_metadata *bb_md_cur; + struct rb_root bb_free_root; unsigned short bb_first_free; unsigned short bb_free; unsigned short bb_fragments; -- 1.6.0.2.526.g5c283 ^ permalink raw reply related [flat|nested] 2+ messages in thread
* [PATCH] ext4: Use tag dirty lookup during mpage_da_submit_io 2008-10-14 6:33 ` [PATCH -V5] ext4: Use an rbtree for tracking blocks freed during transaction Aneesh Kumar K.V @ 2008-10-14 6:33 ` Aneesh Kumar K.V 2008-10-14 6:33 ` [PATCH] vfs: Remove the range_cont writeback mode Aneesh Kumar K.V 0 siblings, 1 reply; 2+ messages in thread From: Aneesh Kumar K.V @ 2008-10-14 6:33 UTC (permalink / raw) To: aneesh.kumar, cmm, tytso, sandeen; +Cc: linux-ext4 This enables us to drop the range_cont writeback mode use from ext4_da_writepages. Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> --- fs/ext4/inode.c | 30 +++++++++++++----------------- 1 files changed, 13 insertions(+), 17 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7c2820e..cba7960 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1656,17 +1656,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) while (index <= end) { /* XXX: optimize tail */ - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + /* + * We can use PAGECACHE_TAG_DIRTY lookup here because + * even though we have cleared the dirty flag on the page + * We still keep the page in the radix tree with tag + * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io. + * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback + * which is called via the below writepage callback. + */ + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, + (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - index = page->index; - if (index > end) - break; - index++; - err = mapping->a_ops->writepage(page, mpd->wbc); if (!err) mpd->pages_written++; @@ -2361,7 +2367,6 @@ static int ext4_da_writepages(struct address_space *mapping, struct writeback_control *wbc) { handle_t *handle = NULL; - loff_t range_start = 0; struct mpage_da_data mpd; struct inode *inode = mapping->host; int needed_blocks, ret = 0, nr_to_writebump = 0; @@ -2386,14 +2391,7 @@ static int ext4_da_writepages(struct address_space *mapping, wbc->nr_to_write = sbi->s_mb_stream_request; } - if (!wbc->range_cyclic) - /* - * If range_cyclic is not set force range_cont - * and save the old writeback_index - */ - wbc->range_cont = 1; - range_start = wbc->range_start; pages_skipped = wbc->pages_skipped; mpd.wbc = wbc; @@ -2452,9 +2450,8 @@ static int ext4_da_writepages(struct address_space *mapping, wbc->nr_to_write = to_write; } - if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { + if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) { /* We skipped pages in this loop */ - wbc->range_start = range_start; wbc->nr_to_write = to_write + wbc->pages_skipped - pages_skipped; wbc->pages_skipped = pages_skipped; @@ -2463,7 +2460,6 @@ static int ext4_da_writepages(struct address_space *mapping, out_writepages: wbc->nr_to_write = to_write - nr_to_writebump; - wbc->range_start = range_start; return ret; } -- 1.6.0.2.526.g5c283 ^ permalink raw reply related [flat|nested] 2+ messages in thread
* [PATCH] vfs: Remove the range_cont writeback mode. 2008-10-14 6:33 ` [PATCH] ext4: Use tag dirty lookup during mpage_da_submit_io Aneesh Kumar K.V @ 2008-10-14 6:33 ` Aneesh Kumar K.V 2008-10-14 6:33 ` [PATCH] vfs: Add no_nrwrite_update and no_index_update writeback control flags Aneesh Kumar K.V 0 siblings, 1 reply; 2+ messages in thread From: Aneesh Kumar K.V @ 2008-10-14 6:33 UTC (permalink / raw) To: aneesh.kumar, cmm, tytso, sandeen; +Cc: linux-ext4, linux-fsdevel Ext4 was the only user of range_cont writeback mode and ext4 switched to a different method. So remove the range_cont mode which is not used in the kernel. Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> CC: linux-fsdevel@vger.kernel.org --- include/linux/writeback.h | 1 - mm/page-writeback.c | 2 -- 2 files changed, 0 insertions(+), 3 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 12b15c5..bd91987 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -63,7 +63,6 @@ struct writeback_control { unsigned for_writepages:1; /* This is a writepages() call */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned more_io:1; /* more io to be dispatched */ - unsigned range_cont:1; }; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 24de8b6..718efa6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -961,8 +961,6 @@ int write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = index; - if (wbc->range_cont) - wbc->range_start = index << PAGE_CACHE_SHIFT; return ret; } EXPORT_SYMBOL(write_cache_pages); -- 1.6.0.2.526.g5c283 ^ permalink raw reply related [flat|nested] 2+ messages in thread
* [PATCH] vfs: Add no_nrwrite_update and no_index_update writeback control flags 2008-10-14 6:33 ` [PATCH] vfs: Remove the range_cont writeback mode Aneesh Kumar K.V @ 2008-10-14 6:33 ` Aneesh Kumar K.V 2008-10-14 6:33 ` [PATCH] ext4: Fix file fragmentation during large file write Aneesh Kumar K.V 0 siblings, 1 reply; 2+ messages in thread From: Aneesh Kumar K.V @ 2008-10-14 6:33 UTC (permalink / raw) To: aneesh.kumar, cmm, tytso, sandeen; +Cc: linux-ext4, linux-fsdevel If no_nrwrite_update is set we don't update nr_to_write in write_cache_pages. Similarly if no_index_update is we don't update address space writeback_index. These changes enable a file system to skip these updates in write_cache_pages and do them in the writepages() callback. This patch will be followed by an ext4 patch that make use of these new flags. Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> CC: linux-fsdevel@vger.kernel.org --- include/linux/writeback.h | 4 ++++ mm/page-writeback.c | 9 +++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index bd91987..b04287e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -63,6 +63,10 @@ struct writeback_control { unsigned for_writepages:1; /* This is a writepages() call */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned more_io:1; /* more io to be dispatched */ + + /* write_cache_pages() control */ + unsigned no_nrwrite_update:1; /* don't update nr_to_write */ + unsigned no_index_update:1; /* don't update writeback_index */ }; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 718efa6..4f359f4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -876,6 +876,7 @@ int write_cache_pages(struct address_space *mapping, pgoff_t end; /* Inclusive */ int scanned = 0; int range_whole = 0; + long nr_to_write = wbc->nr_to_write; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; @@ -939,7 +940,7 @@ int write_cache_pages(struct address_space *mapping, unlock_page(page); ret = 0; } - if (ret || (--(wbc->nr_to_write) <= 0)) + if (ret || (--nr_to_write <= 0)) done = 1; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; @@ -958,8 +959,12 @@ int write_cache_pages(struct address_space *mapping, index = 0; goto retry; } - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + if (!wbc->no_index_update && + (wbc->range_cyclic || (range_whole && nr_to_write > 0))) { mapping->writeback_index = index; + } + if (!wbc->no_nrwrite_update) + wbc->nr_to_write = nr_to_write; return ret; } -- 1.6.0.2.526.g5c283 ^ permalink raw reply related [flat|nested] 2+ messages in thread
* [PATCH] ext4: Fix file fragmentation during large file write. 2008-10-14 6:33 ` [PATCH] vfs: Add no_nrwrite_update and no_index_update writeback control flags Aneesh Kumar K.V @ 2008-10-14 6:33 ` Aneesh Kumar K.V 0 siblings, 0 replies; 2+ messages in thread From: Aneesh Kumar K.V @ 2008-10-14 6:33 UTC (permalink / raw) To: aneesh.kumar, cmm, tytso, sandeen; +Cc: linux-ext4 The range_cyclic writeback mode uses the address_space writeback_index as the start index for writeback. With delayed allocation we were updating writeback_index wrongly resulting in highly fragmented file. Number of extents reduced from 4000 to 27 for a 3GB file with the below patch. Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/inode.c | 83 +++++++++++++++++++++++++++++++++---------------------- 1 files changed, 50 insertions(+), 33 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cba7960..844c136 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) int ret = 0, err, nr_pages, i; unsigned long index, end; struct pagevec pvec; + long pages_skipped; BUG_ON(mpd->next_page <= mpd->first_page); pagevec_init(&pvec, 0); @@ -1655,7 +1656,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) end = mpd->next_page - 1; while (index <= end) { - /* XXX: optimize tail */ /* * We can use PAGECACHE_TAG_DIRTY lookup here because * even though we have cleared the dirty flag on the page @@ -1673,8 +1673,13 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + pages_skipped = mpd->wbc->pages_skipped; err = mapping->a_ops->writepage(page, mpd->wbc); - if (!err) + if (!err && (pages_skipped == mpd->wbc->pages_skipped)) + /* + * have successfully written the page + * without skipping the same + */ mpd->pages_written++; /* * In error case, we have to continue because @@ -2110,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping, struct writeback_control *wbc, struct mpage_da_data *mpd) { - long to_write; int ret; if (!mpd->get_block) @@ -2125,10 +2129,7 @@ static int mpage_da_writepages(struct address_space *mapping, mpd->pages_written = 0; mpd->retval = 0; - to_write = wbc->nr_to_write; - ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); - /* * Handle last extent of pages */ @@ -2137,7 +2138,7 @@ static int mpage_da_writepages(struct address_space *mapping, mpage_da_submit_io(mpd); } - wbc->nr_to_write = to_write - mpd->pages_written; + wbc->nr_to_write -= mpd->pages_written; return ret; } @@ -2366,11 +2367,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) static int ext4_da_writepages(struct address_space *mapping, struct writeback_control *wbc) { + pgoff_t index; + int range_whole = 0; handle_t *handle = NULL; + long pages_written = 0; struct mpage_da_data mpd; struct inode *inode = mapping->host; + int no_nrwrite_update, no_index_update; int needed_blocks, ret = 0, nr_to_writebump = 0; - long to_write, pages_skipped = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); /* @@ -2390,16 +2394,27 @@ static int ext4_da_writepages(struct address_space *mapping, nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; wbc->nr_to_write = sbi->s_mb_stream_request; } + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; - - pages_skipped = wbc->pages_skipped; + if (wbc->range_cyclic) + index = mapping->writeback_index; + else + index = wbc->range_start >> PAGE_CACHE_SHIFT; mpd.wbc = wbc; mpd.inode = mapping->host; -restart_loop: - to_write = wbc->nr_to_write; - while (!ret && to_write > 0) { + /* + * we don't want write_cache_pages to update + * nr_to_write and writeback_index + */ + no_nrwrite_update = wbc->no_nrwrite_update; + wbc->no_nrwrite_update = 1; + no_index_update = wbc->no_index_update; + wbc->no_index_update = 1; + + while (!ret && wbc->nr_to_write > 0) { /* * we insert one extent at a time. So we need @@ -2420,46 +2435,48 @@ static int ext4_da_writepages(struct address_space *mapping, dump_stack(); goto out_writepages; } - to_write -= wbc->nr_to_write; - mpd.get_block = ext4_da_get_block_write; ret = mpage_da_writepages(mapping, wbc, &mpd); ext4_journal_stop(handle); - if (mpd.retval == -ENOSPC) + if (mpd.retval == -ENOSPC) { + /* commit the transaction which would + * free blocks released in the transaction + * and try again + */ jbd2_journal_force_commit_nested(sbi->s_journal); - - /* reset the retry count */ - if (ret == MPAGE_DA_EXTENT_TAIL) { + ret = 0; + } else if (ret == MPAGE_DA_EXTENT_TAIL) { /* * got one extent now try with * rest of the pages */ - to_write += wbc->nr_to_write; + pages_written += mpd.pages_written; ret = 0; - } else if (wbc->nr_to_write) { + } else if (wbc->nr_to_write) /* * There is no more writeout needed * or we requested for a noblocking writeout * and we found the device congested */ - to_write += wbc->nr_to_write; break; - } - wbc->nr_to_write = to_write; - } - - if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) { - /* We skipped pages in this loop */ - wbc->nr_to_write = to_write + - wbc->pages_skipped - pages_skipped; - wbc->pages_skipped = pages_skipped; - goto restart_loop; } + /* Update index */ + index += pages_written; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + /* + * set the writeback_index so that range_cyclic + * mode will write it back later + */ + mapping->writeback_index = index; out_writepages: - wbc->nr_to_write = to_write - nr_to_writebump; + if (!no_nrwrite_update) + wbc->no_nrwrite_update = 0; + if (!no_index_update) + wbc->no_index_update = 0; + wbc->nr_to_write -= nr_to_writebump; return ret; } -- 1.6.0.2.526.g5c283 ^ permalink raw reply related [flat|nested] 2+ messages in thread
end of thread, other threads:[~2008-10-14 6:33 UTC | newest] Thread overview: 2+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2008-10-07 8:47 [PATCH] ext4: Fix file fragmentation during large file write Aneesh Kumar K.V -- strict thread matches above, loose matches on Subject: below -- 2008-10-14 6:33 Patches for patchqueue Aneesh Kumar K.V 2008-10-14 6:33 ` [PATCH -V5] ext4: Use an rbtree for tracking blocks freed during transaction Aneesh Kumar K.V 2008-10-14 6:33 ` [PATCH] ext4: Use tag dirty lookup during mpage_da_submit_io Aneesh Kumar K.V 2008-10-14 6:33 ` [PATCH] vfs: Remove the range_cont writeback mode Aneesh Kumar K.V 2008-10-14 6:33 ` [PATCH] vfs: Add no_nrwrite_update and no_index_update writeback control flags Aneesh Kumar K.V 2008-10-14 6:33 ` [PATCH] ext4: Fix file fragmentation during large file write Aneesh Kumar K.V
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox