From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
To: "Theodore Ts'o" <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org
Subject: Re: Problem with delayed allocation
Date: Mon, 4 Aug 2008 22:05:05 +0530 [thread overview]
Message-ID: <20080804163505.GE9397@skywalker> (raw)
In-Reply-To: <E1KPNNn-0003IN-0B@closure.thunk.org>
On Sat, Aug 02, 2008 at 04:07:19PM -0400, Theodore Ts'o wrote:
>
> Apparently __fsync_super(), which is called right before remounting a
> filesystem read-only, isn't working correctly. To reproduce, create a
> script which does this:
>
> #!/bin/sh
> DEVICE=/dev/closure/test
> mke2fs -t ext4dev /dev/closure/test
> mount $DEVICE /mnt
> cd /mnt
> tar xfj /var/tmp/linux-2.6.26.tar.gz <----- or some really big file
> du -s
> cd ..
> mount -o remount,ro /mnt
> sync
> dmesg > /tmp/dmesg.out <----- note all of the ext4_da_writepages error messages
> umount /mnt
> du -s /mnt
> sync
> mount $DEVICE /mnt
> du -s /mnt <--- note that size of the unpacked hierarcy is much smaller
>
> This doesn't happen if the ext4 filesystem is mounted with nodelalloc,
> so I assume the problem is in ext4_da_writepages().
>
This is the complete patch that I have. I haven't fully tested it
(right now waiting for the machine to be free). This should apply
after stable-boundary-undo.patch
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5665bec..bfe27d9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -41,6 +41,8 @@
#include "acl.h"
#include "ext4_extents.h"
+#define MPAGE_DA_EXTENT_TAIL 0x01
+
static inline int ext4_begin_ordered_truncate(struct inode *inode,
loff_t new_size)
{
@@ -1580,6 +1582,8 @@ static void ext4_da_page_release_reservation(struct page *page,
unsigned long first_page, next_page; /* extent of pages */
get_block_t *get_block;
struct writeback_control *wbc;
+ int io_done;
+ long pages_written;
};
/*
@@ -1599,12 +1603,6 @@ static void ext4_da_page_release_reservation(struct page *page,
static int mpage_da_submit_io(struct mpage_da_data *mpd)
{
struct address_space *mapping = mpd->inode->i_mapping;
- struct mpage_data mpd_pp = {
- .bio = NULL,
- .last_block_in_bio = 0,
- .get_block = mpd->get_block,
- .use_writepage = 1,
- };
int ret = 0, err, nr_pages, i;
unsigned long index, end;
struct pagevec pvec;
@@ -1628,7 +1626,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
break;
index++;
- err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+ err = mapping->a_ops->writepage(page, mpd->wbc);
+ if (!err)
+ mpd->pages_written++;
/*
* In error case, we have to continue because
@@ -1640,8 +1640,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
}
pagevec_release(&pvec);
}
- if (mpd_pp.bio)
- mpage_bio_submit(WRITE, mpd_pp.bio);
return ret;
}
@@ -1708,6 +1706,13 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
if (buffer_delay(bh)) {
bh->b_blocknr = pblock;
clear_buffer_delay(bh);
+ bh->b_bdev = inode->i_sb->s_bdev;
+ } else if (buffer_unwritten(bh)) {
+ bh->b_blocknr = pblock;
+ clear_buffer_unwritten(bh);
+ set_buffer_mapped(bh);
+ set_buffer_new(bh);
+ bh->b_bdev = inode->i_sb->s_bdev;
} else if (buffer_mapped(bh))
BUG_ON(bh->b_blocknr != pblock);
@@ -1748,8 +1753,8 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
*/
static void mpage_da_map_blocks(struct mpage_da_data *mpd)
{
+ int err = 0;
struct buffer_head *lbh = &mpd->lbh;
- int err = 0, remain = lbh->b_size;
sector_t next = lbh->b_blocknr;
struct buffer_head new;
@@ -1759,38 +1764,28 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
if (buffer_mapped(lbh) && !buffer_delay(lbh))
return;
- while (remain) {
- new.b_state = lbh->b_state;
- new.b_blocknr = 0;
- new.b_size = remain;
- err = mpd->get_block(mpd->inode, next, &new, 1);
- if (err) {
- /*
- * Rather than implement own error handling
- * here, we just leave remaining blocks
- * unallocated and try again with ->writepage()
- */
- break;
- }
- BUG_ON(new.b_size == 0);
+ new.b_state = lbh->b_state;
+ new.b_blocknr = 0;
+ new.b_size = lbh->b_size;
+ err = mpd->get_block(mpd->inode, next, &new, 1);
+ if (err)
+ return;
+ BUG_ON(new.b_size == 0);
- if (buffer_new(&new))
- __unmap_underlying_blocks(mpd->inode, &new);
+ if (buffer_new(&new))
+ __unmap_underlying_blocks(mpd->inode, &new);
- /*
- * If blocks are delayed marked, we need to
- * put actual blocknr and drop delayed bit
- */
- if (buffer_delay(lbh))
- mpage_put_bnr_to_bhs(mpd, next, &new);
+ /*
+ * If blocks are delayed marked, we need to
+ * put actual blocknr and drop delayed bit
+ */
+ if (buffer_delay(lbh) || buffer_unwritten(lbh))
+ mpage_put_bnr_to_bhs(mpd, next, &new);
- /* go for the remaining blocks */
- next += new.b_size >> mpd->inode->i_blkbits;
- remain -= new.b_size;
- }
+ return;
}
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay) | (1 << BH_Unwritten))
/*
* mpage_add_bh_to_extent - try to add one more block to extent of blocks
@@ -1832,13 +1827,9 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
* need to flush current extent and start new one
*/
mpage_da_map_blocks(mpd);
-
- /*
- * Now start a new extent
- */
- lbh->b_size = bh->b_size;
- lbh->b_state = bh->b_state & BH_FLAGS;
- lbh->b_blocknr = logical;
+ mpage_da_submit_io(mpd);
+ mpd->io_done = 1;
+ return;
}
/*
@@ -1858,6 +1849,17 @@ static int __mpage_da_writepage(struct page *page,
struct buffer_head *bh, *head, fake;
sector_t logical;
+ if (mpd->io_done) {
+ /*
+ * Rest of the page in the page_vec
+ * redirty then and skip then. We will
+ * try to to write them again after
+ * starting a new transaction
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return MPAGE_DA_EXTENT_TAIL;
+ }
/*
* Can we merge this page to current extent?
*/
@@ -1869,6 +1871,13 @@ static int __mpage_da_writepage(struct page *page,
if (mpd->next_page != mpd->first_page) {
mpage_da_map_blocks(mpd);
mpage_da_submit_io(mpd);
+ /*
+ * skip rest of the page in the page_vec
+ */
+ mpd->io_done = 1;
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return MPAGE_DA_EXTENT_TAIL;
}
/*
@@ -1899,6 +1908,8 @@ static int __mpage_da_writepage(struct page *page,
set_buffer_dirty(bh);
set_buffer_uptodate(bh);
mpage_add_bh_to_extent(mpd, logical, bh);
+ if (mpd->io_done)
+ return MPAGE_DA_EXTENT_TAIL;
} else {
/*
* Page with regular buffer heads, just add all dirty ones
@@ -1907,8 +1918,11 @@ static int __mpage_da_writepage(struct page *page,
bh = head;
do {
BUG_ON(buffer_locked(bh));
- if (buffer_dirty(bh))
+ if (buffer_dirty(bh) && (!buffer_mapped(bh) || buffer_delay(bh))) {
mpage_add_bh_to_extent(mpd, logical, bh);
+ if (mpd->io_done)
+ return MPAGE_DA_EXTENT_TAIL;
+ }
logical++;
} while ((bh = bh->b_this_page) != head);
}
@@ -1943,6 +1957,7 @@ static int mpage_da_writepages(struct address_space *mapping,
get_block_t get_block)
{
struct mpage_da_data mpd;
+ long to_write;
int ret;
if (!get_block)
@@ -1956,17 +1971,22 @@ static int mpage_da_writepages(struct address_space *mapping,
mpd.first_page = 0;
mpd.next_page = 0;
mpd.get_block = get_block;
+ mpd.io_done = 0;
+ mpd.pages_written = 0;
+
+ to_write = wbc->nr_to_write;
ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
/*
* Handle last extent of pages
*/
- if (mpd.next_page != mpd.first_page) {
+ if (!mpd.io_done && mpd.next_page != mpd.first_page) {
mpage_da_map_blocks(&mpd);
mpage_da_submit_io(&mpd);
}
+ wbc->nr_to_write = to_write - mpd.pages_written;
return ret;
}
@@ -2178,10 +2198,7 @@ static int ext4_da_writepages(struct address_space *mapping,
int ret = 0;
long to_write;
loff_t range_start = 0;
- int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
- int max_credit_blocks = ext4_journal_max_transaction_buffers(inode);
- int need_credits_per_page = ext4_writepages_trans_blocks(inode, 1);
- int max_writeback_pages = (max_credit_blocks / blocks_per_page) / need_credits_per_page;
+ long pages_skipped = 0;
/*
* No pages to write? This is mainly a kludge to avoid starting
@@ -2191,11 +2208,6 @@ static int ext4_da_writepages(struct address_space *mapping,
if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
- if (wbc->nr_to_write > mapping->nrpages)
- wbc->nr_to_write = mapping->nrpages;
-
- to_write = wbc->nr_to_write;
-
if (!wbc->range_cyclic) {
/*
* If range_cyclic is not set force range_cont
@@ -2204,32 +2216,27 @@ static int ext4_da_writepages(struct address_space *mapping,
wbc->range_cont = 1;
range_start = wbc->range_start;
}
+ pages_skipped = wbc->pages_skipped;
- while (!ret && to_write) {
- /*
- * set the max dirty pages could be write at a time
- * to fit into the reserved transaction credits
- */
- if (wbc->nr_to_write > max_writeback_pages)
- wbc->nr_to_write = max_writeback_pages;
+restart_loop:
+ to_write = wbc->nr_to_write;
+ while (!ret && to_write > 0) {
/*
- * Estimate the worse case needed credits to write out
- * to_write pages
+ * we insert one extent at a time. So we need
+ * credit needed for single extent allocation.
+ * journalled mode is currently not supported
+ * by delalloc
*/
- needed_blocks = ext4_writepages_trans_blocks(inode,
- wbc->nr_to_write);
- while (needed_blocks > max_credit_blocks) {
- wbc->nr_to_write --;
- needed_blocks = ext4_writepages_trans_blocks(inode,
- wbc->nr_to_write);
- }
+ BUG_ON(ext4_should_journal_data(inode));
+ needed_blocks = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+
/* start a new transaction*/
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- printk(KERN_EMERG "%s: Not enough credits to flush %ld pages\n", __func__,
- wbc->nr_to_write);
+ printk(KERN_EMERG "%s: Not enough credits to flush %ld pages %d\n",
+ __func__, wbc->nr_to_write , ret);
dump_stack();
goto out_writepages;
}
@@ -2251,7 +2258,14 @@ static int ext4_da_writepages(struct address_space *mapping,
ret = mpage_da_writepages(mapping, wbc,
ext4_da_get_block_write);
ext4_journal_stop(handle);
- if (wbc->nr_to_write) {
+ if (ret == MPAGE_DA_EXTENT_TAIL) {
+ /*
+ * got one extent now try with
+ * rest of the pages
+ */
+ to_write += wbc->nr_to_write;
+ ret = 0;
+ } else if (wbc->nr_to_write) {
/*
* There is no more writeout needed
* or we requested for a noblocking writeout
@@ -2263,6 +2277,15 @@ static int ext4_da_writepages(struct address_space *mapping,
wbc->nr_to_write = to_write;
}
+ if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
+ /* We skipped pages in this loop */
+ wbc->range_start = range_start;
+ wbc->nr_to_write = to_write +
+ wbc->pages_skipped - pages_skipped;
+ wbc->pages_skipped = pages_skipped;
+ goto restart_loop;
+ }
+
out_writepages:
wbc->nr_to_write = to_write;
if (range_start)
next prev parent reply other threads:[~2008-08-04 16:35 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-08-02 20:07 Problem with delayed allocation Theodore Ts'o
2008-08-02 22:40 ` Theodore Tso
2008-08-04 3:16 ` Aneesh Kumar K.V
2008-08-04 14:08 ` Theodore Tso
2008-08-04 14:52 ` Aneesh Kumar K.V
2008-08-04 15:27 ` Aneesh Kumar K.V
2008-08-04 15:33 ` Aneesh Kumar K.V
2008-08-04 16:35 ` Aneesh Kumar K.V [this message]
2008-08-05 6:44 ` Theodore Tso
2008-08-05 6:52 ` Aneesh Kumar K.V
2008-08-05 13:21 ` Aneesh Kumar K.V
2008-08-05 13:47 ` Theodore Tso
2008-08-05 14:24 ` Aneesh Kumar K.V
2008-08-05 15:16 ` Theodore Tso
2008-08-06 10:05 ` Aneesh Kumar K.V
2008-08-06 10:11 ` Aneesh Kumar K.V
2008-08-07 0:49 ` Mingming Cao
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20080804163505.GE9397@skywalker \
--to=aneesh.kumar@linux.vnet.ibm.com \
--cc=linux-ext4@vger.kernel.org \
--cc=tytso@mit.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.