From: bpm@sgi.com
To: Dave Chinner <david@fromorbit.com>
Cc: xfs@oss.sgi.com
Subject: Re: Issues with delalloc->real extent allocation
Date: Fri, 14 Jan 2011 17:32:26 -0600 [thread overview]
Message-ID: <20110114233226.GO28274@sgi.com> (raw)
In-Reply-To: <20110114214334.GN28274@sgi.com>
Hi Dave,
On Fri, Jan 14, 2011 at 03:43:34PM -0600, bpm@sgi.com wrote:
> On Fri, Jan 14, 2011 at 11:29:00AM +1100, Dave Chinner wrote:
> > I'm sure there are other ways to solve these problems, but these two
> > are the ones that come to mind for me. I'm open to other solutions
> > or ways to improve on these ones, especially if they are simpler. ;)
> > Anyone got any ideas or improvements?
>
> The direction I've been taking is to use XFS_BMAPI_EXACT in
> *xfs_iomap_write_allocate to ensure that an extent covering exactly the
> pages we're prepared to write out immediately is allocated and the rest
> of the delalloc extent is left as is. This exercises some of the btree
> code more heavily and led to the discovery of the
> allocate-in-the-middle-of-a-delalloc-extent bug. It also presents a
> performance issue which I've tried to resolve by extending
> xfs_probe_cluster to probe delalloc extents-- lock up all of the pages
> to be converted before performing the allocation and hold those locks
> until they are submitted for writeback. It's not very pretty but it
> resolves the corruption.
Here's the xfs_page_state_convert side of the patch so you can get an
idea what am trying for, and how ugly it is. ;^)
I have not ported the xfs_iomap_write_allocate bits yet. It is against
an older version of xfs... but you get the idea.
-Ben
Index: sles11-src/fs/xfs/linux-2.6/xfs_aops.c
===================================================================
--- sles11-src.orig/fs/xfs/linux-2.6/xfs_aops.c
+++ sles11-src/fs/xfs/linux-2.6/xfs_aops.c
@@ -585,41 +585,61 @@ STATIC unsigned int
xfs_probe_page(
struct page *page,
unsigned int pg_offset,
- int mapped)
+ int mapped,
+ unsigned int type,
+ int *entire)
{
+ struct buffer_head *bh, *head;
int ret = 0;
if (PageWriteback(page))
return 0;
+ if (!page->mapping)
+ return 0;
+ if (!PageDirty(page))
+ return 0;
+ if (!page_has_buffers(page))
+ return 0;
- if (page->mapping && PageDirty(page)) {
- if (page_has_buffers(page)) {
- struct buffer_head *bh, *head;
-
- bh = head = page_buffers(page);
- do {
- if (!buffer_uptodate(bh))
- break;
- if (mapped != buffer_mapped(bh))
- break;
- ret += bh->b_size;
- if (ret >= pg_offset)
- break;
- } while ((bh = bh->b_this_page) != head);
- } else
- ret = mapped ? 0 : PAGE_CACHE_SIZE;
- }
+ *entire = 0;
+ bh = head = page_buffers(page);
+ do {
+ if (!buffer_uptodate(bh))
+ break;
+ if (buffer_mapped(bh) != mapped)
+ break;
+ if (type == IOMAP_UNWRITTEN && !buffer_unwritten(bh))
+ break;
+ if (type == IOMAP_DELAY && !buffer_delay(bh))
+ break;
+ if (type == IOMAP_NEW && !buffer_dirty(bh))
+ break;
+
+ ret += bh->b_size;
+
+ if (ret >= pg_offset)
+ break;
+ } while ((bh = bh->b_this_page) != head);
+
+ if (bh == head)
+ *entire = 1;
return ret;
}
+#define MAX_WRITEBACK_PAGES 1024
+
STATIC size_t
xfs_probe_cluster(
struct inode *inode,
struct page *startpage,
struct buffer_head *bh,
struct buffer_head *head,
- int mapped)
+ int mapped,
+ unsigned int type,
+ struct page **pages,
+ int *pagecount,
+ struct writeback_control *wbc)
{
struct pagevec pvec;
pgoff_t tindex, tlast, tloff;
@@ -628,8 +648,15 @@ xfs_probe_cluster(
/* First sum forwards in this page */
do {
- if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh)))
+ if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh))) {
+ return total;
+ } else if (type == IOMAP_UNWRITTEN && !buffer_unwritten(bh)) {
return total;
+ } else if (type == IOMAP_DELAY && !buffer_delay(bh)) {
+ return total;
+ } else if (type == IOMAP_NEW && !buffer_dirty(bh)) {
+ return total;
+ }
total += bh->b_size;
} while ((bh = bh->b_this_page) != head);
@@ -637,8 +664,9 @@ xfs_probe_cluster(
tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
tindex = startpage->index + 1;
- /* Prune this back to avoid pathological behavior */
- tloff = min(tlast, startpage->index + 64);
+ /* Prune this back to avoid pathological behavior, subtract 1 for the
+ * first page. */
+ tloff = min(tlast, startpage->index + (pgoff_t)MAX_WRITEBACK_PAGES);
pagevec_init(&pvec, 0);
while (!done && tindex <= tloff) {
@@ -647,10 +675,10 @@ xfs_probe_cluster(
if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
break;
- for (i = 0; i < pagevec_count(&pvec); i++) {
+ for (i = 0; i < pagevec_count(&pvec) && !done; i++) {
struct page *page = pvec.pages[i];
size_t pg_offset, pg_len = 0;
-
+ int entire = 0;
if (tindex == tlast) {
pg_offset =
i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
@@ -658,20 +686,39 @@ xfs_probe_cluster(
done = 1;
break;
}
- } else
+ } else {
pg_offset = PAGE_CACHE_SIZE;
-
- if (page->index == tindex && trylock_page(page)) {
- pg_len = xfs_probe_page(page, pg_offset, mapped);
- unlock_page(page);
}
+ if (page->index == tindex &&
+ *pagecount < MAX_WRITEBACK_PAGES - 1 &&
+ trylock_page(page)) {
+ pg_len = xfs_probe_page(page, pg_offset,
+ mapped, type, &entire);
+ if (pg_len) {
+ pages[(*pagecount)++] = page;
+
+ } else {
+ unlock_page(page);
+ }
+ }
if (!pg_len) {
done = 1;
break;
}
total += pg_len;
+
+ /*
+ * if probe did not succeed on all buffers in the page
+ * we don't want to probe subsequent pages. This
+ * ensures that we don't have a mix of buffer types in
+ * the iomap.
+ */
+ if (!entire) {
+ done = 1;
+ break;
+ }
tindex++;
}
@@ -683,56 +730,19 @@ xfs_probe_cluster(
}
/*
- * Test if a given page is suitable for writing as part of an unwritten
- * or delayed allocate extent.
- */
-STATIC int
-xfs_is_delayed_page(
- struct page *page,
- unsigned int type)
-{
- if (PageWriteback(page))
- return 0;
-
- if (page->mapping && page_has_buffers(page)) {
- struct buffer_head *bh, *head;
- int acceptable = 0;
-
- bh = head = page_buffers(page);
- do {
- if (buffer_unwritten(bh))
- acceptable = (type == IOMAP_UNWRITTEN);
- else if (buffer_delay(bh))
- acceptable = (type == IOMAP_DELAY);
- else if (buffer_dirty(bh) && buffer_mapped(bh))
- acceptable = (type == IOMAP_NEW);
- else
- break;
- } while ((bh = bh->b_this_page) != head);
-
- if (acceptable)
- return 1;
- }
-
- return 0;
-}
-
-/*
* Allocate & map buffers for page given the extent map. Write it out.
* except for the original page of a writepage, this is called on
* delalloc/unwritten pages only, for the original page it is possible
* that the page has no mapping at all.
*/
-STATIC int
+STATIC void
xfs_convert_page(
struct inode *inode,
struct page *page,
- loff_t tindex,
xfs_iomap_t *mp,
xfs_ioend_t **ioendp,
struct writeback_control *wbc,
- int startio,
- int all_bh)
+ int startio)
{
struct buffer_head *bh, *head;
xfs_off_t end_offset;
@@ -740,20 +750,9 @@ xfs_convert_page(
unsigned int type;
int bbits = inode->i_blkbits;
int len, page_dirty;
- int count = 0, done = 0, uptodate = 1;
+ int count = 0, uptodate = 1;
xfs_off_t offset = page_offset(page);
- if (page->index != tindex)
- goto fail;
- if (! trylock_page(page))
- goto fail;
- if (PageWriteback(page))
- goto fail_unlock_page;
- if (page->mapping != inode->i_mapping)
- goto fail_unlock_page;
- if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
- goto fail_unlock_page;
-
/*
* page_dirty is initially a count of buffers on the page before
* EOF and is decremented as we move each into a cleanable state.
@@ -770,6 +769,8 @@ xfs_convert_page(
end_offset = min_t(unsigned long long,
(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
i_size_read(inode));
+ end_offset = min_t(unsigned long long, end_offset,
+ (mp->iomap_offset + mp->iomap_bsize));
len = 1 << inode->i_blkbits;
p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
@@ -779,12 +780,12 @@ xfs_convert_page(
bh = head = page_buffers(page);
do {
- if (offset >= end_offset)
+ if (offset >= end_offset) {
break;
+ }
if (!buffer_uptodate(bh))
uptodate = 0;
if (!(PageUptodate(page) || buffer_uptodate(bh))) {
- done = 1;
continue;
}
@@ -794,10 +795,7 @@ xfs_convert_page(
else
type = IOMAP_DELAY;
- if (!xfs_iomap_valid(mp, offset)) {
- done = 1;
- continue;
- }
+ BUG_ON(!xfs_iomap_valid(mp, offset));
ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
@@ -805,7 +803,7 @@ xfs_convert_page(
xfs_map_at_offset(bh, offset, bbits, mp);
if (startio) {
xfs_add_to_ioend(inode, bh, offset,
- type, ioendp, done);
+ type, ioendp, 0 /* !done */);
} else {
set_buffer_dirty(bh);
unlock_buffer(bh);
@@ -814,15 +812,14 @@ xfs_convert_page(
page_dirty--;
count++;
} else {
+ WARN_ON(!xfs_iomap_valid(mp, offset));
type = IOMAP_NEW;
- if (buffer_mapped(bh) && all_bh && startio) {
+ if (buffer_mapped(bh) && buffer_dirty(bh) && startio) {
lock_buffer(bh);
xfs_add_to_ioend(inode, bh, offset,
- type, ioendp, done);
+ type, ioendp, 0 /* !done */);
count++;
page_dirty--;
- } else {
- done = 1;
}
}
} while (offset += len, (bh = bh->b_this_page) != head);
@@ -838,19 +835,16 @@ xfs_convert_page(
wbc->nr_to_write--;
if (bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
- done = 1;
} else if (wbc->nr_to_write <= 0) {
- done = 1;
+ /* XXX ignore nr_to_write
+ done = 1; */
}
}
+ /* unlocks page */
xfs_start_page_writeback(page, wbc, !page_dirty, count);
+ } else {
+ unlock_page(page);
}
-
- return done;
- fail_unlock_page:
- unlock_page(page);
- fail:
- return 1;
}
/*
@@ -860,33 +854,17 @@ xfs_convert_page(
STATIC void
xfs_cluster_write(
struct inode *inode,
- pgoff_t tindex,
+ struct page **pages,
+ int pagecount,
xfs_iomap_t *iomapp,
xfs_ioend_t **ioendp,
struct writeback_control *wbc,
- int startio,
- int all_bh,
- pgoff_t tlast)
+ int startio)
{
- struct pagevec pvec;
- int done = 0, i;
-
- pagevec_init(&pvec, 0);
- while (!done && tindex <= tlast) {
- unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-
- if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
- break;
-
- for (i = 0; i < pagevec_count(&pvec); i++) {
- done = xfs_convert_page(inode, pvec.pages[i], tindex++,
- iomapp, ioendp, wbc, startio, all_bh);
- if (done)
- break;
- }
+ int i;
- pagevec_release(&pvec);
- cond_resched();
+ for (i = 0; i < pagecount; i++) {
+ xfs_convert_page(inode, pages[i], iomapp, ioendp, wbc, startio);
}
}
@@ -908,7 +886,6 @@ xfs_cluster_write(
* bh->b_states's will not agree and only ones setup by BPW/BCW will have
* valid state, thus the whole page must be written out thing.
*/
-
STATIC int
xfs_page_state_convert(
struct inode *inode,
@@ -924,12 +901,13 @@ xfs_page_state_convert(
unsigned long p_offset = 0;
unsigned int type;
__uint64_t end_offset;
- pgoff_t end_index, last_index, tlast;
+ pgoff_t end_index, last_index;
ssize_t size, len;
int flags, err, iomap_valid = 0, uptodate = 1;
int page_dirty, count = 0;
int trylock = 0;
- int all_bh = unmapped;
+ struct page **pages;
+ int pagecount = 0, i;
if (startio) {
if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
@@ -949,6 +927,9 @@ xfs_page_state_convert(
}
}
+ pages = kmem_zalloc(sizeof(struct page*) * MAX_WRITEBACK_PAGES, KM_NOFS);
+
+
/*
* page_dirty is initially a count of buffers on the page before
* EOF and is decremented as we move each into a cleanable state.
@@ -1036,14 +1017,23 @@ xfs_page_state_convert(
* for unwritten extent conversion.
*/
new_ioend = 1;
- if (type == IOMAP_NEW) {
- size = xfs_probe_cluster(inode,
- page, bh, head, 0);
- } else {
- size = len;
+ size = 0;
+ if (type == IOMAP_NEW && !pagecount) {
+ size = xfs_probe_cluster(inode, page,
+ bh, head,
+ 0 /* !mapped */, type,
+ pages, &pagecount, wbc);
+ } else if ((type == IOMAP_DELAY ||
+ type == IOMAP_UNWRITTEN) &&
+ !pagecount) {
+ size = xfs_probe_cluster(inode, page,
+ bh, head,
+ 1 /* mapped */, type,
+ pages, &pagecount, wbc);
}
- err = xfs_map_blocks(inode, offset, size,
+ err = xfs_map_blocks(inode, offset,
+ size ? size : len,
&iomap, flags);
if (err)
goto error;
@@ -1072,9 +1062,16 @@ xfs_page_state_convert(
*/
if (!iomap_valid || flags != BMAPI_READ) {
flags = BMAPI_READ;
- size = xfs_probe_cluster(inode, page, bh,
- head, 1);
- err = xfs_map_blocks(inode, offset, size,
+ size = 0;
+ if (!pagecount) {
+ size = xfs_probe_cluster(inode, page,
+ bh, head,
+ 1 /* mapped */,
+ IOMAP_NEW,
+ pages, &pagecount, wbc);
+ }
+ err = xfs_map_blocks(inode, offset,
+ size ? size : len,
&iomap, flags);
if (err)
goto error;
@@ -1092,8 +1089,6 @@ xfs_page_state_convert(
type = IOMAP_NEW;
if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
ASSERT(buffer_mapped(bh));
- if (iomap_valid)
- all_bh = 1;
xfs_add_to_ioend(inode, bh, offset, type,
&ioend, !iomap_valid);
page_dirty--;
@@ -1104,6 +1099,8 @@ xfs_page_state_convert(
} else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
(unmapped || startio)) {
iomap_valid = 0;
+ } else {
+ WARN_ON(1);
}
if (!iohead)
@@ -1117,13 +1114,11 @@ xfs_page_state_convert(
if (startio)
xfs_start_page_writeback(page, wbc, 1, count);
- if (ioend && iomap_valid) {
- offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
- PAGE_CACHE_SHIFT;
- tlast = min_t(pgoff_t, offset, last_index);
- xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
- wbc, startio, all_bh, tlast);
+ if (ioend && iomap_valid && pagecount) {
+ xfs_cluster_write(inode, pages, pagecount, &iomap, &ioend,
+ wbc, startio);
}
+ kmem_free(pages, sizeof(struct page *) * MAX_WRITEBACK_PAGES);
if (iohead)
xfs_submit_ioend(iohead);
@@ -1133,7 +1128,12 @@ xfs_page_state_convert(
error:
if (iohead)
xfs_cancel_ioend(iohead);
-
+ if (pages) {
+ for (i = 0; i < pagecount; i++) {
+ unlock_page(pages[i]);
+ }
+ kmem_free(pages, sizeof(struct page *) * MAX_WRITEBACK_PAGES);
+ }
return err;
}
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
next prev parent reply other threads:[~2011-01-14 23:30 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-01-14 0:29 Issues with delalloc->real extent allocation Dave Chinner
2011-01-14 16:40 ` Geoffrey Wehrman
2011-01-14 22:59 ` Dave Chinner
2011-01-15 4:16 ` Geoffrey Wehrman
2011-01-17 5:18 ` Dave Chinner
2011-01-17 14:37 ` Geoffrey Wehrman
2011-01-18 0:24 ` Dave Chinner
2011-01-18 14:30 ` Geoffrey Wehrman
2011-01-18 20:40 ` Christoph Hellwig
2011-01-18 22:03 ` Dave Chinner
2011-01-14 21:43 ` bpm
2011-01-14 23:32 ` bpm [this message]
2011-01-14 23:50 ` bpm
2011-01-14 23:55 ` Dave Chinner
2011-01-17 20:12 ` bpm
2011-01-18 1:44 ` Dave Chinner
2011-01-18 20:47 ` Christoph Hellwig
2011-01-18 23:18 ` Dave Chinner
2011-01-19 12:03 ` Christoph Hellwig
2011-01-19 13:31 ` Dave Chinner
2011-01-19 13:55 ` Christoph Hellwig
2011-01-20 1:33 ` Dave Chinner
2011-01-20 11:16 ` Christoph Hellwig
2011-01-21 1:59 ` Dave Chinner
2011-01-20 14:45 ` Geoffrey Wehrman
2011-01-21 2:51 ` Dave Chinner
2011-01-21 14:41 ` Geoffrey Wehrman
2011-01-23 23:26 ` Dave Chinner
2011-01-17 0:28 ` Lachlan McIlroy
2011-01-17 4:37 ` Dave Chinner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20110114233226.GO28274@sgi.com \
--to=bpm@sgi.com \
--cc=david@fromorbit.com \
--cc=xfs@oss.sgi.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.