All of lore.kernel.org
 help / color / mirror / Atom feed
From: bpm@sgi.com
To: Dave Chinner <david@fromorbit.com>
Cc: xfs@oss.sgi.com
Subject: Re: Issues with delalloc->real extent allocation
Date: Fri, 14 Jan 2011 17:32:26 -0600	[thread overview]
Message-ID: <20110114233226.GO28274@sgi.com> (raw)
In-Reply-To: <20110114214334.GN28274@sgi.com>

Hi Dave,

On Fri, Jan 14, 2011 at 03:43:34PM -0600, bpm@sgi.com wrote:
> On Fri, Jan 14, 2011 at 11:29:00AM +1100, Dave Chinner wrote:
> > I'm sure there are other ways to solve these problems, but these two
> > are the ones that come to mind for me.  I'm open to other solutions
> > or ways to improve on these ones, especially if they are simpler. ;)
> > Anyone got any ideas or improvements?
> 
> The direction I've been taking is to use XFS_BMAPI_EXACT in
> *xfs_iomap_write_allocate to ensure that an extent covering exactly the
> pages we're prepared to write out immediately is allocated and the rest
> of the delalloc extent is left as is.  This exercises some of the btree
> code more heavily and led to the discovery of the
> allocate-in-the-middle-of-a-delalloc-extent bug.  It also presents a
> performance issue which I've tried to resolve by extending
> xfs_probe_cluster to probe delalloc extents-- lock up all of the pages
> to be converted before performing the allocation and hold those locks
> until they are submitted for writeback.  It's not very pretty but it
> resolves the corruption.

Here's the xfs_page_state_convert side of the patch so you can get an
idea what am trying for, and how ugly it is.  ;^)

I have not ported the xfs_iomap_write_allocate bits yet.  It is against
an older version of xfs... but you get the idea.

-Ben

Index: sles11-src/fs/xfs/linux-2.6/xfs_aops.c
===================================================================
--- sles11-src.orig/fs/xfs/linux-2.6/xfs_aops.c
+++ sles11-src/fs/xfs/linux-2.6/xfs_aops.c
@@ -585,41 +585,61 @@ STATIC unsigned int
 xfs_probe_page(
 	struct page		*page,
 	unsigned int		pg_offset,
-	int			mapped)
+	int			mapped,
+	unsigned int		type,
+	int			*entire)
 {
+	struct buffer_head	*bh, *head;
 	int			ret = 0;
 
 	if (PageWriteback(page))
 		return 0;
+	if (!page->mapping)
+		return 0;
+	if (!PageDirty(page))
+		return 0;
+	if (!page_has_buffers(page))
+		return 0;
 
-	if (page->mapping && PageDirty(page)) {
-		if (page_has_buffers(page)) {
-			struct buffer_head	*bh, *head;
-
-			bh = head = page_buffers(page);
-			do {
-				if (!buffer_uptodate(bh))
-					break;
-				if (mapped != buffer_mapped(bh))
-					break;
-				ret += bh->b_size;
-				if (ret >= pg_offset)
-					break;
-			} while ((bh = bh->b_this_page) != head);
-		} else
-			ret = mapped ? 0 : PAGE_CACHE_SIZE;
-	}
+	*entire = 0;
+	bh = head = page_buffers(page);
+	do {
+		if (!buffer_uptodate(bh))
+			break;
+		if (buffer_mapped(bh) != mapped)
+			break;
+		if (type == IOMAP_UNWRITTEN && !buffer_unwritten(bh))
+			break;
+		if (type == IOMAP_DELAY && !buffer_delay(bh))
+			break;
+		if (type == IOMAP_NEW && !buffer_dirty(bh))
+			break;
+
+		ret += bh->b_size;
+
+		if (ret >= pg_offset)
+			break;
+	} while ((bh = bh->b_this_page) != head);
+
+	if (bh == head)
+		*entire = 1;
 
 	return ret;
 }
 
+#define MAX_WRITEBACK_PAGES	1024
+
 STATIC size_t
 xfs_probe_cluster(
 	struct inode		*inode,
 	struct page		*startpage,
 	struct buffer_head	*bh,
 	struct buffer_head	*head,
-	int			mapped)
+	int			mapped,
+	unsigned int		type,
+	struct page		**pages,
+	int			*pagecount,
+	struct writeback_control *wbc)
 {
 	struct pagevec		pvec;
 	pgoff_t			tindex, tlast, tloff;
@@ -628,8 +648,15 @@ xfs_probe_cluster(
 
 	/* First sum forwards in this page */
 	do {
-		if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh)))
+		if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh))) {
+			return total;
+		} else if (type == IOMAP_UNWRITTEN && !buffer_unwritten(bh)) {
 			return total;
+		} else if (type == IOMAP_DELAY && !buffer_delay(bh)) {
+			return total;
+		} else if (type == IOMAP_NEW && !buffer_dirty(bh)) {
+			return total;
+		}
 		total += bh->b_size;
 	} while ((bh = bh->b_this_page) != head);
 
@@ -637,8 +664,9 @@ xfs_probe_cluster(
 	tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
 	tindex = startpage->index + 1;
 
-	/* Prune this back to avoid pathological behavior */
-	tloff = min(tlast, startpage->index + 64);
+	/* Prune this back to avoid pathological behavior, subtract 1 for the
+	 * first page. */
+	tloff = min(tlast, startpage->index + (pgoff_t)MAX_WRITEBACK_PAGES);
 
 	pagevec_init(&pvec, 0);
 	while (!done && tindex <= tloff) {
@@ -647,10 +675,10 @@ xfs_probe_cluster(
 		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
 			break;
 
-		for (i = 0; i < pagevec_count(&pvec); i++) {
+		for (i = 0; i < pagevec_count(&pvec) && !done; i++) {
 			struct page *page = pvec.pages[i];
 			size_t pg_offset, pg_len = 0;
-
+			int	entire = 0;
 			if (tindex == tlast) {
 				pg_offset =
 				    i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
@@ -658,20 +686,39 @@ xfs_probe_cluster(
 					done = 1;
 					break;
 				}
-			} else
+			} else {
 				pg_offset = PAGE_CACHE_SIZE;
-
-			if (page->index == tindex && trylock_page(page)) {
-				pg_len = xfs_probe_page(page, pg_offset, mapped);
-				unlock_page(page);
 			}
 
+			if (page->index == tindex &&
+			    *pagecount < MAX_WRITEBACK_PAGES - 1 &&
+			    trylock_page(page)) {
+	 			pg_len = xfs_probe_page(page, pg_offset,
+						mapped, type, &entire);
+				if (pg_len) {
+					pages[(*pagecount)++] = page;
+
+				} else {
+					unlock_page(page);
+				}
+			}
 			if (!pg_len) {
 				done = 1;
 				break;
 			}
 
 			total += pg_len;
+
+			/*
+			 * if probe did not succeed on all buffers in the page
+			 * we don't want to probe subsequent pages.  This
+			 * ensures that we don't have a mix of buffer types in
+			 * the iomap.
+			 */
+			if (!entire) {
+				done = 1;
+				break;
+			}
 			tindex++;
 		}
 
@@ -683,56 +730,19 @@ xfs_probe_cluster(
 }
 
 /*
- * Test if a given page is suitable for writing as part of an unwritten
- * or delayed allocate extent.
- */
-STATIC int
-xfs_is_delayed_page(
-	struct page		*page,
-	unsigned int		type)
-{
-	if (PageWriteback(page))
-		return 0;
-
-	if (page->mapping && page_has_buffers(page)) {
-		struct buffer_head	*bh, *head;
-		int			acceptable = 0;
-
-		bh = head = page_buffers(page);
-		do {
-			if (buffer_unwritten(bh))
-				acceptable = (type == IOMAP_UNWRITTEN);
-			else if (buffer_delay(bh))
-				acceptable = (type == IOMAP_DELAY);
-			else if (buffer_dirty(bh) && buffer_mapped(bh))
-				acceptable = (type == IOMAP_NEW);
-			else
-				break;
-		} while ((bh = bh->b_this_page) != head);
-
-		if (acceptable)
-			return 1;
-	}
-
-	return 0;
-}
-
-/*
  * Allocate & map buffers for page given the extent map. Write it out.
  * except for the original page of a writepage, this is called on
  * delalloc/unwritten pages only, for the original page it is possible
  * that the page has no mapping at all.
  */
-STATIC int
+STATIC void
 xfs_convert_page(
 	struct inode		*inode,
 	struct page		*page,
-	loff_t			tindex,
 	xfs_iomap_t		*mp,
 	xfs_ioend_t		**ioendp,
 	struct writeback_control *wbc,
-	int			startio,
-	int			all_bh)
+	int			startio)
 {
 	struct buffer_head	*bh, *head;
 	xfs_off_t		end_offset;
@@ -740,20 +750,9 @@ xfs_convert_page(
 	unsigned int		type;
 	int			bbits = inode->i_blkbits;
 	int			len, page_dirty;
-	int			count = 0, done = 0, uptodate = 1;
+	int			count = 0, uptodate = 1;
  	xfs_off_t		offset = page_offset(page);
 
-	if (page->index != tindex)
-		goto fail;
-	if (! trylock_page(page))
-		goto fail;
-	if (PageWriteback(page))
-		goto fail_unlock_page;
-	if (page->mapping != inode->i_mapping)
-		goto fail_unlock_page;
-	if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
-		goto fail_unlock_page;
-
 	/*
 	 * page_dirty is initially a count of buffers on the page before
 	 * EOF and is decremented as we move each into a cleanable state.
@@ -770,6 +769,8 @@ xfs_convert_page(
 	end_offset = min_t(unsigned long long,
 			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
 			i_size_read(inode));
+	end_offset = min_t(unsigned long long, end_offset,
+			(mp->iomap_offset + mp->iomap_bsize));
 
 	len = 1 << inode->i_blkbits;
 	p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
@@ -779,12 +780,12 @@ xfs_convert_page(
 
 	bh = head = page_buffers(page);
 	do {
-		if (offset >= end_offset)
+		if (offset >= end_offset) {
 			break;
+		}
 		if (!buffer_uptodate(bh))
 			uptodate = 0;
 		if (!(PageUptodate(page) || buffer_uptodate(bh))) {
-			done = 1;
 			continue;
 		}
 
@@ -794,10 +795,7 @@ xfs_convert_page(
 			else
 				type = IOMAP_DELAY;
 
-			if (!xfs_iomap_valid(mp, offset)) {
-				done = 1;
-				continue;
-			}
+			BUG_ON(!xfs_iomap_valid(mp, offset));
 
 			ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
 			ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
@@ -805,7 +803,7 @@ xfs_convert_page(
 			xfs_map_at_offset(bh, offset, bbits, mp);
 			if (startio) {
 				xfs_add_to_ioend(inode, bh, offset,
-						type, ioendp, done);
+						type, ioendp, 0 /* !done */);
 			} else {
 				set_buffer_dirty(bh);
 				unlock_buffer(bh);
@@ -814,15 +812,14 @@ xfs_convert_page(
 			page_dirty--;
 			count++;
 		} else {
+			WARN_ON(!xfs_iomap_valid(mp, offset));
 			type = IOMAP_NEW;
-			if (buffer_mapped(bh) && all_bh && startio) {
+			if (buffer_mapped(bh) && buffer_dirty(bh) && startio) {
 				lock_buffer(bh);
 				xfs_add_to_ioend(inode, bh, offset,
-						type, ioendp, done);
+						type, ioendp, 0 /* !done */);
 				count++;
 				page_dirty--;
-			} else {
-				done = 1;
 			}
 		}
 	} while (offset += len, (bh = bh->b_this_page) != head);
@@ -838,19 +835,16 @@ xfs_convert_page(
 			wbc->nr_to_write--;
 			if (bdi_write_congested(bdi)) {
 				wbc->encountered_congestion = 1;
-				done = 1;
 			} else if (wbc->nr_to_write <= 0) {
-				done = 1;
+				/* XXX ignore nr_to_write
+				done = 1; */
 			}
 		}
+		/* unlocks page */
 		xfs_start_page_writeback(page, wbc, !page_dirty, count);
+	} else {
+		unlock_page(page);
 	}
-
-	return done;
- fail_unlock_page:
-	unlock_page(page);
- fail:
-	return 1;
 }
 
 /*
@@ -860,33 +854,17 @@ xfs_convert_page(
 STATIC void
 xfs_cluster_write(
 	struct inode		*inode,
-	pgoff_t			tindex,
+	struct page		**pages,
+	int			pagecount,
 	xfs_iomap_t		*iomapp,
 	xfs_ioend_t		**ioendp,
 	struct writeback_control *wbc,
-	int			startio,
-	int			all_bh,
-	pgoff_t			tlast)
+	int			startio)
 {
-	struct pagevec		pvec;
-	int			done = 0, i;
-
-	pagevec_init(&pvec, 0);
-	while (!done && tindex <= tlast) {
-		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-
-		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-			break;
-
-		for (i = 0; i < pagevec_count(&pvec); i++) {
-			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-					iomapp, ioendp, wbc, startio, all_bh);
-			if (done)
-				break;
-		}
+	int			i;
 
-		pagevec_release(&pvec);
-		cond_resched();
+	for (i = 0; i < pagecount; i++) {
+		xfs_convert_page(inode, pages[i], iomapp, ioendp, wbc, startio);
 	}
 }
 
@@ -908,7 +886,6 @@ xfs_cluster_write(
  * bh->b_states's will not agree and only ones setup by BPW/BCW will have
  * valid state, thus the whole page must be written out thing.
  */
-
 STATIC int
 xfs_page_state_convert(
 	struct inode	*inode,
@@ -924,12 +901,13 @@ xfs_page_state_convert(
 	unsigned long           p_offset = 0;
 	unsigned int		type;
 	__uint64_t              end_offset;
-	pgoff_t                 end_index, last_index, tlast;
+	pgoff_t                 end_index, last_index;
 	ssize_t			size, len;
 	int			flags, err, iomap_valid = 0, uptodate = 1;
 	int			page_dirty, count = 0;
 	int			trylock = 0;
-	int			all_bh = unmapped;
+	struct page		**pages;
+	int			pagecount = 0, i;
 
 	if (startio) {
 		if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
@@ -949,6 +927,9 @@ xfs_page_state_convert(
 		}
 	}
 
+	pages = kmem_zalloc(sizeof(struct page*) * MAX_WRITEBACK_PAGES, KM_NOFS);
+
+
 	/*
 	 * page_dirty is initially a count of buffers on the page before
 	 * EOF and is decremented as we move each into a cleanable state.
@@ -1036,14 +1017,23 @@ xfs_page_state_convert(
 				 * for unwritten extent conversion.
 				 */
 				new_ioend = 1;
-				if (type == IOMAP_NEW) {
-					size = xfs_probe_cluster(inode,
-							page, bh, head, 0);
-				} else {
-					size = len;
+				size = 0;
+				if (type == IOMAP_NEW && !pagecount) {
+					size = xfs_probe_cluster(inode, page,
+							bh, head,
+							0 /* !mapped */, type,
+						       	pages, &pagecount, wbc);
+				} else if ((type == IOMAP_DELAY ||
+					    type == IOMAP_UNWRITTEN) &&
+						!pagecount) {
+					size = xfs_probe_cluster(inode, page,
+							bh, head,
+							1 /* mapped */, type,
+							pages, &pagecount, wbc);
 				}
 
-				err = xfs_map_blocks(inode, offset, size,
+				err = xfs_map_blocks(inode, offset,
+						size ? size : len,
 						&iomap, flags);
 				if (err)
 					goto error;
@@ -1072,9 +1062,16 @@ xfs_page_state_convert(
 			 */
 			if (!iomap_valid || flags != BMAPI_READ) {
 				flags = BMAPI_READ;
-				size = xfs_probe_cluster(inode, page, bh,
-								head, 1);
-				err = xfs_map_blocks(inode, offset, size,
+				size = 0;
+				if (!pagecount) {
+					size = xfs_probe_cluster(inode, page,
+							bh, head,
+							1 /* mapped */,
+							IOMAP_NEW,
+							pages, &pagecount, wbc);
+				}
+				err = xfs_map_blocks(inode, offset,
+						size ? size : len,
 						&iomap, flags);
 				if (err)
 					goto error;
@@ -1092,8 +1089,6 @@ xfs_page_state_convert(
 			type = IOMAP_NEW;
 			if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
 				ASSERT(buffer_mapped(bh));
-				if (iomap_valid)
-					all_bh = 1;
 				xfs_add_to_ioend(inode, bh, offset, type,
 						&ioend, !iomap_valid);
 				page_dirty--;
@@ -1104,6 +1099,8 @@ xfs_page_state_convert(
 		} else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
 			   (unmapped || startio)) {
 			iomap_valid = 0;
+		} else {
+			WARN_ON(1);
 		}
 
 		if (!iohead)
@@ -1117,13 +1114,11 @@ xfs_page_state_convert(
 	if (startio)
 		xfs_start_page_writeback(page, wbc, 1, count);
 
-	if (ioend && iomap_valid) {
-		offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
-					PAGE_CACHE_SHIFT;
-		tlast = min_t(pgoff_t, offset, last_index);
-		xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
-					wbc, startio, all_bh, tlast);
+	if (ioend && iomap_valid && pagecount) {
+		xfs_cluster_write(inode, pages, pagecount, &iomap, &ioend,
+					wbc, startio);
 	}
+	kmem_free(pages, sizeof(struct page *) * MAX_WRITEBACK_PAGES);
 
 	if (iohead)
 		xfs_submit_ioend(iohead);
@@ -1133,7 +1128,12 @@ xfs_page_state_convert(
 error:
 	if (iohead)
 		xfs_cancel_ioend(iohead);
-
+	if (pages) {
+		for (i = 0; i < pagecount; i++) {
+			unlock_page(pages[i]);
+		}
+		kmem_free(pages, sizeof(struct page *) * MAX_WRITEBACK_PAGES);
+	}
 	return err;
 }
 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

  reply	other threads:[~2011-01-14 23:30 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-01-14  0:29 Issues with delalloc->real extent allocation Dave Chinner
2011-01-14 16:40 ` Geoffrey Wehrman
2011-01-14 22:59   ` Dave Chinner
2011-01-15  4:16     ` Geoffrey Wehrman
2011-01-17  5:18       ` Dave Chinner
2011-01-17 14:37         ` Geoffrey Wehrman
2011-01-18  0:24           ` Dave Chinner
2011-01-18 14:30             ` Geoffrey Wehrman
2011-01-18 20:40               ` Christoph Hellwig
2011-01-18 22:03                 ` Dave Chinner
2011-01-14 21:43 ` bpm
2011-01-14 23:32   ` bpm [this message]
2011-01-14 23:50   ` bpm
2011-01-14 23:55   ` Dave Chinner
2011-01-17 20:12     ` bpm
2011-01-18  1:44       ` Dave Chinner
2011-01-18 20:47     ` Christoph Hellwig
2011-01-18 23:18       ` Dave Chinner
2011-01-19 12:03         ` Christoph Hellwig
2011-01-19 13:31           ` Dave Chinner
2011-01-19 13:55             ` Christoph Hellwig
2011-01-20  1:33               ` Dave Chinner
2011-01-20 11:16                 ` Christoph Hellwig
2011-01-21  1:59                   ` Dave Chinner
2011-01-20 14:45                 ` Geoffrey Wehrman
2011-01-21  2:51                   ` Dave Chinner
2011-01-21 14:41                     ` Geoffrey Wehrman
2011-01-23 23:26                       ` Dave Chinner
2011-01-17  0:28   ` Lachlan McIlroy
2011-01-17  4:37     ` Dave Chinner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20110114233226.GO28274@sgi.com \
    --to=bpm@sgi.com \
    --cc=david@fromorbit.com \
    --cc=xfs@oss.sgi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.