[patch][rfc] mm: hold page lock over page

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [patch][rfc] mm: hold page lock over page_mkwrite
@ 2009-02-25  9:36 Nick Piggin
  2009-02-25 16:42 ` Zach Brown
                   ` (3 more replies)
  0 siblings, 4 replies; 19+ messages in thread
From: Nick Piggin @ 2009-02-25  9:36 UTC (permalink / raw)
  To: linux-fsdevel, Linux Memory Management List


I want to have the page be protected by page lock between page_mkwrite
notification to the filesystem, and the actual setting of the page
dirty. Do this by holding the page lock over page_mkwrite, and keep it
held until after set_page_dirty.

I need this in fsblock because I am working to ensure filesystem metadata
can be correctly allocated and refcounted. This means that page cleaning
should not require memory allocation (to be really robust).

Without this patch, then for example we could get a concurrent writeout
after the page_mkwrite (which allocates page metadata required to clean
it), but before the set_page_dirty. The writeout will clean the page and
notice that the metadata is now unused and may be deallocated (because
it appears clean as set_page_dirty hasn't been called yet). So at this
point the page may be dirtied via the pte without enough metadata to be
able to write it back.

---
 fs/buffer.c |   11 +++++------
 mm/memory.c |   51 ++++++++++++++++++++++++++++-----------------------
 2 files changed, 33 insertions(+), 29 deletions(-)

Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c
+++ linux-2.6/fs/buffer.c
@@ -2474,12 +2474,12 @@ block_page_mkwrite(struct vm_area_struct
 	loff_t size;
 	int ret = -EINVAL;
 
-	lock_page(page);
+	BUG_ON(page->mapping != inode->i_mapping);
+
 	size = i_size_read(inode);
-	if ((page->mapping != inode->i_mapping) ||
-	    (page_offset(page) > size)) {
+	if (page_offset(page) > size) {
 		/* page got truncated out from underneath us */
-		goto out_unlock;
+		goto out;
 	}
 
 	/* page is wholly or partially inside EOF */
@@ -2492,8 +2492,7 @@ block_page_mkwrite(struct vm_area_struct
 	if (!ret)
 		ret = block_commit_write(page, 0, end);
 
-out_unlock:
-	unlock_page(page);
+out:
 	return ret;
 }
 
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -1948,9 +1948,16 @@ static int do_wp_page(struct mm_struct *
 			 */
 			page_cache_get(old_page);
 			pte_unmap_unlock(page_table, ptl);
+			lock_page(old_page);
+			if (!old_page->mapping) {
+				ret = 0;
+				goto out_error;
+			}
 
-			if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
-				goto unwritable_page;
+			if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) {
+				ret = VM_FAULT_SIGBUS;
+				goto out_error;
+			}
 
 			/*
 			 * Since we dropped the lock we need to revalidate
@@ -1960,9 +1967,11 @@ static int do_wp_page(struct mm_struct *
 			 */
 			page_table = pte_offset_map_lock(mm, pmd, address,
 							 &ptl);
-			page_cache_release(old_page);
-			if (!pte_same(*page_table, orig_pte))
+			if (!pte_same(*page_table, orig_pte)) {
+				unlock_page(old_page);
+				page_cache_release(old_page);
 				goto unlock;
+			}
 
 			page_mkwrite = 1;
 		}
@@ -2085,19 +2094,30 @@ unlock:
 		 *
 		 * do_no_page is protected similarly.
 		 */
-		wait_on_page_locked(dirty_page);
+		if (!page_mkwrite)
+			wait_on_page_locked(dirty_page);
 		set_page_dirty_balance(dirty_page, page_mkwrite);
 		put_page(dirty_page);
+		if (page_mkwrite) {
+			unlock_page(old_page);
+			page_cache_release(old_page);
+		}
 	}
 	return ret;
 oom_free_new:
 	page_cache_release(new_page);
 oom:
-	if (old_page)
+	if (old_page) {
+		if (page_mkwrite) {
+			unlock_page(old_page);
+			page_cache_release(old_page);
+		}
 		page_cache_release(old_page);
+	}
 	return VM_FAULT_OOM;
 
-unwritable_page:
+out_error:
+	unlock_page(old_page);
 	page_cache_release(old_page);
 	return VM_FAULT_SIGBUS;
 }
@@ -2643,23 +2663,9 @@ static int __do_fault(struct mm_struct *
 			 * to become writable
 			 */
 			if (vma->vm_ops->page_mkwrite) {
-				unlock_page(page);
 				if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
 					ret = VM_FAULT_SIGBUS;
 					anon = 1; /* no anon but release vmf.page */
-					goto out_unlocked;
-				}
-				lock_page(page);
-				/*
-				 * XXX: this is not quite right (racy vs
-				 * invalidate) to unlock and relock the page
-				 * like this, however a better fix requires
-				 * reworking page_mkwrite locking API, which
-				 * is better done later.
-				 */
-				if (!page->mapping) {
-					ret = 0;
-					anon = 1; /* no anon but release vmf.page */
 					goto out;
 				}
 				page_mkwrite = 1;
@@ -2713,8 +2719,6 @@ static int __do_fault(struct mm_struct *
 	pte_unmap_unlock(page_table, ptl);
 
 out:
-	unlock_page(vmf.page);
-out_unlocked:
 	if (anon)
 		page_cache_release(vmf.page);
 	else if (dirty_page) {
@@ -2724,6 +2728,7 @@ out_unlocked:
 		set_page_dirty_balance(dirty_page, page_mkwrite);
 		put_page(dirty_page);
 	}
+	unlock_page(vmf.page);
 
 	return ret;
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-02-25  9:36 [patch][rfc] mm: hold page lock over page_mkwrite Nick Piggin
@ 2009-02-25 16:42 ` Zach Brown
  2009-02-25 16:55   ` Nick Piggin
  2009-02-25 16:48 ` Chris Mason
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 19+ messages in thread
From: Zach Brown @ 2009-02-25 16:42 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-fsdevel, Linux Memory Management List, Mark Fasheh,
	Sage Weil

Nick Piggin wrote:
> I want to have the page be protected by page lock between page_mkwrite
> notification to the filesystem, and the actual setting of the page
> dirty. Do this by holding the page lock over page_mkwrite, and keep it
> held until after set_page_dirty.

I fear that this will end up creating lock inversions with file systems
who grab cross-node locks, which are ordered outside of the page lock,
inside their ->page_mkwrite().  See ocfs2's call of ocfs2_inode_lock()
from ocfs2_page_mkwrite().

In a sense, it's prepare_write() all over again.  Please don't call into
file systems after having acquired the page lock.

Instead, can we get a helper that the file system would call after
having acquired its locks and the page lock?  Something like that.

- z

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-02-25  9:36 [patch][rfc] mm: hold page lock over page_mkwrite Nick Piggin
  2009-02-25 16:42 ` Zach Brown
@ 2009-02-25 16:48 ` Chris Mason
  2009-02-26  9:20 ` Peter Zijlstra
  2009-03-01  8:17 ` Dave Chinner
  3 siblings, 0 replies; 19+ messages in thread
From: Chris Mason @ 2009-02-25 16:48 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-fsdevel, Linux Memory Management List

On Wed, 2009-02-25 at 10:36 +0100, Nick Piggin wrote:
> I want to have the page be protected by page lock between page_mkwrite
> notification to the filesystem, and the actual setting of the page
> dirty. Do this by holding the page lock over page_mkwrite, and keep it
> held until after set_page_dirty.

Are any of the filesystems ordering the journal lock outside the page
lock?  I thought ocfs2 and ext4 were either doing this or discussing it.
If they are, this will make fsblock hard to use for them.

-chris


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-02-25 16:42 ` Zach Brown
@ 2009-02-25 16:55   ` Nick Piggin
  2009-02-25 16:58     ` Zach Brown
  0 siblings, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2009-02-25 16:55 UTC (permalink / raw)
  To: Zach Brown
  Cc: linux-fsdevel, Linux Memory Management List, Mark Fasheh,
	Sage Weil

On Wed, Feb 25, 2009 at 08:42:50AM -0800, Zach Brown wrote:
> Nick Piggin wrote:
> > I want to have the page be protected by page lock between page_mkwrite
> > notification to the filesystem, and the actual setting of the page
> > dirty. Do this by holding the page lock over page_mkwrite, and keep it
> > held until after set_page_dirty.
> 
> I fear that this will end up creating lock inversions with file systems
> who grab cross-node locks, which are ordered outside of the page lock,
> inside their ->page_mkwrite().  See ocfs2's call of ocfs2_inode_lock()
> from ocfs2_page_mkwrite().

Is ocfs2 immune to the races that get covered by this patch?

> In a sense, it's prepare_write() all over again.  Please don't call into
> file systems after having acquired the page lock.

It is very much an opt-out thing with page_mkwrite. The filesystem
if it thinks it is really smart can unlock the page if it likes. I'm
sure it will probably introduce some obscure race or another, but hey ;)

> Instead, can we get a helper that the file system would call after
> having acquired its locks and the page lock?  Something like that.

Well, the critical part is holding the same page lock over the
"important" part of the page_mkwrite operation and setting the
pte dirty and marking the page dirty. So there isn't really a
helper it can call.

Hmm, actually possibly we can enter page_mkwrite with the page unlocked,
but exit with the page locked? Slightly more complex, but should save
complexity elsewhere. Yes I think this might be the best way to go.

Thanks,
Nick

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-02-25 16:55   ` Nick Piggin
@ 2009-02-25 16:58     ` Zach Brown
  2009-02-25 17:02       ` Nick Piggin
  0 siblings, 1 reply; 19+ messages in thread
From: Zach Brown @ 2009-02-25 16:58 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-fsdevel, Linux Memory Management List, Mark Fasheh,
	Sage Weil


> Is ocfs2 immune to the races that get covered by this patch?

I haven't the slightest idea.

> Hmm, actually possibly we can enter page_mkwrite with the page unlocked,
> but exit with the page locked? Slightly more complex, but should save
> complexity elsewhere. Yes I think this might be the best way to go.

That sounds like it would work on first glance, yeah.  Mark will yell at
us if we've gotten it wrong ;).

- z

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-02-25 16:58     ` Zach Brown
@ 2009-02-25 17:02       ` Nick Piggin
  2009-02-25 22:35         ` Mark Fasheh
  0 siblings, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2009-02-25 17:02 UTC (permalink / raw)
  To: Zach Brown
  Cc: linux-fsdevel, Linux Memory Management List, Mark Fasheh,
	Sage Weil

(Chris, your point about lock ordering is good. Zach raised the
same one. Thanks for the good catch there guys).

On Wed, Feb 25, 2009 at 08:58:06AM -0800, Zach Brown wrote:
> 
> > Is ocfs2 immune to the races that get covered by this patch?
> 
> I haven't the slightest idea.

Well, they would be covered with the new scheme anyway:

 
> > Hmm, actually possibly we can enter page_mkwrite with the page unlocked,
> > but exit with the page locked? Slightly more complex, but should save
> > complexity elsewhere. Yes I think this might be the best way to go.
> 
> That sounds like it would work on first glance, yeah.  Mark will yell at
> us if we've gotten it wrong ;).

OK, thanks. I'll go with that approach and see what happens.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-02-25 17:02       ` Nick Piggin
@ 2009-02-25 22:35         ` Mark Fasheh
  0 siblings, 0 replies; 19+ messages in thread
From: Mark Fasheh @ 2009-02-25 22:35 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Zach Brown, linux-fsdevel, Linux Memory Management List,
	Sage Weil

On Wed, Feb 25, 2009 at 06:02:40PM +0100, Nick Piggin wrote:
> > > Hmm, actually possibly we can enter page_mkwrite with the page unlocked,
> > > but exit with the page locked? Slightly more complex, but should save
> > > complexity elsewhere. Yes I think this might be the best way to go.
> > 
> > That sounds like it would work on first glance, yeah.  Mark will yell at
> > us if we've gotten it wrong ;).
> 
> OK, thanks. I'll go with that approach and see what happens.

Yeah, that sounds reasonable... Thanks for pointing this out Zach!
	--Mark

--
Mark Fasheh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-02-25  9:36 [patch][rfc] mm: hold page lock over page_mkwrite Nick Piggin
  2009-02-25 16:42 ` Zach Brown
  2009-02-25 16:48 ` Chris Mason
@ 2009-02-26  9:20 ` Peter Zijlstra
  2009-02-26 11:09   ` Nick Piggin
  2009-03-01  8:17 ` Dave Chinner
  3 siblings, 1 reply; 19+ messages in thread
From: Peter Zijlstra @ 2009-02-26  9:20 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-fsdevel, Linux Memory Management List

On Wed, 2009-02-25 at 10:36 +0100, Nick Piggin wrote:
> +               if (!page_mkwrite)
> +                       wait_on_page_locked(dirty_page);
>                 set_page_dirty_balance(dirty_page, page_mkwrite);
>                 put_page(dirty_page);
> +               if (page_mkwrite) {
> +                       unlock_page(old_page);
> +                       page_cache_release(old_page);
> +               }

We're calling into the whole balance_dirty_pages() writeout path with a
page locked.. is that sensible?


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-02-26  9:20 ` Peter Zijlstra
@ 2009-02-26 11:09   ` Nick Piggin
  0 siblings, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2009-02-26 11:09 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-fsdevel, Linux Memory Management List

On Thu, Feb 26, 2009 at 10:20:18AM +0100, Peter Zijlstra wrote:
> On Wed, 2009-02-25 at 10:36 +0100, Nick Piggin wrote:
> > +               if (!page_mkwrite)
> > +                       wait_on_page_locked(dirty_page);
> >                 set_page_dirty_balance(dirty_page, page_mkwrite);
> >                 put_page(dirty_page);
> > +               if (page_mkwrite) {
> > +                       unlock_page(old_page);
> > +                       page_cache_release(old_page);
> > +               }
> 
> We're calling into the whole balance_dirty_pages() writeout path with a
> page locked.. is that sensible?

Yeah, probably should move the balance out of there.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-02-25  9:36 [patch][rfc] mm: hold page lock over page_mkwrite Nick Piggin
                   ` (2 preceding siblings ...)
  2009-02-26  9:20 ` Peter Zijlstra
@ 2009-03-01  8:17 ` Dave Chinner
  2009-03-01 13:50   ` Nick Piggin
  3 siblings, 1 reply; 19+ messages in thread
From: Dave Chinner @ 2009-03-01  8:17 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-fsdevel, Linux Memory Management List

On Wed, Feb 25, 2009 at 10:36:29AM +0100, Nick Piggin wrote:
> I need this in fsblock because I am working to ensure filesystem metadata
> can be correctly allocated and refcounted. This means that page cleaning
> should not require memory allocation (to be really robust).

Which, unfortunately, is just a dream for any filesystem that uses
delayed allocation. i.e. they have to walk the free space trees
which may need to be read from disk and therefore require memory
to succeed....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-03-01  8:17 ` Dave Chinner
@ 2009-03-01 13:50   ` Nick Piggin
  2009-03-02  8:19     ` Dave Chinner
  0 siblings, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2009-03-01 13:50 UTC (permalink / raw)
  To: linux-fsdevel, Linux Memory Management List

On Sun, Mar 01, 2009 at 07:17:44PM +1100, Dave Chinner wrote:
> On Wed, Feb 25, 2009 at 10:36:29AM +0100, Nick Piggin wrote:
> > I need this in fsblock because I am working to ensure filesystem metadata
> > can be correctly allocated and refcounted. This means that page cleaning
> > should not require memory allocation (to be really robust).
> 
> Which, unfortunately, is just a dream for any filesystem that uses
> delayed allocation. i.e. they have to walk the free space trees
> which may need to be read from disk and therefore require memory
> to succeed....

Well it's a dream because probably none of them get it right, but
that doesn't mean its impossible.

You don't need complete memory allocation up-front to be robust,
but having reserves or degraded modes that simply guarantee
forward progress is enough.

For example, if you need to read/write filesystem metadata to find
and allocate free space, then you really only need a page to do all
the IO.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-03-01 13:50   ` Nick Piggin
@ 2009-03-02  8:19     ` Dave Chinner
  2009-03-02  8:37       ` Nick Piggin
  0 siblings, 1 reply; 19+ messages in thread
From: Dave Chinner @ 2009-03-02  8:19 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-fsdevel, Linux Memory Management List

On Sun, Mar 01, 2009 at 02:50:57PM +0100, Nick Piggin wrote:
> On Sun, Mar 01, 2009 at 07:17:44PM +1100, Dave Chinner wrote:
> > On Wed, Feb 25, 2009 at 10:36:29AM +0100, Nick Piggin wrote:
> > > I need this in fsblock because I am working to ensure filesystem metadata
> > > can be correctly allocated and refcounted. This means that page cleaning
> > > should not require memory allocation (to be really robust).
> > 
> > Which, unfortunately, is just a dream for any filesystem that uses
> > delayed allocation. i.e. they have to walk the free space trees
> > which may need to be read from disk and therefore require memory
> > to succeed....
> 
> Well it's a dream because probably none of them get it right, but
> that doesn't mean its impossible.
> 
> You don't need complete memory allocation up-front to be robust,
> but having reserves or degraded modes that simply guarantee
> forward progress is enough.
> 
> For example, if you need to read/write filesystem metadata to find
> and allocate free space, then you really only need a page to do all
> the IO.

For journalling filesystems, dirty metadata is pinned for at least the
duration of the transaction and in many cases it is pinned for
multiple transactions (i.e. in memory aggregation of commits like
XFS does). And then once the transaction is complete, it can't be
reused until it is written to disk.

For the worst case usage in XFS, think about a complete btree split
of both free space trees, plus a complete btree split of the extent
tree.  That is two buffers per level per btree that are pinned by
the transaction.

The free space trees are bound in depth by the AG size so the limit
is (IIRC) 15 buffers per tree at 1TB AG size. However, the inode
extent tree can be deeper than that (bound by filesystem size). In
effect, writing back a single page could require memory allocation
of 30-40 pages just for metadata that is dirtied by the allocation
transaction.

And then the next page written back goes into a different
AG and splits the trees there. And then the next does the same.

Luckily, this sort of thing doesn't happen very often, but it does
serve to demonstrate how difficult it is to quantify how much memory
the writeback path really needs to guarantee forward progress.
Hence the dream......

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-03-02  8:19     ` Dave Chinner
@ 2009-03-02  8:37       ` Nick Piggin
  2009-03-02 15:26         ` jim owens
  0 siblings, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2009-03-02  8:37 UTC (permalink / raw)
  To: linux-fsdevel, Linux Memory Management List

On Mon, Mar 02, 2009 at 07:19:53PM +1100, Dave Chinner wrote:
> On Sun, Mar 01, 2009 at 02:50:57PM +0100, Nick Piggin wrote:
> > On Sun, Mar 01, 2009 at 07:17:44PM +1100, Dave Chinner wrote:
> > > On Wed, Feb 25, 2009 at 10:36:29AM +0100, Nick Piggin wrote:
> > > > I need this in fsblock because I am working to ensure filesystem metadata
> > > > can be correctly allocated and refcounted. This means that page cleaning
> > > > should not require memory allocation (to be really robust).
> > > 
> > > Which, unfortunately, is just a dream for any filesystem that uses
> > > delayed allocation. i.e. they have to walk the free space trees
> > > which may need to be read from disk and therefore require memory
> > > to succeed....
> > 
> > Well it's a dream because probably none of them get it right, but
> > that doesn't mean its impossible.
> > 
> > You don't need complete memory allocation up-front to be robust,
> > but having reserves or degraded modes that simply guarantee
> > forward progress is enough.
> > 
> > For example, if you need to read/write filesystem metadata to find
> > and allocate free space, then you really only need a page to do all
> > the IO.
> 
> For journalling filesystems, dirty metadata is pinned for at least the
> duration of the transaction and in many cases it is pinned for
> multiple transactions (i.e. in memory aggregation of commits like
> XFS does). And then once the transaction is complete, it can't be
> reused until it is written to disk.
> 
> For the worst case usage in XFS, think about a complete btree split
> of both free space trees, plus a complete btree split of the extent
> tree.  That is two buffers per level per btree that are pinned by
> the transaction.
> 
> The free space trees are bound in depth by the AG size so the limit
> is (IIRC) 15 buffers per tree at 1TB AG size. However, the inode
> extent tree can be deeper than that (bound by filesystem size). In
> effect, writing back a single page could require memory allocation
> of 30-40 pages just for metadata that is dirtied by the allocation
> transaction.
> 
> And then the next page written back goes into a different
> AG and splits the trees there. And then the next does the same.

So assuming there is no reasonable way to do out of core algorithms
on the filesystem metadata (and likely you don't want to anyway
because it would be a significant slowdown or diverge of code
paths), you still only need to reserve one set of those 30-40 pages
for the entire kernel.

You only ever need to reserve enough memory for a *single* page
to be processed. In the worst case that there are multiple pages
under writeout but can't allocate memory, only one will be allowed
access to reserves and the others will block until it is finished
and can unpin them all.


> Luckily, this sort of thing doesn't happen very often, but it does
> serve to demonstrate how difficult it is to quantify how much memory
> the writeback path really needs to guarantee forward progress.
> Hence the dream......

Well I'm not saying it is an immediate problem or it would be a
good use of anybody's time to rush out and try to redesign their
fs code to fix it ;) But at least for any new core/generic library
functionality like fsblock, it would be silly not to close the hole
there (not least because the problem is simpler here than in a
complex fs).


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-03-02  8:37       ` Nick Piggin
@ 2009-03-02 15:26         ` jim owens
  2009-03-03  4:33           ` Nick Piggin
  0 siblings, 1 reply; 19+ messages in thread
From: jim owens @ 2009-03-02 15:26 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-fsdevel, Linux Memory Management List

Nick Piggin wrote:
> 
> So assuming there is no reasonable way to do out of core algorithms
> on the filesystem metadata (and likely you don't want to anyway
> because it would be a significant slowdown or diverge of code
> paths), you still only need to reserve one set of those 30-40 pages
> for the entire kernel.
> 
> You only ever need to reserve enough memory for a *single* page
> to be processed. In the worst case that there are multiple pages
> under writeout but can't allocate memory, only one will be allowed
> access to reserves and the others will block until it is finished
> and can unpin them all.

Sure, nobody will mind seeing lots of extra pinned memory ;)

Don't forget to add the space for data transforms and raid
driver operations in the write stack, and whatever else we
may not have thought of.  With good engineering we can make
it so "we can always make forward progress".  But it won't
matter because once a real user drives the system off this
cliff there is no difference between "hung" and "really slow
progress".  They are going to crash it and report a hang.

> Well I'm not saying it is an immediate problem or it would be a
> good use of anybody's time to rush out and try to redesign their
> fs code to fix it ;) But at least for any new core/generic library
> functionality like fsblock, it would be silly not to close the hole
> there (not least because the problem is simpler here than in a
> complex fs).

Hey, I appreciate anything you do in VM to make the ugly
dance with filesystems (my area) a little less ugly.

I'm sure you also appreciate that every time VM tries to
save 32 bytes, someone else tries to take 32 K-bytes.
As they say... memory is cheap :)

jim

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-03-02 15:26         ` jim owens
@ 2009-03-03  4:33           ` Nick Piggin
  2009-03-03 17:25             ` Jamie Lokier
  0 siblings, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2009-03-03  4:33 UTC (permalink / raw)
  To: jim owens; +Cc: linux-fsdevel, Linux Memory Management List

On Mon, Mar 02, 2009 at 10:26:21AM -0500, jim owens wrote:
> Nick Piggin wrote:
> >
> >So assuming there is no reasonable way to do out of core algorithms
> >on the filesystem metadata (and likely you don't want to anyway
> >because it would be a significant slowdown or diverge of code
> >paths), you still only need to reserve one set of those 30-40 pages
> >for the entire kernel.
> >
> >You only ever need to reserve enough memory for a *single* page
> >to be processed. In the worst case that there are multiple pages
> >under writeout but can't allocate memory, only one will be allowed
> >access to reserves and the others will block until it is finished
> >and can unpin them all.
> 
> Sure, nobody will mind seeing lots of extra pinned memory ;)

40 pages (160k) isn't a huge amount. You could always have a
boot option to disable the memory reserve if it is a big
deal.

> Don't forget to add the space for data transforms and raid
> driver operations in the write stack, and whatever else we
> may not have thought of.  With good engineering we can make

The block layer below the filesystem should be robust. Well
actually the core block layer is (except maybe for the new
bio integrity stuff that looks pretty nasty). Not sure about
md/dm, but they really should be safe (they use mempools etc).

> it so "we can always make forward progress".  But it won't
> matter because once a real user drives the system off this
> cliff there is no difference between "hung" and "really slow
> progress".  They are going to crash it and report a hang.

I don't think that is the case. These are situations that
would be *really* rare and transient. It is not like thrashing
in that your working set size exceeds physical RAM, but just
a combination of conditions that causes an unusual spike in the
required memory to clean some dirty pages (eg. Dave's example
of several IOs requiring btree splits over several AGs). Could
cause a resource deadlock.

> >Well I'm not saying it is an immediate problem or it would be a
> >good use of anybody's time to rush out and try to redesign their
> >fs code to fix it ;) But at least for any new core/generic library
> >functionality like fsblock, it would be silly not to close the hole
> >there (not least because the problem is simpler here than in a
> >complex fs).
> 
> Hey, I appreciate anything you do in VM to make the ugly
> dance with filesystems (my area) a little less ugly.

Well thanks.

> I'm sure you also appreciate that every time VM tries to
> save 32 bytes, someone else tries to take 32 K-bytes.
> As they say... memory is cheap :)

Well that's OK. If core vm/fs code saves a little bit of memory
that enables something else like a filesystem to use it to cache
a tiny bit more useful data, then I think it is a good result :)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-03-03  4:33           ` Nick Piggin
@ 2009-03-03 17:25             ` Jamie Lokier
  2009-03-04  4:37               ` Dave Chinner
  2009-03-04  9:23               ` Nick Piggin
  0 siblings, 2 replies; 19+ messages in thread
From: Jamie Lokier @ 2009-03-03 17:25 UTC (permalink / raw)
  To: Nick Piggin; +Cc: jim owens, linux-fsdevel, Linux Memory Management List

Nick Piggin wrote:
> > >You only ever need to reserve enough memory for a *single* page
> > >to be processed. In the worst case that there are multiple pages
> > >under writeout but can't allocate memory, only one will be allowed
> > >access to reserves and the others will block until it is finished
> > >and can unpin them all.
> > 
> > Sure, nobody will mind seeing lots of extra pinned memory ;)
> 
> 40 pages (160k) isn't a huge amount. You could always have a
> boot option to disable the memory reserve if it is a big
> deal.
>  
> > Don't forget to add the space for data transforms and raid
> > driver operations in the write stack, and whatever else we
> > may not have thought of.  With good engineering we can make
> 
> The block layer below the filesystem should be robust. Well
> actually the core block layer is (except maybe for the new
> bio integrity stuff that looks pretty nasty). Not sure about
> md/dm, but they really should be safe (they use mempools etc).

Are mempools fully safe, or just statistically safer?

> > it so "we can always make forward progress".  But it won't
> > matter because once a real user drives the system off this
> > cliff there is no difference between "hung" and "really slow
> > progress".  They are going to crash it and report a hang.
> 
> I don't think that is the case. These are situations that
> would be *really* rare and transient. It is not like thrashing
> in that your working set size exceeds physical RAM, but just
> a combination of conditions that causes an unusual spike in the
> required memory to clean some dirty pages (eg. Dave's example
> of several IOs requiring btree splits over several AGs). Could
> cause a resource deadlock.

Suppose the systems has two pages to be written.  The first must
_reserve_ 40 pages of scratch space just in case the operation will
need them.  If the second page write is initiated concurrently with
the first, the second must reserve another 40 pages concurrently.

If 10 page writes are concurrent, that's 400 pages of scratch space
needed in reserve...

Your idea that the system can serialises page writes to limit the
scratch used will limit this.  But how does it know when to serialise
page writes, if not by reserving 40 pages of memory per written page,
until it knows better?

In other words, do you need 40 pages of metadata space times the
number of concurrent page writes - until the write logic for each page
reaches the point of "unreserving" because they've worked out which
metadata they don't need to touch?

It sounds like a classic concurrency problem - how many things to
start in parallel, each having a large upper bound on the memory they
may need when they are started, even though the average is much lower.

A solution to that is each concurrent thing being able to reserve
memory "I may need X memory later" and block at reservation time, so
it doesn't have to block when it's later allocating and using memory.
Then to keep up global concurrency, each thing is able to unreserve
memory as it progresses.  Or reserve some more.  The point is
reservation can block, while later processing which may need the
memory doesn't block.

-- Jamie

> 
> 
> > >Well I'm not saying it is an immediate problem or it would be a
> > >good use of anybody's time to rush out and try to redesign their
> > >fs code to fix it ;) But at least for any new core/generic library
> > >functionality like fsblock, it would be silly not to close the hole
> > >there (not least because the problem is simpler here than in a
> > >complex fs).
> > 
> > Hey, I appreciate anything you do in VM to make the ugly
> > dance with filesystems (my area) a little less ugly.
> 
> Well thanks.
> 
> 
> > I'm sure you also appreciate that every time VM tries to
> > save 32 bytes, someone else tries to take 32 K-bytes.
> > As they say... memory is cheap :)
> 
> Well that's OK. If core vm/fs code saves a little bit of memory
> that enables something else like a filesystem to use it to cache
> a tiny bit more useful data, then I think it is a good result :)
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-03-03 17:25             ` Jamie Lokier
@ 2009-03-04  4:37               ` Dave Chinner
  2009-03-04  9:23               ` Nick Piggin
  1 sibling, 0 replies; 19+ messages in thread
From: Dave Chinner @ 2009-03-04  4:37 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Nick Piggin, jim owens, linux-fsdevel,
	Linux Memory Management List

On Tue, Mar 03, 2009 at 05:25:36PM +0000, Jamie Lokier wrote:
> > > it so "we can always make forward progress".  But it won't
> > > matter because once a real user drives the system off this
> > > cliff there is no difference between "hung" and "really slow
> > > progress".  They are going to crash it and report a hang.
> > 
> > I don't think that is the case. These are situations that
> > would be *really* rare and transient. It is not like thrashing
> > in that your working set size exceeds physical RAM, but just
> > a combination of conditions that causes an unusual spike in the
> > required memory to clean some dirty pages (eg. Dave's example
> > of several IOs requiring btree splits over several AGs). Could
> > cause a resource deadlock.
> 
> Suppose the systems has two pages to be written.  The first must
> _reserve_ 40 pages of scratch space just in case the operation will
> need them.  If the second page write is initiated concurrently with
> the first, the second must reserve another 40 pages concurrently.
> 
> If 10 page writes are concurrent, that's 400 pages of scratch space
> needed in reserve...

Therein lies the problem. XFS can do this in parallel in every AG at
the same time. i.e. the reserve is per AG. The maximum number of AGs
in XFS is 2^32, and I know of filesystems out there that have
thousands of AGs in them. Hence reserving 40 pages per AG is
definitely unreasonable. ;)

Even if we look at concurrent allocations as the upper bound, I've
seen an 8p machine with several hundred concurrent allocation
transactions in progress. Even that is unreasonable if you consider
machines with 64k pages - it's hundreds of megabytes of RAM that are
mostly going to be unused.

Specifying a pool of pages is not a guaranteed solution, either,
as someone will always exhaust it as we can't guarantee any given
transaction will complete before the pool is exhausted. i.e.
the mempool design as it stands can't be used.

AFAIC, "should never allocate during writeback" is a great goal, but
it is one that we will never be able to reach without throwing
everything away and starting again. Minimising allocation is
something we can do but we can't avoid it entirely. The higher
layers need to understand this, not assert that the lower layers
must conform to an impossible constraint and break if they don't.....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-03-03 17:25             ` Jamie Lokier
  2009-03-04  4:37               ` Dave Chinner
@ 2009-03-04  9:23               ` Nick Piggin
  2009-03-04 18:13                 ` Jamie Lokier
  1 sibling, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2009-03-04  9:23 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: jim owens, linux-fsdevel, Linux Memory Management List

On Tue, Mar 03, 2009 at 05:25:36PM +0000, Jamie Lokier wrote:
> Nick Piggin wrote:
> > The block layer below the filesystem should be robust. Well
> > actually the core block layer is (except maybe for the new
> > bio integrity stuff that looks pretty nasty). Not sure about
> > md/dm, but they really should be safe (they use mempools etc).
> 
> Are mempools fully safe, or just statistically safer?

They will guarantee forward progress if used correctly, so
yes fully safe.

> > > it so "we can always make forward progress".  But it won't
> > > matter because once a real user drives the system off this
> > > cliff there is no difference between "hung" and "really slow
> > > progress".  They are going to crash it and report a hang.
> > 
> > I don't think that is the case. These are situations that
> > would be *really* rare and transient. It is not like thrashing
> > in that your working set size exceeds physical RAM, but just
> > a combination of conditions that causes an unusual spike in the
> > required memory to clean some dirty pages (eg. Dave's example
> > of several IOs requiring btree splits over several AGs). Could
> > cause a resource deadlock.
> 
> Suppose the systems has two pages to be written.  The first must
> _reserve_ 40 pages of scratch space just in case the operation will
> need them.  If the second page write is initiated concurrently with
> the first, the second must reserve another 40 pages concurrently.
> 
> If 10 page writes are concurrent, that's 400 pages of scratch space
> needed in reserve...

You only need to guarantee forward progress, so you would reserve
40 pages up front for the entire machine (some mempools have more
memory than strictly needed to improve performance, so you could
toy with that, but let's just describe the baseline).

So allocations happen as normal, except when an allocation fails,
then the task which fails the allocation is given access to this
reserve memory, any other task requiring reserve will then block.

Now the reserve provides enough pages to guarantee forward progress,
so that one task is going to be able to proceed and eventually its
pages will become freeable and can be returned to the reserve. Once
the writeout has finished, the reserve will become available to
other tasks.

So this way you only have to reserve enough to write out 1 page,
and you only start blocking things when their memory allocations
wolud have failed *anyway*. And you guarantee forward progress.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch][rfc] mm: hold page lock over page_mkwrite
  2009-03-04  9:23               ` Nick Piggin
@ 2009-03-04 18:13                 ` Jamie Lokier
  0 siblings, 0 replies; 19+ messages in thread
From: Jamie Lokier @ 2009-03-04 18:13 UTC (permalink / raw)
  To: Nick Piggin; +Cc: jim owens, linux-fsdevel, Linux Memory Management List

Nick Piggin wrote:
> > Suppose the systems has two pages to be written.  The first must
> > _reserve_ 40 pages of scratch space just in case the operation will
> > need them.  If the second page write is initiated concurrently with
> > the first, the second must reserve another 40 pages concurrently.
> > 
> > If 10 page writes are concurrent, that's 400 pages of scratch space
> > needed in reserve...
> 
> You only need to guarantee forward progress, so you would reserve
> 40 pages up front for the entire machine (some mempools have more
> memory than strictly needed to improve performance, so you could
> toy with that, but let's just describe the baseline).
> 
> So allocations happen as normal, except when an allocation fails,
> then the task which fails the allocation is given access to this
> reserve memory, any other task requiring reserve will then block.
> 
> Now the reserve provides enough pages to guarantee forward progress,
> so that one task is going to be able to proceed and eventually its
> pages will become freeable and can be returned to the reserve. Once
> the writeout has finished, the reserve will become available to
> other tasks.
> 
> So this way you only have to reserve enough to write out 1 page,
> and you only start blocking things when their memory allocations
> wolud have failed *anyway*. And you guarantee forward progress.

Ah.  *light bulb*  In the writing tasks, memory allocation is
forbidden, but blocking is allowed.  That's what I was missing :-)

Forward progress marches on.

-- Jamie

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2009-03-04 18:13 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-02-25  9:36 [patch][rfc] mm: hold page lock over page_mkwrite Nick Piggin
2009-02-25 16:42 ` Zach Brown
2009-02-25 16:55   ` Nick Piggin
2009-02-25 16:58     ` Zach Brown
2009-02-25 17:02       ` Nick Piggin
2009-02-25 22:35         ` Mark Fasheh
2009-02-25 16:48 ` Chris Mason
2009-02-26  9:20 ` Peter Zijlstra
2009-02-26 11:09   ` Nick Piggin
2009-03-01  8:17 ` Dave Chinner
2009-03-01 13:50   ` Nick Piggin
2009-03-02  8:19     ` Dave Chinner
2009-03-02  8:37       ` Nick Piggin
2009-03-02 15:26         ` jim owens
2009-03-03  4:33           ` Nick Piggin
2009-03-03 17:25             ` Jamie Lokier
2009-03-04  4:37               ` Dave Chinner
2009-03-04  9:23               ` Nick Piggin
2009-03-04 18:13                 ` Jamie Lokier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).