[rfc][patch] unlock_page speedup

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [rfc][patch] unlock_page speedup
@ 2008-12-19  7:29 Nick Piggin
  2008-12-19  7:35 ` Andrew Morton
  0 siblings, 1 reply; 9+ messages in thread
From: Nick Piggin @ 2008-12-19  7:29 UTC (permalink / raw)
  To: Linux Memory Management List, linux-fsdevel, Andrew Morton,
	Linus Torvalds

Here is another one (background: lmbench lat_mmap primarily MAP_SHARED fault
benchmark has regressed quite a bit since anybody last cared to look, I'm
trying to find some improvements).

OK, so one thing we do is lock the page in the page fault path to cover some
nasty races... This patch gets back another 2% (453.12us to 442.9us), by
speeding up unlock_page by using an extra bit to signal contention. This
actually helps powerpc far more than x86, but I'm glad to see x86 still
has a nice win from avoiding the function call and hash cacheline lookup.

lat_mmap profile goes from looking like this (after the mnt_want_write patches)
CPU: AMD64 family10, speed 2000 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit mask) count 10000
samples  %        symbol name
254150   14.5889  __do_fault
163003    9.3568  unmap_vmas
110232    6.3276  mark_page_accessed
77864     4.4696  __up_read
75864     4.3548  page_waitqueue     <<<<
69984     4.0173  handle_mm_fault
66945     3.8428  do_page_fault
66457     3.8148  retint_swapgs
65413     3.7549  shmem_getpage
62904     3.6109  file_update_time
61430     3.5262  set_page_dirty
53425     3.0667  unlock_page        <<<<

To this:
3119      0.1430  unlock_page
0         0.0000  page_waitqueue

--
Introduce a new page flag, PG_waiters, to signal there are processes waiting on
PG_lock; and use it to avoid memory barriers and waitqueue hash lookup in the
unlock_page fastpath.


---
 include/linux/page-flags.h |    4 +-
 include/linux/pagemap.h    |    7 ++-
 kernel/wait.c              |    3 -
 mm/filemap.c               |   89 ++++++++++++++++++++++++++++++++++++---------
 4 files changed, 81 insertions(+), 22 deletions(-)

Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h
+++ linux-2.6/include/linux/page-flags.h
@@ -71,6 +71,7 @@
  */
 enum pageflags {
 	PG_locked,		/* Page is locked. Don't touch. */
+	PG_waiters,		/* Page has PG_locked waiters. */
 	PG_error,
 	PG_referenced,
 	PG_uptodate,
@@ -181,6 +182,7 @@ static inline int TestClearPage##uname(s
 struct page;	/* forward declaration */
 
 TESTPAGEFLAG(Locked, locked)
+PAGEFLAG(Waiters, waiters)
 PAGEFLAG(Error, error)
 PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
 PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
@@ -373,7 +375,7 @@ static inline void __ClearPageTail(struc
 #endif
 
 #define PAGE_FLAGS	(1 << PG_lru   | 1 << PG_private   | 1 << PG_locked | \
-			 1 << PG_buddy | 1 << PG_writeback | \
+			 1 << PG_waiters | 1 << PG_buddy | 1 << PG_writeback | \
 			 1 << PG_slab  | 1 << PG_swapcache | 1 << PG_active | \
 			 __PG_UNEVICTABLE | __PG_MLOCKED)
 
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -189,7 +189,7 @@ static inline int page_cache_add_specula
 	if (unlikely(!atomic_add_unless(&page->_count, count, 0)))
 		return 0;
 #endif
-	VM_BUG_ON(PageCompound(page) && page != compound_head(page));
+	VM_BUG_ON(PageTail(page));
 
 	return 1;
 }
@@ -353,6 +353,7 @@ static inline void lock_page_nosync(stru
  * Never use this directly!
  */
 extern void wait_on_page_bit(struct page *page, int bit_nr);
+extern void __wait_on_page_locked(struct page *page);
 
 /* 
  * Wait for a page to be unlocked.
@@ -363,8 +364,9 @@ extern void wait_on_page_bit(struct page
  */
 static inline void wait_on_page_locked(struct page *page)
 {
+	might_sleep();
 	if (PageLocked(page))
-		wait_on_page_bit(page, PG_locked);
+		__wait_on_page_locked(page);
 }
 
 /* 
@@ -372,6 +374,7 @@ static inline void wait_on_page_locked(s
  */
 static inline void wait_on_page_writeback(struct page *page)
 {
+	might_sleep();
 	if (PageWriteback(page))
 		wait_on_page_bit(page, PG_writeback);
 }
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -180,6 +180,7 @@ static int sync_page(void *word)
 	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 		mapping->a_ops->sync_page(page);
 	io_schedule();
+
 	return 0;
 }
 
@@ -526,12 +527,6 @@ struct page *__page_cache_alloc(gfp_t gf
 EXPORT_SYMBOL(__page_cache_alloc);
 #endif
 
-static int __sleep_on_page_lock(void *word)
-{
-	io_schedule();
-	return 0;
-}
-
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
@@ -564,6 +559,22 @@ void wait_on_page_bit(struct page *page,
 }
 EXPORT_SYMBOL(wait_on_page_bit);
 
+/*
+ * If PageWaiters was found to be set at unlock time, __wake_page_waiters
+ * should be called to actually perform the wakeup of waiters.
+ */
+static void __wake_page_waiters(struct page *page)
+{
+	ClearPageWaiters(page);
+	/*
+	 * The smp_mb() is necessary to enforce ordering between the clear_bit
+	 * and the read of the waitqueue (to avoid SMP races with a parallel
+	 * __wait_on_page_locked()).
+	 */
+	smp_mb__after_clear_bit();
+	wake_up_page(page, PG_locked);
+}
+
 /**
  * unlock_page - unlock a locked page
  * @page: the page
@@ -580,8 +591,8 @@ void unlock_page(struct page *page)
 {
 	VM_BUG_ON(!PageLocked(page));
 	clear_bit_unlock(PG_locked, &page->flags);
-	smp_mb__after_clear_bit();
-	wake_up_page(page, PG_locked);
+	if (unlikely(PageWaiters(page)))
+		__wake_page_waiters(page);
 }
 EXPORT_SYMBOL(unlock_page);
 
@@ -611,22 +622,59 @@ EXPORT_SYMBOL(end_page_writeback);
  * chances are that on the second loop, the block layer's plug list is empty,
  * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
  */
-void __lock_page(struct page *page)
+void  __lock_page(struct page *page)
 {
+	wait_queue_head_t *wq = page_waitqueue(page);
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
-	__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
-							TASK_UNINTERRUPTIBLE);
+	do {
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		SetPageWaiters(page);
+		if (likely(PageLocked(page)))
+			sync_page(page);
+	} while (!trylock_page(page));
+	finish_wait(wq, &wait.wait);
 }
 EXPORT_SYMBOL(__lock_page);
 
-int __lock_page_killable(struct page *page)
+int  __lock_page_killable(struct page *page)
+{
+	wait_queue_head_t *wq = page_waitqueue(page);
+	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+	int err = 0;
+
+	do {
+		prepare_to_wait(wq, &wait.wait, TASK_KILLABLE);
+		SetPageWaiters(page);
+		if (likely(PageLocked(page))) {
+			err = sync_page_killable(page);
+			if (err)
+				break;
+		}
+	} while (!trylock_page(page));
+	finish_wait(wq, &wait.wait);
+
+	return err;
+}
+
+void  __wait_on_page_locked(struct page *page)
 {
+	wait_queue_head_t *wq = page_waitqueue(page);
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
-	return __wait_on_bit_lock(page_waitqueue(page), &wait,
-					sync_page_killable, TASK_KILLABLE);
+	do {
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		SetPageWaiters(page);
+		if (likely(PageLocked(page)))
+			sync_page(page);
+	} while (PageLocked(page));
+	finish_wait(wq, &wait.wait);
+
+	/* Clean up a potentially dangling PG_waiters */
+	if (unlikely(PageWaiters(page)))
+		__wake_page_waiters(page);
 }
+EXPORT_SYMBOL(__wait_on_page_locked);
 
 /**
  * __lock_page_nosync - get a lock on the page, without calling sync_page()
@@ -635,11 +683,18 @@ int __lock_page_killable(struct page *pa
  * Variant of lock_page that does not require the caller to hold a reference
  * on the page's mapping.
  */
-void __lock_page_nosync(struct page *page)
+void  __lock_page_nosync(struct page *page)
 {
+	wait_queue_head_t *wq = page_waitqueue(page);
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
-	__wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
-							TASK_UNINTERRUPTIBLE);
+
+	do {
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		SetPageWaiters(page);
+		if (likely(PageLocked(page)))
+			io_schedule();
+	} while (!trylock_page(page));
+	finish_wait(wq, &wait.wait);
 }
 
 /**
Index: linux-2.6/kernel/wait.c
===================================================================
--- linux-2.6.orig/kernel/wait.c
+++ linux-2.6/kernel/wait.c
@@ -134,8 +134,7 @@ int wake_bit_function(wait_queue_t *wait
 		= container_of(wait, struct wait_bit_queue, wait);
 
 	if (wait_bit->key.flags != key->flags ||
-			wait_bit->key.bit_nr != key->bit_nr ||
-			test_bit(key->bit_nr, key->flags))
+			wait_bit->key.bit_nr != key->bit_nr)
 		return 0;
 	else
 		return autoremove_wake_function(wait, mode, sync, key);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [rfc][patch] unlock_page speedup
  2008-12-19  7:29 [rfc][patch] unlock_page speedup Nick Piggin
@ 2008-12-19  7:35 ` Andrew Morton
  2008-12-19  7:53   ` Nick Piggin
  2008-12-19 17:35   ` Linus Torvalds
  0 siblings, 2 replies; 9+ messages in thread
From: Andrew Morton @ 2008-12-19  7:35 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Memory Management List, linux-fsdevel, Linus Torvalds

On Fri, 19 Dec 2008 08:29:09 +0100 Nick Piggin <npiggin@suse.de> wrote:

> Introduce a new page flag, PG_waiters

Leaving how many?  fs-cache wants to take two more.

How's about we actually work this out, then make PG_waiters the
highest-numbered free one?

	PG_free1,
	PG_free2,
	...
	PG_waiters
};

(or even something really sensitive, like PG_lru)

So that

a) we can see how many are left in a robust fashion and

b) we find out whether PG_waiters (PG_lru?) gets scribbled on by architectures
   which borrow upper bits from page.flags for other nefarious purposes.

?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [rfc][patch] unlock_page speedup
  2008-12-19  7:35 ` Andrew Morton
@ 2008-12-19  7:53   ` Nick Piggin
  2008-12-19  7:59     ` Andrew Morton
  2008-12-19 17:35   ` Linus Torvalds
  1 sibling, 1 reply; 9+ messages in thread
From: Nick Piggin @ 2008-12-19  7:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Memory Management List, linux-fsdevel, Linus Torvalds

On Thu, Dec 18, 2008 at 11:35:49PM -0800, Andrew Morton wrote:
> On Fri, 19 Dec 2008 08:29:09 +0100 Nick Piggin <npiggin@suse.de> wrote:
> 
> > Introduce a new page flag, PG_waiters
> 
> Leaving how many?

Don't know... I thought the page-flags.h obfuscation project was
supposed to make that clearer to work out. There are what, 21 flags
used now. If everything is coded properly, then the memory model
should automatically kick its metadata out of page flags if it gets
too big. But most likely it will just blow up. Probably we want
at least a few flags for memory model on 32-bit for smaller systems
(big NUMA 32-bit systems probably don't matter much anymore).

>  fs-cache wants to take two more.

fs-cache is getting merged? Wow, I've wanted to review that. When?
Aside from artificial benchmarks (which are obviously going to be
good), does anybody actually deploy it? What do their before/afters
look like I wonder?

> How's about we actually work this out, then make PG_waiters the
> highest-numbered free one?
> 
> 	PG_free1,
> 	PG_free2,
> 	...
> 	PG_waiters
> };
> 
> (or even something really sensitive, like PG_lru)
> 
> So that
> 
> a) we can see how many are left in a robust fashion and
> 
> b) we find out whether PG_waiters (PG_lru?) gets scribbled on by architectures
>    which borrow upper bits from page.flags for other nefarious purposes.

I think if we can just get the memory-model people involved, they
can assure us their code automatically scales to any value of __NR_PAGEFLAGS,
and suggest a reasonable number of flags we should leave for 32-bit systems.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [rfc][patch] unlock_page speedup
  2008-12-19  7:53   ` Nick Piggin
@ 2008-12-19  7:59     ` Andrew Morton
  2008-12-19  8:53       ` Nick Piggin
  0 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2008-12-19  7:59 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Memory Management List, linux-fsdevel, Linus Torvalds

On Fri, 19 Dec 2008 08:53:28 +0100 Nick Piggin <npiggin@suse.de> wrote:

> On Thu, Dec 18, 2008 at 11:35:49PM -0800, Andrew Morton wrote:
> > On Fri, 19 Dec 2008 08:29:09 +0100 Nick Piggin <npiggin@suse.de> wrote:
> > 
> > > Introduce a new page flag, PG_waiters
> > 
> > Leaving how many?
> 
> Don't know...

Need to know!  page.flags is prime real estate and we should decide
whether gaining 2% in a particular microbenchmark is our best use of it

> I thought the page-flags.h obfuscation project was
> supposed to make that clearer to work out. There are what, 21 flags
> used now. If everything is coded properly, then the memory model
> should automatically kick its metadata out of page flags if it gets
> too big.

That would be nice :)

> But most likely it will just blow up.

If we use them all _now_, as I proposed, we'll find out about that.

> Probably we want
> at least a few flags for memory model on 32-bit for smaller systems
> (big NUMA 32-bit systems probably don't matter much anymore).
> 
> 
> >  fs-cache wants to take two more.
> 
> fs-cache is getting merged?

See thread titled "Pull request for FS-Cache, including NFS patches"

> Wow, I've wanted to review that.

That would be good.



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [rfc][patch] unlock_page speedup
  2008-12-19  7:59     ` Andrew Morton
@ 2008-12-19  8:53       ` Nick Piggin
  0 siblings, 0 replies; 9+ messages in thread
From: Nick Piggin @ 2008-12-19  8:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Memory Management List, linux-fsdevel, Linus Torvalds

On Thu, Dec 18, 2008 at 11:59:57PM -0800, Andrew Morton wrote:
> On Fri, 19 Dec 2008 08:53:28 +0100 Nick Piggin <npiggin@suse.de> wrote:
> 
> > On Thu, Dec 18, 2008 at 11:35:49PM -0800, Andrew Morton wrote:
> > > On Fri, 19 Dec 2008 08:29:09 +0100 Nick Piggin <npiggin@suse.de> wrote:
> > > 
> > > > Introduce a new page flag, PG_waiters
> > > 
> > > Leaving how many?
> > 
> > Don't know...
> 
> Need to know!  page.flags is prime real estate and we should decide
> whether gaining 2% in a particular microbenchmark is our best use of it

OK, good question. Honest answer is I don't really know. all
the different archs and memory models.. it's not something
I can exactly work out at a glance.

Keep in mind that it is page lock. We unlock it for every
file backed fault, COW fault, every truncate or invalidate,
every pagecache IO, write(2) etc etc. So the gain is not huge,
but it is definitely a common workload.

On powerpc the gain is much larger, because smp_mb__after_clear_bit
is really heavyweight in comparison to the lock release ordering.
Same would apply to ia64.

> > I thought the page-flags.h obfuscation project was
> > supposed to make that clearer to work out. There are what, 21 flags
> > used now. If everything is coded properly, then the memory model
> > should automatically kick its metadata out of page flags if it gets
> > too big.
> 
> That would be nice :)
> 
> > But most likely it will just blow up.
> 
> If we use them all _now_, as I proposed, we'll find out about that.

Well but it would push out the memory model metadata out of line
sooner (on smaller configs) which is undesirable.

But I think any extra page flags are always a bad thing, whether
or not we've reached the limit. Conceptually it is still using
a resource, even if not in reality.

But one nice thing about this flag is that it's simple to drop if we
ever need to reclaim it. Unlike "feature" features, that can never
be dropped and must always be maintained...

> > Probably we want
> > at least a few flags for memory model on 32-bit for smaller systems
> > (big NUMA 32-bit systems probably don't matter much anymore).
> > 
> > 
> > >  fs-cache wants to take two more.
> > 
> > fs-cache is getting merged?
> 
> See thread titled "Pull request for FS-Cache, including NFS patches"

Oh, ok thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [rfc][patch] unlock_page speedup
  2008-12-19  7:35 ` Andrew Morton
  2008-12-19  7:53   ` Nick Piggin
@ 2008-12-19 17:35   ` Linus Torvalds
  2008-12-19 17:55     ` Linus Torvalds
  2008-12-22  3:51     ` Nick Piggin
  1 sibling, 2 replies; 9+ messages in thread
From: Linus Torvalds @ 2008-12-19 17:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Nick Piggin, Linux Memory Management List, linux-fsdevel

On Thu, 18 Dec 2008, Andrew Morton wrote:
>
> On Fri, 19 Dec 2008 08:29:09 +0100 Nick Piggin <npiggin@suse.de> wrote:
> 
> > Introduce a new page flag, PG_waiters
> 
> Leaving how many?  fs-cache wants to take two more.

Hmm. Do we ever use lock_page() on anything but page-cache pages and the 
buffer cache?

We _could_ decide to try to move the whole locking into the "mapping" 
field, and use a few more bits in the low bits of the pointer. Right now 
we just use one bit (PAGE_MAPPING_ANON), but if we just make the rule be 
that "struct address_space" has to be 8-byte aligned, then we'd have two 
more bits available there, and we could hide the lock bit and the 
contention bit there too.

This actually would have a _really_ nice effect, in that if we do this, 
then I suspect that we could eventually even make the bits in "flags" be 
non-atomic. The lock bit really is special. The other bits tend to be 
either pretty static over allocation, or things that should be set only 
when the page is locked.

I dunno. But it sounds like a reasonable thing to do, and it would free 
one bit from the page flags, rather than use yet another one. And because 
locking is special and because we already have to access that "mapping" 
pointer specially, I don't think the impact would be very invasive.

				Linus

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [rfc][patch] unlock_page speedup
  2008-12-19 17:35   ` Linus Torvalds
@ 2008-12-19 17:55     ` Linus Torvalds
  2008-12-23  0:46       ` Hugh Dickins
  2008-12-22  3:51     ` Nick Piggin
  1 sibling, 1 reply; 9+ messages in thread
From: Linus Torvalds @ 2008-12-19 17:55 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Nick Piggin, Linux Memory Management List, linux-fsdevel

On Fri, 19 Dec 2008, Linus Torvalds wrote:
> 
> Hmm. Do we ever use lock_page() on anything but page-cache pages and the 
> buffer cache?

Looking closer, I don't think we do. 

The issue with using the low bits in page->mapping is that sometimes that 
field doesn't exist, and we use other members of that union:

 - spinlock_t ptr	 	- page table pages
 - struct kmem_cache *slab	- slab allocations
 - struct page *first_page	- compound tail pages

but I cannot see how lock_page() could ever be valid for any of them. We 
use lock_page() for things that we do IO on, or that are mapped into user 
space. And while we can map compound pages into user space, we'd better 
not be locking random parts of it - we have to lock the whole thing (ie 
the first one, not the tails).

And we even have a way to verify it - we can make lock_page() verify the 
page flags for at least things like "not a slab page or a compound tail". 
I guess we don't mark page table pages any special way, so I don't see how 
we can add an assert for that use, but verifying that we never do 
lock_page() on a page table page should be trivial.

So it should work.

That said, I did notice a problem. Namely that while the VM code is good 
about looking at ->mapping (because it doesn't know whether the page is 
anonymous or a true mapping), much of the filesystem code is _not_ careful 
about page->mapping, since the filesystem code knows a-priori that the 
mapping pointer must be an inode mapping (or we'd not have called it).

So filesystems do tend to do things like

	struct inode *inode = page->mapping->host;

and while the low bit of mapping is magic, those code-paths don't care 
because they depend on it being zero.

So hiding the lock bit there would involve a lot more work than I naïvely 
expected before looking closer. We'd have to change the name (to 
"_mapping", presumably), and make all users use an accessor function to 
make code like the above do

	struct inode *inode = page_mapping(page)->host;

or something (we migth want to have a "page_host_inode()" helper to do it, 
it seems to be the most common reason for accessing "->mapping" that 
there is.

So it could be done pretty mechanically, but it's still a _big_ change. 
Maybe not worth it, unless we can really translate it into some other 
advantage (ie real simplification of page flag access)

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [rfc][patch] unlock_page speedup
  2008-12-19 17:35   ` Linus Torvalds
  2008-12-19 17:55     ` Linus Torvalds
@ 2008-12-22  3:51     ` Nick Piggin
  1 sibling, 0 replies; 9+ messages in thread
From: Nick Piggin @ 2008-12-22  3:51 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Linux Memory Management List, linux-fsdevel

On Fri, Dec 19, 2008 at 09:35:14AM -0800, Linus Torvalds wrote:
> 
> 
> On Thu, 18 Dec 2008, Andrew Morton wrote:
> >
> > On Fri, 19 Dec 2008 08:29:09 +0100 Nick Piggin <npiggin@suse.de> wrote:
> > 
> > > Introduce a new page flag, PG_waiters
> > 
> > Leaving how many?  fs-cache wants to take two more.
> 
> Hmm. Do we ever use lock_page() on anything but page-cache pages and the 
> buffer cache?
> 
> We _could_ decide to try to move the whole locking into the "mapping" 
> field, and use a few more bits in the low bits of the pointer. Right now 
> we just use one bit (PAGE_MAPPING_ANON), but if we just make the rule be 
> that "struct address_space" has to be 8-byte aligned, then we'd have two 
> more bits available there, and we could hide the lock bit and the 
> contention bit there too.
> 
> This actually would have a _really_ nice effect, in that if we do this, 
> then I suspect that we could eventually even make the bits in "flags" be 
> non-atomic. The lock bit really is special. The other bits tend to be 
> either pretty static over allocation, or things that should be set only 
> when the page is locked.
> 
> I dunno. But it sounds like a reasonable thing to do, and it would free 
> one bit from the page flags, rather than use yet another one. And because 
> locking is special and because we already have to access that "mapping" 
> pointer specially, I don't think the impact would be very invasive.

I did a patch for that at one point. It doesn't go very far to allowing
non-atomic page flags, but it allows non-atomic unlock_page. But Hugh
wanted to put PG_swapcache in there, so I put it on the shelf for a while.



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [rfc][patch] unlock_page speedup
  2008-12-19 17:55     ` Linus Torvalds
@ 2008-12-23  0:46       ` Hugh Dickins
  0 siblings, 0 replies; 9+ messages in thread
From: Hugh Dickins @ 2008-12-23  0:46 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, Nick Piggin, Linux Memory Management List,
	linux-fsdevel

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2074 bytes --]

On Fri, 19 Dec 2008, Linus Torvalds wrote:
> 
> That said, I did notice a problem. Namely that while the VM code is good 
> about looking at ->mapping (because it doesn't know whether the page is 
> anonymous or a true mapping), much of the filesystem code is _not_ careful 
> about page->mapping, since the filesystem code knows a-priori that the 
> mapping pointer must be an inode mapping (or we'd not have called it).
> 
> So filesystems do tend to do things like
> 
> 	struct inode *inode = page->mapping->host;
> 
> and while the low bit of mapping is magic, those code-paths don't care 
> because they depend on it being zero.
> 
> So hiding the lock bit there would involve a lot more work than I naïvely 
> expected before looking closer. We'd have to change the name (to 
> "_mapping", presumably), and make all users use an accessor function to 
> make code like the above do
> 
> 	struct inode *inode = page_mapping(page)->host;
> 
> or something (we migth want to have a "page_host_inode()" helper to do it, 
> it seems to be the most common reason for accessing "->mapping" that 
> there is.
> 
> So it could be done pretty mechanically, but it's still a _big_ change. 
> Maybe not worth it, unless we can really translate it into some other 
> advantage (ie real simplification of page flag access)

Yes, it's messy, particularly given out-of-tree filesystems.

Perhaps there's somewhere else in the struct page you could keep the
lock bit and get the advantage you're looking for: playing with the
low bits of page->mapping is best left to anon pages.

I did have a patch to keep PG_swapcache there: that got stalled on
dreaming up enough memorable names for variants of page_mapping()
that I turned out to need, then it got buried under other work.

I'll resurrect it in the next month or so; it would be nice to
keep PG_swapbacked there too, but it's not quite as easy since
Rik's split LRU categorizes shmem/tmpfs pages as swapbacked i.e.
that flag applies to anon and also to one particular filesystem.

Hugh

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2008-12-23  0:45 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-12-19  7:29 [rfc][patch] unlock_page speedup Nick Piggin
2008-12-19  7:35 ` Andrew Morton
2008-12-19  7:53   ` Nick Piggin
2008-12-19  7:59     ` Andrew Morton
2008-12-19  8:53       ` Nick Piggin
2008-12-19 17:35   ` Linus Torvalds
2008-12-19 17:55     ` Linus Torvalds
2008-12-23  0:46       ` Hugh Dickins
2008-12-22  3:51     ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).