[rfc] lock bitops

linux-arch.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [rfc] lock bitops
@ 2007-05-08 11:37 Nick Piggin
  2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin
                   ` (3 more replies)
  0 siblings, 4 replies; 34+ messages in thread
From: Nick Piggin @ 2007-05-08 11:37 UTC (permalink / raw)
  To: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List

Hi,

This patch (along with the subsequent one to optimise unlock_page) reduces
the overhead of lock_page/unlock_page (measured with page faults and a patch
to lock the page in the fault handler) by about 425 cycles on my 2-way G5.

It also improves `dd if=big-sparse-file of=/dev/null`  performance from
626MB/s to 683MB/s (+9.1%) by reducing lock_page/unlock_page overhead by
651 cycles on the same machine.

Lastly, kbuild elapsed times are improved by maybe half a percent on the
same system, from:
236.21user 18.68system 2:09.38elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
236.21user 18.68system 2:09.15elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
236.12user 18.69system 2:09.06elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
236.22user 18.70system 2:09.14elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
236.24user 18.70system 2:09.20elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k

to:
235.13user 18.38system 2:08.49elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
235.25user 18.39system 2:08.63elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
235.19user 18.41system 2:08.66elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
235.16user 18.40system 2:08.57elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
235.21user 18.39system 2:08.47elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k

--
Introduce test_and_set_bit_lock / clear_bit_unlock bitops with lock semantics.
Add non-trivial for powerpc and ia64. Convert page lock, buffer lock,
bit_spin_lock, tasklet locks to use the new locks.

---
 Documentation/atomic_ops.txt      |    9 ++++++++
 drivers/scsi/sg.c                 |    2 -
 fs/buffer.c                       |    7 ++----
 fs/cifs/file.c                    |    2 -
 fs/jbd/commit.c                   |    2 -
 fs/jbd2/commit.c                  |    2 -
 fs/xfs/linux-2.6/xfs_aops.c       |    4 +--
 include/asm-alpha/bitops.h        |    1 
 include/asm-arm/bitops.h          |    1 
 include/asm-arm26/bitops.h        |    1 
 include/asm-avr32/bitops.h        |    1 
 include/asm-cris/bitops.h         |    1 
 include/asm-frv/bitops.h          |    1 
 include/asm-generic/bitops.h      |    1 
 include/asm-generic/bitops/lock.h |   16 +++++++++++++++
 include/asm-h8300/bitops.h        |    1 
 include/asm-i386/bitops.h         |    1 
 include/asm-ia64/bitops.h         |   33 ++++++++++++++++++++++++++++++++
 include/asm-m32r/bitops.h         |    1 
 include/asm-m68k/bitops.h         |    1 
 include/asm-m68knommu/bitops.h    |    1 
 include/asm-mips/bitops.h         |    1 
 include/asm-parisc/bitops.h       |    1 
 include/asm-powerpc/bitops.h      |   39 ++++++++++++++++++++++++++++++++++++++
 include/asm-s390/bitops.h         |    1 
 include/asm-sh/bitops.h           |    1 
 include/asm-sh64/bitops.h         |    1 
 include/asm-sparc/bitops.h        |    1 
 include/asm-sparc64/bitops.h      |    1 
 include/asm-v850/bitops.h         |    1 
 include/asm-x86_64/bitops.h       |    1 
 include/asm-xtensa/bitops.h       |    1 
 include/linux/bit_spinlock.h      |   11 +++++-----
 include/linux/buffer_head.h       |    8 +++++--
 include/linux/interrupt.h         |    5 +---
 include/linux/page-flags.h        |    4 +++
 include/linux/pagemap.h           |    9 ++++++--
 kernel/wait.c                     |    2 -
 mm/filemap.c                      |   13 ++++--------
 mm/memory.c                       |    2 -
 mm/migrate.c                      |    4 +--
 mm/rmap.c                         |    2 -
 mm/shmem.c                        |    4 +--
 mm/swap.c                         |    2 -
 mm/swap_state.c                   |    2 -
 mm/swapfile.c                     |    2 -
 mm/truncate.c                     |    4 +--
 mm/vmscan.c                       |    4 +--
 48 files changed, 172 insertions(+), 44 deletions(-)

Index: linux-2.6/include/asm-powerpc/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-powerpc/bitops.h	2007-05-08 13:33:18.000000000 +1000
+++ linux-2.6/include/asm-powerpc/bitops.h	2007-05-08 13:45:31.000000000 +1000
@@ -87,6 +87,24 @@
 	: "cc" );
 }
 
+static __inline__ void clear_bit_unlock(int nr, volatile unsigned long *addr)
+{
+	unsigned long old;
+	unsigned long mask = BITOP_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+	__asm__ __volatile__(
+	LWSYNC_ON_SMP
+"1:"	PPC_LLARX "%0,0,%3	# clear_bit_unlock\n"
+	"andc	%0,%0,%2\n"
+	PPC405_ERR77(0,%3)
+	PPC_STLCX "%0,0,%3\n"
+	"bne-	1b"
+	: "=&r" (old), "+m" (*p)
+	: "r" (mask), "r" (p)
+	: "cc" );
+}
+
 static __inline__ void change_bit(int nr, volatile unsigned long *addr)
 {
 	unsigned long old;
@@ -126,6 +144,27 @@
 	return (old & mask) != 0;
 }
 
+static __inline__ int test_and_set_bit_lock(unsigned long nr,
+				       volatile unsigned long *addr)
+{
+	unsigned long old, t;
+	unsigned long mask = BITOP_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+	__asm__ __volatile__(
+"1:"	PPC_LLARX "%0,0,%3		# test_and_set_bit_lock\n"
+	"or	%1,%0,%2 \n"
+	PPC405_ERR77(0,%3)
+	PPC_STLCX "%1,0,%3 \n"
+	"bne-	1b"
+	ISYNC_ON_SMP
+	: "=&r" (old), "=&r" (t)
+	: "r" (mask), "r" (p)
+	: "cc", "memory");
+
+	return (old & mask) != 0;
+}
+
 static __inline__ int test_and_clear_bit(unsigned long nr,
 					 volatile unsigned long *addr)
 {
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h	2007-05-08 13:33:18.000000000 +1000
+++ linux-2.6/include/linux/pagemap.h	2007-05-08 15:30:39.000000000 +1000
@@ -135,13 +135,18 @@
 extern void FASTCALL(__lock_page_nosync(struct page *page));
 extern void FASTCALL(unlock_page(struct page *page));
 
+static inline int trylock_page(struct page *page)
+{
+	return (likely(!TestSetPageLocked_Lock(page)));
+}
+
 /*
  * lock_page may only be called if we have the page's inode pinned.
  */
 static inline void lock_page(struct page *page)
 {
 	might_sleep();
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		__lock_page(page);
 }
 
@@ -152,7 +157,7 @@
 static inline void lock_page_nosync(struct page *page)
 {
 	might_sleep();
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		__lock_page_nosync(page);
 }
 	
Index: linux-2.6/drivers/scsi/sg.c
===================================================================
--- linux-2.6.orig/drivers/scsi/sg.c	2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/drivers/scsi/sg.c	2007-05-08 13:45:31.000000000 +1000
@@ -1734,7 +1734,7 @@
                  */
 		flush_dcache_page(pages[i]);
 		/* ?? Is locking needed? I don't think so */
-		/* if (TestSetPageLocked(pages[i]))
+		/* if (!trylock_page(pages[i]))
 		   goto out_unlock; */
         }
 
Index: linux-2.6/fs/cifs/file.c
===================================================================
--- linux-2.6.orig/fs/cifs/file.c	2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/fs/cifs/file.c	2007-05-08 13:45:31.000000000 +1000
@@ -1229,7 +1229,7 @@
 
 			if (first < 0)
 				lock_page(page);
-			else if (TestSetPageLocked(page))
+			else if (!trylock_page(page))
 				break;
 
 			if (unlikely(page->mapping != mapping)) {
Index: linux-2.6/fs/jbd/commit.c
===================================================================
--- linux-2.6.orig/fs/jbd/commit.c	2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/fs/jbd/commit.c	2007-05-08 15:19:20.000000000 +1000
@@ -64,7 +64,7 @@
 		goto nope;
 
 	/* OK, it's a truncated page */
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		goto nope;
 
 	page_cache_get(page);
@@ -209,7 +209,7 @@
 		 * blocking lock_buffer().
 		 */
 		if (buffer_dirty(bh)) {
-			if (test_set_buffer_locked(bh)) {
+			if (!trylock_buffer(bh)) {
 				BUFFER_TRACE(bh, "needs blocking lock");
 				spin_unlock(&journal->j_list_lock);
 				/* Write out all data to prevent deadlocks */
Index: linux-2.6/fs/jbd2/commit.c
===================================================================
--- linux-2.6.orig/fs/jbd2/commit.c	2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/fs/jbd2/commit.c	2007-05-08 15:19:12.000000000 +1000
@@ -64,7 +64,7 @@
 		goto nope;
 
 	/* OK, it's a truncated page */
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		goto nope;
 
 	page_cache_get(page);
@@ -209,7 +209,7 @@
 		 * blocking lock_buffer().
 		 */
 		if (buffer_dirty(bh)) {
-			if (test_set_buffer_locked(bh)) {
+			if (!trylock_buffer(bh)) {
 				BUFFER_TRACE(bh, "needs blocking lock");
 				spin_unlock(&journal->j_list_lock);
 				/* Write out all data to prevent deadlocks */
Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.c	2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.c	2007-05-08 13:45:31.000000000 +1000
@@ -601,7 +601,7 @@
 			} else
 				pg_offset = PAGE_CACHE_SIZE;
 
-			if (page->index == tindex && !TestSetPageLocked(page)) {
+			if (page->index == tindex && trylock_page(page)) {
 				len = xfs_probe_page(page, pg_offset, mapped);
 				unlock_page(page);
 			}
@@ -685,7 +685,7 @@
 
 	if (page->index != tindex)
 		goto fail;
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		goto fail;
 	if (PageWriteback(page))
 		goto fail_unlock_page;
Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h	2007-05-08 13:33:18.000000000 +1000
+++ linux-2.6/include/linux/page-flags.h	2007-05-08 15:30:39.000000000 +1000
@@ -114,8 +114,12 @@
 		set_bit(PG_locked, &(page)->flags)
 #define TestSetPageLocked(page)		\
 		test_and_set_bit(PG_locked, &(page)->flags)
+#define TestSetPageLocked_Lock(page)		\
+		test_and_set_bit_lock(PG_locked, &(page)->flags)
 #define ClearPageLocked(page)		\
 		clear_bit(PG_locked, &(page)->flags)
+#define ClearPageLocked_Unlock(page)		\
+		clear_bit_unlock(PG_locked, &(page)->flags)
 #define TestClearPageLocked(page)	\
 		test_and_clear_bit(PG_locked, &(page)->flags)
 
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2007-05-08 13:45:15.000000000 +1000
+++ linux-2.6/mm/memory.c	2007-05-08 13:45:31.000000000 +1000
@@ -1550,7 +1550,7 @@
 	 * not dirty accountable.
 	 */
 	if (PageAnon(old_page)) {
-		if (!TestSetPageLocked(old_page)) {
+		if (trylock_page(old_page)) {
 			reuse = can_share_swap_page(old_page);
 			unlock_page(old_page);
 		}
Index: linux-2.6/mm/migrate.c
===================================================================
--- linux-2.6.orig/mm/migrate.c	2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/mm/migrate.c	2007-05-08 13:45:31.000000000 +1000
@@ -569,7 +569,7 @@
 	 * establishing additional references. We are the only one
 	 * holding a reference to the new page at this point.
 	 */
-	if (TestSetPageLocked(newpage))
+	if (!trylock_page(newpage))
 		BUG();
 
 	/* Prepare mapping for the new page.*/
@@ -621,7 +621,7 @@
 		goto move_newpage;
 
 	rc = -EAGAIN;
-	if (TestSetPageLocked(page)) {
+	if (!trylock_page(page)) {
 		if (!force)
 			goto move_newpage;
 		lock_page(page);
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c	2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/mm/rmap.c	2007-05-08 13:45:31.000000000 +1000
@@ -426,7 +426,7 @@
 			referenced += page_referenced_anon(page);
 		else if (is_locked)
 			referenced += page_referenced_file(page);
-		else if (TestSetPageLocked(page))
+		else if (!trylock_page(page))
 			referenced++;
 		else {
 			if (page->mapping)
Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c	2007-05-08 13:45:15.000000000 +1000
+++ linux-2.6/mm/shmem.c	2007-05-08 13:45:31.000000000 +1000
@@ -1154,7 +1154,7 @@
 		}
 
 		/* We have to do this with page locked to prevent races */
-		if (TestSetPageLocked(swappage)) {
+		if (!trylock_page(swappage)) {
 			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
 			wait_on_page_locked(swappage);
@@ -1213,7 +1213,7 @@
 		shmem_swp_unmap(entry);
 		filepage = find_get_page(mapping, idx);
 		if (filepage &&
-		    (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
+		    (!PageUptodate(filepage) || !trylock_page(filepage))) {
 			spin_unlock(&info->lock);
 			wait_on_page_locked(filepage);
 			page_cache_release(filepage);
Index: linux-2.6/mm/swap.c
===================================================================
--- linux-2.6.orig/mm/swap.c	2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/mm/swap.c	2007-05-08 13:45:31.000000000 +1000
@@ -412,7 +412,7 @@
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
 
-		if (PagePrivate(page) && !TestSetPageLocked(page)) {
+		if (PagePrivate(page) && trylock_page(page)) {
 			if (PagePrivate(page))
 				try_to_release_page(page, 0);
 			unlock_page(page);
Index: linux-2.6/mm/swap_state.c
===================================================================
--- linux-2.6.orig/mm/swap_state.c	2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/mm/swap_state.c	2007-05-08 13:45:31.000000000 +1000
@@ -252,7 +252,7 @@
  */
 static inline void free_swap_cache(struct page *page)
 {
-	if (PageSwapCache(page) && !TestSetPageLocked(page)) {
+	if (PageSwapCache(page) && trylock_page(page)) {
 		remove_exclusive_swap_page(page);
 		unlock_page(page);
 	}
Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c	2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/mm/swapfile.c	2007-05-08 13:45:31.000000000 +1000
@@ -401,7 +401,7 @@
 	if (p) {
 		if (swap_entry_free(p, swp_offset(entry)) == 1) {
 			page = find_get_page(&swapper_space, entry.val);
-			if (page && unlikely(TestSetPageLocked(page))) {
+			if (page && unlikely(!trylock_page(page))) {
 				page_cache_release(page);
 				page = NULL;
 			}
Index: linux-2.6/mm/truncate.c
===================================================================
--- linux-2.6.orig/mm/truncate.c	2007-05-08 13:45:15.000000000 +1000
+++ linux-2.6/mm/truncate.c	2007-05-08 13:45:31.000000000 +1000
@@ -185,7 +185,7 @@
 			if (page_index > next)
 				next = page_index;
 			next++;
-			if (TestSetPageLocked(page))
+			if (!trylock_page(page))
 				continue;
 			if (PageWriteback(page)) {
 				unlock_page(page);
@@ -281,7 +281,7 @@
 			pgoff_t index;
 			int lock_failed;
 
-			lock_failed = TestSetPageLocked(page);
+			lock_failed = !trylock_page(page);
 
 			/*
 			 * We really shouldn't be looking at the ->index of an
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c	2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/mm/vmscan.c	2007-05-08 13:45:31.000000000 +1000
@@ -466,7 +466,7 @@
 		page = lru_to_page(page_list);
 		list_del(&page->lru);
 
-		if (TestSetPageLocked(page))
+		if (!trylock_page(page))
 			goto keep;
 
 		VM_BUG_ON(PageActive(page));
@@ -538,7 +538,7 @@
 				 * A synchronous write - probably a ramdisk.  Go
 				 * ahead and try to reclaim the page.
 				 */
-				if (TestSetPageLocked(page))
+				if (!trylock_page(page))
 					goto keep;
 				if (PageDirty(page) || PageWriteback(page))
 					goto keep_locked;
Index: linux-2.6/include/linux/bit_spinlock.h
===================================================================
--- linux-2.6.orig/include/linux/bit_spinlock.h	2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/include/linux/bit_spinlock.h	2007-05-08 15:00:18.000000000 +1000
@@ -18,7 +18,7 @@
 	 */
 	preempt_disable();
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-	while (test_and_set_bit(bitnum, addr)) {
+	while (unlikely(test_and_set_bit_lock(bitnum, addr))) {
 		while (test_bit(bitnum, addr)) {
 			preempt_enable();
 			cpu_relax();
@@ -36,7 +36,7 @@
 {
 	preempt_disable();
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-	if (test_and_set_bit(bitnum, addr)) {
+	if (test_and_set_bit_lock(bitnum, addr)) {
 		preempt_enable();
 		return 0;
 	}
@@ -50,10 +50,11 @@
  */
 static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
 {
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+#ifdef CONFIG_DEBUG_SPINLOCK
 	BUG_ON(!test_bit(bitnum, addr));
-	smp_mb__before_clear_bit();
-	clear_bit(bitnum, addr);
+#endif
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	clear_bit_unlock(bitnum, addr);
 #endif
 	preempt_enable();
 	__release(bitlock);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c	2007-05-08 13:47:21.000000000 +1000
+++ linux-2.6/mm/filemap.c	2007-05-08 15:30:55.000000000 +1000
@@ -525,17 +525,14 @@
  * mechananism between PageLocked pages and PageWriteback pages is shared.
  * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
  *
- * The first mb is necessary to safely close the critical section opened by the
- * TestSetPageLocked(), the second mb is necessary to enforce ordering between
- * the clear_bit and the read of the waitqueue (to avoid SMP races with a
- * parallel wait_on_page_locked()).
+ * The mb is necessary to enforce ordering between the clear_bit and the read
+ * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
  */
 void fastcall unlock_page(struct page *page)
 {
-	smp_mb__before_clear_bit();
-	if (!TestClearPageLocked(page))
-		BUG();
-	smp_mb__after_clear_bit(); 
+	VM_BUG_ON(!PageLocked(page));
+	ClearPageLocked_Unlock(page);
+	smp_mb__after_clear_bit();
 	wake_up_page(page, PG_locked);
 }
 EXPORT_SYMBOL(unlock_page);
@@ -625,7 +622,7 @@
 	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page) {
 		page_cache_get(page);
-		if (TestSetPageLocked(page)) {
+		if (!trylock_page(page)) {
 			read_unlock_irq(&mapping->tree_lock);
 			__lock_page(page);
 			read_lock_irq(&mapping->tree_lock);
@@ -798,7 +795,7 @@
 	struct page *page = find_get_page(mapping, index);
 
 	if (page) {
-		if (!TestSetPageLocked(page))
+		if (trylock_page(page))
 			return page;
 		page_cache_release(page);
 		return NULL;
Index: linux-2.6/include/asm-alpha/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-alpha/bitops.h	2006-10-10 19:35:15.000000000 +1000
+++ linux-2.6/include/asm-alpha/bitops.h	2007-05-08 14:22:29.000000000 +1000
@@ -359,6 +359,7 @@
 #else
 #include <asm-generic/bitops/hweight.h>
 #endif
+#include <asm-generic/bitops/lock.h>
 
 #endif /* __KERNEL__ */
 
Index: linux-2.6/include/asm-arm/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-arm/bitops.h	2007-01-29 17:30:03.000000000 +1100
+++ linux-2.6/include/asm-arm/bitops.h	2007-05-08 14:21:11.000000000 +1000
@@ -286,6 +286,7 @@
 
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 /*
  * Ext2 is defined to use little-endian byte ordering.
Index: linux-2.6/include/asm-arm26/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-arm26/bitops.h	2006-05-02 14:30:50.000000000 +1000
+++ linux-2.6/include/asm-arm26/bitops.h	2007-05-08 14:20:54.000000000 +1000
@@ -167,6 +167,7 @@
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 /*
  * Ext2 is defined to use little-endian byte ordering.
Index: linux-2.6/include/asm-avr32/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-avr32/bitops.h	2007-01-29 17:30:03.000000000 +1100
+++ linux-2.6/include/asm-avr32/bitops.h	2007-05-08 14:21:18.000000000 +1000
@@ -288,6 +288,7 @@
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 #include <asm-generic/bitops/ext2-non-atomic.h>
 #include <asm-generic/bitops/ext2-atomic.h>
Index: linux-2.6/include/asm-cris/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-cris/bitops.h	2006-05-02 14:30:50.000000000 +1000
+++ linux-2.6/include/asm-cris/bitops.h	2007-05-08 14:21:24.000000000 +1000
@@ -154,6 +154,7 @@
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/hweight.h>
 #include <asm-generic/bitops/find.h>
+#include <asm-generic/bitops/lock.h>
 
 #include <asm-generic/bitops/ext2-non-atomic.h>
 
Index: linux-2.6/include/asm-frv/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-frv/bitops.h	2007-01-29 17:35:54.000000000 +1100
+++ linux-2.6/include/asm-frv/bitops.h	2007-05-08 14:21:29.000000000 +1000
@@ -302,6 +302,7 @@
 
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 #include <asm-generic/bitops/ext2-non-atomic.h>
 
Index: linux-2.6/include/asm-generic/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-generic/bitops.h	2006-05-02 14:30:50.000000000 +1000
+++ linux-2.6/include/asm-generic/bitops.h	2007-05-08 14:21:47.000000000 +1000
@@ -22,6 +22,7 @@
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 #include <asm-generic/bitops/ext2-non-atomic.h>
 #include <asm-generic/bitops/ext2-atomic.h>
Index: linux-2.6/include/asm-generic/bitops/lock.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/include/asm-generic/bitops/lock.h	2007-05-08 14:17:44.000000000 +1000
@@ -0,0 +1,16 @@
+#ifndef _ASM_GENERIC_BITOPS_LOCK_H_
+#define _ASM_GENERIC_BITOPS_LOCK_H_
+
+static inline int test_and_set_bit_lock(unsigned long nr, volatile void *addr)
+{
+	return test_and_set_bit(nr, addr);
+}
+
+static inline void clear_bit_unlock(unsigned long nr, volatile void *addr)
+{
+	smp_mb__before_clear_bit();
+	clear_bit(nr, addr);
+}
+
+#endif /* _ASM_GENERIC_BITOPS_LOCK_H_ */
+
Index: linux-2.6/include/asm-h8300/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-h8300/bitops.h	2006-10-10 19:35:15.000000000 +1000
+++ linux-2.6/include/asm-h8300/bitops.h	2007-05-08 14:21:54.000000000 +1000
@@ -194,6 +194,7 @@
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/ext2-non-atomic.h>
 #include <asm-generic/bitops/ext2-atomic.h>
 #include <asm-generic/bitops/minix.h>
Index: linux-2.6/include/asm-i386/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-i386/bitops.h	2007-02-26 11:43:54.000000000 +1100
+++ linux-2.6/include/asm-i386/bitops.h	2007-05-08 14:22:18.000000000 +1000
@@ -402,6 +402,7 @@
 }
 
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 #endif /* __KERNEL__ */
 
Index: linux-2.6/include/asm-ia64/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-ia64/bitops.h	2006-05-02 14:30:58.000000000 +1000
+++ linux-2.6/include/asm-ia64/bitops.h	2007-05-08 14:25:10.000000000 +1000
@@ -94,6 +94,30 @@
 }
 
 /**
+ * clear_bit_unlock - Clears a bit in memory with release
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and may not be reordered.  It does
+ * contain a memory barrier suitable for unlock type operations.
+ */
+static __inline__ void
+clear_bit_unlock (int nr, volatile void *addr)
+{
+	__u32 mask, old, new;
+	volatile __u32 *m;
+	CMPXCHG_BUGCHECK_DECL
+
+	m = (volatile __u32 *) addr + (nr >> 5);
+	mask = ~(1 << (nr & 31));
+	do {
+		CMPXCHG_BUGCHECK(m);
+		old = *m;
+		new = old & mask;
+	} while (cmpxchg_rel(m, old, new) != old);
+}
+
+/**
  * __clear_bit - Clears a bit in memory (non-atomic version)
  */
 static __inline__ void
@@ -170,6 +194,15 @@
 }
 
 /**
+ * test_and_set_bit_lock - Set a bit and return its old value for lock
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This is the same as test_and_set_bit on ia64
+ */
+#define test_and_set_bit_lock test_and_set_bit
+
+/**
  * __test_and_set_bit - Set a bit and return its old value
  * @nr: Bit to set
  * @addr: Address to count from
Index: linux-2.6/include/asm-m32r/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-m32r/bitops.h	2006-10-10 19:35:15.000000000 +1000
+++ linux-2.6/include/asm-m32r/bitops.h	2007-05-08 14:25:18.000000000 +1000
@@ -255,6 +255,7 @@
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 #endif /* __KERNEL__ */
 
Index: linux-2.6/include/asm-m68k/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-m68k/bitops.h	2006-05-02 14:30:50.000000000 +1000
+++ linux-2.6/include/asm-m68k/bitops.h	2007-05-08 14:25:25.000000000 +1000
@@ -314,6 +314,7 @@
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 /* Bitmap functions for the minix filesystem */
 
Index: linux-2.6/include/asm-m68knommu/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-m68knommu/bitops.h	2007-02-26 11:43:54.000000000 +1100
+++ linux-2.6/include/asm-m68knommu/bitops.h	2007-05-08 14:25:36.000000000 +1000
@@ -160,6 +160,7 @@
 
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 static __inline__ int ext2_set_bit(int nr, volatile void * addr)
 {
Index: linux-2.6/include/asm-mips/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-mips/bitops.h	2007-03-30 11:31:29.000000000 +1000
+++ linux-2.6/include/asm-mips/bitops.h	2007-05-08 14:25:51.000000000 +1000
@@ -569,6 +569,7 @@
 
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/ext2-non-atomic.h>
 #include <asm-generic/bitops/ext2-atomic.h>
 #include <asm-generic/bitops/minix.h>
Index: linux-2.6/include/asm-parisc/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-parisc/bitops.h	2007-03-05 12:46:03.000000000 +1100
+++ linux-2.6/include/asm-parisc/bitops.h	2007-05-08 14:25:56.000000000 +1000
@@ -208,6 +208,7 @@
 
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
 
 #endif /* __KERNEL__ */
Index: linux-2.6/include/asm-s390/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-s390/bitops.h	2007-01-29 17:30:03.000000000 +1100
+++ linux-2.6/include/asm-s390/bitops.h	2007-05-08 14:26:14.000000000 +1000
@@ -746,6 +746,7 @@
 #include <asm-generic/bitops/fls64.h>
 
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 /*
  * ATTENTION: intel byte ordering convention for ext2 and minix !!
Index: linux-2.6/include/asm-sh/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-sh/bitops.h	2007-01-29 17:30:03.000000000 +1100
+++ linux-2.6/include/asm-sh/bitops.h	2007-05-08 14:26:25.000000000 +1000
@@ -137,6 +137,7 @@
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/ext2-non-atomic.h>
 #include <asm-generic/bitops/ext2-atomic.h>
Index: linux-2.6/include/asm-sh64/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-sh64/bitops.h	2006-05-02 14:30:51.000000000 +1000
+++ linux-2.6/include/asm-sh64/bitops.h	2007-05-08 14:26:20.000000000 +1000
@@ -136,6 +136,7 @@
 #include <asm-generic/bitops/__ffs.h>
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/ext2-non-atomic.h>
Index: linux-2.6/include/asm-sparc/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-sparc/bitops.h	2007-01-29 17:35:57.000000000 +1100
+++ linux-2.6/include/asm-sparc/bitops.h	2007-05-08 14:27:11.000000000 +1000
@@ -96,6 +96,7 @@
 #include <asm-generic/bitops/fls.h>
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/ext2-non-atomic.h>
 #include <asm-generic/bitops/ext2-atomic.h>
Index: linux-2.6/include/asm-sparc64/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-sparc64/bitops.h	2006-10-10 19:35:16.000000000 +1000
+++ linux-2.6/include/asm-sparc64/bitops.h	2007-05-08 14:27:06.000000000 +1000
@@ -81,6 +81,7 @@
 #include <asm-generic/bitops/hweight.h>
 
 #endif
+#include <asm-generic/bitops/lock.h>
 #endif /* __KERNEL__ */
 
 #include <asm-generic/bitops/find.h>
Index: linux-2.6/include/asm-v850/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-v850/bitops.h	2006-10-10 19:35:16.000000000 +1000
+++ linux-2.6/include/asm-v850/bitops.h	2007-05-08 14:27:16.000000000 +1000
@@ -145,6 +145,7 @@
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 #include <asm-generic/bitops/ext2-non-atomic.h>
 #define ext2_set_bit_atomic(l,n,a)      test_and_set_bit(n,a)
Index: linux-2.6/include/asm-x86_64/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-x86_64/bitops.h	2007-02-26 11:43:54.000000000 +1100
+++ linux-2.6/include/asm-x86_64/bitops.h	2007-05-08 14:27:22.000000000 +1000
@@ -408,6 +408,7 @@
 #define ARCH_HAS_FAST_MULTIPLIER 1
 
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 #endif /* __KERNEL__ */
 
Index: linux-2.6/include/asm-xtensa/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-xtensa/bitops.h	2006-05-02 14:30:51.000000000 +1000
+++ linux-2.6/include/asm-xtensa/bitops.h	2007-05-08 14:27:24.000000000 +1000
@@ -115,6 +115,7 @@
 #endif
 
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/minix.h>
 
Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c	2007-04-24 10:39:55.000000000 +1000
+++ linux-2.6/fs/buffer.c	2007-05-08 14:51:42.000000000 +1000
@@ -78,8 +78,7 @@
 
 void fastcall unlock_buffer(struct buffer_head *bh)
 {
-	smp_mb__before_clear_bit();
-	clear_buffer_locked(bh);
+	clear_bit_unlock(BH_Lock, &bh->b_state);
 	smp_mb__after_clear_bit();
 	wake_up_bit(&bh->b_state, BH_Lock);
 }
@@ -1664,7 +1663,7 @@
 		 */
 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
 			lock_buffer(bh);
-		} else if (test_set_buffer_locked(bh)) {
+		} else if (!trylock_buffer(bh)) {
 			redirty_page_for_writepage(wbc, page);
 			continue;
 		}
@@ -2719,7 +2718,7 @@
 
 		if (rw == SWRITE)
 			lock_buffer(bh);
-		else if (test_set_buffer_locked(bh))
+		else if (!trylock_buffer(bh))
 			continue;
 
 		if (rw == WRITE || rw == SWRITE) {
Index: linux-2.6/include/linux/buffer_head.h
===================================================================
--- linux-2.6.orig/include/linux/buffer_head.h	2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/include/linux/buffer_head.h	2007-05-08 14:50:41.000000000 +1000
@@ -115,7 +115,6 @@
 BUFFER_FNS(Dirty, dirty)
 TAS_BUFFER_FNS(Dirty, dirty)
 BUFFER_FNS(Lock, locked)
-TAS_BUFFER_FNS(Lock, locked)
 BUFFER_FNS(Req, req)
 TAS_BUFFER_FNS(Req, req)
 BUFFER_FNS(Mapped, mapped)
@@ -301,10 +300,15 @@
 		__wait_on_buffer(bh);
 }
 
+static inline int trylock_buffer(struct buffer_head *bh)
+{
+	return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state));
+}
+
 static inline void lock_buffer(struct buffer_head *bh)
 {
 	might_sleep();
-	if (test_set_buffer_locked(bh))
+	if (!trylock_buffer(bh))
 		__lock_buffer(bh);
 }
 
Index: linux-2.6/include/linux/interrupt.h
===================================================================
--- linux-2.6.orig/include/linux/interrupt.h	2007-04-12 14:35:11.000000000 +1000
+++ linux-2.6/include/linux/interrupt.h	2007-05-08 14:46:57.000000000 +1000
@@ -310,13 +310,12 @@
 #ifdef CONFIG_SMP
 static inline int tasklet_trylock(struct tasklet_struct *t)
 {
-	return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
+	return !test_and_set_bit_lock(TASKLET_STATE_RUN, &(t)->state);
 }
 
 static inline void tasklet_unlock(struct tasklet_struct *t)
 {
-	smp_mb__before_clear_bit(); 
-	clear_bit(TASKLET_STATE_RUN, &(t)->state);
+	clear_bit_unlock(TASKLET_STATE_RUN, &(t)->state);
 }
 
 static inline void tasklet_unlock_wait(struct tasklet_struct *t)
Index: linux-2.6/kernel/wait.c
===================================================================
--- linux-2.6.orig/kernel/wait.c	2006-10-10 19:35:16.000000000 +1000
+++ linux-2.6/kernel/wait.c	2007-05-08 14:38:25.000000000 +1000
@@ -195,7 +195,7 @@
 			if ((ret = (*action)(q->key.flags)))
 				break;
 		}
-	} while (test_and_set_bit(q->key.bit_nr, q->key.flags));
+	} while (test_and_set_bit_lock(q->key.bit_nr, q->key.flags));
 	finish_wait(wq, &q->wait);
 	return ret;
 }
Index: linux-2.6/Documentation/atomic_ops.txt
===================================================================
--- linux-2.6.orig/Documentation/atomic_ops.txt	2007-04-12 14:35:03.000000000 +1000
+++ linux-2.6/Documentation/atomic_ops.txt	2007-05-08 14:58:40.000000000 +1000
@@ -369,6 +369,15 @@
 	 */
 	 smp_mb__after_clear_bit();
 
+There are two special bitops with lock barrier semantics (acquire/release,
+same as spinlocks). These operate in the same way as their non-_lock/unlock
+postfixed variants, except that they are to provide acquire/release semantics,
+respectively. This means they can be used for bit_spin_trylock and
+bit_spin_unlock type operations without specifying any more barriers.
+
+	int test_and_set_bit_lock(unsigned long nr, unsigned long *addr);
+	void clear_bit_unlock(unsigned long nr, unsigned long *addr);
+
 Finally, there are non-atomic versions of the bitmask operations
 provided.  They are used in contexts where some other higher-level SMP
 locking scheme is being used to protect the bitmask, and thus less
Index: linux-2.6/fs/ntfs/aops.c
===================================================================
--- linux-2.6.orig/fs/ntfs/aops.c	2007-03-05 15:17:25.000000000 +1100
+++ linux-2.6/fs/ntfs/aops.c	2007-05-08 15:18:18.000000000 +1000
@@ -1199,7 +1199,7 @@
 		tbh = bhs[i];
 		if (!tbh)
 			continue;
-		if (unlikely(test_set_buffer_locked(tbh)))
+		if (!trylock_buffer(tbh))
 			BUG();
 		/* The buffer dirty state is now irrelevant, just clean it. */
 		clear_buffer_dirty(tbh);
Index: linux-2.6/fs/ntfs/compress.c
===================================================================
--- linux-2.6.orig/fs/ntfs/compress.c	2007-03-05 15:17:25.000000000 +1100
+++ linux-2.6/fs/ntfs/compress.c	2007-05-08 15:18:13.000000000 +1000
@@ -655,7 +655,7 @@
 	for (i = 0; i < nr_bhs; i++) {
 		struct buffer_head *tbh = bhs[i];
 
-		if (unlikely(test_set_buffer_locked(tbh)))
+		if (!trylock_buffer(tbh))
 			continue;
 		if (unlikely(buffer_uptodate(tbh))) {
 			unlock_buffer(tbh);
Index: linux-2.6/fs/ntfs/mft.c
===================================================================
--- linux-2.6.orig/fs/ntfs/mft.c	2007-02-06 16:20:16.000000000 +1100
+++ linux-2.6/fs/ntfs/mft.c	2007-05-08 15:18:05.000000000 +1000
@@ -586,7 +586,7 @@
 		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
 			struct buffer_head *tbh = bhs[i_bhs];
 
-			if (unlikely(test_set_buffer_locked(tbh)))
+			if (!trylock_buffer(tbh))
 				BUG();
 			BUG_ON(!buffer_uptodate(tbh));
 			clear_buffer_dirty(tbh);
@@ -779,7 +779,7 @@
 	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
 		struct buffer_head *tbh = bhs[i_bhs];
 
-		if (unlikely(test_set_buffer_locked(tbh)))
+		if (!trylock_buffer(tbh))
 			BUG();
 		BUG_ON(!buffer_uptodate(tbh));
 		clear_buffer_dirty(tbh);
Index: linux-2.6/fs/reiserfs/inode.c
===================================================================
--- linux-2.6.orig/fs/reiserfs/inode.c	2007-03-05 15:17:25.000000000 +1100
+++ linux-2.6/fs/reiserfs/inode.c	2007-05-08 15:19:00.000000000 +1000
@@ -2448,7 +2448,7 @@
 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
 			lock_buffer(bh);
 		} else {
-			if (test_set_buffer_locked(bh)) {
+			if (!trylock_buffer(bh)) {
 				redirty_page_for_writepage(wbc, page);
 				continue;
 			}
Index: linux-2.6/fs/reiserfs/journal.c
===================================================================
--- linux-2.6.orig/fs/reiserfs/journal.c	2007-01-29 17:35:54.000000000 +1100
+++ linux-2.6/fs/reiserfs/journal.c	2007-05-08 15:19:34.000000000 +1000
@@ -830,7 +830,7 @@
 		jh = JH_ENTRY(list->next);
 		bh = jh->bh;
 		get_bh(bh);
-		if (test_set_buffer_locked(bh)) {
+		if (!trylock_buffer(bh)) {
 			if (!buffer_dirty(bh)) {
 				list_move(&jh->list, &tmp);
 				goto loop_next;
@@ -3838,7 +3838,7 @@
 {
 	PROC_INFO_INC(p_s_sb, journal.prepare);
 
-	if (test_set_buffer_locked(bh)) {
+	if (!trylock_buffer(bh)) {
 		if (!wait)
 			return 0;
 		lock_buffer(bh);

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [rfc] optimise unlock_page
  2007-05-08 11:37 [rfc] lock bitops Nick Piggin
@ 2007-05-08 11:40 ` Nick Piggin
  2007-05-08 12:13   ` David Howells
                     ` (2 more replies)
  2007-05-08 12:22 ` [rfc] lock bitops David Howells
                   ` (2 subsequent siblings)
  3 siblings, 3 replies; 34+ messages in thread
From: Nick Piggin @ 2007-05-08 11:40 UTC (permalink / raw)
  To: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

This patch trades a page flag for a significant improvement in the unlock_page
fastpath. Various problems in the previous version were spotted by Hugh and
Ben (and fixed in this one).

Comments?

--

Speed up unlock_page by introducing a new page flag to signal that there are
page waitqueue waiters for PG_locked. This means a memory barrier and a random
waitqueue hash cacheline load can be avoided in the fastpath when there is no
contention.

Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h	2007-05-08 15:30:39.000000000 +1000
+++ linux-2.6/include/linux/page-flags.h	2007-05-08 15:31:00.000000000 +1000
@@ -91,6 +91,8 @@
 #define PG_nosave_free		18	/* Used for system suspend/resume */
 #define PG_buddy		19	/* Page is free, on buddy lists */
 
+#define PG_waiters		20	/* Page has PG_locked waiters */
+
 /* PG_owner_priv_1 users should have descriptive aliases */
 #define PG_checked		PG_owner_priv_1 /* Used by some filesystems */
 
@@ -123,6 +125,10 @@
 #define TestClearPageLocked(page)	\
 		test_and_clear_bit(PG_locked, &(page)->flags)
 
+#define PageWaiters(page)	test_bit(PG_waiters, &(page)->flags)
+#define SetPageWaiters(page)	set_bit(PG_waiters, &(page)->flags)
+#define ClearPageWaiters(page)	clear_bit(PG_waiters, &(page)->flags)
+
 #define PageError(page)		test_bit(PG_error, &(page)->flags)
 #define SetPageError(page)	set_bit(PG_error, &(page)->flags)
 #define ClearPageError(page)	clear_bit(PG_error, &(page)->flags)
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h	2007-05-08 15:30:39.000000000 +1000
+++ linux-2.6/include/linux/pagemap.h	2007-05-08 16:55:07.000000000 +1000
@@ -133,7 +133,8 @@
 
 extern void FASTCALL(__lock_page(struct page *page));
 extern void FASTCALL(__lock_page_nosync(struct page *page));
-extern void FASTCALL(unlock_page(struct page *page));
+extern void FASTCALL(__wait_on_page_locked(struct page *page));
+extern void FASTCALL(__unlock_page(struct page *page));
 
 static inline int trylock_page(struct page *page)
 {
@@ -160,7 +161,15 @@
 	if (!trylock_page(page))
 		__lock_page_nosync(page);
 }
-	
+
+static inline void unlock_page(struct page *page)
+{
+	VM_BUG_ON(!PageLocked(page));
+	ClearPageLocked_Unlock(page);
+	if (unlikely(PageWaiters(page)))
+		__unlock_page(page);
+}
+
 /*
  * This is exported only for wait_on_page_locked/wait_on_page_writeback.
  * Never use this directly!
@@ -176,8 +185,9 @@
  */
 static inline void wait_on_page_locked(struct page *page)
 {
+	might_sleep();
 	if (PageLocked(page))
-		wait_on_page_bit(page, PG_locked);
+		__wait_on_page_locked(page);
 }
 
 /* 
@@ -185,6 +195,7 @@
  */
 static inline void wait_on_page_writeback(struct page *page)
 {
+	might_sleep();
 	if (PageWriteback(page))
 		wait_on_page_bit(page, PG_writeback);
 }
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c	2007-05-08 15:30:55.000000000 +1000
+++ linux-2.6/mm/filemap.c	2007-05-08 18:04:20.000000000 +1000
@@ -169,6 +169,7 @@
 	return 0;
 }
 
+
 /**
  * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
  * @mapping:	address space structure to write
@@ -478,12 +479,6 @@
 EXPORT_SYMBOL(__page_cache_alloc);
 #endif
 
-static int __sleep_on_page_lock(void *word)
-{
-	io_schedule();
-	return 0;
-}
-
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
@@ -516,26 +511,22 @@
 }
 EXPORT_SYMBOL(wait_on_page_bit);
 
-/**
- * unlock_page - unlock a locked page
- * @page: the page
- *
- * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
- * Also wakes sleepers in wait_on_page_writeback() because the wakeup
- * mechananism between PageLocked pages and PageWriteback pages is shared.
- * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
- *
- * The mb is necessary to enforce ordering between the clear_bit and the read
- * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
+/*
+ * If PageWaiters was found to be set at unlock-time, __unlock_page should
+ * be called to actually perform the wakeups of waiters.
  */
-void fastcall unlock_page(struct page *page)
+void fastcall __unlock_page(struct page *page)
 {
-	VM_BUG_ON(!PageLocked(page));
-	ClearPageLocked_Unlock(page);
+	ClearPageWaiters(page);
+ 	/*
+	 * The mb is necessary to enforce ordering between the clear_bit and
+	 * the read of the waitqueue (to avoid SMP races with a parallel
+	 * wait_on_page_locked()
+	 */
 	smp_mb__after_clear_bit();
 	wake_up_page(page, PG_locked);
 }
-EXPORT_SYMBOL(unlock_page);
+EXPORT_SYMBOL(__unlock_page);
 
 /**
  * end_page_writeback - end writeback against a page
@@ -563,10 +554,16 @@
  */
 void fastcall __lock_page(struct page *page)
 {
+	wait_queue_head_t *wq = page_waitqueue(page);
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
-	__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
-							TASK_UNINTERRUPTIBLE);
+	do {
+		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		SetPageWaiters(page);
+		if (likely(PageLocked(page)))
+			sync_page(page);
+	} while (!trylock_page(page));
+	finish_wait(wq, &wait.wait);
 }
 EXPORT_SYMBOL(__lock_page);
 
@@ -576,10 +573,39 @@
  */
 void fastcall __lock_page_nosync(struct page *page)
 {
+	wait_queue_head_t *wq = page_waitqueue(page);
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
-	__wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
-							TASK_UNINTERRUPTIBLE);
+
+	do {
+		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		SetPageWaiters(page);
+		if (likely(PageLocked(page)))
+			io_schedule();
+	} while (!trylock_page(page));
+	finish_wait(wq, &wait.wait);
+}
+
+void fastcall __wait_on_page_locked(struct page *page)
+{
+	wait_queue_head_t *wq = page_waitqueue(page);
+	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+
+	do {
+		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		SetPageWaiters(page);
+		if (likely(PageLocked(page)))
+			sync_page(page);
+	} while (PageLocked(page));
+	finish_wait(wq, &wait.wait);
+
+	/*
+	 * Could skip this, but that would leave PG_waiters dangling
+	 * for random pages. This keeps it tidy.
+	 */
+	if (unlikely(PageWaiters(page)))
+		__unlock_page(page);
 }
+EXPORT_SYMBOL(__wait_on_page_locked);
 
 /**
  * find_get_page - find and get a page reference
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2007-05-08 15:30:39.000000000 +1000
+++ linux-2.6/mm/page_alloc.c	2007-05-08 15:31:00.000000000 +1000
@@ -203,7 +203,8 @@
 			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback |
-			1 << PG_buddy );
+			1 << PG_buddy |
+			1 << PG_waiters );
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
@@ -438,7 +439,8 @@
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
-			1 << PG_buddy ))))
+			1 << PG_buddy |
+			1 << PG_waiters ))))
 		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
@@ -588,7 +590,8 @@
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
-			1 << PG_buddy ))))
+			1 << PG_buddy |
+			1 << PG_waiters ))))
 		bad_page(page);
 
 	/*

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin
@ 2007-05-08 12:13   ` David Howells
  2007-05-08 22:35     ` Nick Piggin
  2007-05-08 20:08   ` Hugh Dickins
  2007-05-08 21:30   ` Benjamin Herrenschmidt
  2 siblings, 1 reply; 34+ messages in thread
From: David Howells @ 2007-05-08 12:13 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List


Nick Piggin <npiggin@suse.de> wrote:

> This patch trades a page flag for a significant improvement in the unlock_page
> fastpath. Various problems in the previous version were spotted by Hugh and
> Ben (and fixed in this one).

It looks reasonable at first glance, though it does consume yet another page
flag:-/  However, I think that's probably a worthy trade.

>  }
> -	
> +
> +static inline void unlock_page(struct page *page)
> +{
> +	VM_BUG_ON(!PageLocked(page));
> +	ClearPageLocked_Unlock(page);
> +	if (unlikely(PageWaiters(page)))
> +		__unlock_page(page);
> +}
> +

Please don't simply discard the documentation, we have little enough as it is:

> -/**
> - * unlock_page - unlock a locked page
> - * @page: the page
> - *
> - * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
> - * Also wakes sleepers in wait_on_page_writeback() because the wakeup
> - * mechananism between PageLocked pages and PageWriteback pages is shared.
> - * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
> - *
> - * The mb is necessary to enforce ordering between the clear_bit and the read
> - * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).

David

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-08 12:13   ` David Howells
@ 2007-05-08 22:35     ` Nick Piggin
  0 siblings, 0 replies; 34+ messages in thread
From: Nick Piggin @ 2007-05-08 22:35 UTC (permalink / raw)
  To: David Howells
  Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Tue, May 08, 2007 at 01:13:35PM +0100, David Howells wrote:
> 
> Nick Piggin <npiggin@suse.de> wrote:
> 
> > This patch trades a page flag for a significant improvement in the unlock_page
> > fastpath. Various problems in the previous version were spotted by Hugh and
> > Ben (and fixed in this one).
> 
> It looks reasonable at first glance, though it does consume yet another page
> flag:-/  However, I think that's probably a worthy trade.

Well, that's the big question :)


> >  }
> > -	
> > +
> > +static inline void unlock_page(struct page *page)
> > +{
> > +	VM_BUG_ON(!PageLocked(page));
> > +	ClearPageLocked_Unlock(page);
> > +	if (unlikely(PageWaiters(page)))
> > +		__unlock_page(page);
> > +}
> > +
> 
> Please don't simply discard the documentation, we have little enough as it is:

Oops, right.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin
  2007-05-08 12:13   ` David Howells
@ 2007-05-08 20:08   ` Hugh Dickins
  2007-05-08 21:30   ` Benjamin Herrenschmidt
  2 siblings, 0 replies; 34+ messages in thread
From: Hugh Dickins @ 2007-05-08 20:08 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Tue, 8 May 2007, Nick Piggin wrote:

> This patch trades a page flag for a significant improvement in the unlock_page
> fastpath. Various problems in the previous version were spotted by Hugh and
> Ben (and fixed in this one).
> 
> Comments?

Seems there's still a bug there.  I get hangs on the page lock, on
i386 and on x86_64 and on powerpc: sometimes they unhang themselves
after a while (presume other activity does the wakeup).  Obvious even
while booting (Starting udevd).  Sorry, not had time to investigate.

Hugh

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin
  2007-05-08 12:13   ` David Howells
  2007-05-08 20:08   ` Hugh Dickins
@ 2007-05-08 21:30   ` Benjamin Herrenschmidt
  2007-05-08 22:41     ` Nick Piggin
  2 siblings, 1 reply; 34+ messages in thread
From: Benjamin Herrenschmidt @ 2007-05-08 21:30 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-arch, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Tue, 2007-05-08 at 13:40 +0200, Nick Piggin wrote:
> This patch trades a page flag for a significant improvement in the unlock_page
> fastpath. Various problems in the previous version were spotted by Hugh and
> Ben (and fixed in this one).
> 
> Comments?
> 
> --
> 
> Speed up unlock_page by introducing a new page flag to signal that there are
> page waitqueue waiters for PG_locked. This means a memory barrier and a random
> waitqueue hash cacheline load can be avoided in the fastpath when there is no
> contention.

I'm not 100% familiar with the exclusive vs. non exclusive wait thingy
but wake_up_page() does __wake_up_bit() which calls __wake_up() with
nr_exclusive set to 1. Doesn't that mean that only one waiter will be
woken up ?

If that's the case, then we lose because we'll have clear PG_waiters but
only wake up one of them.

Waking them all would fix it but at the risk of causing other
problems... Maybe PG_waiters need to actually be a counter but if that
is the case, then it complicates things even more.

Any smart idea ?

Ben.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-08 21:30   ` Benjamin Herrenschmidt
@ 2007-05-08 22:41     ` Nick Piggin
  2007-05-08 22:50       ` Nick Piggin
  0 siblings, 1 reply; 34+ messages in thread
From: Nick Piggin @ 2007-05-08 22:41 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: linux-arch, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Wed, May 09, 2007 at 07:30:27AM +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2007-05-08 at 13:40 +0200, Nick Piggin wrote:
> > This patch trades a page flag for a significant improvement in the unlock_page
> > fastpath. Various problems in the previous version were spotted by Hugh and
> > Ben (and fixed in this one).
> > 
> > Comments?
> > 
> > --
> > 
> > Speed up unlock_page by introducing a new page flag to signal that there are
> > page waitqueue waiters for PG_locked. This means a memory barrier and a random
> > waitqueue hash cacheline load can be avoided in the fastpath when there is no
> > contention.
> 
> I'm not 100% familiar with the exclusive vs. non exclusive wait thingy
> but wake_up_page() does __wake_up_bit() which calls __wake_up() with
> nr_exclusive set to 1. Doesn't that mean that only one waiter will be
> woken up ?
> 
> If that's the case, then we lose because we'll have clear PG_waiters but
> only wake up one of them.
> 
> Waking them all would fix it but at the risk of causing other
> problems... Maybe PG_waiters need to actually be a counter but if that
> is the case, then it complicates things even more.
> 
> Any smart idea ?

It will wake up 1 exclusive waiter, but no limit on non exclusive waiters.
Hmm, but it won't wake up waiters behind the exclusive guy... maybe the
wake up code can check whether the waitqueue is still active after the
wakeup, and set PG_waiters again in that case?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-08 22:41     ` Nick Piggin
@ 2007-05-08 22:50       ` Nick Piggin
  2007-05-09 19:33         ` Hugh Dickins
  0 siblings, 1 reply; 34+ messages in thread
From: Nick Piggin @ 2007-05-08 22:50 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: linux-arch, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Wed, May 09, 2007 at 12:41:24AM +0200, Nick Piggin wrote:
> On Wed, May 09, 2007 at 07:30:27AM +1000, Benjamin Herrenschmidt wrote:
> > On Tue, 2007-05-08 at 13:40 +0200, Nick Piggin wrote:
> > > This patch trades a page flag for a significant improvement in the unlock_page
> > > fastpath. Various problems in the previous version were spotted by Hugh and
> > > Ben (and fixed in this one).
> > > 
> > > Comments?
> > > 
> > > --
> > > 
> > > Speed up unlock_page by introducing a new page flag to signal that there are
> > > page waitqueue waiters for PG_locked. This means a memory barrier and a random
> > > waitqueue hash cacheline load can be avoided in the fastpath when there is no
> > > contention.
> > 
> > I'm not 100% familiar with the exclusive vs. non exclusive wait thingy
> > but wake_up_page() does __wake_up_bit() which calls __wake_up() with
> > nr_exclusive set to 1. Doesn't that mean that only one waiter will be
> > woken up ?
> > 
> > If that's the case, then we lose because we'll have clear PG_waiters but
> > only wake up one of them.
> > 
> > Waking them all would fix it but at the risk of causing other
> > problems... Maybe PG_waiters need to actually be a counter but if that
> > is the case, then it complicates things even more.
> > 
> > Any smart idea ?
> 
> It will wake up 1 exclusive waiter, but no limit on non exclusive waiters.
> Hmm, but it won't wake up waiters behind the exclusive guy... maybe the
> wake up code can check whether the waitqueue is still active after the
> wakeup, and set PG_waiters again in that case?

Hm, I don't know if we can do that without a race either...

OTOH, waking all non exclusive waiters may not be a really bad idea.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-08 22:50       ` Nick Piggin
@ 2007-05-09 19:33         ` Hugh Dickins
  2007-05-09 21:21           ` Benjamin Herrenschmidt
  2007-05-10  3:37           ` Nick Piggin
  0 siblings, 2 replies; 34+ messages in thread
From: Hugh Dickins @ 2007-05-09 19:33 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Wed, 9 May 2007, Nick Piggin wrote:
> On Wed, May 09, 2007 at 12:41:24AM +0200, Nick Piggin wrote:
> > On Wed, May 09, 2007 at 07:30:27AM +1000, Benjamin Herrenschmidt wrote:
> > > 
> > > Waking them all would fix it but at the risk of causing other
> > > problems... Maybe PG_waiters need to actually be a counter but if that
> > > is the case, then it complicates things even more.
> > 
> > It will wake up 1 exclusive waiter, but no limit on non exclusive waiters.
> > Hmm, but it won't wake up waiters behind the exclusive guy... maybe the
> > wake up code can check whether the waitqueue is still active after the
> > wakeup, and set PG_waiters again in that case?
> 
> Hm, I don't know if we can do that without a race either...
> 
> OTOH, waking all non exclusive waiters may not be a really bad idea.

Not good enough, I'm afraid.  It looks like Ben's right and you need
a count - and counts in the page struct are a lot harder to add than
page flags.

I've now played around with the hangs on my three 4CPU machines
(all of them in io_schedule below __lock_page, waiting on pages
which were neither PG_locked nor PG_waiters when I looked).

Seeing Ben's mail, I thought the answer would be just to remove
the "_exclusive" from your three prepare_to_wait_exclusive()s.
That helped, but it didn't eliminate the hangs.

After fiddling around with different ideas for some while, I came
to realize that the ClearPageWaiters (in very misleadingly named
__unlock_page) is hopeless.  It's just so easy for it to clear the
PG_waiters that a third task relies upon for wakeup (and which
cannot loop around to set it again, because it simply won't be
woken by unlock_page/__unlock_page without it already being set).

Below is the patch I've applied to see some tests actually running
with your patches, but it's just a joke: absurdly racy and
presumptuous in itself (the "3" stands for us and the cache and one
waiter; I deleted the neighbouring mb and comment, not because I
disagree, but because it's ridiculous to pay so much attention to
such unlikely races when there's much worse nearby).  Though I've
not checked: if I've got the counting wrong, then maybe all my
pages are left marked PG_waiters by now.

(I did imagine we could go back to prepare_to_wait_exclusive
once I'd put in the page_count test before ClearPageWaiters;
but apparently not, that still hung.)

My intention had been to apply the patches to what I tested before
with lmbench, to get comparative numbers; but I don't think this
is worth the time, it's too far from being a real solution.

I was puzzled as to how you came up with any performance numbers
yourself, when I could hardly boot.  I see you mentioned 2CPU G5,
I guess you need a CPU or two more; or maybe it's that you didn't
watch what happened as it booted, often those hangs recover later.

Hugh

--- a/mm/filemap.c	2007-05-08 20:17:31.000000000 +0100
+++ b/mm/filemap.c	2007-05-09 19:14:03.000000000 +0100
@@ -517,13 +517,8 @@ EXPORT_SYMBOL(wait_on_page_bit);
  */
 void fastcall __unlock_page(struct page *page)
 {
-	ClearPageWaiters(page);
- 	/*
-	 * The mb is necessary to enforce ordering between the clear_bit and
-	 * the read of the waitqueue (to avoid SMP races with a parallel
-	 * wait_on_page_locked()
-	 */
-	smp_mb__after_clear_bit();
+	if (page_count(page) <= 3 + page_has_buffers(page)+page_mapcount(page))
+		ClearPageWaiters(page);
 	wake_up_page(page, PG_locked);
 }
 EXPORT_SYMBOL(__unlock_page);
@@ -558,7 +553,7 @@ void fastcall __lock_page(struct page *p
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);

 	do {
-		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 		SetPageWaiters(page);
 		if (likely(PageLocked(page)))
 			sync_page(page);
@@ -577,7 +572,7 @@ void fastcall __lock_page_nosync(struct 
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);

 	do {
-		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 		SetPageWaiters(page);
 		if (likely(PageLocked(page)))
 			io_schedule();
@@ -591,7 +586,7 @@ void fastcall __wait_on_page_locked(stru
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);

 	do {
-		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 		SetPageWaiters(page);
 		if (likely(PageLocked(page)))
 			sync_page(page);

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-09 19:33         ` Hugh Dickins
@ 2007-05-09 21:21           ` Benjamin Herrenschmidt
  2007-05-10  3:37           ` Nick Piggin
  1 sibling, 0 replies; 34+ messages in thread
From: Benjamin Herrenschmidt @ 2007-05-09 21:21 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Nick Piggin, linux-arch, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List


> Not good enough, I'm afraid.  It looks like Ben's right and you need
> a count - and counts in the page struct are a lot harder to add than
> page flags.
> 
> I've now played around with the hangs on my three 4CPU machines
> (all of them in io_schedule below __lock_page, waiting on pages
> which were neither PG_locked nor PG_waiters when I looked).
> 
> Seeing Ben's mail, I thought the answer would be just to remove
> the "_exclusive" from your three prepare_to_wait_exclusive()s.
> That helped, but it didn't eliminate the hangs.

There might be a way ... by having the flags manipulation always
atomically deal with PG_locked and PG_waiters together. This is possible
but we would need even more weirdo bitops abstractions from the arch I'm
afraid... unless we start using atomic_* rather that bitops in order to
manipulate multiple bits at a time.

Ben.



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-09 19:33         ` Hugh Dickins
  2007-05-09 21:21           ` Benjamin Herrenschmidt
@ 2007-05-10  3:37           ` Nick Piggin
  2007-05-10 19:14             ` Hugh Dickins
  1 sibling, 1 reply; 34+ messages in thread
From: Nick Piggin @ 2007-05-10  3:37 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Wed, May 09, 2007 at 08:33:15PM +0100, Hugh Dickins wrote:
> 
> Not good enough, I'm afraid.  It looks like Ben's right and you need
> a count - and counts in the page struct are a lot harder to add than
> page flags.
> 
> I've now played around with the hangs on my three 4CPU machines
> (all of them in io_schedule below __lock_page, waiting on pages
> which were neither PG_locked nor PG_waiters when I looked).
> 
> Seeing Ben's mail, I thought the answer would be just to remove
> the "_exclusive" from your three prepare_to_wait_exclusive()s.
> That helped, but it didn't eliminate the hangs.
> 
> After fiddling around with different ideas for some while, I came
> to realize that the ClearPageWaiters (in very misleadingly named
> __unlock_page) is hopeless.  It's just so easy for it to clear the
> PG_waiters that a third task relies upon for wakeup (and which
> cannot loop around to set it again, because it simply won't be
> woken by unlock_page/__unlock_page without it already being set).

OK, I found a simple bug after pulling out my hair for a while :)
With this, a 4-way system survives a couple of concurrent make -j250s
quite nicely (wheras they eventually locked up before).

The problem is that the bit wakeup function did not go through with
the wakeup if it found the bit (ie. PG_locked) set. This meant that
waiters would not get a chance to reset PG_waiters.

However you probably weren't referring to that particular problem
when you imagined the need for a full count, or the slippery 3rd
task... I wasn't able to derive any such problems with the basic
logic, so if there was a bug there, it would still be unfixed in this
patch.
---

Speed up unlock_page by introducing a new page flag to signal that there are
page waitqueue waiters for PG_locked. This means a memory barrier and a random
waitqueue hash cacheline load can be avoided in the fastpath when there is no
contention.

Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h	2007-05-10 10:22:05.000000000 +1000
+++ linux-2.6/include/linux/page-flags.h	2007-05-10 10:22:06.000000000 +1000
@@ -91,6 +91,8 @@
 #define PG_nosave_free		18	/* Used for system suspend/resume */
 #define PG_buddy		19	/* Page is free, on buddy lists */
 
+#define PG_waiters		20	/* Page has PG_locked waiters */
+
 /* PG_owner_priv_1 users should have descriptive aliases */
 #define PG_checked		PG_owner_priv_1 /* Used by some filesystems */
 
@@ -113,6 +115,10 @@
 #define SetPageLocked(page)		\
 		set_bit(PG_locked, &(page)->flags)
 
+#define PageWaiters(page)	test_bit(PG_waiters, &(page)->flags)
+#define SetPageWaiters(page)	set_bit(PG_waiters, &(page)->flags)
+#define ClearPageWaiters(page)	clear_bit(PG_waiters, &(page)->flags)
+
 #define PageError(page)		test_bit(PG_error, &(page)->flags)
 #define SetPageError(page)	set_bit(PG_error, &(page)->flags)
 #define ClearPageError(page)	clear_bit(PG_error, &(page)->flags)
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h	2007-05-10 10:22:05.000000000 +1000
+++ linux-2.6/include/linux/pagemap.h	2007-05-10 10:22:06.000000000 +1000
@@ -133,7 +133,8 @@
 
 extern void FASTCALL(__lock_page(struct page *page));
 extern void FASTCALL(__lock_page_nosync(struct page *page));
-extern void FASTCALL(unlock_page(struct page *page));
+extern void FASTCALL(__wait_on_page_locked(struct page *page));
+extern void FASTCALL(__unlock_page(struct page *page));
 
 static inline int trylock_page(struct page *page)
 {
@@ -160,7 +161,15 @@
 	if (!trylock_page(page))
 		__lock_page_nosync(page);
 }
-	
+
+static inline void unlock_page(struct page *page)
+{
+	VM_BUG_ON(!PageLocked(page));
+	clear_bit_unlock(PG_locked, &page->flags);
+	if (unlikely(PageWaiters(page)))
+		__unlock_page(page);
+}
+
 /*
  * This is exported only for wait_on_page_locked/wait_on_page_writeback.
  * Never use this directly!
@@ -176,8 +185,9 @@
  */
 static inline void wait_on_page_locked(struct page *page)
 {
+	might_sleep();
 	if (PageLocked(page))
-		wait_on_page_bit(page, PG_locked);
+		__wait_on_page_locked(page);
 }
 
 /* 
@@ -185,6 +195,7 @@
  */
 static inline void wait_on_page_writeback(struct page *page)
 {
+	might_sleep();
 	if (PageWriteback(page))
 		wait_on_page_bit(page, PG_writeback);
 }
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c	2007-05-10 10:22:05.000000000 +1000
+++ linux-2.6/mm/filemap.c	2007-05-10 11:03:10.000000000 +1000
@@ -165,10 +165,12 @@
 	mapping = page_mapping(page);
 	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 		mapping->a_ops->sync_page(page);
-	io_schedule();
+	if (io_schedule_timeout(20*HZ) == 0)
+		printk("page->flags = %lx\n", page->flags);
 	return 0;
 }
 
+
 /**
  * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
  * @mapping:	address space structure to write
@@ -478,12 +480,6 @@
 EXPORT_SYMBOL(__page_cache_alloc);
 #endif
 
-static int __sleep_on_page_lock(void *word)
-{
-	io_schedule();
-	return 0;
-}
-
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
@@ -516,26 +512,23 @@
 }
 EXPORT_SYMBOL(wait_on_page_bit);
 
-/**
- * unlock_page - unlock a locked page
- * @page: the page
- *
- * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
- * Also wakes sleepers in wait_on_page_writeback() because the wakeup
- * mechananism between PageLocked pages and PageWriteback pages is shared.
- * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
- *
- * The mb is necessary to enforce ordering between the clear_bit and the read
- * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
+/*
+ * If PageWaiters was found to be set at unlock-time, __unlock_page should
+ * be called to actually perform the wakeups of waiters.
  */
-void fastcall unlock_page(struct page *page)
+void fastcall __unlock_page(struct page *page)
 {
-	VM_BUG_ON(!PageLocked(page));
-	clear_bit_unlock(PG_locked, &page->flags);
+	ClearPageWaiters(page);
+ 	/*
+	 * The mb is necessary to enforce ordering between the clear_bit and
+	 * the read of the waitqueue (to avoid SMP races with a parallel
+	 * wait_on_page_locked()
+	 */
 	smp_mb__after_clear_bit();
+
 	wake_up_page(page, PG_locked);
 }
-EXPORT_SYMBOL(unlock_page);
+EXPORT_SYMBOL(__unlock_page);
 
 /**
  * end_page_writeback - end writeback against a page
@@ -563,10 +556,16 @@
  */
 void fastcall __lock_page(struct page *page)
 {
+	wait_queue_head_t *wq = page_waitqueue(page);
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
-	__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
-							TASK_UNINTERRUPTIBLE);
+	do {
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		SetPageWaiters(page);
+		if (likely(PageLocked(page)))
+			sync_page(page);
+	} while (!trylock_page(page));
+	finish_wait(wq, &wait.wait);
 }
 EXPORT_SYMBOL(__lock_page);
 
@@ -576,10 +575,41 @@
  */
 void fastcall __lock_page_nosync(struct page *page)
 {
+	wait_queue_head_t *wq = page_waitqueue(page);
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
-	__wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
-							TASK_UNINTERRUPTIBLE);
+
+	do {
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		SetPageWaiters(page);
+		if (likely(PageLocked(page))) {
+			if (io_schedule_timeout(20*HZ) == 0)
+				printk("page->flags = %lx\n", page->flags);
+		}
+	} while (!trylock_page(page));
+	finish_wait(wq, &wait.wait);
+}
+
+void fastcall __wait_on_page_locked(struct page *page)
+{
+	wait_queue_head_t *wq = page_waitqueue(page);
+	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+
+	do {
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		SetPageWaiters(page);
+		if (likely(PageLocked(page)))
+			sync_page(page);
+	} while (PageLocked(page));
+	finish_wait(wq, &wait.wait);
+
+	/*
+	 * Could skip this, but that would leave PG_waiters dangling
+	 * for random pages. This keeps it tidy.
+	 */
+	if (unlikely(PageWaiters(page)))
+		__unlock_page(page);
 }
+EXPORT_SYMBOL(__wait_on_page_locked);
 
 /**
  * find_get_page - find and get a page reference
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2007-05-10 10:21:32.000000000 +1000
+++ linux-2.6/mm/page_alloc.c	2007-05-10 10:22:06.000000000 +1000
@@ -203,7 +203,8 @@
 			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback |
-			1 << PG_buddy );
+			1 << PG_buddy |
+			1 << PG_waiters );
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
@@ -438,7 +439,8 @@
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
-			1 << PG_buddy ))))
+			1 << PG_buddy |
+			1 << PG_waiters ))))
 		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
@@ -588,7 +590,8 @@
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
-			1 << PG_buddy ))))
+			1 << PG_buddy |
+			1 << PG_waiters ))))
 		bad_page(page);
 
 	/*
Index: linux-2.6/kernel/wait.c
===================================================================
--- linux-2.6.orig/kernel/wait.c	2007-05-10 11:06:43.000000000 +1000
+++ linux-2.6/kernel/wait.c	2007-05-10 11:17:25.000000000 +1000
@@ -144,8 +144,7 @@
 		= container_of(wait, struct wait_bit_queue, wait);
 
 	if (wait_bit->key.flags != key->flags ||
-			wait_bit->key.bit_nr != key->bit_nr ||
-			test_bit(key->bit_nr, key->flags))
+			wait_bit->key.bit_nr != key->bit_nr)
 		return 0;
 	else
 		return autoremove_wake_function(wait, mode, sync, key);

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-10  3:37           ` Nick Piggin
@ 2007-05-10 19:14             ` Hugh Dickins
  2007-05-11  8:54               ` Nick Piggin
  0 siblings, 1 reply; 34+ messages in thread
From: Hugh Dickins @ 2007-05-10 19:14 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Thu, 10 May 2007, Nick Piggin wrote:
> 
> OK, I found a simple bug after pulling out my hair for a while :)
> With this, a 4-way system survives a couple of concurrent make -j250s
> quite nicely (wheras they eventually locked up before).
> 
> The problem is that the bit wakeup function did not go through with
> the wakeup if it found the bit (ie. PG_locked) set. This meant that
> waiters would not get a chance to reset PG_waiters.

That makes a lot of sense.  And this version seems stable to me,
I've found no problems so far: magic!

Well, on the x86_64 I have seen a few of your io_schedule_timeout
printks under load; but suspect those are no fault of your changes,
but reflect some actual misbehaviour down towards the disk end (when
kernel default moved from AS to CFQ, I had to stick with AS because
CFQ ran my tests very much slower on that one machine: something odd
going on that I've occasionally wasted time looking into but never
tracked down - certainly long-locked pages are a feature of that).

> However you probably weren't referring to that particular problem
> when you imagined the need for a full count, or the slippery 3rd
> task... I wasn't able to derive any such problems with the basic
> logic, so if there was a bug there, it would still be unfixed in this
> patch.

I've been struggling to conjure up and exorcise the race that seemed
so obvious to me yesterday.  I was certainly imagining one task on
its way between SetPageWaiters and io_schedule, when the unlock_page
comes, wakes, and lets another waiter take the lock.  Probably I was
forgetting the essence of prepare_to_wait, that this task would then
fall through io_schedule as if woken as part of that batch.  Until
demonstrated otherwise, let's assume I was utterly mistaken.

In addition to 3 hours of load on the three machines, I've gone back
and applied this new patch (and the lock bitops; remembering to shift
PG_waiters up) to 2.6.21-rc3-mm2 on which I did the earlier lmbench
testing, on those three machines.

On the PowerPC G5, these changes pretty much balance out your earlier
changes (not just the one fix-fault-vs-invalidate patch, but the whole
group which came in with that - it'd take me a while to tell exactly
what, easiest to send you a diff if you want it), in those lmbench
fork, exec, sh, mmap, fault tests.  On the P4 Xeons, they improve
the numbers significantly, but only retrieve half the regression.

So here it looks like a good change; but not enough to atone ;)

Hugh

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-10 19:14             ` Hugh Dickins
@ 2007-05-11  8:54               ` Nick Piggin
  2007-05-11 13:15                 ` Hugh Dickins
  0 siblings, 1 reply; 34+ messages in thread
From: Nick Piggin @ 2007-05-11  8:54 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Thu, May 10, 2007 at 08:14:52PM +0100, Hugh Dickins wrote:
> On Thu, 10 May 2007, Nick Piggin wrote:
> > 
> > OK, I found a simple bug after pulling out my hair for a while :)
> > With this, a 4-way system survives a couple of concurrent make -j250s
> > quite nicely (wheras they eventually locked up before).
> > 
> > The problem is that the bit wakeup function did not go through with
> > the wakeup if it found the bit (ie. PG_locked) set. This meant that
> > waiters would not get a chance to reset PG_waiters.
> 
> That makes a lot of sense.  And this version seems stable to me,
> I've found no problems so far: magic!
> 
> Well, on the x86_64 I have seen a few of your io_schedule_timeout
> printks under load; but suspect those are no fault of your changes,

Hmm, I see... well I forgot to remove those from the page I sent,
the timeouts will kick things off again if they get stalled, so
maybe it just hides a problem? (OTOH, I *think* the logic is pretty
sound).


> In addition to 3 hours of load on the three machines, I've gone back
> and applied this new patch (and the lock bitops; remembering to shift
> PG_waiters up) to 2.6.21-rc3-mm2 on which I did the earlier lmbench
> testing, on those three machines.
> 
> On the PowerPC G5, these changes pretty much balance out your earlier
> changes (not just the one fix-fault-vs-invalidate patch, but the whole
> group which came in with that - it'd take me a while to tell exactly
> what, easiest to send you a diff if you want it), in those lmbench
> fork, exec, sh, mmap, fault tests.  On the P4 Xeons, they improve
> the numbers significantly, but only retrieve half the regression.
> 
> So here it looks like a good change; but not enough to atone ;)

Don't worry, I'm only just beginning ;) Can we then do something crazy
like this?  (working on x86-64 only, so far. It seems to eliminate
lat_pagefault and lat_proc regressions here).

What architecture and workloads are you testing with, btw?

--

Put PG_locked in its own byte from other PG_bits, so we can use non-atomic
stores to unlock it.

Index: linux-2.6/include/asm-x86_64/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-x86_64/bitops.h
+++ linux-2.6/include/asm-x86_64/bitops.h
@@ -68,6 +68,38 @@ static __inline__ void clear_bit(int nr,
 		:"dIr" (nr));
 }
 
+/**
+ * clear_bit_unlock - Clears a bit in memory with unlock semantics
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ */
+static __inline__ void clear_bit_unlock(int nr, volatile void * addr)
+{
+	barrier();
+	__asm__ __volatile__( LOCK_PREFIX
+		"btrl %1,%0"
+		:ADDR
+		:"dIr" (nr));
+}
+
+/**
+ * __clear_bit_unlock_byte - same as clear_bit_unlock but uses a byte sized
+ *			     non-atomic store
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * __clear_bit_unlock() is non-atomic, however it implements unlock ordering,
+ * so it cannot be reordered arbitrarily.
+ */
+static __inline__ void __clear_bit_unlock_byte(int nr, void *addr)
+{
+        unsigned char mask = 1UL << (nr % BITS_PER_BYTE);
+        unsigned char *p = addr + nr / BITS_PER_BYTE;
+
+        barrier();
+        *p &= ~mask;
+}
+
 static __inline__ void __clear_bit(int nr, volatile void * addr)
 {
 	__asm__ __volatile__(
@@ -132,6 +164,26 @@ static __inline__ int test_and_set_bit(i
 	return oldbit;
 }
 
+
+/**
+ * test_and_set_bit_lock - Set a bit and return its old value for locking
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and has lock barrier semantics.
+ */
+static __inline__ int test_and_set_bit_lock(int nr, volatile void * addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__( LOCK_PREFIX
+		"btsl %2,%1\n\tsbbl %0,%0"
+		:"=r" (oldbit),ADDR
+		:"dIr" (nr));
+	barrier();
+	return oldbit;
+}
+
 /**
  * __test_and_set_bit - Set a bit and return its old value
  * @nr: Bit to set
@@ -408,7 +460,6 @@ static __inline__ int fls(int x)
 #define ARCH_HAS_FAST_MULTIPLIER 1
 
 #include <asm-generic/bitops/hweight.h>
-#include <asm-generic/bitops/lock.h>
 
 #endif /* __KERNEL__ */
 
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h
+++ linux-2.6/include/linux/mmzone.h
@@ -615,13 +615,13 @@ extern struct zone *next_zone(struct zon
  * with 32 bit page->flags field, we reserve 9 bits for node/zone info.
  * there are 4 zones (3 bits) and this leaves 9-3=6 bits for nodes.
  */
-#define FLAGS_RESERVED		9
+#define FLAGS_RESERVED		7
 
 #elif BITS_PER_LONG == 64
 /*
  * with 64 bit flags field, there's plenty of room.
  */
-#define FLAGS_RESERVED		32
+#define FLAGS_RESERVED		31
 
 #else
 
Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h
+++ linux-2.6/include/linux/page-flags.h
@@ -67,7 +67,6 @@
  * FLAGS_RESERVED which defines the width of the fields section
  * (see linux/mmzone.h).  New flags must _not_ overlap with this area.
  */
-#define PG_locked	 	 0	/* Page is locked. Don't touch. */
 #define PG_error		 1
 #define PG_referenced		 2
 #define PG_uptodate		 3
@@ -104,6 +103,14 @@
  *         63                            32                              0
  */
 #define PG_uncached		31	/* Page has been mapped as uncached */
+
+/*
+ * PG_locked sits in a different byte to the rest of the flags. This allows
+ * optimised implementations to use a non-atomic store to unlock.
+ */
+#define PG_locked		32
+#else
+#define PG_locked		24
 #endif
 
 /*
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -187,7 +187,11 @@ static inline void lock_page_nosync(stru
 static inline void unlock_page(struct page *page)
 {
 	VM_BUG_ON(!PageLocked(page));
-	clear_bit_unlock(PG_locked, &page->flags);
+	/*
+	 * PG_locked sits in its own byte in page->flags, away from normal
+	 * flags, so we can do a non-atomic unlock here
+	 */
+	__clear_bit_unlock_byte(PG_locked, &page->flags);
 	if (unlikely(PageWaiters(page)))
 		__unlock_page(page);
 }
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -192,17 +192,17 @@ static void bad_page(struct page *page)
 		(unsigned long)page->flags, page->mapping,
 		page_mapcount(page), page_count(page));
 	dump_stack();
-	page->flags &= ~(1 << PG_lru	|
-			1 << PG_private |
-			1 << PG_locked	|
-			1 << PG_active	|
-			1 << PG_dirty	|
-			1 << PG_reclaim |
-			1 << PG_slab    |
-			1 << PG_swapcache |
-			1 << PG_writeback |
-			1 << PG_buddy |
-			1 << PG_waiters );
+	page->flags &= ~(1UL << PG_lru	 |
+			1UL << PG_private|
+			1UL << PG_locked |
+			1UL << PG_active |
+			1UL << PG_dirty	 |
+			1UL << PG_reclaim|
+			1UL << PG_slab	 |
+			1UL << PG_swapcache|
+			1UL << PG_writeback|
+			1UL << PG_buddy	 |
+			1UL << PG_waiters );
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
@@ -427,19 +427,19 @@ static inline void __free_one_page(struc
 static inline int free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
-		(page->mapping != NULL)  |
-		(page_count(page) != 0)  |
+		(page->mapping != NULL)	 |
+		(page_count(page) != 0)	 |
 		(page->flags & (
-			1 << PG_lru	|
-			1 << PG_private |
-			1 << PG_locked	|
-			1 << PG_active	|
-			1 << PG_slab	|
-			1 << PG_swapcache |
-			1 << PG_writeback |
-			1 << PG_reserved |
-			1 << PG_buddy |
-			1 << PG_waiters ))))
+			1UL << PG_lru	 |
+			1UL << PG_private|
+			1UL << PG_locked |
+			1UL << PG_active |
+			1UL << PG_slab	 |
+			1UL << PG_swapcache|
+			1UL << PG_writeback|
+			1UL << PG_reserved|
+			1UL << PG_buddy	 |
+			1UL << PG_waiters ))))
 		bad_page(page);
 	/*
 	 * PageReclaim == PageTail. It is only an error
@@ -582,21 +582,21 @@ static inline void expand(struct zone *z
 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	if (unlikely(page_mapcount(page) |
-		(page->mapping != NULL)  |
-		(page_count(page) != 0)  |
+		(page->mapping != NULL)	 |
+		(page_count(page) != 0)	 |
 		(page->flags & (
-			1 << PG_lru	|
-			1 << PG_private	|
-			1 << PG_locked	|
-			1 << PG_active	|
-			1 << PG_dirty	|
-			1 << PG_reclaim	|
-			1 << PG_slab    |
-			1 << PG_swapcache |
-			1 << PG_writeback |
-			1 << PG_reserved |
-			1 << PG_buddy |
-			1 << PG_waiters ))))
+			1UL << PG_lru	 |
+			1UL << PG_private|
+			1UL << PG_locked |
+			1UL << PG_active |
+			1UL << PG_dirty	 |
+			1UL << PG_reclaim|
+			1UL << PG_slab	 |
+			1UL << PG_swapcache|
+			1UL << PG_writeback|
+			1UL << PG_reserved|
+			1UL << PG_buddy	 |
+			1UL << PG_waiters ))))
 		bad_page(page);
 
 	/*
@@ -606,9 +606,9 @@ static int prep_new_page(struct page *pa
 	if (PageReserved(page))
 		return 1;
 
-	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
-			1 << PG_referenced | 1 << PG_arch_1 |
-			1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
+	page->flags &= ~(1UL << PG_uptodate | 1UL << PG_error |
+			1UL << PG_referenced | 1UL << PG_arch_1 |
+			1UL << PG_owner_priv_1 | 1UL << PG_mappedtodisk);
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-11  8:54               ` Nick Piggin
@ 2007-05-11 13:15                 ` Hugh Dickins
  2007-05-13  3:32                   ` Nick Piggin
  0 siblings, 1 reply; 34+ messages in thread
From: Hugh Dickins @ 2007-05-11 13:15 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Fri, 11 May 2007, Nick Piggin wrote:
> On Thu, May 10, 2007 at 08:14:52PM +0100, Hugh Dickins wrote:
> > 
> > Well, on the x86_64 I have seen a few of your io_schedule_timeout
> > printks under load; but suspect those are no fault of your changes,
> 
> Hmm, I see... well I forgot to remove those from the page I sent,
> the timeouts will kick things off again if they get stalled, so
> maybe it just hides a problem? (OTOH, I *think* the logic is pretty
> sound).

As I said in what you snipped, I believe your debug there is showing
up an existing problem on my machine, not a problem in your changes.

> > So here it looks like a good change; but not enough to atone ;)
> 
> Don't worry, I'm only just beginning ;) Can we then do something crazy
> like this?  (working on x86-64 only, so far. It seems to eliminate
> lat_pagefault and lat_proc regressions here).

I think Mr __NickPiggin_Lock is squirming ever more desperately.

So, in essence, you'd like to expand PG_locked from 1 to 8 bits,
despite the fact that page flags are known to be in short supply?
Ah, no, you're keeping it near the static mmzone FLAGS_RESERVED.

Hmm, well, I think that's fairly horrid, and would it even be
guaranteed to work on all architectures?  Playing with one char
of an unsigned long in one way, while playing with the whole of
the unsigned long in another way (bitops) sounds very dodgy to me.

I think I'd rather just accept that your changes have slowed some
microbenchmarks down: it is not always possible to fix a serious
bug without slowing something down.  That's probably what you're
trying to push me into saying by this patch ;)

But again I wonder just what the gain has been, once your double
unmap_mapping_range is factored in.  When I suggested before that
perhaps the double (well, treble including the one in truncate.c)
unmap_mapping_range might solve the problem you set out to solve
(I've lost sight of that!) without pagelock when faulting, you said:

> Well aside from being terribly ugly, it means we can still drop
> the dirty bit where we'd otherwise rather not, so I don't think
> we can do that.

but that didn't give me enough information to agree or disagree.

> 
> What architecture and workloads are you testing with, btw?

i386 (2*HT P4 Xeons), x86_64 (2*HT P4 Xeons), PowerPC (G5 Quad).

Workloads mostly lmbench and my usual pair of make -j20 kernel builds,
one to tmpfs and one to ext2 looped on tmpfs, restricted to 512M RAM
plus swap.  Which is ever so old but still finds enough to keep me busy.

Hugh

> --
> 
> Put PG_locked in its own byte from other PG_bits, so we can use non-atomic
> stores to unlock it.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-11 13:15                 ` Hugh Dickins
@ 2007-05-13  3:32                   ` Nick Piggin
  2007-05-13  4:39                     ` Hugh Dickins
  2007-05-16 17:21                     ` Hugh Dickins
  0 siblings, 2 replies; 34+ messages in thread
From: Nick Piggin @ 2007-05-13  3:32 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote:
> On Fri, 11 May 2007, Nick Piggin wrote:
> > 
> > Don't worry, I'm only just beginning ;) Can we then do something crazy
> > like this?  (working on x86-64 only, so far. It seems to eliminate
> > lat_pagefault and lat_proc regressions here).
> 
> I think Mr __NickPiggin_Lock is squirming ever more desperately.

Really? I thought it was pretty cool to be able to shave several
hundreds of cycles off our page lock :)


> So, in essence, you'd like to expand PG_locked from 1 to 8 bits,
> despite the fact that page flags are known to be in short supply?
> Ah, no, you're keeping it near the static mmzone FLAGS_RESERVED.

Yep, no flags bloating at all.


> Hmm, well, I think that's fairly horrid, and would it even be
> guaranteed to work on all architectures?  Playing with one char
> of an unsigned long in one way, while playing with the whole of
> the unsigned long in another way (bitops) sounds very dodgy to me.

Of course not, but they can just use a regular atomic word sized
bitop. The problem with i386 is that its atomic ops also imply
memory barriers that you obviously don't need on unlock. So I think
getting rid of them is pretty good. A grep of mm/ and fs/ for
lock_page tells me we want this to be as fast as possible even
if it isn't being used in the nopage fastpath.


> I think I'd rather just accept that your changes have slowed some
> microbenchmarks down: it is not always possible to fix a serious
> bug without slowing something down.  That's probably what you're
> trying to push me into saying by this patch ;)

Well I was resigned to that few % regression in the page fault path
until the G5 numbers showed that we needed to improve things. But
now it looks like (at least on my 2*HT P4 Xeon) that we don't have to
have any regression there.


> But again I wonder just what the gain has been, once your double
> unmap_mapping_range is factored in.  When I suggested before that
> perhaps the double (well, treble including the one in truncate.c)
> unmap_mapping_range might solve the problem you set out to solve
> (I've lost sight of that!) without pagelock when faulting, you said:
> 
> > Well aside from being terribly ugly, it means we can still drop
> > the dirty bit where we'd otherwise rather not, so I don't think
> > we can do that.
> 
> but that didn't give me enough information to agree or disagree.

Oh, well invalidate wants to be able to skip dirty pages or have the
filesystem do something special with them first. Once you have taken
the page out of the pagecache but still mapped shared, then blowing
it away doesn't actually solve the data loss problem... only makes
the window of VM inconsistency smaller.


> > What architecture and workloads are you testing with, btw?
> 
> i386 (2*HT P4 Xeons), x86_64 (2*HT P4 Xeons), PowerPC (G5 Quad).
> 
> Workloads mostly lmbench and my usual pair of make -j20 kernel builds,
> one to tmpfs and one to ext2 looped on tmpfs, restricted to 512M RAM
> plus swap.  Which is ever so old but still finds enough to keep me busy.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-13  3:32                   ` Nick Piggin
@ 2007-05-13  4:39                     ` Hugh Dickins
  2007-05-13  6:52                       ` Nick Piggin
  2007-05-16 17:21                     ` Hugh Dickins
  1 sibling, 1 reply; 34+ messages in thread
From: Hugh Dickins @ 2007-05-13  4:39 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Sun, 13 May 2007, Nick Piggin wrote:
> On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote:
> 
> > Hmm, well, I think that's fairly horrid, and would it even be
> > guaranteed to work on all architectures?  Playing with one char
> > of an unsigned long in one way, while playing with the whole of
> > the unsigned long in another way (bitops) sounds very dodgy to me.
> 
> Of course not, but they can just use a regular atomic word sized
> bitop. The problem with i386 is that its atomic ops also imply
> memory barriers that you obviously don't need on unlock.

But is it even a valid procedure on i386?

Hugh

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-13  4:39                     ` Hugh Dickins
@ 2007-05-13  6:52                       ` Nick Piggin
  2007-05-16 17:54                         ` Hugh Dickins
  0 siblings, 1 reply; 34+ messages in thread
From: Nick Piggin @ 2007-05-13  6:52 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Sun, May 13, 2007 at 05:39:03AM +0100, Hugh Dickins wrote:
> On Sun, 13 May 2007, Nick Piggin wrote:
> > On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote:
> > 
> > > Hmm, well, I think that's fairly horrid, and would it even be
> > > guaranteed to work on all architectures?  Playing with one char
> > > of an unsigned long in one way, while playing with the whole of
> > > the unsigned long in another way (bitops) sounds very dodgy to me.
> > 
> > Of course not, but they can just use a regular atomic word sized
> > bitop. The problem with i386 is that its atomic ops also imply
> > memory barriers that you obviously don't need on unlock.
> 
> But is it even a valid procedure on i386?

Well I think so, but not completely sure. OTOH, I admit this is one
of the more contentious speedups ;) It is likely to be vary a lot by
the arch (I think the P4 is infamous for expensive locked ops, others
may prefer not to mix the byte sized ops with word length ones).

But that aside, I'd still like to do the lock page in nopage and get
this bug fixed. Now it is possible to fix some other way, eg we could
use another page flag (I'd say it would be better to use that flag for
PG_waiters and speed up all PG_locked users), however I think it is fine
to lock the page over fault. It gets rid of some complexity of memory
ordering there, and we already have to do the wait_on_page_locked thing
to prevent the page_mkclean data loss thingy.

I haven't seen a non-microbenchmark where it hurts.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-13  6:52                       ` Nick Piggin
@ 2007-05-16 17:54                         ` Hugh Dickins
  2007-05-16 18:18                           ` Nick Piggin
  0 siblings, 1 reply; 34+ messages in thread
From: Hugh Dickins @ 2007-05-16 17:54 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Sun, 13 May 2007, Nick Piggin wrote:
> On Sun, May 13, 2007 at 05:39:03AM +0100, Hugh Dickins wrote:
> > On Sun, 13 May 2007, Nick Piggin wrote:
> > > On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote:
> > > 
> > > > Hmm, well, I think that's fairly horrid, and would it even be
> > > > guaranteed to work on all architectures?  Playing with one char
> > > > of an unsigned long in one way, while playing with the whole of
> > > > the unsigned long in another way (bitops) sounds very dodgy to me.
> > > 
> > > Of course not, but they can just use a regular atomic word sized
> > > bitop. The problem with i386 is that its atomic ops also imply
> > > memory barriers that you obviously don't need on unlock.
> > 
> > But is it even a valid procedure on i386?
> 
> Well I think so, but not completely sure.

That's not quite enough to convince me!

I do retract my "fairly horrid" remark, that was a kneejerk reaction
to cleverness; it's quite nice, if it can be guaranteed to work (and
if lowering FLAGS_RESERVED from 9 to 7 doesn't upset whoever carefully
chose 9).

Please seek out those guarantees.  Like you, I can't really see how
it would go wrong (how could moving in the unlocked char mess with
the flag bits in the rest of the long? how could atomically modifying
the long have a chance of undoing that move?), but it feels like it
might take us into errata territory.

Hugh

> OTOH, I admit this is one
> of the more contentious speedups ;) It is likely to be vary a lot by
> the arch (I think the P4 is infamous for expensive locked ops, others
> may prefer not to mix the byte sized ops with word length ones).

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-16 17:54                         ` Hugh Dickins
@ 2007-05-16 18:18                           ` Nick Piggin
  2007-05-16 19:28                             ` Hugh Dickins
  0 siblings, 1 reply; 34+ messages in thread
From: Nick Piggin @ 2007-05-16 18:18 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List,
	Linus Torvalds

On Wed, May 16, 2007 at 06:54:15PM +0100, Hugh Dickins wrote:
> On Sun, 13 May 2007, Nick Piggin wrote:
> > On Sun, May 13, 2007 at 05:39:03AM +0100, Hugh Dickins wrote:
> > > On Sun, 13 May 2007, Nick Piggin wrote:
> > > > On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote:
> > > > 
> > > > > Hmm, well, I think that's fairly horrid, and would it even be
> > > > > guaranteed to work on all architectures?  Playing with one char
> > > > > of an unsigned long in one way, while playing with the whole of
> > > > > the unsigned long in another way (bitops) sounds very dodgy to me.
> > > > 
> > > > Of course not, but they can just use a regular atomic word sized
> > > > bitop. The problem with i386 is that its atomic ops also imply
> > > > memory barriers that you obviously don't need on unlock.
> > > 
> > > But is it even a valid procedure on i386?
> > 
> > Well I think so, but not completely sure.
> 
> That's not quite enough to convince me!

I did ask Linus, and he was very sure it works.

> I do retract my "fairly horrid" remark, that was a kneejerk reaction
> to cleverness; it's quite nice, if it can be guaranteed to work (and
> if lowering FLAGS_RESERVED from 9 to 7 doesn't upset whoever carefully
> chose 9).

Hmm, it is a _little_ bit horrid ;) Maybe slightly clever, but definitely
slightly horrid as well!

Not so much from a high level view (although it does put more constraints
on the flags layout), but from a CPU level... the way we intermix different
sized loads and stores can run into store forwarding issues[*] which might
be expensive as well. Not to mention that we can't do the non-atomic
unlock on all architectures.

OTOH, it did _seem_ to eliminate the pagefault regression on my P4 Xeon
here, in one round of tests.

The other option of moving the bit into ->mapping hopefully avoids all
the issues, and would probably be a little faster again on the P4, at the
expense of being a more intrusive (but it doesn't look too bad, at first
glance)...

[*] I did mention to Linus that we might be able to avoid the store
forwarding stall by loading just a single byte to test PG_waiters after
the byte store to clear PG_locked. At this point he puked. We decided
that using another field altogether might be better.

> Please seek out those guarantees.  Like you, I can't really see how
> it would go wrong (how could moving in the unlocked char mess with
> the flag bits in the rest of the long? how could atomically modifying
> the long have a chance of undoing that move?), but it feels like it
> might take us into errata territory.

I think we can just rely on the cache coherency protocol taking care of
it for us, on x86. movb would not affect other data other than the dest.
A non-atomic op _could_ of course undo the movb, but it could likewise
undo any other store to the word or byte. An atomic op on the flags does
not modify the movb byte so the movb before/after possibilities should
look exactly the same regardless of the atomic operations happening.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-16 18:18                           ` Nick Piggin
@ 2007-05-16 19:28                             ` Hugh Dickins
  2007-05-16 19:47                               ` Linus Torvalds
  0 siblings, 1 reply; 34+ messages in thread
From: Hugh Dickins @ 2007-05-16 19:28 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List,
	Linus Torvalds

On Wed, 16 May 2007, Nick Piggin wrote:
> On Wed, May 16, 2007 at 06:54:15PM +0100, Hugh Dickins wrote:
> > On Sun, 13 May 2007, Nick Piggin wrote:
> > > 
> > > Well I think so, but not completely sure.
> > 
> > That's not quite enough to convince me!
> 
> I did ask Linus, and he was very sure it works.

Good, that's very encouraging.

> Not so much from a high level view (although it does put more constraints
> on the flags layout), but from a CPU level... the way we intermix different
> sized loads and stores can run into store forwarding issues[*] which might
> be expensive as well. Not to mention that we can't do the non-atomic
> unlock on all architectures.

Ah yes, that's easier to envisage than an actual correctness problem.

> The other option of moving the bit into ->mapping hopefully avoids all
> the issues, and would probably be a little faster again on the P4, at the
> expense of being a more intrusive (but it doesn't look too bad, at first
> glance)...

Hmm, I'm so happy with PG_swapcache in there, that I'm reluctant to
cede it to your PG_locked, though I can't deny your use should take
precedence.  Perhaps we could enforce 8-byte alignment of struct
address_space and struct anon_vma to make both bits available
(along with the anon bit).

But I think you may not be appreciating how intrusive PG_locked
will be.  There are many references to page->mapping (often ->host)
throughout fs/ : when we keep anon/swap flags in page->mapping, we
know the filesystems will never see those bits set in their pages,
so no page_mapping-like conversion is needed; just a few places in
common code need to adapt.

And given our deprecation discipline for in-kernel interfaces,
wouldn't we have to wait a similar period before making page->mapping
unavailable to out-of-tree filesystems?

> > Please seek out those guarantees.  Like you, I can't really see how
> > it would go wrong (how could moving in the unlocked char mess with
> > the flag bits in the rest of the long? how could atomically modifying
> > the long have a chance of undoing that move?), but it feels like it
> > might take us into errata territory.
> 
> I think we can just rely on the cache coherency protocol taking care of
> it for us, on x86. movb would not affect other data other than the dest.
> A non-atomic op _could_ of course undo the movb, but it could likewise
> undo any other store to the word or byte. An atomic op on the flags does
> not modify the movb byte so the movb before/after possibilities should
> look exactly the same regardless of the atomic operations happening.

Yes, I've gone through that same thought process (my questions were
intended as rhetorical exclamations of inconceivabilty, rather than
actual queries).  But if you do go that way, I'd still like you to
check with Intel and AMD for errata.  See include/asm-i386/spinlock.h
for the CONFIG_X86_OOSTORE || CONFIG_X86_PPRO_FENCE __raw_spin_unlock
using xchgb: doesn't that hint that exceptions may be needed?

Hugh

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-16 19:28                             ` Hugh Dickins
@ 2007-05-16 19:47                               ` Linus Torvalds
  2007-05-17  6:27                                 ` Nick Piggin
  0 siblings, 1 reply; 34+ messages in thread
From: Linus Torvalds @ 2007-05-16 19:47 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Nick Piggin, Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List



On Wed, 16 May 2007, Hugh Dickins wrote:

> On Wed, 16 May 2007, Nick Piggin wrote:
> > On Wed, May 16, 2007 at 06:54:15PM +0100, Hugh Dickins wrote:
> > > On Sun, 13 May 2007, Nick Piggin wrote:
> > > > 
> > > > Well I think so, but not completely sure.
> > > 
> > > That's not quite enough to convince me!
> > 
> > I did ask Linus, and he was very sure it works.
> 
> Good, that's very encouraging.

Note that our default spinlocks _depend_ on a bog-standard store just 
working as an unlock, so this wouldn't even be anything half-way new:

	static inline void __raw_spin_unlock(raw_spinlock_t *lock)
	{
		asm volatile("movb $1,%0" : "+m" (lock->slock) :: "memory");
	}

There are some Opteron errata (and a really old P6 bug) wrt this, but they 
are definitely CPU bugs, and we haven't really worked out what the Opteron 
solution should be (the bug is apparently pretty close to impossible to 
trigger in practice, so it's not been a high priority).

> > The other option of moving the bit into ->mapping hopefully avoids all
> > the issues, and would probably be a little faster again on the P4, at the
> > expense of being a more intrusive (but it doesn't look too bad, at first
> > glance)...
> 
> Hmm, I'm so happy with PG_swapcache in there, that I'm reluctant to
> cede it to your PG_locked, though I can't deny your use should take
> precedence.  Perhaps we could enforce 8-byte alignment of struct
> address_space and struct anon_vma to make both bits available
> (along with the anon bit).

We probably could. It should be easy enough to mark "struct address_space" 
to be 8-byte aligned.

> But I think you may not be appreciating how intrusive PG_locked
> will be.  There are many references to page->mapping (often ->host)
> throughout fs/ : when we keep anon/swap flags in page->mapping, we
> know the filesystems will never see those bits set in their pages,
> so no page_mapping-like conversion is needed; just a few places in
> common code need to adapt.

You're right, it could be really painful. We'd have to rename the field, 
and use some inline function to access it (which masks off the low bits).

		Linus

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-16 19:47                               ` Linus Torvalds
@ 2007-05-17  6:27                                 ` Nick Piggin
  0 siblings, 0 replies; 34+ messages in thread
From: Nick Piggin @ 2007-05-17  6:27 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hugh Dickins, Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Wed, May 16, 2007 at 12:47:54PM -0700, Linus Torvalds wrote:
> 
> On Wed, 16 May 2007, Hugh Dickins wrote:
> 
> > > The other option of moving the bit into ->mapping hopefully avoids all
> > > the issues, and would probably be a little faster again on the P4, at the
> > > expense of being a more intrusive (but it doesn't look too bad, at first
> > > glance)...
> > 
> > Hmm, I'm so happy with PG_swapcache in there, that I'm reluctant to
> > cede it to your PG_locked, though I can't deny your use should take
> > precedence.  Perhaps we could enforce 8-byte alignment of struct
> > address_space and struct anon_vma to make both bits available
> > (along with the anon bit).
> 
> We probably could. It should be easy enough to mark "struct address_space" 
> to be 8-byte aligned.

Yeah, it might be worthwhile, because I agree that PG_swapcache would
work nicely there too.


> > But I think you may not be appreciating how intrusive PG_locked
> > will be.  There are many references to page->mapping (often ->host)
> > throughout fs/ : when we keep anon/swap flags in page->mapping, we
> > know the filesystems will never see those bits set in their pages,
> > so no page_mapping-like conversion is needed; just a few places in
> > common code need to adapt.
> 
> You're right, it could be really painful. We'd have to rename the field, 
> and use some inline function to access it (which masks off the low bits).

Yeah, I realise that the change is intrusive in terms of lines touched,
but AFAIKS, it should not be much more complex than a search/replace...

As far as deprecating things goes... I don't think we have to wait too
long, its more for features, drivers, or more fundamental APIs isn't it?
If we just point out that one must use set_page_mapping/page_mapping
rather than page->mapping, it is trivial to fix any out of tree breakage.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-13  3:32                   ` Nick Piggin
  2007-05-13  4:39                     ` Hugh Dickins
@ 2007-05-16 17:21                     ` Hugh Dickins
  2007-05-16 17:38                       ` Nick Piggin
  1 sibling, 1 reply; 34+ messages in thread
From: Hugh Dickins @ 2007-05-16 17:21 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Sun, 13 May 2007, Nick Piggin wrote:
> On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote:
> 
> > But again I wonder just what the gain has been, once your double
> > unmap_mapping_range is factored in.  When I suggested before that
> > perhaps the double (well, treble including the one in truncate.c)
> > unmap_mapping_range might solve the problem you set out to solve
> > (I've lost sight of that!) without pagelock when faulting, you said:
> > 
> > > Well aside from being terribly ugly, it means we can still drop
> > > the dirty bit where we'd otherwise rather not, so I don't think
> > > we can do that.
> > 
> > but that didn't give me enough information to agree or disagree.
> 
> Oh, well invalidate wants to be able to skip dirty pages or have the
> filesystem do something special with them first. Once you have taken
> the page out of the pagecache but still mapped shared, then blowing
> it away doesn't actually solve the data loss problem... only makes
> the window of VM inconsistency smaller.

Right, I think I see what you mean now, thanks: userspace
must not for a moment be allowed to write to orphaned pages.

Whereas it's not an issue for the privately COWed pages you added
the second unmap_mapping_range for: because it's only truncation
that has to worry about them, so they're heading for SIGBUS anyway.

Yes, and the page_mapped tests in mm/truncate.c are just racy
heuristics without the page lock you now put into faulting.

Hugh

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-16 17:21                     ` Hugh Dickins
@ 2007-05-16 17:38                       ` Nick Piggin
  0 siblings, 0 replies; 34+ messages in thread
From: Nick Piggin @ 2007-05-16 17:38 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Wed, May 16, 2007 at 06:21:09PM +0100, Hugh Dickins wrote:
> On Sun, 13 May 2007, Nick Piggin wrote:
> > On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote:
> > 
> > > But again I wonder just what the gain has been, once your double
> > > unmap_mapping_range is factored in.  When I suggested before that
> > > perhaps the double (well, treble including the one in truncate.c)
> > > unmap_mapping_range might solve the problem you set out to solve
> > > (I've lost sight of that!) without pagelock when faulting, you said:
> > > 
> > > > Well aside from being terribly ugly, it means we can still drop
> > > > the dirty bit where we'd otherwise rather not, so I don't think
> > > > we can do that.
> > > 
> > > but that didn't give me enough information to agree or disagree.
> > 
> > Oh, well invalidate wants to be able to skip dirty pages or have the
> > filesystem do something special with them first. Once you have taken
> > the page out of the pagecache but still mapped shared, then blowing
> > it away doesn't actually solve the data loss problem... only makes
> > the window of VM inconsistency smaller.
> 
> Right, I think I see what you mean now, thanks: userspace
> must not for a moment be allowed to write to orphaned pages.

Yep.


> Whereas it's not an issue for the privately COWed pages you added
> the second unmap_mapping_range for: because it's only truncation
> that has to worry about them, so they're heading for SIGBUS anyway.
> 
> Yes, and the page_mapped tests in mm/truncate.c are just racy
> heuristics without the page lock you now put into faulting.

Yes.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] lock bitops
  2007-05-08 11:37 [rfc] lock bitops Nick Piggin
  2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin
@ 2007-05-08 12:22 ` David Howells
  2007-05-08 22:33   ` Nick Piggin
  2007-05-09  6:18   ` Nick Piggin
  2007-05-08 15:06 ` Matthew Wilcox
  2007-05-09 12:08 ` Nikita Danilov
  3 siblings, 2 replies; 34+ messages in thread
From: David Howells @ 2007-05-08 12:22 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List

Nick Piggin <npiggin@suse.de> wrote:

> This patch (along with the subsequent one to optimise unlock_page) reduces
> the overhead of lock_page/unlock_page (measured with page faults and a patch
> to lock the page in the fault handler) by about 425 cycles on my 2-way G5.

Seems reasonable, though test_and_set_lock_bit() might be a better name.

> +There are two special bitops with lock barrier semantics (acquire/release,
> +same as spinlocks).

You should update Documentation/memory-barriers.txt also.

>  #define TestSetPageLocked(page)		\
>  		test_and_set_bit(PG_locked, &(page)->flags)
> +#define TestSetPageLocked_Lock(page)		\
> +		test_and_set_bit_lock(PG_locked, &(page)->flags)

Can we get away with just moving TestSetPageLocked() to the new function
rather than adding another accessor?  Or how about LockPageLocked() and
UnlockPageLocked() rather than SetPageLocked_Lock() that last looks wrong
somehow.

The FRV changes look reasonable, btw.

David


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] lock bitops
  2007-05-08 12:22 ` [rfc] lock bitops David Howells
@ 2007-05-08 22:33   ` Nick Piggin
  2007-05-09  6:18   ` Nick Piggin
  1 sibling, 0 replies; 34+ messages in thread
From: Nick Piggin @ 2007-05-08 22:33 UTC (permalink / raw)
  To: David Howells
  Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List

On Tue, May 08, 2007 at 01:22:56PM +0100, David Howells wrote:
> Nick Piggin <npiggin@suse.de> wrote:
> 
> > This patch (along with the subsequent one to optimise unlock_page) reduces
> > the overhead of lock_page/unlock_page (measured with page faults and a patch
> > to lock the page in the fault handler) by about 425 cycles on my 2-way G5.
> 
> Seems reasonable, though test_and_set_lock_bit() might be a better name.

The postfix attaches lock semantics to the test_and_set_bit operation,
so I don't think it is necessarily wrong to have this name. But it doesn't
matter too much I guess.


> > +There are two special bitops with lock barrier semantics (acquire/release,
> > +same as spinlocks).
> 
> You should update Documentation/memory-barriers.txt also.

Will do.


> >  #define TestSetPageLocked(page)		\
> >  		test_and_set_bit(PG_locked, &(page)->flags)
> > +#define TestSetPageLocked_Lock(page)		\
> > +		test_and_set_bit_lock(PG_locked, &(page)->flags)
> 
> Can we get away with just moving TestSetPageLocked() to the new function
> rather than adding another accessor?  Or how about LockPageLocked() and
> UnlockPageLocked() rather than SetPageLocked_Lock() that last looks wrong
> somehow.

The problem is that it implies some semantics that it may not have. Possibly
better is to just remove them all and use only trylock/lock/unlock_page.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] lock bitops
  2007-05-08 12:22 ` [rfc] lock bitops David Howells
  2007-05-08 22:33   ` Nick Piggin
@ 2007-05-09  6:18   ` Nick Piggin
  1 sibling, 0 replies; 34+ messages in thread
From: Nick Piggin @ 2007-05-09  6:18 UTC (permalink / raw)
  To: David Howells
  Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List

On Tue, May 08, 2007 at 01:22:56PM +0100, David Howells wrote:
> Nick Piggin <npiggin@suse.de> wrote:
> 
> > +There are two special bitops with lock barrier semantics (acquire/release,
> > +same as spinlocks).
> 
> You should update Documentation/memory-barriers.txt also.
> 
> >  #define TestSetPageLocked(page)		\
> >  		test_and_set_bit(PG_locked, &(page)->flags)
> > +#define TestSetPageLocked_Lock(page)		\
> > +		test_and_set_bit_lock(PG_locked, &(page)->flags)
> 
> Can we get away with just moving TestSetPageLocked() to the new function
> rather than adding another accessor?  Or how about LockPageLocked() and
> UnlockPageLocked() rather than SetPageLocked_Lock() that last looks wrong

Updated memory-barriers.txt, made the generic routines #defines because
smp_mb isn't always included early enough, and culled some of the page-
flags.h macros in favour of coding the lock operations directly.
--

Introduce test_and_set_bit_lock / clear_bit_unlock bitops with lock semantics.
Add non-trivial implementations for powerpc and ia64. Convert page lock, buffer
lock, bit_spin_lock, tasklet locks to use the new locks.

---
 Documentation/atomic_ops.txt      |    9 +++++++
 Documentation/memory-barriers.txt |   14 ++++++++++--
 drivers/scsi/sg.c                 |    2 -
 fs/buffer.c                       |    7 ++----
 fs/cifs/file.c                    |    2 -
 fs/jbd/commit.c                   |    4 +--
 fs/jbd2/commit.c                  |    4 +--
 fs/ntfs/aops.c                    |    2 -
 fs/ntfs/compress.c                |    2 -
 fs/ntfs/mft.c                     |    4 +--
 fs/reiserfs/inode.c               |    2 -
 fs/reiserfs/journal.c             |    4 +--
 fs/xfs/linux-2.6/xfs_aops.c       |    4 +--
 include/asm-alpha/bitops.h        |    1 
 include/asm-arm/bitops.h          |    1 
 include/asm-arm26/bitops.h        |    1 
 include/asm-avr32/bitops.h        |    1 
 include/asm-cris/bitops.h         |    1 
 include/asm-frv/bitops.h          |    1 
 include/asm-generic/bitops.h      |    1 
 include/asm-generic/bitops/lock.h |   13 +++++++++++
 include/asm-h8300/bitops.h        |    1 
 include/asm-i386/bitops.h         |    1 
 include/asm-ia64/bitops.h         |   33 ++++++++++++++++++++++++++++
 include/asm-m32r/bitops.h         |    1 
 include/asm-m68k/bitops.h         |    1 
 include/asm-m68knommu/bitops.h    |    1 
 include/asm-mips/bitops.h         |    1 
 include/asm-parisc/bitops.h       |    1 
 include/asm-powerpc/bitops.h      |   44 ++++++++++++++++++++++++++++++++++++++
 include/asm-s390/bitops.h         |    1 
 include/asm-sh/bitops.h           |    1 
 include/asm-sh64/bitops.h         |    1 
 include/asm-sparc/bitops.h        |    1 
 include/asm-sparc64/bitops.h      |    1 
 include/asm-v850/bitops.h         |    1 
 include/asm-x86_64/bitops.h       |    1 
 include/asm-xtensa/bitops.h       |    1 
 include/linux/bit_spinlock.h      |   11 +++++----
 include/linux/buffer_head.h       |    8 +++++-
 include/linux/interrupt.h         |    5 +---
 include/linux/page-flags.h        |    6 -----
 include/linux/pagemap.h           |    9 ++++++-
 kernel/wait.c                     |    2 -
 mm/filemap.c                      |   17 ++++++--------
 mm/memory.c                       |    2 -
 mm/migrate.c                      |    4 +--
 mm/rmap.c                         |    2 -
 mm/shmem.c                        |    4 +--
 mm/swap.c                         |    2 -
 mm/swap_state.c                   |    2 -
 mm/swapfile.c                     |    2 -
 mm/truncate.c                     |    4 +--
 mm/vmscan.c                       |    4 +--
 54 files changed, 193 insertions(+), 63 deletions(-)

Index: linux-2.6/include/asm-powerpc/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-powerpc/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-powerpc/bitops.h	2007-05-09 14:00:42.000000000 +1000
@@ -87,6 +87,24 @@
 	: "cc" );
 }
 
+static __inline__ void clear_bit_unlock(int nr, volatile unsigned long *addr)
+{
+	unsigned long old;
+	unsigned long mask = BITOP_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+	__asm__ __volatile__(
+	LWSYNC_ON_SMP
+"1:"	PPC_LLARX "%0,0,%3	# clear_bit_unlock\n"
+	"andc	%0,%0,%2\n"
+	PPC405_ERR77(0,%3)
+	PPC_STLCX "%0,0,%3\n"
+	"bne-	1b"
+	: "=&r" (old), "+m" (*p)
+	: "r" (mask), "r" (p)
+	: "cc" );
+}
+
 static __inline__ void change_bit(int nr, volatile unsigned long *addr)
 {
 	unsigned long old;
@@ -126,6 +144,27 @@
 	return (old & mask) != 0;
 }
 
+static __inline__ int test_and_set_bit_lock(unsigned long nr,
+				       volatile unsigned long *addr)
+{
+	unsigned long old, t;
+	unsigned long mask = BITOP_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+	__asm__ __volatile__(
+"1:"	PPC_LLARX "%0,0,%3		# test_and_set_bit_lock\n"
+	"or	%1,%0,%2 \n"
+	PPC405_ERR77(0,%3)
+	PPC_STLCX "%1,0,%3 \n"
+	"bne-	1b"
+	ISYNC_ON_SMP
+	: "=&r" (old), "=&r" (t)
+	: "r" (mask), "r" (p)
+	: "cc", "memory");
+
+	return (old & mask) != 0;
+}
+
 static __inline__ int test_and_clear_bit(unsigned long nr,
 					 volatile unsigned long *addr)
 {
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/linux/pagemap.h	2007-05-09 13:57:33.000000000 +1000
@@ -135,13 +135,18 @@
 extern void FASTCALL(__lock_page_nosync(struct page *page));
 extern void FASTCALL(unlock_page(struct page *page));
 
+static inline int trylock_page(struct page *page)
+{
+	return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
+}
+
 /*
  * lock_page may only be called if we have the page's inode pinned.
  */
 static inline void lock_page(struct page *page)
 {
 	might_sleep();
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		__lock_page(page);
 }
 
@@ -152,7 +157,7 @@
 static inline void lock_page_nosync(struct page *page)
 {
 	might_sleep();
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		__lock_page_nosync(page);
 }
 	
Index: linux-2.6/drivers/scsi/sg.c
===================================================================
--- linux-2.6.orig/drivers/scsi/sg.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/drivers/scsi/sg.c	2007-05-09 13:57:33.000000000 +1000
@@ -1734,7 +1734,7 @@
                  */
 		flush_dcache_page(pages[i]);
 		/* ?? Is locking needed? I don't think so */
-		/* if (TestSetPageLocked(pages[i]))
+		/* if (!trylock_page(pages[i]))
 		   goto out_unlock; */
         }
 
Index: linux-2.6/fs/cifs/file.c
===================================================================
--- linux-2.6.orig/fs/cifs/file.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/fs/cifs/file.c	2007-05-09 13:57:33.000000000 +1000
@@ -1229,7 +1229,7 @@
 
 			if (first < 0)
 				lock_page(page);
-			else if (TestSetPageLocked(page))
+			else if (!trylock_page(page))
 				break;
 
 			if (unlikely(page->mapping != mapping)) {
Index: linux-2.6/fs/jbd/commit.c
===================================================================
--- linux-2.6.orig/fs/jbd/commit.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/fs/jbd/commit.c	2007-05-09 13:57:33.000000000 +1000
@@ -64,7 +64,7 @@
 		goto nope;
 
 	/* OK, it's a truncated page */
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		goto nope;
 
 	page_cache_get(page);
@@ -209,7 +209,7 @@
 		 * blocking lock_buffer().
 		 */
 		if (buffer_dirty(bh)) {
-			if (test_set_buffer_locked(bh)) {
+			if (!trylock_buffer(bh)) {
 				BUFFER_TRACE(bh, "needs blocking lock");
 				spin_unlock(&journal->j_list_lock);
 				/* Write out all data to prevent deadlocks */
Index: linux-2.6/fs/jbd2/commit.c
===================================================================
--- linux-2.6.orig/fs/jbd2/commit.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/fs/jbd2/commit.c	2007-05-09 13:57:33.000000000 +1000
@@ -64,7 +64,7 @@
 		goto nope;
 
 	/* OK, it's a truncated page */
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		goto nope;
 
 	page_cache_get(page);
@@ -209,7 +209,7 @@
 		 * blocking lock_buffer().
 		 */
 		if (buffer_dirty(bh)) {
-			if (test_set_buffer_locked(bh)) {
+			if (!trylock_buffer(bh)) {
 				BUFFER_TRACE(bh, "needs blocking lock");
 				spin_unlock(&journal->j_list_lock);
 				/* Write out all data to prevent deadlocks */
Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.c	2007-05-09 13:57:33.000000000 +1000
@@ -601,7 +601,7 @@
 			} else
 				pg_offset = PAGE_CACHE_SIZE;
 
-			if (page->index == tindex && !TestSetPageLocked(page)) {
+			if (page->index == tindex && trylock_page(page)) {
 				len = xfs_probe_page(page, pg_offset, mapped);
 				unlock_page(page);
 			}
@@ -685,7 +685,7 @@
 
 	if (page->index != tindex)
 		goto fail;
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		goto fail;
 	if (PageWriteback(page))
 		goto fail_unlock_page;
Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/linux/page-flags.h	2007-05-09 13:57:33.000000000 +1000
@@ -112,12 +112,6 @@
 		test_bit(PG_locked, &(page)->flags)
 #define SetPageLocked(page)		\
 		set_bit(PG_locked, &(page)->flags)
-#define TestSetPageLocked(page)		\
-		test_and_set_bit(PG_locked, &(page)->flags)
-#define ClearPageLocked(page)		\
-		clear_bit(PG_locked, &(page)->flags)
-#define TestClearPageLocked(page)	\
-		test_and_clear_bit(PG_locked, &(page)->flags)
 
 #define PageError(page)		test_bit(PG_error, &(page)->flags)
 #define SetPageError(page)	set_bit(PG_error, &(page)->flags)
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/mm/memory.c	2007-05-09 13:57:33.000000000 +1000
@@ -1550,7 +1550,7 @@
 	 * not dirty accountable.
 	 */
 	if (PageAnon(old_page)) {
-		if (!TestSetPageLocked(old_page)) {
+		if (trylock_page(old_page)) {
 			reuse = can_share_swap_page(old_page);
 			unlock_page(old_page);
 		}
Index: linux-2.6/mm/migrate.c
===================================================================
--- linux-2.6.orig/mm/migrate.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/mm/migrate.c	2007-05-09 13:57:33.000000000 +1000
@@ -569,7 +569,7 @@
 	 * establishing additional references. We are the only one
 	 * holding a reference to the new page at this point.
 	 */
-	if (TestSetPageLocked(newpage))
+	if (!trylock_page(newpage))
 		BUG();
 
 	/* Prepare mapping for the new page.*/
@@ -621,7 +621,7 @@
 		goto move_newpage;
 
 	rc = -EAGAIN;
-	if (TestSetPageLocked(page)) {
+	if (!trylock_page(page)) {
 		if (!force)
 			goto move_newpage;
 		lock_page(page);
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/mm/rmap.c	2007-05-09 13:57:33.000000000 +1000
@@ -426,7 +426,7 @@
 			referenced += page_referenced_anon(page);
 		else if (is_locked)
 			referenced += page_referenced_file(page);
-		else if (TestSetPageLocked(page))
+		else if (!trylock_page(page))
 			referenced++;
 		else {
 			if (page->mapping)
Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/mm/shmem.c	2007-05-09 13:57:33.000000000 +1000
@@ -1154,7 +1154,7 @@
 		}
 
 		/* We have to do this with page locked to prevent races */
-		if (TestSetPageLocked(swappage)) {
+		if (!trylock_page(swappage)) {
 			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
 			wait_on_page_locked(swappage);
@@ -1213,7 +1213,7 @@
 		shmem_swp_unmap(entry);
 		filepage = find_get_page(mapping, idx);
 		if (filepage &&
-		    (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
+		    (!PageUptodate(filepage) || !trylock_page(filepage))) {
 			spin_unlock(&info->lock);
 			wait_on_page_locked(filepage);
 			page_cache_release(filepage);
Index: linux-2.6/mm/swap.c
===================================================================
--- linux-2.6.orig/mm/swap.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/mm/swap.c	2007-05-09 13:57:33.000000000 +1000
@@ -412,7 +412,7 @@
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
 
-		if (PagePrivate(page) && !TestSetPageLocked(page)) {
+		if (PagePrivate(page) && trylock_page(page)) {
 			if (PagePrivate(page))
 				try_to_release_page(page, 0);
 			unlock_page(page);
Index: linux-2.6/mm/swap_state.c
===================================================================
--- linux-2.6.orig/mm/swap_state.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/mm/swap_state.c	2007-05-09 13:57:33.000000000 +1000
@@ -252,7 +252,7 @@
  */
 static inline void free_swap_cache(struct page *page)
 {
-	if (PageSwapCache(page) && !TestSetPageLocked(page)) {
+	if (PageSwapCache(page) && trylock_page(page)) {
 		remove_exclusive_swap_page(page);
 		unlock_page(page);
 	}
Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/mm/swapfile.c	2007-05-09 13:57:33.000000000 +1000
@@ -401,7 +401,7 @@
 	if (p) {
 		if (swap_entry_free(p, swp_offset(entry)) == 1) {
 			page = find_get_page(&swapper_space, entry.val);
-			if (page && unlikely(TestSetPageLocked(page))) {
+			if (page && unlikely(!trylock_page(page))) {
 				page_cache_release(page);
 				page = NULL;
 			}
Index: linux-2.6/mm/truncate.c
===================================================================
--- linux-2.6.orig/mm/truncate.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/mm/truncate.c	2007-05-09 13:57:33.000000000 +1000
@@ -185,7 +185,7 @@
 			if (page_index > next)
 				next = page_index;
 			next++;
-			if (TestSetPageLocked(page))
+			if (!trylock_page(page))
 				continue;
 			if (PageWriteback(page)) {
 				unlock_page(page);
@@ -281,7 +281,7 @@
 			pgoff_t index;
 			int lock_failed;
 
-			lock_failed = TestSetPageLocked(page);
+			lock_failed = !trylock_page(page);
 
 			/*
 			 * We really shouldn't be looking at the ->index of an
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/mm/vmscan.c	2007-05-09 13:57:33.000000000 +1000
@@ -466,7 +466,7 @@
 		page = lru_to_page(page_list);
 		list_del(&page->lru);
 
-		if (TestSetPageLocked(page))
+		if (!trylock_page(page))
 			goto keep;
 
 		VM_BUG_ON(PageActive(page));
@@ -538,7 +538,7 @@
 				 * A synchronous write - probably a ramdisk.  Go
 				 * ahead and try to reclaim the page.
 				 */
-				if (TestSetPageLocked(page))
+				if (!trylock_page(page))
 					goto keep;
 				if (PageDirty(page) || PageWriteback(page))
 					goto keep_locked;
Index: linux-2.6/include/linux/bit_spinlock.h
===================================================================
--- linux-2.6.orig/include/linux/bit_spinlock.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/linux/bit_spinlock.h	2007-05-09 13:57:33.000000000 +1000
@@ -18,7 +18,7 @@
 	 */
 	preempt_disable();
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-	while (test_and_set_bit(bitnum, addr)) {
+	while (unlikely(test_and_set_bit_lock(bitnum, addr))) {
 		while (test_bit(bitnum, addr)) {
 			preempt_enable();
 			cpu_relax();
@@ -36,7 +36,7 @@
 {
 	preempt_disable();
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-	if (test_and_set_bit(bitnum, addr)) {
+	if (test_and_set_bit_lock(bitnum, addr)) {
 		preempt_enable();
 		return 0;
 	}
@@ -50,10 +50,11 @@
  */
 static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
 {
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+#ifdef CONFIG_DEBUG_SPINLOCK
 	BUG_ON(!test_bit(bitnum, addr));
-	smp_mb__before_clear_bit();
-	clear_bit(bitnum, addr);
+#endif
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	clear_bit_unlock(bitnum, addr);
 #endif
 	preempt_enable();
 	__release(bitlock);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/mm/filemap.c	2007-05-09 13:57:33.000000000 +1000
@@ -525,17 +525,14 @@
  * mechananism between PageLocked pages and PageWriteback pages is shared.
  * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
  *
- * The first mb is necessary to safely close the critical section opened by the
- * TestSetPageLocked(), the second mb is necessary to enforce ordering between
- * the clear_bit and the read of the waitqueue (to avoid SMP races with a
- * parallel wait_on_page_locked()).
+ * The mb is necessary to enforce ordering between the clear_bit and the read
+ * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
  */
 void fastcall unlock_page(struct page *page)
 {
-	smp_mb__before_clear_bit();
-	if (!TestClearPageLocked(page))
-		BUG();
-	smp_mb__after_clear_bit(); 
+	VM_BUG_ON(!PageLocked(page));
+	clear_bit_unlock(PG_locked, &page->flags);
+	smp_mb__after_clear_bit();
 	wake_up_page(page, PG_locked);
 }
 EXPORT_SYMBOL(unlock_page);
@@ -625,7 +622,7 @@
 	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page) {
 		page_cache_get(page);
-		if (TestSetPageLocked(page)) {
+		if (!trylock_page(page)) {
 			read_unlock_irq(&mapping->tree_lock);
 			__lock_page(page);
 			read_lock_irq(&mapping->tree_lock);
@@ -798,7 +795,7 @@
 	struct page *page = find_get_page(mapping, index);
 
 	if (page) {
-		if (!TestSetPageLocked(page))
+		if (trylock_page(page))
 			return page;
 		page_cache_release(page);
 		return NULL;
Index: linux-2.6/include/asm-alpha/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-alpha/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-alpha/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -359,6 +359,7 @@
 #else
 #include <asm-generic/bitops/hweight.h>
 #endif
+#include <asm-generic/bitops/lock.h>
 
 #endif /* __KERNEL__ */
 
Index: linux-2.6/include/asm-arm/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-arm/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-arm/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -286,6 +286,7 @@
 
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 /*
  * Ext2 is defined to use little-endian byte ordering.
Index: linux-2.6/include/asm-arm26/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-arm26/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-arm26/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -167,6 +167,7 @@
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 /*
  * Ext2 is defined to use little-endian byte ordering.
Index: linux-2.6/include/asm-avr32/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-avr32/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-avr32/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -288,6 +288,7 @@
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 #include <asm-generic/bitops/ext2-non-atomic.h>
 #include <asm-generic/bitops/ext2-atomic.h>
Index: linux-2.6/include/asm-cris/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-cris/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-cris/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -154,6 +154,7 @@
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/hweight.h>
 #include <asm-generic/bitops/find.h>
+#include <asm-generic/bitops/lock.h>
 
 #include <asm-generic/bitops/ext2-non-atomic.h>
 
Index: linux-2.6/include/asm-frv/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-frv/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-frv/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -302,6 +302,7 @@
 
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 #include <asm-generic/bitops/ext2-non-atomic.h>
 
Index: linux-2.6/include/asm-generic/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-generic/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-generic/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -22,6 +22,7 @@
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 #include <asm-generic/bitops/ext2-non-atomic.h>
 #include <asm-generic/bitops/ext2-atomic.h>
Index: linux-2.6/include/asm-generic/bitops/lock.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/include/asm-generic/bitops/lock.h	2007-05-09 13:57:33.000000000 +1000
@@ -0,0 +1,13 @@
+#ifndef _ASM_GENERIC_BITOPS_LOCK_H_
+#define _ASM_GENERIC_BITOPS_LOCK_H_
+
+#define test_and_set_bit_lock(nr, addr)	test_and_set_bit(nr, addr)
+
+#define clear_bit_unlock(nr, addr)	\
+do {					\
+	smp_mb__before_clear_bit();	\
+	clear_bit(nr, addr);		\
+} while (0)
+
+#endif /* _ASM_GENERIC_BITOPS_LOCK_H_ */
+
Index: linux-2.6/include/asm-h8300/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-h8300/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-h8300/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -194,6 +194,7 @@
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/ext2-non-atomic.h>
 #include <asm-generic/bitops/ext2-atomic.h>
 #include <asm-generic/bitops/minix.h>
Index: linux-2.6/include/asm-i386/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-i386/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-i386/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -402,6 +402,7 @@
 }
 
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 #endif /* __KERNEL__ */
 
Index: linux-2.6/include/asm-ia64/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-ia64/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-ia64/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -94,6 +94,30 @@
 }
 
 /**
+ * clear_bit_unlock - Clears a bit in memory with release
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and may not be reordered.  It does
+ * contain a memory barrier suitable for unlock type operations.
+ */
+static __inline__ void
+clear_bit_unlock (int nr, volatile void *addr)
+{
+	__u32 mask, old, new;
+	volatile __u32 *m;
+	CMPXCHG_BUGCHECK_DECL
+
+	m = (volatile __u32 *) addr + (nr >> 5);
+	mask = ~(1 << (nr & 31));
+	do {
+		CMPXCHG_BUGCHECK(m);
+		old = *m;
+		new = old & mask;
+	} while (cmpxchg_rel(m, old, new) != old);
+}
+
+/**
  * __clear_bit - Clears a bit in memory (non-atomic version)
  */
 static __inline__ void
@@ -170,6 +194,15 @@
 }
 
 /**
+ * test_and_set_bit_lock - Set a bit and return its old value for lock
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This is the same as test_and_set_bit on ia64
+ */
+#define test_and_set_bit_lock test_and_set_bit
+
+/**
  * __test_and_set_bit - Set a bit and return its old value
  * @nr: Bit to set
  * @addr: Address to count from
Index: linux-2.6/include/asm-m32r/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-m32r/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-m32r/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -255,6 +255,7 @@
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 #endif /* __KERNEL__ */
 
Index: linux-2.6/include/asm-m68k/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-m68k/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-m68k/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -314,6 +314,7 @@
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 /* Bitmap functions for the minix filesystem */
 
Index: linux-2.6/include/asm-m68knommu/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-m68knommu/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-m68knommu/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -160,6 +160,7 @@
 
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 static __inline__ int ext2_set_bit(int nr, volatile void * addr)
 {
Index: linux-2.6/include/asm-mips/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-mips/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-mips/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -569,6 +569,7 @@
 
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/ext2-non-atomic.h>
 #include <asm-generic/bitops/ext2-atomic.h>
 #include <asm-generic/bitops/minix.h>
Index: linux-2.6/include/asm-parisc/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-parisc/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-parisc/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -208,6 +208,7 @@
 
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
 
 #endif /* __KERNEL__ */
Index: linux-2.6/include/asm-s390/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-s390/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-s390/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -746,6 +746,7 @@
 #include <asm-generic/bitops/fls64.h>
 
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 /*
  * ATTENTION: intel byte ordering convention for ext2 and minix !!
Index: linux-2.6/include/asm-sh/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-sh/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-sh/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -137,6 +137,7 @@
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/ext2-non-atomic.h>
 #include <asm-generic/bitops/ext2-atomic.h>
Index: linux-2.6/include/asm-sh64/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-sh64/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-sh64/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -136,6 +136,7 @@
 #include <asm-generic/bitops/__ffs.h>
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/ext2-non-atomic.h>
Index: linux-2.6/include/asm-sparc/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-sparc/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-sparc/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -96,6 +96,7 @@
 #include <asm-generic/bitops/fls.h>
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/ext2-non-atomic.h>
 #include <asm-generic/bitops/ext2-atomic.h>
Index: linux-2.6/include/asm-sparc64/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-sparc64/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-sparc64/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -81,6 +81,7 @@
 #include <asm-generic/bitops/hweight.h>
 
 #endif
+#include <asm-generic/bitops/lock.h>
 #endif /* __KERNEL__ */
 
 #include <asm-generic/bitops/find.h>
Index: linux-2.6/include/asm-v850/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-v850/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-v850/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -145,6 +145,7 @@
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 #include <asm-generic/bitops/ext2-non-atomic.h>
 #define ext2_set_bit_atomic(l,n,a)      test_and_set_bit(n,a)
Index: linux-2.6/include/asm-x86_64/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-x86_64/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-x86_64/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -408,6 +408,7 @@
 #define ARCH_HAS_FAST_MULTIPLIER 1
 
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 
 #endif /* __KERNEL__ */
 
Index: linux-2.6/include/asm-xtensa/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-xtensa/bitops.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/asm-xtensa/bitops.h	2007-05-09 13:57:33.000000000 +1000
@@ -115,6 +115,7 @@
 #endif
 
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/minix.h>
 
Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/fs/buffer.c	2007-05-09 13:57:33.000000000 +1000
@@ -78,8 +78,7 @@
 
 void fastcall unlock_buffer(struct buffer_head *bh)
 {
-	smp_mb__before_clear_bit();
-	clear_buffer_locked(bh);
+	clear_bit_unlock(BH_Lock, &bh->b_state);
 	smp_mb__after_clear_bit();
 	wake_up_bit(&bh->b_state, BH_Lock);
 }
@@ -1664,7 +1663,7 @@
 		 */
 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
 			lock_buffer(bh);
-		} else if (test_set_buffer_locked(bh)) {
+		} else if (!trylock_buffer(bh)) {
 			redirty_page_for_writepage(wbc, page);
 			continue;
 		}
@@ -2719,7 +2718,7 @@
 
 		if (rw == SWRITE)
 			lock_buffer(bh);
-		else if (test_set_buffer_locked(bh))
+		else if (!trylock_buffer(bh))
 			continue;
 
 		if (rw == WRITE || rw == SWRITE) {
Index: linux-2.6/include/linux/buffer_head.h
===================================================================
--- linux-2.6.orig/include/linux/buffer_head.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/linux/buffer_head.h	2007-05-09 13:57:33.000000000 +1000
@@ -115,7 +115,6 @@
 BUFFER_FNS(Dirty, dirty)
 TAS_BUFFER_FNS(Dirty, dirty)
 BUFFER_FNS(Lock, locked)
-TAS_BUFFER_FNS(Lock, locked)
 BUFFER_FNS(Req, req)
 TAS_BUFFER_FNS(Req, req)
 BUFFER_FNS(Mapped, mapped)
@@ -301,10 +300,15 @@
 		__wait_on_buffer(bh);
 }
 
+static inline int trylock_buffer(struct buffer_head *bh)
+{
+	return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state));
+}
+
 static inline void lock_buffer(struct buffer_head *bh)
 {
 	might_sleep();
-	if (test_set_buffer_locked(bh))
+	if (!trylock_buffer(bh))
 		__lock_buffer(bh);
 }
 
Index: linux-2.6/include/linux/interrupt.h
===================================================================
--- linux-2.6.orig/include/linux/interrupt.h	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/include/linux/interrupt.h	2007-05-09 13:57:33.000000000 +1000
@@ -310,13 +310,12 @@
 #ifdef CONFIG_SMP
 static inline int tasklet_trylock(struct tasklet_struct *t)
 {
-	return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
+	return !test_and_set_bit_lock(TASKLET_STATE_RUN, &(t)->state);
 }
 
 static inline void tasklet_unlock(struct tasklet_struct *t)
 {
-	smp_mb__before_clear_bit(); 
-	clear_bit(TASKLET_STATE_RUN, &(t)->state);
+	clear_bit_unlock(TASKLET_STATE_RUN, &(t)->state);
 }
 
 static inline void tasklet_unlock_wait(struct tasklet_struct *t)
Index: linux-2.6/kernel/wait.c
===================================================================
--- linux-2.6.orig/kernel/wait.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/kernel/wait.c	2007-05-09 13:57:33.000000000 +1000
@@ -195,7 +195,7 @@
 			if ((ret = (*action)(q->key.flags)))
 				break;
 		}
-	} while (test_and_set_bit(q->key.bit_nr, q->key.flags));
+	} while (test_and_set_bit_lock(q->key.bit_nr, q->key.flags));
 	finish_wait(wq, &q->wait);
 	return ret;
 }
Index: linux-2.6/Documentation/atomic_ops.txt
===================================================================
--- linux-2.6.orig/Documentation/atomic_ops.txt	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/Documentation/atomic_ops.txt	2007-05-09 13:57:33.000000000 +1000
@@ -369,6 +369,15 @@
 	 */
 	 smp_mb__after_clear_bit();
 
+There are two special bitops with lock barrier semantics (acquire/release,
+same as spinlocks). These operate in the same way as their non-_lock/unlock
+postfixed variants, except that they are to provide acquire/release semantics,
+respectively. This means they can be used for bit_spin_trylock and
+bit_spin_unlock type operations without specifying any more barriers.
+
+	int test_and_set_bit_lock(unsigned long nr, unsigned long *addr);
+	void clear_bit_unlock(unsigned long nr, unsigned long *addr);
+
 Finally, there are non-atomic versions of the bitmask operations
 provided.  They are used in contexts where some other higher-level SMP
 locking scheme is being used to protect the bitmask, and thus less
Index: linux-2.6/fs/ntfs/aops.c
===================================================================
--- linux-2.6.orig/fs/ntfs/aops.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/fs/ntfs/aops.c	2007-05-09 13:57:33.000000000 +1000
@@ -1199,7 +1199,7 @@
 		tbh = bhs[i];
 		if (!tbh)
 			continue;
-		if (unlikely(test_set_buffer_locked(tbh)))
+		if (!trylock_buffer(tbh))
 			BUG();
 		/* The buffer dirty state is now irrelevant, just clean it. */
 		clear_buffer_dirty(tbh);
Index: linux-2.6/fs/ntfs/compress.c
===================================================================
--- linux-2.6.orig/fs/ntfs/compress.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/fs/ntfs/compress.c	2007-05-09 13:57:33.000000000 +1000
@@ -655,7 +655,7 @@
 	for (i = 0; i < nr_bhs; i++) {
 		struct buffer_head *tbh = bhs[i];
 
-		if (unlikely(test_set_buffer_locked(tbh)))
+		if (!trylock_buffer(tbh))
 			continue;
 		if (unlikely(buffer_uptodate(tbh))) {
 			unlock_buffer(tbh);
Index: linux-2.6/fs/ntfs/mft.c
===================================================================
--- linux-2.6.orig/fs/ntfs/mft.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/fs/ntfs/mft.c	2007-05-09 13:57:33.000000000 +1000
@@ -586,7 +586,7 @@
 		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
 			struct buffer_head *tbh = bhs[i_bhs];
 
-			if (unlikely(test_set_buffer_locked(tbh)))
+			if (!trylock_buffer(tbh))
 				BUG();
 			BUG_ON(!buffer_uptodate(tbh));
 			clear_buffer_dirty(tbh);
@@ -779,7 +779,7 @@
 	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
 		struct buffer_head *tbh = bhs[i_bhs];
 
-		if (unlikely(test_set_buffer_locked(tbh)))
+		if (!trylock_buffer(tbh))
 			BUG();
 		BUG_ON(!buffer_uptodate(tbh));
 		clear_buffer_dirty(tbh);
Index: linux-2.6/fs/reiserfs/inode.c
===================================================================
--- linux-2.6.orig/fs/reiserfs/inode.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/fs/reiserfs/inode.c	2007-05-09 13:57:33.000000000 +1000
@@ -2448,7 +2448,7 @@
 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
 			lock_buffer(bh);
 		} else {
-			if (test_set_buffer_locked(bh)) {
+			if (!trylock_buffer(bh)) {
 				redirty_page_for_writepage(wbc, page);
 				continue;
 			}
Index: linux-2.6/fs/reiserfs/journal.c
===================================================================
--- linux-2.6.orig/fs/reiserfs/journal.c	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/fs/reiserfs/journal.c	2007-05-09 13:57:33.000000000 +1000
@@ -830,7 +830,7 @@
 		jh = JH_ENTRY(list->next);
 		bh = jh->bh;
 		get_bh(bh);
-		if (test_set_buffer_locked(bh)) {
+		if (!trylock_buffer(bh)) {
 			if (!buffer_dirty(bh)) {
 				list_move(&jh->list, &tmp);
 				goto loop_next;
@@ -3838,7 +3838,7 @@
 {
 	PROC_INFO_INC(p_s_sb, journal.prepare);
 
-	if (test_set_buffer_locked(bh)) {
+	if (!trylock_buffer(bh)) {
 		if (!wait)
 			return 0;
 		lock_buffer(bh);
Index: linux-2.6/Documentation/memory-barriers.txt
===================================================================
--- linux-2.6.orig/Documentation/memory-barriers.txt	2007-05-09 13:56:56.000000000 +1000
+++ linux-2.6/Documentation/memory-barriers.txt	2007-05-09 13:57:33.000000000 +1000
@@ -1479,7 +1479,8 @@
 
 Any atomic operation that modifies some state in memory and returns information
 about the state (old or new) implies an SMP-conditional general memory barrier
-(smp_mb()) on each side of the actual operation.  These include:
+(smp_mb()) on each side of the actual operation (with the exception of
+explicit lock operations, described later).  These include:
 
 	xchg();
 	cmpxchg();
@@ -1536,10 +1537,19 @@
 do need memory barriers as a lock primitive generally has to do things in a
 specific order.
 
-
 Basically, each usage case has to be carefully considered as to whether memory
 barriers are needed or not.
 
+The following operations are special locking primitives:
+
+	test_and_set_bit_lock();
+	clear_bit_unlock();
+
+These implement LOCK-class and UNLOCK-class operations, respectively. These
+should be used in preference to other operations when implementing locking
+primitives, because their implementations can be optimised on many
+architectures.
+
 [!] Note that special memory barrier primitives are available for these
 situations because on some CPUs the atomic instructions used imply full memory
 barriers, and so barrier instructions are superfluous in conjunction with them,

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] lock bitops
  2007-05-08 11:37 [rfc] lock bitops Nick Piggin
  2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin
  2007-05-08 12:22 ` [rfc] lock bitops David Howells
@ 2007-05-08 15:06 ` Matthew Wilcox
  2007-05-08 21:23   ` Benjamin Herrenschmidt
  2007-05-08 22:29   ` Nick Piggin
  2007-05-09 12:08 ` Nikita Danilov
  3 siblings, 2 replies; 34+ messages in thread
From: Matthew Wilcox @ 2007-05-08 15:06 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List

On Tue, May 08, 2007 at 01:37:09PM +0200, Nick Piggin wrote:
> --
> Introduce test_and_set_bit_lock / clear_bit_unlock bitops with lock semantics.
> Add non-trivial for powerpc and ia64. Convert page lock, buffer lock,
> bit_spin_lock, tasklet locks to use the new locks.

The names are a bit clumsy.  How about naming them after the effect,
rather than the implementation?  It struck me that really these things
are bit mutexes -- you can sleep while holding the lock.  How about
calling them bit_mutex_trylock() and bit_mutex_unlock()?


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] lock bitops
  2007-05-08 15:06 ` Matthew Wilcox
@ 2007-05-08 21:23   ` Benjamin Herrenschmidt
  2007-05-08 22:29   ` Nick Piggin
  1 sibling, 0 replies; 34+ messages in thread
From: Benjamin Herrenschmidt @ 2007-05-08 21:23 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Nick Piggin, linux-arch, Andrew Morton, Linux Kernel Mailing List

On Tue, 2007-05-08 at 09:06 -0600, Matthew Wilcox wrote:
> On Tue, May 08, 2007 at 01:37:09PM +0200, Nick Piggin wrote:
> > --
> > Introduce test_and_set_bit_lock / clear_bit_unlock bitops with lock semantics.
> > Add non-trivial for powerpc and ia64. Convert page lock, buffer lock,
> > bit_spin_lock, tasklet locks to use the new locks.
> 
> The names are a bit clumsy.  How about naming them after the effect,
> rather than the implementation?  It struck me that really these things
> are bit mutexes -- you can sleep while holding the lock.  How about
> calling them bit_mutex_trylock() and bit_mutex_unlock()?

Hrm... spin_trylock vs. mutex_trylock ... what difference ? :-)

Note that if we're gonna generalize the usage as a mutex, we might want
to extend the unlock semantic to return the first word flag atomically
so that the caller can test for other bits without having to read
the word back (which might be a performance hit on CPUs without store
forwarding). That's already what Nick's new unlock_page() does (testing
the flag again right after unlock). At first, I though it would make
clear_bit_unlock() semantics a bit too clumsy but maybe it's worth it.

Ben.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] lock bitops
  2007-05-08 15:06 ` Matthew Wilcox
  2007-05-08 21:23   ` Benjamin Herrenschmidt
@ 2007-05-08 22:29   ` Nick Piggin
  2007-05-08 22:40     ` Matthew Wilcox
  1 sibling, 1 reply; 34+ messages in thread
From: Nick Piggin @ 2007-05-08 22:29 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List

On Tue, May 08, 2007 at 09:06:32AM -0600, Matthew Wilcox wrote:
> On Tue, May 08, 2007 at 01:37:09PM +0200, Nick Piggin wrote:
> > --
> > Introduce test_and_set_bit_lock / clear_bit_unlock bitops with lock semantics.
> > Add non-trivial for powerpc and ia64. Convert page lock, buffer lock,
> > bit_spin_lock, tasklet locks to use the new locks.
> 
> The names are a bit clumsy.  How about naming them after the effect,
> rather than the implementation?  It struck me that really these things
> are bit mutexes -- you can sleep while holding the lock.  How about
> calling them bit_mutex_trylock() and bit_mutex_unlock()?

bit_spin_trylock / bit_spin_unlock be OK? ;)

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] lock bitops
  2007-05-08 22:29   ` Nick Piggin
@ 2007-05-08 22:40     ` Matthew Wilcox
  2007-05-08 22:45       ` Nick Piggin
  0 siblings, 1 reply; 34+ messages in thread
From: Matthew Wilcox @ 2007-05-08 22:40 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List

On Wed, May 09, 2007 at 12:29:27AM +0200, Nick Piggin wrote:
> On Tue, May 08, 2007 at 09:06:32AM -0600, Matthew Wilcox wrote:
> > On Tue, May 08, 2007 at 01:37:09PM +0200, Nick Piggin wrote:
> > > --
> > > Introduce test_and_set_bit_lock / clear_bit_unlock bitops with lock semantics.
> > > Add non-trivial for powerpc and ia64. Convert page lock, buffer lock,
> > > bit_spin_lock, tasklet locks to use the new locks.
> > 
> > The names are a bit clumsy.  How about naming them after the effect,
> > rather than the implementation?  It struck me that really these things
> > are bit mutexes -- you can sleep while holding the lock.  How about
> > calling them bit_mutex_trylock() and bit_mutex_unlock()?
> 
> bit_spin_trylock / bit_spin_unlock be OK? ;)

We already have a bit_spin_trylock -- it keeps preempt disabled until
you bit_spin_unlock.  Oh, and it only actually sets a bit if you've got
SMP or lock debugging on.  Nice try though ;-)

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] lock bitops
  2007-05-08 22:40     ` Matthew Wilcox
@ 2007-05-08 22:45       ` Nick Piggin
  0 siblings, 0 replies; 34+ messages in thread
From: Nick Piggin @ 2007-05-08 22:45 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List

On Tue, May 08, 2007 at 04:40:36PM -0600, Matthew Wilcox wrote:
> On Wed, May 09, 2007 at 12:29:27AM +0200, Nick Piggin wrote:
> > On Tue, May 08, 2007 at 09:06:32AM -0600, Matthew Wilcox wrote:
> > > On Tue, May 08, 2007 at 01:37:09PM +0200, Nick Piggin wrote:
> > > > --
> > > > Introduce test_and_set_bit_lock / clear_bit_unlock bitops with lock semantics.
> > > > Add non-trivial for powerpc and ia64. Convert page lock, buffer lock,
> > > > bit_spin_lock, tasklet locks to use the new locks.
> > > 
> > > The names are a bit clumsy.  How about naming them after the effect,
> > > rather than the implementation?  It struck me that really these things
> > > are bit mutexes -- you can sleep while holding the lock.  How about
> > > calling them bit_mutex_trylock() and bit_mutex_unlock()?
> > 
> > bit_spin_trylock / bit_spin_unlock be OK? ;)
> 
> We already have a bit_spin_trylock -- it keeps preempt disabled until
> you bit_spin_unlock.  Oh, and it only actually sets a bit if you've got
> SMP or lock debugging on.  Nice try though ;-)

OK, I'll be blunt then. I think s/test_and_set_bit_lock/bit_mutex_trylock
is much worse ;)

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] lock bitops
  2007-05-08 11:37 [rfc] lock bitops Nick Piggin
                   ` (2 preceding siblings ...)
  2007-05-08 15:06 ` Matthew Wilcox
@ 2007-05-09 12:08 ` Nikita Danilov
  2007-05-09 12:20   ` Nick Piggin
  3 siblings, 1 reply; 34+ messages in thread
From: Nikita Danilov @ 2007-05-09 12:08 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List

Nick Piggin writes:
 > Hi,

[...]

 >  
 >  /**
 > + * clear_bit_unlock - Clears a bit in memory with release
 > + * @nr: Bit to clear
 > + * @addr: Address to start counting from
 > + *
 > + * clear_bit() is atomic and may not be reordered.  It does

s/clear_bit/clear_bit_unlock/ ?

 > + * contain a memory barrier suitable for unlock type operations.
 > + */
 > +static __inline__ void
 > +clear_bit_unlock (int nr, volatile void *addr)
 > +{

Nikita.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] lock bitops
  2007-05-09 12:08 ` Nikita Danilov
@ 2007-05-09 12:20   ` Nick Piggin
  0 siblings, 0 replies; 34+ messages in thread
From: Nick Piggin @ 2007-05-09 12:20 UTC (permalink / raw)
  To: Nikita Danilov
  Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List

On Wed, May 09, 2007 at 04:08:41PM +0400, Nikita Danilov wrote:
> Nick Piggin writes:
>  > Hi,
> 
> [...]
> 
>  >  
>  >  /**
>  > + * clear_bit_unlock - Clears a bit in memory with release
>  > + * @nr: Bit to clear
>  > + * @addr: Address to start counting from
>  > + *
>  > + * clear_bit() is atomic and may not be reordered.  It does
> 
> s/clear_bit/clear_bit_unlock/ ?

Yes :)

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2007-05-17  6:27 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-05-08 11:37 [rfc] lock bitops Nick Piggin
2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin
2007-05-08 12:13   ` David Howells
2007-05-08 22:35     ` Nick Piggin
2007-05-08 20:08   ` Hugh Dickins
2007-05-08 21:30   ` Benjamin Herrenschmidt
2007-05-08 22:41     ` Nick Piggin
2007-05-08 22:50       ` Nick Piggin
2007-05-09 19:33         ` Hugh Dickins
2007-05-09 21:21           ` Benjamin Herrenschmidt
2007-05-10  3:37           ` Nick Piggin
2007-05-10 19:14             ` Hugh Dickins
2007-05-11  8:54               ` Nick Piggin
2007-05-11 13:15                 ` Hugh Dickins
2007-05-13  3:32                   ` Nick Piggin
2007-05-13  4:39                     ` Hugh Dickins
2007-05-13  6:52                       ` Nick Piggin
2007-05-16 17:54                         ` Hugh Dickins
2007-05-16 18:18                           ` Nick Piggin
2007-05-16 19:28                             ` Hugh Dickins
2007-05-16 19:47                               ` Linus Torvalds
2007-05-17  6:27                                 ` Nick Piggin
2007-05-16 17:21                     ` Hugh Dickins
2007-05-16 17:38                       ` Nick Piggin
2007-05-08 12:22 ` [rfc] lock bitops David Howells
2007-05-08 22:33   ` Nick Piggin
2007-05-09  6:18   ` Nick Piggin
2007-05-08 15:06 ` Matthew Wilcox
2007-05-08 21:23   ` Benjamin Herrenschmidt
2007-05-08 22:29   ` Nick Piggin
2007-05-08 22:40     ` Matthew Wilcox
2007-05-08 22:45       ` Nick Piggin
2007-05-09 12:08 ` Nikita Danilov
2007-05-09 12:20   ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).