* [rfc] lock bitops
@ 2007-05-08 11:37 Nick Piggin
2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin
` (3 more replies)
0 siblings, 4 replies; 34+ messages in thread
From: Nick Piggin @ 2007-05-08 11:37 UTC (permalink / raw)
To: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
Linux Kernel Mailing List
Hi,
This patch (along with the subsequent one to optimise unlock_page) reduces
the overhead of lock_page/unlock_page (measured with page faults and a patch
to lock the page in the fault handler) by about 425 cycles on my 2-way G5.
It also improves `dd if=big-sparse-file of=/dev/null` performance from
626MB/s to 683MB/s (+9.1%) by reducing lock_page/unlock_page overhead by
651 cycles on the same machine.
Lastly, kbuild elapsed times are improved by maybe half a percent on the
same system, from:
236.21user 18.68system 2:09.38elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
236.21user 18.68system 2:09.15elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
236.12user 18.69system 2:09.06elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
236.22user 18.70system 2:09.14elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
236.24user 18.70system 2:09.20elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
to:
235.13user 18.38system 2:08.49elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
235.25user 18.39system 2:08.63elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
235.19user 18.41system 2:08.66elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
235.16user 18.40system 2:08.57elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
235.21user 18.39system 2:08.47elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
--
Introduce test_and_set_bit_lock / clear_bit_unlock bitops with lock semantics.
Add non-trivial for powerpc and ia64. Convert page lock, buffer lock,
bit_spin_lock, tasklet locks to use the new locks.
---
Documentation/atomic_ops.txt | 9 ++++++++
drivers/scsi/sg.c | 2 -
fs/buffer.c | 7 ++----
fs/cifs/file.c | 2 -
fs/jbd/commit.c | 2 -
fs/jbd2/commit.c | 2 -
fs/xfs/linux-2.6/xfs_aops.c | 4 +--
include/asm-alpha/bitops.h | 1
include/asm-arm/bitops.h | 1
include/asm-arm26/bitops.h | 1
include/asm-avr32/bitops.h | 1
include/asm-cris/bitops.h | 1
include/asm-frv/bitops.h | 1
include/asm-generic/bitops.h | 1
include/asm-generic/bitops/lock.h | 16 +++++++++++++++
include/asm-h8300/bitops.h | 1
include/asm-i386/bitops.h | 1
include/asm-ia64/bitops.h | 33 ++++++++++++++++++++++++++++++++
include/asm-m32r/bitops.h | 1
include/asm-m68k/bitops.h | 1
include/asm-m68knommu/bitops.h | 1
include/asm-mips/bitops.h | 1
include/asm-parisc/bitops.h | 1
include/asm-powerpc/bitops.h | 39 ++++++++++++++++++++++++++++++++++++++
include/asm-s390/bitops.h | 1
include/asm-sh/bitops.h | 1
include/asm-sh64/bitops.h | 1
include/asm-sparc/bitops.h | 1
include/asm-sparc64/bitops.h | 1
include/asm-v850/bitops.h | 1
include/asm-x86_64/bitops.h | 1
include/asm-xtensa/bitops.h | 1
include/linux/bit_spinlock.h | 11 +++++-----
include/linux/buffer_head.h | 8 +++++--
include/linux/interrupt.h | 5 +---
include/linux/page-flags.h | 4 +++
include/linux/pagemap.h | 9 ++++++--
kernel/wait.c | 2 -
mm/filemap.c | 13 ++++--------
mm/memory.c | 2 -
mm/migrate.c | 4 +--
mm/rmap.c | 2 -
mm/shmem.c | 4 +--
mm/swap.c | 2 -
mm/swap_state.c | 2 -
mm/swapfile.c | 2 -
mm/truncate.c | 4 +--
mm/vmscan.c | 4 +--
48 files changed, 172 insertions(+), 44 deletions(-)
Index: linux-2.6/include/asm-powerpc/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-powerpc/bitops.h 2007-05-08 13:33:18.000000000 +1000
+++ linux-2.6/include/asm-powerpc/bitops.h 2007-05-08 13:45:31.000000000 +1000
@@ -87,6 +87,24 @@
: "cc" );
}
+static __inline__ void clear_bit_unlock(int nr, volatile unsigned long *addr)
+{
+ unsigned long old;
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+ __asm__ __volatile__(
+ LWSYNC_ON_SMP
+"1:" PPC_LLARX "%0,0,%3 # clear_bit_unlock\n"
+ "andc %0,%0,%2\n"
+ PPC405_ERR77(0,%3)
+ PPC_STLCX "%0,0,%3\n"
+ "bne- 1b"
+ : "=&r" (old), "+m" (*p)
+ : "r" (mask), "r" (p)
+ : "cc" );
+}
+
static __inline__ void change_bit(int nr, volatile unsigned long *addr)
{
unsigned long old;
@@ -126,6 +144,27 @@
return (old & mask) != 0;
}
+static __inline__ int test_and_set_bit_lock(unsigned long nr,
+ volatile unsigned long *addr)
+{
+ unsigned long old, t;
+ unsigned long mask = BITOP_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+
+ __asm__ __volatile__(
+"1:" PPC_LLARX "%0,0,%3 # test_and_set_bit_lock\n"
+ "or %1,%0,%2 \n"
+ PPC405_ERR77(0,%3)
+ PPC_STLCX "%1,0,%3 \n"
+ "bne- 1b"
+ ISYNC_ON_SMP
+ : "=&r" (old), "=&r" (t)
+ : "r" (mask), "r" (p)
+ : "cc", "memory");
+
+ return (old & mask) != 0;
+}
+
static __inline__ int test_and_clear_bit(unsigned long nr,
volatile unsigned long *addr)
{
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h 2007-05-08 13:33:18.000000000 +1000
+++ linux-2.6/include/linux/pagemap.h 2007-05-08 15:30:39.000000000 +1000
@@ -135,13 +135,18 @@
extern void FASTCALL(__lock_page_nosync(struct page *page));
extern void FASTCALL(unlock_page(struct page *page));
+static inline int trylock_page(struct page *page)
+{
+ return (likely(!TestSetPageLocked_Lock(page)));
+}
+
/*
* lock_page may only be called if we have the page's inode pinned.
*/
static inline void lock_page(struct page *page)
{
might_sleep();
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
__lock_page(page);
}
@@ -152,7 +157,7 @@
static inline void lock_page_nosync(struct page *page)
{
might_sleep();
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
__lock_page_nosync(page);
}
Index: linux-2.6/drivers/scsi/sg.c
===================================================================
--- linux-2.6.orig/drivers/scsi/sg.c 2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/drivers/scsi/sg.c 2007-05-08 13:45:31.000000000 +1000
@@ -1734,7 +1734,7 @@
*/
flush_dcache_page(pages[i]);
/* ?? Is locking needed? I don't think so */
- /* if (TestSetPageLocked(pages[i]))
+ /* if (!trylock_page(pages[i]))
goto out_unlock; */
}
Index: linux-2.6/fs/cifs/file.c
===================================================================
--- linux-2.6.orig/fs/cifs/file.c 2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/fs/cifs/file.c 2007-05-08 13:45:31.000000000 +1000
@@ -1229,7 +1229,7 @@
if (first < 0)
lock_page(page);
- else if (TestSetPageLocked(page))
+ else if (!trylock_page(page))
break;
if (unlikely(page->mapping != mapping)) {
Index: linux-2.6/fs/jbd/commit.c
===================================================================
--- linux-2.6.orig/fs/jbd/commit.c 2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/fs/jbd/commit.c 2007-05-08 15:19:20.000000000 +1000
@@ -64,7 +64,7 @@
goto nope;
/* OK, it's a truncated page */
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
goto nope;
page_cache_get(page);
@@ -209,7 +209,7 @@
* blocking lock_buffer().
*/
if (buffer_dirty(bh)) {
- if (test_set_buffer_locked(bh)) {
+ if (!trylock_buffer(bh)) {
BUFFER_TRACE(bh, "needs blocking lock");
spin_unlock(&journal->j_list_lock);
/* Write out all data to prevent deadlocks */
Index: linux-2.6/fs/jbd2/commit.c
===================================================================
--- linux-2.6.orig/fs/jbd2/commit.c 2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/fs/jbd2/commit.c 2007-05-08 15:19:12.000000000 +1000
@@ -64,7 +64,7 @@
goto nope;
/* OK, it's a truncated page */
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
goto nope;
page_cache_get(page);
@@ -209,7 +209,7 @@
* blocking lock_buffer().
*/
if (buffer_dirty(bh)) {
- if (test_set_buffer_locked(bh)) {
+ if (!trylock_buffer(bh)) {
BUFFER_TRACE(bh, "needs blocking lock");
spin_unlock(&journal->j_list_lock);
/* Write out all data to prevent deadlocks */
Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.c 2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.c 2007-05-08 13:45:31.000000000 +1000
@@ -601,7 +601,7 @@
} else
pg_offset = PAGE_CACHE_SIZE;
- if (page->index == tindex && !TestSetPageLocked(page)) {
+ if (page->index == tindex && trylock_page(page)) {
len = xfs_probe_page(page, pg_offset, mapped);
unlock_page(page);
}
@@ -685,7 +685,7 @@
if (page->index != tindex)
goto fail;
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
goto fail;
if (PageWriteback(page))
goto fail_unlock_page;
Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h 2007-05-08 13:33:18.000000000 +1000
+++ linux-2.6/include/linux/page-flags.h 2007-05-08 15:30:39.000000000 +1000
@@ -114,8 +114,12 @@
set_bit(PG_locked, &(page)->flags)
#define TestSetPageLocked(page) \
test_and_set_bit(PG_locked, &(page)->flags)
+#define TestSetPageLocked_Lock(page) \
+ test_and_set_bit_lock(PG_locked, &(page)->flags)
#define ClearPageLocked(page) \
clear_bit(PG_locked, &(page)->flags)
+#define ClearPageLocked_Unlock(page) \
+ clear_bit_unlock(PG_locked, &(page)->flags)
#define TestClearPageLocked(page) \
test_and_clear_bit(PG_locked, &(page)->flags)
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c 2007-05-08 13:45:15.000000000 +1000
+++ linux-2.6/mm/memory.c 2007-05-08 13:45:31.000000000 +1000
@@ -1550,7 +1550,7 @@
* not dirty accountable.
*/
if (PageAnon(old_page)) {
- if (!TestSetPageLocked(old_page)) {
+ if (trylock_page(old_page)) {
reuse = can_share_swap_page(old_page);
unlock_page(old_page);
}
Index: linux-2.6/mm/migrate.c
===================================================================
--- linux-2.6.orig/mm/migrate.c 2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/mm/migrate.c 2007-05-08 13:45:31.000000000 +1000
@@ -569,7 +569,7 @@
* establishing additional references. We are the only one
* holding a reference to the new page at this point.
*/
- if (TestSetPageLocked(newpage))
+ if (!trylock_page(newpage))
BUG();
/* Prepare mapping for the new page.*/
@@ -621,7 +621,7 @@
goto move_newpage;
rc = -EAGAIN;
- if (TestSetPageLocked(page)) {
+ if (!trylock_page(page)) {
if (!force)
goto move_newpage;
lock_page(page);
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c 2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/mm/rmap.c 2007-05-08 13:45:31.000000000 +1000
@@ -426,7 +426,7 @@
referenced += page_referenced_anon(page);
else if (is_locked)
referenced += page_referenced_file(page);
- else if (TestSetPageLocked(page))
+ else if (!trylock_page(page))
referenced++;
else {
if (page->mapping)
Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c 2007-05-08 13:45:15.000000000 +1000
+++ linux-2.6/mm/shmem.c 2007-05-08 13:45:31.000000000 +1000
@@ -1154,7 +1154,7 @@
}
/* We have to do this with page locked to prevent races */
- if (TestSetPageLocked(swappage)) {
+ if (!trylock_page(swappage)) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
wait_on_page_locked(swappage);
@@ -1213,7 +1213,7 @@
shmem_swp_unmap(entry);
filepage = find_get_page(mapping, idx);
if (filepage &&
- (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
+ (!PageUptodate(filepage) || !trylock_page(filepage))) {
spin_unlock(&info->lock);
wait_on_page_locked(filepage);
page_cache_release(filepage);
Index: linux-2.6/mm/swap.c
===================================================================
--- linux-2.6.orig/mm/swap.c 2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/mm/swap.c 2007-05-08 13:45:31.000000000 +1000
@@ -412,7 +412,7 @@
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
- if (PagePrivate(page) && !TestSetPageLocked(page)) {
+ if (PagePrivate(page) && trylock_page(page)) {
if (PagePrivate(page))
try_to_release_page(page, 0);
unlock_page(page);
Index: linux-2.6/mm/swap_state.c
===================================================================
--- linux-2.6.orig/mm/swap_state.c 2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/mm/swap_state.c 2007-05-08 13:45:31.000000000 +1000
@@ -252,7 +252,7 @@
*/
static inline void free_swap_cache(struct page *page)
{
- if (PageSwapCache(page) && !TestSetPageLocked(page)) {
+ if (PageSwapCache(page) && trylock_page(page)) {
remove_exclusive_swap_page(page);
unlock_page(page);
}
Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c 2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/mm/swapfile.c 2007-05-08 13:45:31.000000000 +1000
@@ -401,7 +401,7 @@
if (p) {
if (swap_entry_free(p, swp_offset(entry)) == 1) {
page = find_get_page(&swapper_space, entry.val);
- if (page && unlikely(TestSetPageLocked(page))) {
+ if (page && unlikely(!trylock_page(page))) {
page_cache_release(page);
page = NULL;
}
Index: linux-2.6/mm/truncate.c
===================================================================
--- linux-2.6.orig/mm/truncate.c 2007-05-08 13:45:15.000000000 +1000
+++ linux-2.6/mm/truncate.c 2007-05-08 13:45:31.000000000 +1000
@@ -185,7 +185,7 @@
if (page_index > next)
next = page_index;
next++;
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
continue;
if (PageWriteback(page)) {
unlock_page(page);
@@ -281,7 +281,7 @@
pgoff_t index;
int lock_failed;
- lock_failed = TestSetPageLocked(page);
+ lock_failed = !trylock_page(page);
/*
* We really shouldn't be looking at the ->index of an
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c 2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/mm/vmscan.c 2007-05-08 13:45:31.000000000 +1000
@@ -466,7 +466,7 @@
page = lru_to_page(page_list);
list_del(&page->lru);
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
goto keep;
VM_BUG_ON(PageActive(page));
@@ -538,7 +538,7 @@
* A synchronous write - probably a ramdisk. Go
* ahead and try to reclaim the page.
*/
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
goto keep;
if (PageDirty(page) || PageWriteback(page))
goto keep_locked;
Index: linux-2.6/include/linux/bit_spinlock.h
===================================================================
--- linux-2.6.orig/include/linux/bit_spinlock.h 2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/include/linux/bit_spinlock.h 2007-05-08 15:00:18.000000000 +1000
@@ -18,7 +18,7 @@
*/
preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
- while (test_and_set_bit(bitnum, addr)) {
+ while (unlikely(test_and_set_bit_lock(bitnum, addr))) {
while (test_bit(bitnum, addr)) {
preempt_enable();
cpu_relax();
@@ -36,7 +36,7 @@
{
preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
- if (test_and_set_bit(bitnum, addr)) {
+ if (test_and_set_bit_lock(bitnum, addr)) {
preempt_enable();
return 0;
}
@@ -50,10 +50,11 @@
*/
static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
{
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+#ifdef CONFIG_DEBUG_SPINLOCK
BUG_ON(!test_bit(bitnum, addr));
- smp_mb__before_clear_bit();
- clear_bit(bitnum, addr);
+#endif
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+ clear_bit_unlock(bitnum, addr);
#endif
preempt_enable();
__release(bitlock);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c 2007-05-08 13:47:21.000000000 +1000
+++ linux-2.6/mm/filemap.c 2007-05-08 15:30:55.000000000 +1000
@@ -525,17 +525,14 @@
* mechananism between PageLocked pages and PageWriteback pages is shared.
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
*
- * The first mb is necessary to safely close the critical section opened by the
- * TestSetPageLocked(), the second mb is necessary to enforce ordering between
- * the clear_bit and the read of the waitqueue (to avoid SMP races with a
- * parallel wait_on_page_locked()).
+ * The mb is necessary to enforce ordering between the clear_bit and the read
+ * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
*/
void fastcall unlock_page(struct page *page)
{
- smp_mb__before_clear_bit();
- if (!TestClearPageLocked(page))
- BUG();
- smp_mb__after_clear_bit();
+ VM_BUG_ON(!PageLocked(page));
+ ClearPageLocked_Unlock(page);
+ smp_mb__after_clear_bit();
wake_up_page(page, PG_locked);
}
EXPORT_SYMBOL(unlock_page);
@@ -625,7 +622,7 @@
page = radix_tree_lookup(&mapping->page_tree, offset);
if (page) {
page_cache_get(page);
- if (TestSetPageLocked(page)) {
+ if (!trylock_page(page)) {
read_unlock_irq(&mapping->tree_lock);
__lock_page(page);
read_lock_irq(&mapping->tree_lock);
@@ -798,7 +795,7 @@
struct page *page = find_get_page(mapping, index);
if (page) {
- if (!TestSetPageLocked(page))
+ if (trylock_page(page))
return page;
page_cache_release(page);
return NULL;
Index: linux-2.6/include/asm-alpha/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-alpha/bitops.h 2006-10-10 19:35:15.000000000 +1000
+++ linux-2.6/include/asm-alpha/bitops.h 2007-05-08 14:22:29.000000000 +1000
@@ -359,6 +359,7 @@
#else
#include <asm-generic/bitops/hweight.h>
#endif
+#include <asm-generic/bitops/lock.h>
#endif /* __KERNEL__ */
Index: linux-2.6/include/asm-arm/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-arm/bitops.h 2007-01-29 17:30:03.000000000 +1100
+++ linux-2.6/include/asm-arm/bitops.h 2007-05-08 14:21:11.000000000 +1000
@@ -286,6 +286,7 @@
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
/*
* Ext2 is defined to use little-endian byte ordering.
Index: linux-2.6/include/asm-arm26/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-arm26/bitops.h 2006-05-02 14:30:50.000000000 +1000
+++ linux-2.6/include/asm-arm26/bitops.h 2007-05-08 14:20:54.000000000 +1000
@@ -167,6 +167,7 @@
#include <asm-generic/bitops/ffs.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
/*
* Ext2 is defined to use little-endian byte ordering.
Index: linux-2.6/include/asm-avr32/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-avr32/bitops.h 2007-01-29 17:30:03.000000000 +1100
+++ linux-2.6/include/asm-avr32/bitops.h 2007-05-08 14:21:18.000000000 +1000
@@ -288,6 +288,7 @@
#include <asm-generic/bitops/fls64.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
#include <asm-generic/bitops/ext2-non-atomic.h>
#include <asm-generic/bitops/ext2-atomic.h>
Index: linux-2.6/include/asm-cris/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-cris/bitops.h 2006-05-02 14:30:50.000000000 +1000
+++ linux-2.6/include/asm-cris/bitops.h 2007-05-08 14:21:24.000000000 +1000
@@ -154,6 +154,7 @@
#include <asm-generic/bitops/fls64.h>
#include <asm-generic/bitops/hweight.h>
#include <asm-generic/bitops/find.h>
+#include <asm-generic/bitops/lock.h>
#include <asm-generic/bitops/ext2-non-atomic.h>
Index: linux-2.6/include/asm-frv/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-frv/bitops.h 2007-01-29 17:35:54.000000000 +1100
+++ linux-2.6/include/asm-frv/bitops.h 2007-05-08 14:21:29.000000000 +1000
@@ -302,6 +302,7 @@
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
#include <asm-generic/bitops/ext2-non-atomic.h>
Index: linux-2.6/include/asm-generic/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-generic/bitops.h 2006-05-02 14:30:50.000000000 +1000
+++ linux-2.6/include/asm-generic/bitops.h 2007-05-08 14:21:47.000000000 +1000
@@ -22,6 +22,7 @@
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/ffs.h>
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
#include <asm-generic/bitops/ext2-non-atomic.h>
#include <asm-generic/bitops/ext2-atomic.h>
Index: linux-2.6/include/asm-generic/bitops/lock.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/include/asm-generic/bitops/lock.h 2007-05-08 14:17:44.000000000 +1000
@@ -0,0 +1,16 @@
+#ifndef _ASM_GENERIC_BITOPS_LOCK_H_
+#define _ASM_GENERIC_BITOPS_LOCK_H_
+
+static inline int test_and_set_bit_lock(unsigned long nr, volatile void *addr)
+{
+ return test_and_set_bit(nr, addr);
+}
+
+static inline void clear_bit_unlock(unsigned long nr, volatile void *addr)
+{
+ smp_mb__before_clear_bit();
+ clear_bit(nr, addr);
+}
+
+#endif /* _ASM_GENERIC_BITOPS_LOCK_H_ */
+
Index: linux-2.6/include/asm-h8300/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-h8300/bitops.h 2006-10-10 19:35:15.000000000 +1000
+++ linux-2.6/include/asm-h8300/bitops.h 2007-05-08 14:21:54.000000000 +1000
@@ -194,6 +194,7 @@
#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
#include <asm-generic/bitops/ext2-non-atomic.h>
#include <asm-generic/bitops/ext2-atomic.h>
#include <asm-generic/bitops/minix.h>
Index: linux-2.6/include/asm-i386/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-i386/bitops.h 2007-02-26 11:43:54.000000000 +1100
+++ linux-2.6/include/asm-i386/bitops.h 2007-05-08 14:22:18.000000000 +1000
@@ -402,6 +402,7 @@
}
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
#endif /* __KERNEL__ */
Index: linux-2.6/include/asm-ia64/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-ia64/bitops.h 2006-05-02 14:30:58.000000000 +1000
+++ linux-2.6/include/asm-ia64/bitops.h 2007-05-08 14:25:10.000000000 +1000
@@ -94,6 +94,30 @@
}
/**
+ * clear_bit_unlock - Clears a bit in memory with release
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and may not be reordered. It does
+ * contain a memory barrier suitable for unlock type operations.
+ */
+static __inline__ void
+clear_bit_unlock (int nr, volatile void *addr)
+{
+ __u32 mask, old, new;
+ volatile __u32 *m;
+ CMPXCHG_BUGCHECK_DECL
+
+ m = (volatile __u32 *) addr + (nr >> 5);
+ mask = ~(1 << (nr & 31));
+ do {
+ CMPXCHG_BUGCHECK(m);
+ old = *m;
+ new = old & mask;
+ } while (cmpxchg_rel(m, old, new) != old);
+}
+
+/**
* __clear_bit - Clears a bit in memory (non-atomic version)
*/
static __inline__ void
@@ -170,6 +194,15 @@
}
/**
+ * test_and_set_bit_lock - Set a bit and return its old value for lock
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This is the same as test_and_set_bit on ia64
+ */
+#define test_and_set_bit_lock test_and_set_bit
+
+/**
* __test_and_set_bit - Set a bit and return its old value
* @nr: Bit to set
* @addr: Address to count from
Index: linux-2.6/include/asm-m32r/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-m32r/bitops.h 2006-10-10 19:35:15.000000000 +1000
+++ linux-2.6/include/asm-m32r/bitops.h 2007-05-08 14:25:18.000000000 +1000
@@ -255,6 +255,7 @@
#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/ffs.h>
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
#endif /* __KERNEL__ */
Index: linux-2.6/include/asm-m68k/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-m68k/bitops.h 2006-05-02 14:30:50.000000000 +1000
+++ linux-2.6/include/asm-m68k/bitops.h 2007-05-08 14:25:25.000000000 +1000
@@ -314,6 +314,7 @@
#include <asm-generic/bitops/fls64.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
/* Bitmap functions for the minix filesystem */
Index: linux-2.6/include/asm-m68knommu/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-m68knommu/bitops.h 2007-02-26 11:43:54.000000000 +1100
+++ linux-2.6/include/asm-m68knommu/bitops.h 2007-05-08 14:25:36.000000000 +1000
@@ -160,6 +160,7 @@
#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
static __inline__ int ext2_set_bit(int nr, volatile void * addr)
{
Index: linux-2.6/include/asm-mips/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-mips/bitops.h 2007-03-30 11:31:29.000000000 +1000
+++ linux-2.6/include/asm-mips/bitops.h 2007-05-08 14:25:51.000000000 +1000
@@ -569,6 +569,7 @@
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
#include <asm-generic/bitops/ext2-non-atomic.h>
#include <asm-generic/bitops/ext2-atomic.h>
#include <asm-generic/bitops/minix.h>
Index: linux-2.6/include/asm-parisc/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-parisc/bitops.h 2007-03-05 12:46:03.000000000 +1100
+++ linux-2.6/include/asm-parisc/bitops.h 2007-05-08 14:25:56.000000000 +1000
@@ -208,6 +208,7 @@
#include <asm-generic/bitops/fls64.h>
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
#include <asm-generic/bitops/sched.h>
#endif /* __KERNEL__ */
Index: linux-2.6/include/asm-s390/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-s390/bitops.h 2007-01-29 17:30:03.000000000 +1100
+++ linux-2.6/include/asm-s390/bitops.h 2007-05-08 14:26:14.000000000 +1000
@@ -746,6 +746,7 @@
#include <asm-generic/bitops/fls64.h>
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
/*
* ATTENTION: intel byte ordering convention for ext2 and minix !!
Index: linux-2.6/include/asm-sh/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-sh/bitops.h 2007-01-29 17:30:03.000000000 +1100
+++ linux-2.6/include/asm-sh/bitops.h 2007-05-08 14:26:25.000000000 +1000
@@ -137,6 +137,7 @@
#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/ffs.h>
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/ext2-non-atomic.h>
#include <asm-generic/bitops/ext2-atomic.h>
Index: linux-2.6/include/asm-sh64/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-sh64/bitops.h 2006-05-02 14:30:51.000000000 +1000
+++ linux-2.6/include/asm-sh64/bitops.h 2007-05-08 14:26:20.000000000 +1000
@@ -136,6 +136,7 @@
#include <asm-generic/bitops/__ffs.h>
#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/ffs.h>
#include <asm-generic/bitops/ext2-non-atomic.h>
Index: linux-2.6/include/asm-sparc/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-sparc/bitops.h 2007-01-29 17:35:57.000000000 +1100
+++ linux-2.6/include/asm-sparc/bitops.h 2007-05-08 14:27:11.000000000 +1000
@@ -96,6 +96,7 @@
#include <asm-generic/bitops/fls.h>
#include <asm-generic/bitops/fls64.h>
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/ext2-non-atomic.h>
#include <asm-generic/bitops/ext2-atomic.h>
Index: linux-2.6/include/asm-sparc64/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-sparc64/bitops.h 2006-10-10 19:35:16.000000000 +1000
+++ linux-2.6/include/asm-sparc64/bitops.h 2007-05-08 14:27:06.000000000 +1000
@@ -81,6 +81,7 @@
#include <asm-generic/bitops/hweight.h>
#endif
+#include <asm-generic/bitops/lock.h>
#endif /* __KERNEL__ */
#include <asm-generic/bitops/find.h>
Index: linux-2.6/include/asm-v850/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-v850/bitops.h 2006-10-10 19:35:16.000000000 +1000
+++ linux-2.6/include/asm-v850/bitops.h 2007-05-08 14:27:16.000000000 +1000
@@ -145,6 +145,7 @@
#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
#include <asm-generic/bitops/ext2-non-atomic.h>
#define ext2_set_bit_atomic(l,n,a) test_and_set_bit(n,a)
Index: linux-2.6/include/asm-x86_64/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-x86_64/bitops.h 2007-02-26 11:43:54.000000000 +1100
+++ linux-2.6/include/asm-x86_64/bitops.h 2007-05-08 14:27:22.000000000 +1000
@@ -408,6 +408,7 @@
#define ARCH_HAS_FAST_MULTIPLIER 1
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
#endif /* __KERNEL__ */
Index: linux-2.6/include/asm-xtensa/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-xtensa/bitops.h 2006-05-02 14:30:51.000000000 +1000
+++ linux-2.6/include/asm-xtensa/bitops.h 2007-05-08 14:27:24.000000000 +1000
@@ -115,6 +115,7 @@
#endif
#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/lock.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/minix.h>
Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c 2007-04-24 10:39:55.000000000 +1000
+++ linux-2.6/fs/buffer.c 2007-05-08 14:51:42.000000000 +1000
@@ -78,8 +78,7 @@
void fastcall unlock_buffer(struct buffer_head *bh)
{
- smp_mb__before_clear_bit();
- clear_buffer_locked(bh);
+ clear_bit_unlock(BH_Lock, &bh->b_state);
smp_mb__after_clear_bit();
wake_up_bit(&bh->b_state, BH_Lock);
}
@@ -1664,7 +1663,7 @@
*/
if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
lock_buffer(bh);
- } else if (test_set_buffer_locked(bh)) {
+ } else if (!trylock_buffer(bh)) {
redirty_page_for_writepage(wbc, page);
continue;
}
@@ -2719,7 +2718,7 @@
if (rw == SWRITE)
lock_buffer(bh);
- else if (test_set_buffer_locked(bh))
+ else if (!trylock_buffer(bh))
continue;
if (rw == WRITE || rw == SWRITE) {
Index: linux-2.6/include/linux/buffer_head.h
===================================================================
--- linux-2.6.orig/include/linux/buffer_head.h 2007-05-08 13:33:05.000000000 +1000
+++ linux-2.6/include/linux/buffer_head.h 2007-05-08 14:50:41.000000000 +1000
@@ -115,7 +115,6 @@
BUFFER_FNS(Dirty, dirty)
TAS_BUFFER_FNS(Dirty, dirty)
BUFFER_FNS(Lock, locked)
-TAS_BUFFER_FNS(Lock, locked)
BUFFER_FNS(Req, req)
TAS_BUFFER_FNS(Req, req)
BUFFER_FNS(Mapped, mapped)
@@ -301,10 +300,15 @@
__wait_on_buffer(bh);
}
+static inline int trylock_buffer(struct buffer_head *bh)
+{
+ return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state));
+}
+
static inline void lock_buffer(struct buffer_head *bh)
{
might_sleep();
- if (test_set_buffer_locked(bh))
+ if (!trylock_buffer(bh))
__lock_buffer(bh);
}
Index: linux-2.6/include/linux/interrupt.h
===================================================================
--- linux-2.6.orig/include/linux/interrupt.h 2007-04-12 14:35:11.000000000 +1000
+++ linux-2.6/include/linux/interrupt.h 2007-05-08 14:46:57.000000000 +1000
@@ -310,13 +310,12 @@
#ifdef CONFIG_SMP
static inline int tasklet_trylock(struct tasklet_struct *t)
{
- return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
+ return !test_and_set_bit_lock(TASKLET_STATE_RUN, &(t)->state);
}
static inline void tasklet_unlock(struct tasklet_struct *t)
{
- smp_mb__before_clear_bit();
- clear_bit(TASKLET_STATE_RUN, &(t)->state);
+ clear_bit_unlock(TASKLET_STATE_RUN, &(t)->state);
}
static inline void tasklet_unlock_wait(struct tasklet_struct *t)
Index: linux-2.6/kernel/wait.c
===================================================================
--- linux-2.6.orig/kernel/wait.c 2006-10-10 19:35:16.000000000 +1000
+++ linux-2.6/kernel/wait.c 2007-05-08 14:38:25.000000000 +1000
@@ -195,7 +195,7 @@
if ((ret = (*action)(q->key.flags)))
break;
}
- } while (test_and_set_bit(q->key.bit_nr, q->key.flags));
+ } while (test_and_set_bit_lock(q->key.bit_nr, q->key.flags));
finish_wait(wq, &q->wait);
return ret;
}
Index: linux-2.6/Documentation/atomic_ops.txt
===================================================================
--- linux-2.6.orig/Documentation/atomic_ops.txt 2007-04-12 14:35:03.000000000 +1000
+++ linux-2.6/Documentation/atomic_ops.txt 2007-05-08 14:58:40.000000000 +1000
@@ -369,6 +369,15 @@
*/
smp_mb__after_clear_bit();
+There are two special bitops with lock barrier semantics (acquire/release,
+same as spinlocks). These operate in the same way as their non-_lock/unlock
+postfixed variants, except that they are to provide acquire/release semantics,
+respectively. This means they can be used for bit_spin_trylock and
+bit_spin_unlock type operations without specifying any more barriers.
+
+ int test_and_set_bit_lock(unsigned long nr, unsigned long *addr);
+ void clear_bit_unlock(unsigned long nr, unsigned long *addr);
+
Finally, there are non-atomic versions of the bitmask operations
provided. They are used in contexts where some other higher-level SMP
locking scheme is being used to protect the bitmask, and thus less
Index: linux-2.6/fs/ntfs/aops.c
===================================================================
--- linux-2.6.orig/fs/ntfs/aops.c 2007-03-05 15:17:25.000000000 +1100
+++ linux-2.6/fs/ntfs/aops.c 2007-05-08 15:18:18.000000000 +1000
@@ -1199,7 +1199,7 @@
tbh = bhs[i];
if (!tbh)
continue;
- if (unlikely(test_set_buffer_locked(tbh)))
+ if (!trylock_buffer(tbh))
BUG();
/* The buffer dirty state is now irrelevant, just clean it. */
clear_buffer_dirty(tbh);
Index: linux-2.6/fs/ntfs/compress.c
===================================================================
--- linux-2.6.orig/fs/ntfs/compress.c 2007-03-05 15:17:25.000000000 +1100
+++ linux-2.6/fs/ntfs/compress.c 2007-05-08 15:18:13.000000000 +1000
@@ -655,7 +655,7 @@
for (i = 0; i < nr_bhs; i++) {
struct buffer_head *tbh = bhs[i];
- if (unlikely(test_set_buffer_locked(tbh)))
+ if (!trylock_buffer(tbh))
continue;
if (unlikely(buffer_uptodate(tbh))) {
unlock_buffer(tbh);
Index: linux-2.6/fs/ntfs/mft.c
===================================================================
--- linux-2.6.orig/fs/ntfs/mft.c 2007-02-06 16:20:16.000000000 +1100
+++ linux-2.6/fs/ntfs/mft.c 2007-05-08 15:18:05.000000000 +1000
@@ -586,7 +586,7 @@
for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
struct buffer_head *tbh = bhs[i_bhs];
- if (unlikely(test_set_buffer_locked(tbh)))
+ if (!trylock_buffer(tbh))
BUG();
BUG_ON(!buffer_uptodate(tbh));
clear_buffer_dirty(tbh);
@@ -779,7 +779,7 @@
for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
struct buffer_head *tbh = bhs[i_bhs];
- if (unlikely(test_set_buffer_locked(tbh)))
+ if (!trylock_buffer(tbh))
BUG();
BUG_ON(!buffer_uptodate(tbh));
clear_buffer_dirty(tbh);
Index: linux-2.6/fs/reiserfs/inode.c
===================================================================
--- linux-2.6.orig/fs/reiserfs/inode.c 2007-03-05 15:17:25.000000000 +1100
+++ linux-2.6/fs/reiserfs/inode.c 2007-05-08 15:19:00.000000000 +1000
@@ -2448,7 +2448,7 @@
if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
lock_buffer(bh);
} else {
- if (test_set_buffer_locked(bh)) {
+ if (!trylock_buffer(bh)) {
redirty_page_for_writepage(wbc, page);
continue;
}
Index: linux-2.6/fs/reiserfs/journal.c
===================================================================
--- linux-2.6.orig/fs/reiserfs/journal.c 2007-01-29 17:35:54.000000000 +1100
+++ linux-2.6/fs/reiserfs/journal.c 2007-05-08 15:19:34.000000000 +1000
@@ -830,7 +830,7 @@
jh = JH_ENTRY(list->next);
bh = jh->bh;
get_bh(bh);
- if (test_set_buffer_locked(bh)) {
+ if (!trylock_buffer(bh)) {
if (!buffer_dirty(bh)) {
list_move(&jh->list, &tmp);
goto loop_next;
@@ -3838,7 +3838,7 @@
{
PROC_INFO_INC(p_s_sb, journal.prepare);
- if (test_set_buffer_locked(bh)) {
+ if (!trylock_buffer(bh)) {
if (!wait)
return 0;
lock_buffer(bh);
^ permalink raw reply [flat|nested] 34+ messages in thread* [rfc] optimise unlock_page 2007-05-08 11:37 [rfc] lock bitops Nick Piggin @ 2007-05-08 11:40 ` Nick Piggin 2007-05-08 12:13 ` David Howells ` (2 more replies) 2007-05-08 12:22 ` [rfc] lock bitops David Howells ` (2 subsequent siblings) 3 siblings, 3 replies; 34+ messages in thread From: Nick Piggin @ 2007-05-08 11:40 UTC (permalink / raw) To: linux-arch, Benjamin Herrenschmidt, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List This patch trades a page flag for a significant improvement in the unlock_page fastpath. Various problems in the previous version were spotted by Hugh and Ben (and fixed in this one). Comments? -- Speed up unlock_page by introducing a new page flag to signal that there are page waitqueue waiters for PG_locked. This means a memory barrier and a random waitqueue hash cacheline load can be avoided in the fastpath when there is no contention. Index: linux-2.6/include/linux/page-flags.h =================================================================== --- linux-2.6.orig/include/linux/page-flags.h 2007-05-08 15:30:39.000000000 +1000 +++ linux-2.6/include/linux/page-flags.h 2007-05-08 15:31:00.000000000 +1000 @@ -91,6 +91,8 @@ #define PG_nosave_free 18 /* Used for system suspend/resume */ #define PG_buddy 19 /* Page is free, on buddy lists */ +#define PG_waiters 20 /* Page has PG_locked waiters */ + /* PG_owner_priv_1 users should have descriptive aliases */ #define PG_checked PG_owner_priv_1 /* Used by some filesystems */ @@ -123,6 +125,10 @@ #define TestClearPageLocked(page) \ test_and_clear_bit(PG_locked, &(page)->flags) +#define PageWaiters(page) test_bit(PG_waiters, &(page)->flags) +#define SetPageWaiters(page) set_bit(PG_waiters, &(page)->flags) +#define ClearPageWaiters(page) clear_bit(PG_waiters, &(page)->flags) + #define PageError(page) test_bit(PG_error, &(page)->flags) #define SetPageError(page) set_bit(PG_error, &(page)->flags) #define ClearPageError(page) clear_bit(PG_error, &(page)->flags) Index: linux-2.6/include/linux/pagemap.h =================================================================== --- linux-2.6.orig/include/linux/pagemap.h 2007-05-08 15:30:39.000000000 +1000 +++ linux-2.6/include/linux/pagemap.h 2007-05-08 16:55:07.000000000 +1000 @@ -133,7 +133,8 @@ extern void FASTCALL(__lock_page(struct page *page)); extern void FASTCALL(__lock_page_nosync(struct page *page)); -extern void FASTCALL(unlock_page(struct page *page)); +extern void FASTCALL(__wait_on_page_locked(struct page *page)); +extern void FASTCALL(__unlock_page(struct page *page)); static inline int trylock_page(struct page *page) { @@ -160,7 +161,15 @@ if (!trylock_page(page)) __lock_page_nosync(page); } - + +static inline void unlock_page(struct page *page) +{ + VM_BUG_ON(!PageLocked(page)); + ClearPageLocked_Unlock(page); + if (unlikely(PageWaiters(page))) + __unlock_page(page); +} + /* * This is exported only for wait_on_page_locked/wait_on_page_writeback. * Never use this directly! @@ -176,8 +185,9 @@ */ static inline void wait_on_page_locked(struct page *page) { + might_sleep(); if (PageLocked(page)) - wait_on_page_bit(page, PG_locked); + __wait_on_page_locked(page); } /* @@ -185,6 +195,7 @@ */ static inline void wait_on_page_writeback(struct page *page) { + might_sleep(); if (PageWriteback(page)) wait_on_page_bit(page, PG_writeback); } Index: linux-2.6/mm/filemap.c =================================================================== --- linux-2.6.orig/mm/filemap.c 2007-05-08 15:30:55.000000000 +1000 +++ linux-2.6/mm/filemap.c 2007-05-08 18:04:20.000000000 +1000 @@ -169,6 +169,7 @@ return 0; } + /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write @@ -478,12 +479,6 @@ EXPORT_SYMBOL(__page_cache_alloc); #endif -static int __sleep_on_page_lock(void *word) -{ - io_schedule(); - return 0; -} - /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of @@ -516,26 +511,22 @@ } EXPORT_SYMBOL(wait_on_page_bit); -/** - * unlock_page - unlock a locked page - * @page: the page - * - * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). - * Also wakes sleepers in wait_on_page_writeback() because the wakeup - * mechananism between PageLocked pages and PageWriteback pages is shared. - * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. - * - * The mb is necessary to enforce ordering between the clear_bit and the read - * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). +/* + * If PageWaiters was found to be set at unlock-time, __unlock_page should + * be called to actually perform the wakeups of waiters. */ -void fastcall unlock_page(struct page *page) +void fastcall __unlock_page(struct page *page) { - VM_BUG_ON(!PageLocked(page)); - ClearPageLocked_Unlock(page); + ClearPageWaiters(page); + /* + * The mb is necessary to enforce ordering between the clear_bit and + * the read of the waitqueue (to avoid SMP races with a parallel + * wait_on_page_locked() + */ smp_mb__after_clear_bit(); wake_up_page(page, PG_locked); } -EXPORT_SYMBOL(unlock_page); +EXPORT_SYMBOL(__unlock_page); /** * end_page_writeback - end writeback against a page @@ -563,10 +554,16 @@ */ void fastcall __lock_page(struct page *page) { + wait_queue_head_t *wq = page_waitqueue(page); DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, - TASK_UNINTERRUPTIBLE); + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + SetPageWaiters(page); + if (likely(PageLocked(page))) + sync_page(page); + } while (!trylock_page(page)); + finish_wait(wq, &wait.wait); } EXPORT_SYMBOL(__lock_page); @@ -576,10 +573,39 @@ */ void fastcall __lock_page_nosync(struct page *page) { + wait_queue_head_t *wq = page_waitqueue(page); DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, - TASK_UNINTERRUPTIBLE); + + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + SetPageWaiters(page); + if (likely(PageLocked(page))) + io_schedule(); + } while (!trylock_page(page)); + finish_wait(wq, &wait.wait); +} + +void fastcall __wait_on_page_locked(struct page *page) +{ + wait_queue_head_t *wq = page_waitqueue(page); + DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + SetPageWaiters(page); + if (likely(PageLocked(page))) + sync_page(page); + } while (PageLocked(page)); + finish_wait(wq, &wait.wait); + + /* + * Could skip this, but that would leave PG_waiters dangling + * for random pages. This keeps it tidy. + */ + if (unlikely(PageWaiters(page))) + __unlock_page(page); } +EXPORT_SYMBOL(__wait_on_page_locked); /** * find_get_page - find and get a page reference Index: linux-2.6/mm/page_alloc.c =================================================================== --- linux-2.6.orig/mm/page_alloc.c 2007-05-08 15:30:39.000000000 +1000 +++ linux-2.6/mm/page_alloc.c 2007-05-08 15:31:00.000000000 +1000 @@ -203,7 +203,8 @@ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_buddy ); + 1 << PG_buddy | + 1 << PG_waiters ); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -438,7 +439,8 @@ 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved | - 1 << PG_buddy )))) + 1 << PG_buddy | + 1 << PG_waiters )))) bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); @@ -588,7 +590,8 @@ 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved | - 1 << PG_buddy )))) + 1 << PG_buddy | + 1 << PG_waiters )))) bad_page(page); /* ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin @ 2007-05-08 12:13 ` David Howells 2007-05-08 22:35 ` Nick Piggin 2007-05-08 20:08 ` Hugh Dickins 2007-05-08 21:30 ` Benjamin Herrenschmidt 2 siblings, 1 reply; 34+ messages in thread From: David Howells @ 2007-05-08 12:13 UTC (permalink / raw) To: Nick Piggin Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List Nick Piggin <npiggin@suse.de> wrote: > This patch trades a page flag for a significant improvement in the unlock_page > fastpath. Various problems in the previous version were spotted by Hugh and > Ben (and fixed in this one). It looks reasonable at first glance, though it does consume yet another page flag:-/ However, I think that's probably a worthy trade. > } > - > + > +static inline void unlock_page(struct page *page) > +{ > + VM_BUG_ON(!PageLocked(page)); > + ClearPageLocked_Unlock(page); > + if (unlikely(PageWaiters(page))) > + __unlock_page(page); > +} > + Please don't simply discard the documentation, we have little enough as it is: > -/** > - * unlock_page - unlock a locked page > - * @page: the page > - * > - * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). > - * Also wakes sleepers in wait_on_page_writeback() because the wakeup > - * mechananism between PageLocked pages and PageWriteback pages is shared. > - * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. > - * > - * The mb is necessary to enforce ordering between the clear_bit and the read > - * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). David ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-08 12:13 ` David Howells @ 2007-05-08 22:35 ` Nick Piggin 0 siblings, 0 replies; 34+ messages in thread From: Nick Piggin @ 2007-05-08 22:35 UTC (permalink / raw) To: David Howells Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Tue, May 08, 2007 at 01:13:35PM +0100, David Howells wrote: > > Nick Piggin <npiggin@suse.de> wrote: > > > This patch trades a page flag for a significant improvement in the unlock_page > > fastpath. Various problems in the previous version were spotted by Hugh and > > Ben (and fixed in this one). > > It looks reasonable at first glance, though it does consume yet another page > flag:-/ However, I think that's probably a worthy trade. Well, that's the big question :) > > } > > - > > + > > +static inline void unlock_page(struct page *page) > > +{ > > + VM_BUG_ON(!PageLocked(page)); > > + ClearPageLocked_Unlock(page); > > + if (unlikely(PageWaiters(page))) > > + __unlock_page(page); > > +} > > + > > Please don't simply discard the documentation, we have little enough as it is: Oops, right. ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin 2007-05-08 12:13 ` David Howells @ 2007-05-08 20:08 ` Hugh Dickins 2007-05-08 21:30 ` Benjamin Herrenschmidt 2 siblings, 0 replies; 34+ messages in thread From: Hugh Dickins @ 2007-05-08 20:08 UTC (permalink / raw) To: Nick Piggin Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Tue, 8 May 2007, Nick Piggin wrote: > This patch trades a page flag for a significant improvement in the unlock_page > fastpath. Various problems in the previous version were spotted by Hugh and > Ben (and fixed in this one). > > Comments? Seems there's still a bug there. I get hangs on the page lock, on i386 and on x86_64 and on powerpc: sometimes they unhang themselves after a while (presume other activity does the wakeup). Obvious even while booting (Starting udevd). Sorry, not had time to investigate. Hugh ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin 2007-05-08 12:13 ` David Howells 2007-05-08 20:08 ` Hugh Dickins @ 2007-05-08 21:30 ` Benjamin Herrenschmidt 2007-05-08 22:41 ` Nick Piggin 2 siblings, 1 reply; 34+ messages in thread From: Benjamin Herrenschmidt @ 2007-05-08 21:30 UTC (permalink / raw) To: Nick Piggin Cc: linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Tue, 2007-05-08 at 13:40 +0200, Nick Piggin wrote: > This patch trades a page flag for a significant improvement in the unlock_page > fastpath. Various problems in the previous version were spotted by Hugh and > Ben (and fixed in this one). > > Comments? > > -- > > Speed up unlock_page by introducing a new page flag to signal that there are > page waitqueue waiters for PG_locked. This means a memory barrier and a random > waitqueue hash cacheline load can be avoided in the fastpath when there is no > contention. I'm not 100% familiar with the exclusive vs. non exclusive wait thingy but wake_up_page() does __wake_up_bit() which calls __wake_up() with nr_exclusive set to 1. Doesn't that mean that only one waiter will be woken up ? If that's the case, then we lose because we'll have clear PG_waiters but only wake up one of them. Waking them all would fix it but at the risk of causing other problems... Maybe PG_waiters need to actually be a counter but if that is the case, then it complicates things even more. Any smart idea ? Ben. ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-08 21:30 ` Benjamin Herrenschmidt @ 2007-05-08 22:41 ` Nick Piggin 2007-05-08 22:50 ` Nick Piggin 0 siblings, 1 reply; 34+ messages in thread From: Nick Piggin @ 2007-05-08 22:41 UTC (permalink / raw) To: Benjamin Herrenschmidt Cc: linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Wed, May 09, 2007 at 07:30:27AM +1000, Benjamin Herrenschmidt wrote: > On Tue, 2007-05-08 at 13:40 +0200, Nick Piggin wrote: > > This patch trades a page flag for a significant improvement in the unlock_page > > fastpath. Various problems in the previous version were spotted by Hugh and > > Ben (and fixed in this one). > > > > Comments? > > > > -- > > > > Speed up unlock_page by introducing a new page flag to signal that there are > > page waitqueue waiters for PG_locked. This means a memory barrier and a random > > waitqueue hash cacheline load can be avoided in the fastpath when there is no > > contention. > > I'm not 100% familiar with the exclusive vs. non exclusive wait thingy > but wake_up_page() does __wake_up_bit() which calls __wake_up() with > nr_exclusive set to 1. Doesn't that mean that only one waiter will be > woken up ? > > If that's the case, then we lose because we'll have clear PG_waiters but > only wake up one of them. > > Waking them all would fix it but at the risk of causing other > problems... Maybe PG_waiters need to actually be a counter but if that > is the case, then it complicates things even more. > > Any smart idea ? It will wake up 1 exclusive waiter, but no limit on non exclusive waiters. Hmm, but it won't wake up waiters behind the exclusive guy... maybe the wake up code can check whether the waitqueue is still active after the wakeup, and set PG_waiters again in that case? ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-08 22:41 ` Nick Piggin @ 2007-05-08 22:50 ` Nick Piggin 2007-05-09 19:33 ` Hugh Dickins 0 siblings, 1 reply; 34+ messages in thread From: Nick Piggin @ 2007-05-08 22:50 UTC (permalink / raw) To: Benjamin Herrenschmidt Cc: linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Wed, May 09, 2007 at 12:41:24AM +0200, Nick Piggin wrote: > On Wed, May 09, 2007 at 07:30:27AM +1000, Benjamin Herrenschmidt wrote: > > On Tue, 2007-05-08 at 13:40 +0200, Nick Piggin wrote: > > > This patch trades a page flag for a significant improvement in the unlock_page > > > fastpath. Various problems in the previous version were spotted by Hugh and > > > Ben (and fixed in this one). > > > > > > Comments? > > > > > > -- > > > > > > Speed up unlock_page by introducing a new page flag to signal that there are > > > page waitqueue waiters for PG_locked. This means a memory barrier and a random > > > waitqueue hash cacheline load can be avoided in the fastpath when there is no > > > contention. > > > > I'm not 100% familiar with the exclusive vs. non exclusive wait thingy > > but wake_up_page() does __wake_up_bit() which calls __wake_up() with > > nr_exclusive set to 1. Doesn't that mean that only one waiter will be > > woken up ? > > > > If that's the case, then we lose because we'll have clear PG_waiters but > > only wake up one of them. > > > > Waking them all would fix it but at the risk of causing other > > problems... Maybe PG_waiters need to actually be a counter but if that > > is the case, then it complicates things even more. > > > > Any smart idea ? > > It will wake up 1 exclusive waiter, but no limit on non exclusive waiters. > Hmm, but it won't wake up waiters behind the exclusive guy... maybe the > wake up code can check whether the waitqueue is still active after the > wakeup, and set PG_waiters again in that case? Hm, I don't know if we can do that without a race either... OTOH, waking all non exclusive waiters may not be a really bad idea. ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-08 22:50 ` Nick Piggin @ 2007-05-09 19:33 ` Hugh Dickins 2007-05-09 21:21 ` Benjamin Herrenschmidt 2007-05-10 3:37 ` Nick Piggin 0 siblings, 2 replies; 34+ messages in thread From: Hugh Dickins @ 2007-05-09 19:33 UTC (permalink / raw) To: Nick Piggin Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Wed, 9 May 2007, Nick Piggin wrote: > On Wed, May 09, 2007 at 12:41:24AM +0200, Nick Piggin wrote: > > On Wed, May 09, 2007 at 07:30:27AM +1000, Benjamin Herrenschmidt wrote: > > > > > > Waking them all would fix it but at the risk of causing other > > > problems... Maybe PG_waiters need to actually be a counter but if that > > > is the case, then it complicates things even more. > > > > It will wake up 1 exclusive waiter, but no limit on non exclusive waiters. > > Hmm, but it won't wake up waiters behind the exclusive guy... maybe the > > wake up code can check whether the waitqueue is still active after the > > wakeup, and set PG_waiters again in that case? > > Hm, I don't know if we can do that without a race either... > > OTOH, waking all non exclusive waiters may not be a really bad idea. Not good enough, I'm afraid. It looks like Ben's right and you need a count - and counts in the page struct are a lot harder to add than page flags. I've now played around with the hangs on my three 4CPU machines (all of them in io_schedule below __lock_page, waiting on pages which were neither PG_locked nor PG_waiters when I looked). Seeing Ben's mail, I thought the answer would be just to remove the "_exclusive" from your three prepare_to_wait_exclusive()s. That helped, but it didn't eliminate the hangs. After fiddling around with different ideas for some while, I came to realize that the ClearPageWaiters (in very misleadingly named __unlock_page) is hopeless. It's just so easy for it to clear the PG_waiters that a third task relies upon for wakeup (and which cannot loop around to set it again, because it simply won't be woken by unlock_page/__unlock_page without it already being set). Below is the patch I've applied to see some tests actually running with your patches, but it's just a joke: absurdly racy and presumptuous in itself (the "3" stands for us and the cache and one waiter; I deleted the neighbouring mb and comment, not because I disagree, but because it's ridiculous to pay so much attention to such unlikely races when there's much worse nearby). Though I've not checked: if I've got the counting wrong, then maybe all my pages are left marked PG_waiters by now. (I did imagine we could go back to prepare_to_wait_exclusive once I'd put in the page_count test before ClearPageWaiters; but apparently not, that still hung.) My intention had been to apply the patches to what I tested before with lmbench, to get comparative numbers; but I don't think this is worth the time, it's too far from being a real solution. I was puzzled as to how you came up with any performance numbers yourself, when I could hardly boot. I see you mentioned 2CPU G5, I guess you need a CPU or two more; or maybe it's that you didn't watch what happened as it booted, often those hangs recover later. Hugh --- a/mm/filemap.c 2007-05-08 20:17:31.000000000 +0100 +++ b/mm/filemap.c 2007-05-09 19:14:03.000000000 +0100 @@ -517,13 +517,8 @@ EXPORT_SYMBOL(wait_on_page_bit); */ void fastcall __unlock_page(struct page *page) { - ClearPageWaiters(page); - /* - * The mb is necessary to enforce ordering between the clear_bit and - * the read of the waitqueue (to avoid SMP races with a parallel - * wait_on_page_locked() - */ - smp_mb__after_clear_bit(); + if (page_count(page) <= 3 + page_has_buffers(page)+page_mapcount(page)) + ClearPageWaiters(page); wake_up_page(page, PG_locked); } EXPORT_SYMBOL(__unlock_page); @@ -558,7 +553,7 @@ void fastcall __lock_page(struct page *p DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); SetPageWaiters(page); if (likely(PageLocked(page))) sync_page(page); @@ -577,7 +572,7 @@ void fastcall __lock_page_nosync(struct DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); SetPageWaiters(page); if (likely(PageLocked(page))) io_schedule(); @@ -591,7 +586,7 @@ void fastcall __wait_on_page_locked(stru DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); SetPageWaiters(page); if (likely(PageLocked(page))) sync_page(page); ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-09 19:33 ` Hugh Dickins @ 2007-05-09 21:21 ` Benjamin Herrenschmidt 2007-05-10 3:37 ` Nick Piggin 1 sibling, 0 replies; 34+ messages in thread From: Benjamin Herrenschmidt @ 2007-05-09 21:21 UTC (permalink / raw) To: Hugh Dickins Cc: Nick Piggin, linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List > Not good enough, I'm afraid. It looks like Ben's right and you need > a count - and counts in the page struct are a lot harder to add than > page flags. > > I've now played around with the hangs on my three 4CPU machines > (all of them in io_schedule below __lock_page, waiting on pages > which were neither PG_locked nor PG_waiters when I looked). > > Seeing Ben's mail, I thought the answer would be just to remove > the "_exclusive" from your three prepare_to_wait_exclusive()s. > That helped, but it didn't eliminate the hangs. There might be a way ... by having the flags manipulation always atomically deal with PG_locked and PG_waiters together. This is possible but we would need even more weirdo bitops abstractions from the arch I'm afraid... unless we start using atomic_* rather that bitops in order to manipulate multiple bits at a time. Ben. ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-09 19:33 ` Hugh Dickins 2007-05-09 21:21 ` Benjamin Herrenschmidt @ 2007-05-10 3:37 ` Nick Piggin 2007-05-10 19:14 ` Hugh Dickins 1 sibling, 1 reply; 34+ messages in thread From: Nick Piggin @ 2007-05-10 3:37 UTC (permalink / raw) To: Hugh Dickins Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Wed, May 09, 2007 at 08:33:15PM +0100, Hugh Dickins wrote: > > Not good enough, I'm afraid. It looks like Ben's right and you need > a count - and counts in the page struct are a lot harder to add than > page flags. > > I've now played around with the hangs on my three 4CPU machines > (all of them in io_schedule below __lock_page, waiting on pages > which were neither PG_locked nor PG_waiters when I looked). > > Seeing Ben's mail, I thought the answer would be just to remove > the "_exclusive" from your three prepare_to_wait_exclusive()s. > That helped, but it didn't eliminate the hangs. > > After fiddling around with different ideas for some while, I came > to realize that the ClearPageWaiters (in very misleadingly named > __unlock_page) is hopeless. It's just so easy for it to clear the > PG_waiters that a third task relies upon for wakeup (and which > cannot loop around to set it again, because it simply won't be > woken by unlock_page/__unlock_page without it already being set). OK, I found a simple bug after pulling out my hair for a while :) With this, a 4-way system survives a couple of concurrent make -j250s quite nicely (wheras they eventually locked up before). The problem is that the bit wakeup function did not go through with the wakeup if it found the bit (ie. PG_locked) set. This meant that waiters would not get a chance to reset PG_waiters. However you probably weren't referring to that particular problem when you imagined the need for a full count, or the slippery 3rd task... I wasn't able to derive any such problems with the basic logic, so if there was a bug there, it would still be unfixed in this patch. --- Speed up unlock_page by introducing a new page flag to signal that there are page waitqueue waiters for PG_locked. This means a memory barrier and a random waitqueue hash cacheline load can be avoided in the fastpath when there is no contention. Index: linux-2.6/include/linux/page-flags.h =================================================================== --- linux-2.6.orig/include/linux/page-flags.h 2007-05-10 10:22:05.000000000 +1000 +++ linux-2.6/include/linux/page-flags.h 2007-05-10 10:22:06.000000000 +1000 @@ -91,6 +91,8 @@ #define PG_nosave_free 18 /* Used for system suspend/resume */ #define PG_buddy 19 /* Page is free, on buddy lists */ +#define PG_waiters 20 /* Page has PG_locked waiters */ + /* PG_owner_priv_1 users should have descriptive aliases */ #define PG_checked PG_owner_priv_1 /* Used by some filesystems */ @@ -113,6 +115,10 @@ #define SetPageLocked(page) \ set_bit(PG_locked, &(page)->flags) +#define PageWaiters(page) test_bit(PG_waiters, &(page)->flags) +#define SetPageWaiters(page) set_bit(PG_waiters, &(page)->flags) +#define ClearPageWaiters(page) clear_bit(PG_waiters, &(page)->flags) + #define PageError(page) test_bit(PG_error, &(page)->flags) #define SetPageError(page) set_bit(PG_error, &(page)->flags) #define ClearPageError(page) clear_bit(PG_error, &(page)->flags) Index: linux-2.6/include/linux/pagemap.h =================================================================== --- linux-2.6.orig/include/linux/pagemap.h 2007-05-10 10:22:05.000000000 +1000 +++ linux-2.6/include/linux/pagemap.h 2007-05-10 10:22:06.000000000 +1000 @@ -133,7 +133,8 @@ extern void FASTCALL(__lock_page(struct page *page)); extern void FASTCALL(__lock_page_nosync(struct page *page)); -extern void FASTCALL(unlock_page(struct page *page)); +extern void FASTCALL(__wait_on_page_locked(struct page *page)); +extern void FASTCALL(__unlock_page(struct page *page)); static inline int trylock_page(struct page *page) { @@ -160,7 +161,15 @@ if (!trylock_page(page)) __lock_page_nosync(page); } - + +static inline void unlock_page(struct page *page) +{ + VM_BUG_ON(!PageLocked(page)); + clear_bit_unlock(PG_locked, &page->flags); + if (unlikely(PageWaiters(page))) + __unlock_page(page); +} + /* * This is exported only for wait_on_page_locked/wait_on_page_writeback. * Never use this directly! @@ -176,8 +185,9 @@ */ static inline void wait_on_page_locked(struct page *page) { + might_sleep(); if (PageLocked(page)) - wait_on_page_bit(page, PG_locked); + __wait_on_page_locked(page); } /* @@ -185,6 +195,7 @@ */ static inline void wait_on_page_writeback(struct page *page) { + might_sleep(); if (PageWriteback(page)) wait_on_page_bit(page, PG_writeback); } Index: linux-2.6/mm/filemap.c =================================================================== --- linux-2.6.orig/mm/filemap.c 2007-05-10 10:22:05.000000000 +1000 +++ linux-2.6/mm/filemap.c 2007-05-10 11:03:10.000000000 +1000 @@ -165,10 +165,12 @@ mapping = page_mapping(page); if (mapping && mapping->a_ops && mapping->a_ops->sync_page) mapping->a_ops->sync_page(page); - io_schedule(); + if (io_schedule_timeout(20*HZ) == 0) + printk("page->flags = %lx\n", page->flags); return 0; } + /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write @@ -478,12 +480,6 @@ EXPORT_SYMBOL(__page_cache_alloc); #endif -static int __sleep_on_page_lock(void *word) -{ - io_schedule(); - return 0; -} - /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of @@ -516,26 +512,23 @@ } EXPORT_SYMBOL(wait_on_page_bit); -/** - * unlock_page - unlock a locked page - * @page: the page - * - * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). - * Also wakes sleepers in wait_on_page_writeback() because the wakeup - * mechananism between PageLocked pages and PageWriteback pages is shared. - * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. - * - * The mb is necessary to enforce ordering between the clear_bit and the read - * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). +/* + * If PageWaiters was found to be set at unlock-time, __unlock_page should + * be called to actually perform the wakeups of waiters. */ -void fastcall unlock_page(struct page *page) +void fastcall __unlock_page(struct page *page) { - VM_BUG_ON(!PageLocked(page)); - clear_bit_unlock(PG_locked, &page->flags); + ClearPageWaiters(page); + /* + * The mb is necessary to enforce ordering between the clear_bit and + * the read of the waitqueue (to avoid SMP races with a parallel + * wait_on_page_locked() + */ smp_mb__after_clear_bit(); + wake_up_page(page, PG_locked); } -EXPORT_SYMBOL(unlock_page); +EXPORT_SYMBOL(__unlock_page); /** * end_page_writeback - end writeback against a page @@ -563,10 +556,16 @@ */ void fastcall __lock_page(struct page *page) { + wait_queue_head_t *wq = page_waitqueue(page); DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, - TASK_UNINTERRUPTIBLE); + do { + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + SetPageWaiters(page); + if (likely(PageLocked(page))) + sync_page(page); + } while (!trylock_page(page)); + finish_wait(wq, &wait.wait); } EXPORT_SYMBOL(__lock_page); @@ -576,10 +575,41 @@ */ void fastcall __lock_page_nosync(struct page *page) { + wait_queue_head_t *wq = page_waitqueue(page); DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, - TASK_UNINTERRUPTIBLE); + + do { + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + SetPageWaiters(page); + if (likely(PageLocked(page))) { + if (io_schedule_timeout(20*HZ) == 0) + printk("page->flags = %lx\n", page->flags); + } + } while (!trylock_page(page)); + finish_wait(wq, &wait.wait); +} + +void fastcall __wait_on_page_locked(struct page *page) +{ + wait_queue_head_t *wq = page_waitqueue(page); + DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + + do { + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + SetPageWaiters(page); + if (likely(PageLocked(page))) + sync_page(page); + } while (PageLocked(page)); + finish_wait(wq, &wait.wait); + + /* + * Could skip this, but that would leave PG_waiters dangling + * for random pages. This keeps it tidy. + */ + if (unlikely(PageWaiters(page))) + __unlock_page(page); } +EXPORT_SYMBOL(__wait_on_page_locked); /** * find_get_page - find and get a page reference Index: linux-2.6/mm/page_alloc.c =================================================================== --- linux-2.6.orig/mm/page_alloc.c 2007-05-10 10:21:32.000000000 +1000 +++ linux-2.6/mm/page_alloc.c 2007-05-10 10:22:06.000000000 +1000 @@ -203,7 +203,8 @@ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_buddy ); + 1 << PG_buddy | + 1 << PG_waiters ); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -438,7 +439,8 @@ 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved | - 1 << PG_buddy )))) + 1 << PG_buddy | + 1 << PG_waiters )))) bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); @@ -588,7 +590,8 @@ 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved | - 1 << PG_buddy )))) + 1 << PG_buddy | + 1 << PG_waiters )))) bad_page(page); /* Index: linux-2.6/kernel/wait.c =================================================================== --- linux-2.6.orig/kernel/wait.c 2007-05-10 11:06:43.000000000 +1000 +++ linux-2.6/kernel/wait.c 2007-05-10 11:17:25.000000000 +1000 @@ -144,8 +144,7 @@ = container_of(wait, struct wait_bit_queue, wait); if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - test_bit(key->bit_nr, key->flags)) + wait_bit->key.bit_nr != key->bit_nr) return 0; else return autoremove_wake_function(wait, mode, sync, key); ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-10 3:37 ` Nick Piggin @ 2007-05-10 19:14 ` Hugh Dickins 2007-05-11 8:54 ` Nick Piggin 0 siblings, 1 reply; 34+ messages in thread From: Hugh Dickins @ 2007-05-10 19:14 UTC (permalink / raw) To: Nick Piggin Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Thu, 10 May 2007, Nick Piggin wrote: > > OK, I found a simple bug after pulling out my hair for a while :) > With this, a 4-way system survives a couple of concurrent make -j250s > quite nicely (wheras they eventually locked up before). > > The problem is that the bit wakeup function did not go through with > the wakeup if it found the bit (ie. PG_locked) set. This meant that > waiters would not get a chance to reset PG_waiters. That makes a lot of sense. And this version seems stable to me, I've found no problems so far: magic! Well, on the x86_64 I have seen a few of your io_schedule_timeout printks under load; but suspect those are no fault of your changes, but reflect some actual misbehaviour down towards the disk end (when kernel default moved from AS to CFQ, I had to stick with AS because CFQ ran my tests very much slower on that one machine: something odd going on that I've occasionally wasted time looking into but never tracked down - certainly long-locked pages are a feature of that). > However you probably weren't referring to that particular problem > when you imagined the need for a full count, or the slippery 3rd > task... I wasn't able to derive any such problems with the basic > logic, so if there was a bug there, it would still be unfixed in this > patch. I've been struggling to conjure up and exorcise the race that seemed so obvious to me yesterday. I was certainly imagining one task on its way between SetPageWaiters and io_schedule, when the unlock_page comes, wakes, and lets another waiter take the lock. Probably I was forgetting the essence of prepare_to_wait, that this task would then fall through io_schedule as if woken as part of that batch. Until demonstrated otherwise, let's assume I was utterly mistaken. In addition to 3 hours of load on the three machines, I've gone back and applied this new patch (and the lock bitops; remembering to shift PG_waiters up) to 2.6.21-rc3-mm2 on which I did the earlier lmbench testing, on those three machines. On the PowerPC G5, these changes pretty much balance out your earlier changes (not just the one fix-fault-vs-invalidate patch, but the whole group which came in with that - it'd take me a while to tell exactly what, easiest to send you a diff if you want it), in those lmbench fork, exec, sh, mmap, fault tests. On the P4 Xeons, they improve the numbers significantly, but only retrieve half the regression. So here it looks like a good change; but not enough to atone ;) Hugh ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-10 19:14 ` Hugh Dickins @ 2007-05-11 8:54 ` Nick Piggin 2007-05-11 13:15 ` Hugh Dickins 0 siblings, 1 reply; 34+ messages in thread From: Nick Piggin @ 2007-05-11 8:54 UTC (permalink / raw) To: Hugh Dickins Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Thu, May 10, 2007 at 08:14:52PM +0100, Hugh Dickins wrote: > On Thu, 10 May 2007, Nick Piggin wrote: > > > > OK, I found a simple bug after pulling out my hair for a while :) > > With this, a 4-way system survives a couple of concurrent make -j250s > > quite nicely (wheras they eventually locked up before). > > > > The problem is that the bit wakeup function did not go through with > > the wakeup if it found the bit (ie. PG_locked) set. This meant that > > waiters would not get a chance to reset PG_waiters. > > That makes a lot of sense. And this version seems stable to me, > I've found no problems so far: magic! > > Well, on the x86_64 I have seen a few of your io_schedule_timeout > printks under load; but suspect those are no fault of your changes, Hmm, I see... well I forgot to remove those from the page I sent, the timeouts will kick things off again if they get stalled, so maybe it just hides a problem? (OTOH, I *think* the logic is pretty sound). > In addition to 3 hours of load on the three machines, I've gone back > and applied this new patch (and the lock bitops; remembering to shift > PG_waiters up) to 2.6.21-rc3-mm2 on which I did the earlier lmbench > testing, on those three machines. > > On the PowerPC G5, these changes pretty much balance out your earlier > changes (not just the one fix-fault-vs-invalidate patch, but the whole > group which came in with that - it'd take me a while to tell exactly > what, easiest to send you a diff if you want it), in those lmbench > fork, exec, sh, mmap, fault tests. On the P4 Xeons, they improve > the numbers significantly, but only retrieve half the regression. > > So here it looks like a good change; but not enough to atone ;) Don't worry, I'm only just beginning ;) Can we then do something crazy like this? (working on x86-64 only, so far. It seems to eliminate lat_pagefault and lat_proc regressions here). What architecture and workloads are you testing with, btw? -- Put PG_locked in its own byte from other PG_bits, so we can use non-atomic stores to unlock it. Index: linux-2.6/include/asm-x86_64/bitops.h =================================================================== --- linux-2.6.orig/include/asm-x86_64/bitops.h +++ linux-2.6/include/asm-x86_64/bitops.h @@ -68,6 +68,38 @@ static __inline__ void clear_bit(int nr, :"dIr" (nr)); } +/** + * clear_bit_unlock - Clears a bit in memory with unlock semantics + * @nr: Bit to clear + * @addr: Address to start counting from + */ +static __inline__ void clear_bit_unlock(int nr, volatile void * addr) +{ + barrier(); + __asm__ __volatile__( LOCK_PREFIX + "btrl %1,%0" + :ADDR + :"dIr" (nr)); +} + +/** + * __clear_bit_unlock_byte - same as clear_bit_unlock but uses a byte sized + * non-atomic store + * @nr: Bit to clear + * @addr: Address to start counting from + * + * __clear_bit_unlock() is non-atomic, however it implements unlock ordering, + * so it cannot be reordered arbitrarily. + */ +static __inline__ void __clear_bit_unlock_byte(int nr, void *addr) +{ + unsigned char mask = 1UL << (nr % BITS_PER_BYTE); + unsigned char *p = addr + nr / BITS_PER_BYTE; + + barrier(); + *p &= ~mask; +} + static __inline__ void __clear_bit(int nr, volatile void * addr) { __asm__ __volatile__( @@ -132,6 +164,26 @@ static __inline__ int test_and_set_bit(i return oldbit; } + +/** + * test_and_set_bit_lock - Set a bit and return its old value for locking + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and has lock barrier semantics. + */ +static __inline__ int test_and_set_bit_lock(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__( LOCK_PREFIX + "btsl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit),ADDR + :"dIr" (nr)); + barrier(); + return oldbit; +} + /** * __test_and_set_bit - Set a bit and return its old value * @nr: Bit to set @@ -408,7 +460,6 @@ static __inline__ int fls(int x) #define ARCH_HAS_FAST_MULTIPLIER 1 #include <asm-generic/bitops/hweight.h> -#include <asm-generic/bitops/lock.h> #endif /* __KERNEL__ */ Index: linux-2.6/include/linux/mmzone.h =================================================================== --- linux-2.6.orig/include/linux/mmzone.h +++ linux-2.6/include/linux/mmzone.h @@ -615,13 +615,13 @@ extern struct zone *next_zone(struct zon * with 32 bit page->flags field, we reserve 9 bits for node/zone info. * there are 4 zones (3 bits) and this leaves 9-3=6 bits for nodes. */ -#define FLAGS_RESERVED 9 +#define FLAGS_RESERVED 7 #elif BITS_PER_LONG == 64 /* * with 64 bit flags field, there's plenty of room. */ -#define FLAGS_RESERVED 32 +#define FLAGS_RESERVED 31 #else Index: linux-2.6/include/linux/page-flags.h =================================================================== --- linux-2.6.orig/include/linux/page-flags.h +++ linux-2.6/include/linux/page-flags.h @@ -67,7 +67,6 @@ * FLAGS_RESERVED which defines the width of the fields section * (see linux/mmzone.h). New flags must _not_ overlap with this area. */ -#define PG_locked 0 /* Page is locked. Don't touch. */ #define PG_error 1 #define PG_referenced 2 #define PG_uptodate 3 @@ -104,6 +103,14 @@ * 63 32 0 */ #define PG_uncached 31 /* Page has been mapped as uncached */ + +/* + * PG_locked sits in a different byte to the rest of the flags. This allows + * optimised implementations to use a non-atomic store to unlock. + */ +#define PG_locked 32 +#else +#define PG_locked 24 #endif /* Index: linux-2.6/include/linux/pagemap.h =================================================================== --- linux-2.6.orig/include/linux/pagemap.h +++ linux-2.6/include/linux/pagemap.h @@ -187,7 +187,11 @@ static inline void lock_page_nosync(stru static inline void unlock_page(struct page *page) { VM_BUG_ON(!PageLocked(page)); - clear_bit_unlock(PG_locked, &page->flags); + /* + * PG_locked sits in its own byte in page->flags, away from normal + * flags, so we can do a non-atomic unlock here + */ + __clear_bit_unlock_byte(PG_locked, &page->flags); if (unlikely(PageWaiters(page))) __unlock_page(page); } Index: linux-2.6/mm/page_alloc.c =================================================================== --- linux-2.6.orig/mm/page_alloc.c +++ linux-2.6/mm/page_alloc.c @@ -192,17 +192,17 @@ static void bad_page(struct page *page) (unsigned long)page->flags, page->mapping, page_mapcount(page), page_count(page)); dump_stack(); - page->flags &= ~(1 << PG_lru | - 1 << PG_private | - 1 << PG_locked | - 1 << PG_active | - 1 << PG_dirty | - 1 << PG_reclaim | - 1 << PG_slab | - 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_buddy | - 1 << PG_waiters ); + page->flags &= ~(1UL << PG_lru | + 1UL << PG_private| + 1UL << PG_locked | + 1UL << PG_active | + 1UL << PG_dirty | + 1UL << PG_reclaim| + 1UL << PG_slab | + 1UL << PG_swapcache| + 1UL << PG_writeback| + 1UL << PG_buddy | + 1UL << PG_waiters ); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -427,19 +427,19 @@ static inline void __free_one_page(struc static inline int free_pages_check(struct page *page) { if (unlikely(page_mapcount(page) | - (page->mapping != NULL) | - (page_count(page) != 0) | + (page->mapping != NULL) | + (page_count(page) != 0) | (page->flags & ( - 1 << PG_lru | - 1 << PG_private | - 1 << PG_locked | - 1 << PG_active | - 1 << PG_slab | - 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_reserved | - 1 << PG_buddy | - 1 << PG_waiters )))) + 1UL << PG_lru | + 1UL << PG_private| + 1UL << PG_locked | + 1UL << PG_active | + 1UL << PG_slab | + 1UL << PG_swapcache| + 1UL << PG_writeback| + 1UL << PG_reserved| + 1UL << PG_buddy | + 1UL << PG_waiters )))) bad_page(page); /* * PageReclaim == PageTail. It is only an error @@ -582,21 +582,21 @@ static inline void expand(struct zone *z static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) { if (unlikely(page_mapcount(page) | - (page->mapping != NULL) | - (page_count(page) != 0) | + (page->mapping != NULL) | + (page_count(page) != 0) | (page->flags & ( - 1 << PG_lru | - 1 << PG_private | - 1 << PG_locked | - 1 << PG_active | - 1 << PG_dirty | - 1 << PG_reclaim | - 1 << PG_slab | - 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_reserved | - 1 << PG_buddy | - 1 << PG_waiters )))) + 1UL << PG_lru | + 1UL << PG_private| + 1UL << PG_locked | + 1UL << PG_active | + 1UL << PG_dirty | + 1UL << PG_reclaim| + 1UL << PG_slab | + 1UL << PG_swapcache| + 1UL << PG_writeback| + 1UL << PG_reserved| + 1UL << PG_buddy | + 1UL << PG_waiters )))) bad_page(page); /* @@ -606,9 +606,9 @@ static int prep_new_page(struct page *pa if (PageReserved(page)) return 1; - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | - 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); + page->flags &= ~(1UL << PG_uptodate | 1UL << PG_error | + 1UL << PG_referenced | 1UL << PG_arch_1 | + 1UL << PG_owner_priv_1 | 1UL << PG_mappedtodisk); set_page_private(page, 0); set_page_refcounted(page); ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-11 8:54 ` Nick Piggin @ 2007-05-11 13:15 ` Hugh Dickins 2007-05-13 3:32 ` Nick Piggin 0 siblings, 1 reply; 34+ messages in thread From: Hugh Dickins @ 2007-05-11 13:15 UTC (permalink / raw) To: Nick Piggin Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Fri, 11 May 2007, Nick Piggin wrote: > On Thu, May 10, 2007 at 08:14:52PM +0100, Hugh Dickins wrote: > > > > Well, on the x86_64 I have seen a few of your io_schedule_timeout > > printks under load; but suspect those are no fault of your changes, > > Hmm, I see... well I forgot to remove those from the page I sent, > the timeouts will kick things off again if they get stalled, so > maybe it just hides a problem? (OTOH, I *think* the logic is pretty > sound). As I said in what you snipped, I believe your debug there is showing up an existing problem on my machine, not a problem in your changes. > > So here it looks like a good change; but not enough to atone ;) > > Don't worry, I'm only just beginning ;) Can we then do something crazy > like this? (working on x86-64 only, so far. It seems to eliminate > lat_pagefault and lat_proc regressions here). I think Mr __NickPiggin_Lock is squirming ever more desperately. So, in essence, you'd like to expand PG_locked from 1 to 8 bits, despite the fact that page flags are known to be in short supply? Ah, no, you're keeping it near the static mmzone FLAGS_RESERVED. Hmm, well, I think that's fairly horrid, and would it even be guaranteed to work on all architectures? Playing with one char of an unsigned long in one way, while playing with the whole of the unsigned long in another way (bitops) sounds very dodgy to me. I think I'd rather just accept that your changes have slowed some microbenchmarks down: it is not always possible to fix a serious bug without slowing something down. That's probably what you're trying to push me into saying by this patch ;) But again I wonder just what the gain has been, once your double unmap_mapping_range is factored in. When I suggested before that perhaps the double (well, treble including the one in truncate.c) unmap_mapping_range might solve the problem you set out to solve (I've lost sight of that!) without pagelock when faulting, you said: > Well aside from being terribly ugly, it means we can still drop > the dirty bit where we'd otherwise rather not, so I don't think > we can do that. but that didn't give me enough information to agree or disagree. > > What architecture and workloads are you testing with, btw? i386 (2*HT P4 Xeons), x86_64 (2*HT P4 Xeons), PowerPC (G5 Quad). Workloads mostly lmbench and my usual pair of make -j20 kernel builds, one to tmpfs and one to ext2 looped on tmpfs, restricted to 512M RAM plus swap. Which is ever so old but still finds enough to keep me busy. Hugh > -- > > Put PG_locked in its own byte from other PG_bits, so we can use non-atomic > stores to unlock it. ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-11 13:15 ` Hugh Dickins @ 2007-05-13 3:32 ` Nick Piggin 2007-05-13 4:39 ` Hugh Dickins 2007-05-16 17:21 ` Hugh Dickins 0 siblings, 2 replies; 34+ messages in thread From: Nick Piggin @ 2007-05-13 3:32 UTC (permalink / raw) To: Hugh Dickins Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote: > On Fri, 11 May 2007, Nick Piggin wrote: > > > > Don't worry, I'm only just beginning ;) Can we then do something crazy > > like this? (working on x86-64 only, so far. It seems to eliminate > > lat_pagefault and lat_proc regressions here). > > I think Mr __NickPiggin_Lock is squirming ever more desperately. Really? I thought it was pretty cool to be able to shave several hundreds of cycles off our page lock :) > So, in essence, you'd like to expand PG_locked from 1 to 8 bits, > despite the fact that page flags are known to be in short supply? > Ah, no, you're keeping it near the static mmzone FLAGS_RESERVED. Yep, no flags bloating at all. > Hmm, well, I think that's fairly horrid, and would it even be > guaranteed to work on all architectures? Playing with one char > of an unsigned long in one way, while playing with the whole of > the unsigned long in another way (bitops) sounds very dodgy to me. Of course not, but they can just use a regular atomic word sized bitop. The problem with i386 is that its atomic ops also imply memory barriers that you obviously don't need on unlock. So I think getting rid of them is pretty good. A grep of mm/ and fs/ for lock_page tells me we want this to be as fast as possible even if it isn't being used in the nopage fastpath. > I think I'd rather just accept that your changes have slowed some > microbenchmarks down: it is not always possible to fix a serious > bug without slowing something down. That's probably what you're > trying to push me into saying by this patch ;) Well I was resigned to that few % regression in the page fault path until the G5 numbers showed that we needed to improve things. But now it looks like (at least on my 2*HT P4 Xeon) that we don't have to have any regression there. > But again I wonder just what the gain has been, once your double > unmap_mapping_range is factored in. When I suggested before that > perhaps the double (well, treble including the one in truncate.c) > unmap_mapping_range might solve the problem you set out to solve > (I've lost sight of that!) without pagelock when faulting, you said: > > > Well aside from being terribly ugly, it means we can still drop > > the dirty bit where we'd otherwise rather not, so I don't think > > we can do that. > > but that didn't give me enough information to agree or disagree. Oh, well invalidate wants to be able to skip dirty pages or have the filesystem do something special with them first. Once you have taken the page out of the pagecache but still mapped shared, then blowing it away doesn't actually solve the data loss problem... only makes the window of VM inconsistency smaller. > > What architecture and workloads are you testing with, btw? > > i386 (2*HT P4 Xeons), x86_64 (2*HT P4 Xeons), PowerPC (G5 Quad). > > Workloads mostly lmbench and my usual pair of make -j20 kernel builds, > one to tmpfs and one to ext2 looped on tmpfs, restricted to 512M RAM > plus swap. Which is ever so old but still finds enough to keep me busy. Thanks, Nick ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-13 3:32 ` Nick Piggin @ 2007-05-13 4:39 ` Hugh Dickins 2007-05-13 6:52 ` Nick Piggin 2007-05-16 17:21 ` Hugh Dickins 1 sibling, 1 reply; 34+ messages in thread From: Hugh Dickins @ 2007-05-13 4:39 UTC (permalink / raw) To: Nick Piggin Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Sun, 13 May 2007, Nick Piggin wrote: > On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote: > > > Hmm, well, I think that's fairly horrid, and would it even be > > guaranteed to work on all architectures? Playing with one char > > of an unsigned long in one way, while playing with the whole of > > the unsigned long in another way (bitops) sounds very dodgy to me. > > Of course not, but they can just use a regular atomic word sized > bitop. The problem with i386 is that its atomic ops also imply > memory barriers that you obviously don't need on unlock. But is it even a valid procedure on i386? Hugh ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-13 4:39 ` Hugh Dickins @ 2007-05-13 6:52 ` Nick Piggin 2007-05-16 17:54 ` Hugh Dickins 0 siblings, 1 reply; 34+ messages in thread From: Nick Piggin @ 2007-05-13 6:52 UTC (permalink / raw) To: Hugh Dickins Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Sun, May 13, 2007 at 05:39:03AM +0100, Hugh Dickins wrote: > On Sun, 13 May 2007, Nick Piggin wrote: > > On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote: > > > > > Hmm, well, I think that's fairly horrid, and would it even be > > > guaranteed to work on all architectures? Playing with one char > > > of an unsigned long in one way, while playing with the whole of > > > the unsigned long in another way (bitops) sounds very dodgy to me. > > > > Of course not, but they can just use a regular atomic word sized > > bitop. The problem with i386 is that its atomic ops also imply > > memory barriers that you obviously don't need on unlock. > > But is it even a valid procedure on i386? Well I think so, but not completely sure. OTOH, I admit this is one of the more contentious speedups ;) It is likely to be vary a lot by the arch (I think the P4 is infamous for expensive locked ops, others may prefer not to mix the byte sized ops with word length ones). But that aside, I'd still like to do the lock page in nopage and get this bug fixed. Now it is possible to fix some other way, eg we could use another page flag (I'd say it would be better to use that flag for PG_waiters and speed up all PG_locked users), however I think it is fine to lock the page over fault. It gets rid of some complexity of memory ordering there, and we already have to do the wait_on_page_locked thing to prevent the page_mkclean data loss thingy. I haven't seen a non-microbenchmark where it hurts. ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-13 6:52 ` Nick Piggin @ 2007-05-16 17:54 ` Hugh Dickins 2007-05-16 18:18 ` Nick Piggin 0 siblings, 1 reply; 34+ messages in thread From: Hugh Dickins @ 2007-05-16 17:54 UTC (permalink / raw) To: Nick Piggin Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Sun, 13 May 2007, Nick Piggin wrote: > On Sun, May 13, 2007 at 05:39:03AM +0100, Hugh Dickins wrote: > > On Sun, 13 May 2007, Nick Piggin wrote: > > > On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote: > > > > > > > Hmm, well, I think that's fairly horrid, and would it even be > > > > guaranteed to work on all architectures? Playing with one char > > > > of an unsigned long in one way, while playing with the whole of > > > > the unsigned long in another way (bitops) sounds very dodgy to me. > > > > > > Of course not, but they can just use a regular atomic word sized > > > bitop. The problem with i386 is that its atomic ops also imply > > > memory barriers that you obviously don't need on unlock. > > > > But is it even a valid procedure on i386? > > Well I think so, but not completely sure. That's not quite enough to convince me! I do retract my "fairly horrid" remark, that was a kneejerk reaction to cleverness; it's quite nice, if it can be guaranteed to work (and if lowering FLAGS_RESERVED from 9 to 7 doesn't upset whoever carefully chose 9). Please seek out those guarantees. Like you, I can't really see how it would go wrong (how could moving in the unlocked char mess with the flag bits in the rest of the long? how could atomically modifying the long have a chance of undoing that move?), but it feels like it might take us into errata territory. Hugh > OTOH, I admit this is one > of the more contentious speedups ;) It is likely to be vary a lot by > the arch (I think the P4 is infamous for expensive locked ops, others > may prefer not to mix the byte sized ops with word length ones). ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-16 17:54 ` Hugh Dickins @ 2007-05-16 18:18 ` Nick Piggin 2007-05-16 19:28 ` Hugh Dickins 0 siblings, 1 reply; 34+ messages in thread From: Nick Piggin @ 2007-05-16 18:18 UTC (permalink / raw) To: Hugh Dickins Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List, Linus Torvalds On Wed, May 16, 2007 at 06:54:15PM +0100, Hugh Dickins wrote: > On Sun, 13 May 2007, Nick Piggin wrote: > > On Sun, May 13, 2007 at 05:39:03AM +0100, Hugh Dickins wrote: > > > On Sun, 13 May 2007, Nick Piggin wrote: > > > > On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote: > > > > > > > > > Hmm, well, I think that's fairly horrid, and would it even be > > > > > guaranteed to work on all architectures? Playing with one char > > > > > of an unsigned long in one way, while playing with the whole of > > > > > the unsigned long in another way (bitops) sounds very dodgy to me. > > > > > > > > Of course not, but they can just use a regular atomic word sized > > > > bitop. The problem with i386 is that its atomic ops also imply > > > > memory barriers that you obviously don't need on unlock. > > > > > > But is it even a valid procedure on i386? > > > > Well I think so, but not completely sure. > > That's not quite enough to convince me! I did ask Linus, and he was very sure it works. > I do retract my "fairly horrid" remark, that was a kneejerk reaction > to cleverness; it's quite nice, if it can be guaranteed to work (and > if lowering FLAGS_RESERVED from 9 to 7 doesn't upset whoever carefully > chose 9). Hmm, it is a _little_ bit horrid ;) Maybe slightly clever, but definitely slightly horrid as well! Not so much from a high level view (although it does put more constraints on the flags layout), but from a CPU level... the way we intermix different sized loads and stores can run into store forwarding issues[*] which might be expensive as well. Not to mention that we can't do the non-atomic unlock on all architectures. OTOH, it did _seem_ to eliminate the pagefault regression on my P4 Xeon here, in one round of tests. The other option of moving the bit into ->mapping hopefully avoids all the issues, and would probably be a little faster again on the P4, at the expense of being a more intrusive (but it doesn't look too bad, at first glance)... [*] I did mention to Linus that we might be able to avoid the store forwarding stall by loading just a single byte to test PG_waiters after the byte store to clear PG_locked. At this point he puked. We decided that using another field altogether might be better. > Please seek out those guarantees. Like you, I can't really see how > it would go wrong (how could moving in the unlocked char mess with > the flag bits in the rest of the long? how could atomically modifying > the long have a chance of undoing that move?), but it feels like it > might take us into errata territory. I think we can just rely on the cache coherency protocol taking care of it for us, on x86. movb would not affect other data other than the dest. A non-atomic op _could_ of course undo the movb, but it could likewise undo any other store to the word or byte. An atomic op on the flags does not modify the movb byte so the movb before/after possibilities should look exactly the same regardless of the atomic operations happening. ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-16 18:18 ` Nick Piggin @ 2007-05-16 19:28 ` Hugh Dickins 2007-05-16 19:47 ` Linus Torvalds 0 siblings, 1 reply; 34+ messages in thread From: Hugh Dickins @ 2007-05-16 19:28 UTC (permalink / raw) To: Nick Piggin Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List, Linus Torvalds On Wed, 16 May 2007, Nick Piggin wrote: > On Wed, May 16, 2007 at 06:54:15PM +0100, Hugh Dickins wrote: > > On Sun, 13 May 2007, Nick Piggin wrote: > > > > > > Well I think so, but not completely sure. > > > > That's not quite enough to convince me! > > I did ask Linus, and he was very sure it works. Good, that's very encouraging. > Not so much from a high level view (although it does put more constraints > on the flags layout), but from a CPU level... the way we intermix different > sized loads and stores can run into store forwarding issues[*] which might > be expensive as well. Not to mention that we can't do the non-atomic > unlock on all architectures. Ah yes, that's easier to envisage than an actual correctness problem. > The other option of moving the bit into ->mapping hopefully avoids all > the issues, and would probably be a little faster again on the P4, at the > expense of being a more intrusive (but it doesn't look too bad, at first > glance)... Hmm, I'm so happy with PG_swapcache in there, that I'm reluctant to cede it to your PG_locked, though I can't deny your use should take precedence. Perhaps we could enforce 8-byte alignment of struct address_space and struct anon_vma to make both bits available (along with the anon bit). But I think you may not be appreciating how intrusive PG_locked will be. There are many references to page->mapping (often ->host) throughout fs/ : when we keep anon/swap flags in page->mapping, we know the filesystems will never see those bits set in their pages, so no page_mapping-like conversion is needed; just a few places in common code need to adapt. And given our deprecation discipline for in-kernel interfaces, wouldn't we have to wait a similar period before making page->mapping unavailable to out-of-tree filesystems? > > Please seek out those guarantees. Like you, I can't really see how > > it would go wrong (how could moving in the unlocked char mess with > > the flag bits in the rest of the long? how could atomically modifying > > the long have a chance of undoing that move?), but it feels like it > > might take us into errata territory. > > I think we can just rely on the cache coherency protocol taking care of > it for us, on x86. movb would not affect other data other than the dest. > A non-atomic op _could_ of course undo the movb, but it could likewise > undo any other store to the word or byte. An atomic op on the flags does > not modify the movb byte so the movb before/after possibilities should > look exactly the same regardless of the atomic operations happening. Yes, I've gone through that same thought process (my questions were intended as rhetorical exclamations of inconceivabilty, rather than actual queries). But if you do go that way, I'd still like you to check with Intel and AMD for errata. See include/asm-i386/spinlock.h for the CONFIG_X86_OOSTORE || CONFIG_X86_PPRO_FENCE __raw_spin_unlock using xchgb: doesn't that hint that exceptions may be needed? Hugh ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-16 19:28 ` Hugh Dickins @ 2007-05-16 19:47 ` Linus Torvalds 2007-05-17 6:27 ` Nick Piggin 0 siblings, 1 reply; 34+ messages in thread From: Linus Torvalds @ 2007-05-16 19:47 UTC (permalink / raw) To: Hugh Dickins Cc: Nick Piggin, Benjamin Herrenschmidt, linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Wed, 16 May 2007, Hugh Dickins wrote: > On Wed, 16 May 2007, Nick Piggin wrote: > > On Wed, May 16, 2007 at 06:54:15PM +0100, Hugh Dickins wrote: > > > On Sun, 13 May 2007, Nick Piggin wrote: > > > > > > > > Well I think so, but not completely sure. > > > > > > That's not quite enough to convince me! > > > > I did ask Linus, and he was very sure it works. > > Good, that's very encouraging. Note that our default spinlocks _depend_ on a bog-standard store just working as an unlock, so this wouldn't even be anything half-way new: static inline void __raw_spin_unlock(raw_spinlock_t *lock) { asm volatile("movb $1,%0" : "+m" (lock->slock) :: "memory"); } There are some Opteron errata (and a really old P6 bug) wrt this, but they are definitely CPU bugs, and we haven't really worked out what the Opteron solution should be (the bug is apparently pretty close to impossible to trigger in practice, so it's not been a high priority). > > The other option of moving the bit into ->mapping hopefully avoids all > > the issues, and would probably be a little faster again on the P4, at the > > expense of being a more intrusive (but it doesn't look too bad, at first > > glance)... > > Hmm, I'm so happy with PG_swapcache in there, that I'm reluctant to > cede it to your PG_locked, though I can't deny your use should take > precedence. Perhaps we could enforce 8-byte alignment of struct > address_space and struct anon_vma to make both bits available > (along with the anon bit). We probably could. It should be easy enough to mark "struct address_space" to be 8-byte aligned. > But I think you may not be appreciating how intrusive PG_locked > will be. There are many references to page->mapping (often ->host) > throughout fs/ : when we keep anon/swap flags in page->mapping, we > know the filesystems will never see those bits set in their pages, > so no page_mapping-like conversion is needed; just a few places in > common code need to adapt. You're right, it could be really painful. We'd have to rename the field, and use some inline function to access it (which masks off the low bits). Linus ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-16 19:47 ` Linus Torvalds @ 2007-05-17 6:27 ` Nick Piggin 0 siblings, 0 replies; 34+ messages in thread From: Nick Piggin @ 2007-05-17 6:27 UTC (permalink / raw) To: Linus Torvalds Cc: Hugh Dickins, Benjamin Herrenschmidt, linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Wed, May 16, 2007 at 12:47:54PM -0700, Linus Torvalds wrote: > > On Wed, 16 May 2007, Hugh Dickins wrote: > > > > The other option of moving the bit into ->mapping hopefully avoids all > > > the issues, and would probably be a little faster again on the P4, at the > > > expense of being a more intrusive (but it doesn't look too bad, at first > > > glance)... > > > > Hmm, I'm so happy with PG_swapcache in there, that I'm reluctant to > > cede it to your PG_locked, though I can't deny your use should take > > precedence. Perhaps we could enforce 8-byte alignment of struct > > address_space and struct anon_vma to make both bits available > > (along with the anon bit). > > We probably could. It should be easy enough to mark "struct address_space" > to be 8-byte aligned. Yeah, it might be worthwhile, because I agree that PG_swapcache would work nicely there too. > > But I think you may not be appreciating how intrusive PG_locked > > will be. There are many references to page->mapping (often ->host) > > throughout fs/ : when we keep anon/swap flags in page->mapping, we > > know the filesystems will never see those bits set in their pages, > > so no page_mapping-like conversion is needed; just a few places in > > common code need to adapt. > > You're right, it could be really painful. We'd have to rename the field, > and use some inline function to access it (which masks off the low bits). Yeah, I realise that the change is intrusive in terms of lines touched, but AFAIKS, it should not be much more complex than a search/replace... As far as deprecating things goes... I don't think we have to wait too long, its more for features, drivers, or more fundamental APIs isn't it? If we just point out that one must use set_page_mapping/page_mapping rather than page->mapping, it is trivial to fix any out of tree breakage. ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-13 3:32 ` Nick Piggin 2007-05-13 4:39 ` Hugh Dickins @ 2007-05-16 17:21 ` Hugh Dickins 2007-05-16 17:38 ` Nick Piggin 1 sibling, 1 reply; 34+ messages in thread From: Hugh Dickins @ 2007-05-16 17:21 UTC (permalink / raw) To: Nick Piggin Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Sun, 13 May 2007, Nick Piggin wrote: > On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote: > > > But again I wonder just what the gain has been, once your double > > unmap_mapping_range is factored in. When I suggested before that > > perhaps the double (well, treble including the one in truncate.c) > > unmap_mapping_range might solve the problem you set out to solve > > (I've lost sight of that!) without pagelock when faulting, you said: > > > > > Well aside from being terribly ugly, it means we can still drop > > > the dirty bit where we'd otherwise rather not, so I don't think > > > we can do that. > > > > but that didn't give me enough information to agree or disagree. > > Oh, well invalidate wants to be able to skip dirty pages or have the > filesystem do something special with them first. Once you have taken > the page out of the pagecache but still mapped shared, then blowing > it away doesn't actually solve the data loss problem... only makes > the window of VM inconsistency smaller. Right, I think I see what you mean now, thanks: userspace must not for a moment be allowed to write to orphaned pages. Whereas it's not an issue for the privately COWed pages you added the second unmap_mapping_range for: because it's only truncation that has to worry about them, so they're heading for SIGBUS anyway. Yes, and the page_mapped tests in mm/truncate.c are just racy heuristics without the page lock you now put into faulting. Hugh ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] optimise unlock_page 2007-05-16 17:21 ` Hugh Dickins @ 2007-05-16 17:38 ` Nick Piggin 0 siblings, 0 replies; 34+ messages in thread From: Nick Piggin @ 2007-05-16 17:38 UTC (permalink / raw) To: Hugh Dickins Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List On Wed, May 16, 2007 at 06:21:09PM +0100, Hugh Dickins wrote: > On Sun, 13 May 2007, Nick Piggin wrote: > > On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote: > > > > > But again I wonder just what the gain has been, once your double > > > unmap_mapping_range is factored in. When I suggested before that > > > perhaps the double (well, treble including the one in truncate.c) > > > unmap_mapping_range might solve the problem you set out to solve > > > (I've lost sight of that!) without pagelock when faulting, you said: > > > > > > > Well aside from being terribly ugly, it means we can still drop > > > > the dirty bit where we'd otherwise rather not, so I don't think > > > > we can do that. > > > > > > but that didn't give me enough information to agree or disagree. > > > > Oh, well invalidate wants to be able to skip dirty pages or have the > > filesystem do something special with them first. Once you have taken > > the page out of the pagecache but still mapped shared, then blowing > > it away doesn't actually solve the data loss problem... only makes > > the window of VM inconsistency smaller. > > Right, I think I see what you mean now, thanks: userspace > must not for a moment be allowed to write to orphaned pages. Yep. > Whereas it's not an issue for the privately COWed pages you added > the second unmap_mapping_range for: because it's only truncation > that has to worry about them, so they're heading for SIGBUS anyway. > > Yes, and the page_mapped tests in mm/truncate.c are just racy > heuristics without the page lock you now put into faulting. Yes. ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] lock bitops 2007-05-08 11:37 [rfc] lock bitops Nick Piggin 2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin @ 2007-05-08 12:22 ` David Howells 2007-05-08 22:33 ` Nick Piggin 2007-05-09 6:18 ` Nick Piggin 2007-05-08 15:06 ` Matthew Wilcox 2007-05-09 12:08 ` Nikita Danilov 3 siblings, 2 replies; 34+ messages in thread From: David Howells @ 2007-05-08 12:22 UTC (permalink / raw) To: Nick Piggin Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton, Linux Kernel Mailing List Nick Piggin <npiggin@suse.de> wrote: > This patch (along with the subsequent one to optimise unlock_page) reduces > the overhead of lock_page/unlock_page (measured with page faults and a patch > to lock the page in the fault handler) by about 425 cycles on my 2-way G5. Seems reasonable, though test_and_set_lock_bit() might be a better name. > +There are two special bitops with lock barrier semantics (acquire/release, > +same as spinlocks). You should update Documentation/memory-barriers.txt also. > #define TestSetPageLocked(page) \ > test_and_set_bit(PG_locked, &(page)->flags) > +#define TestSetPageLocked_Lock(page) \ > + test_and_set_bit_lock(PG_locked, &(page)->flags) Can we get away with just moving TestSetPageLocked() to the new function rather than adding another accessor? Or how about LockPageLocked() and UnlockPageLocked() rather than SetPageLocked_Lock() that last looks wrong somehow. The FRV changes look reasonable, btw. David ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] lock bitops 2007-05-08 12:22 ` [rfc] lock bitops David Howells @ 2007-05-08 22:33 ` Nick Piggin 2007-05-09 6:18 ` Nick Piggin 1 sibling, 0 replies; 34+ messages in thread From: Nick Piggin @ 2007-05-08 22:33 UTC (permalink / raw) To: David Howells Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton, Linux Kernel Mailing List On Tue, May 08, 2007 at 01:22:56PM +0100, David Howells wrote: > Nick Piggin <npiggin@suse.de> wrote: > > > This patch (along with the subsequent one to optimise unlock_page) reduces > > the overhead of lock_page/unlock_page (measured with page faults and a patch > > to lock the page in the fault handler) by about 425 cycles on my 2-way G5. > > Seems reasonable, though test_and_set_lock_bit() might be a better name. The postfix attaches lock semantics to the test_and_set_bit operation, so I don't think it is necessarily wrong to have this name. But it doesn't matter too much I guess. > > +There are two special bitops with lock barrier semantics (acquire/release, > > +same as spinlocks). > > You should update Documentation/memory-barriers.txt also. Will do. > > #define TestSetPageLocked(page) \ > > test_and_set_bit(PG_locked, &(page)->flags) > > +#define TestSetPageLocked_Lock(page) \ > > + test_and_set_bit_lock(PG_locked, &(page)->flags) > > Can we get away with just moving TestSetPageLocked() to the new function > rather than adding another accessor? Or how about LockPageLocked() and > UnlockPageLocked() rather than SetPageLocked_Lock() that last looks wrong > somehow. The problem is that it implies some semantics that it may not have. Possibly better is to just remove them all and use only trylock/lock/unlock_page. ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] lock bitops 2007-05-08 12:22 ` [rfc] lock bitops David Howells 2007-05-08 22:33 ` Nick Piggin @ 2007-05-09 6:18 ` Nick Piggin 1 sibling, 0 replies; 34+ messages in thread From: Nick Piggin @ 2007-05-09 6:18 UTC (permalink / raw) To: David Howells Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton, Linux Kernel Mailing List On Tue, May 08, 2007 at 01:22:56PM +0100, David Howells wrote: > Nick Piggin <npiggin@suse.de> wrote: > > > +There are two special bitops with lock barrier semantics (acquire/release, > > +same as spinlocks). > > You should update Documentation/memory-barriers.txt also. > > > #define TestSetPageLocked(page) \ > > test_and_set_bit(PG_locked, &(page)->flags) > > +#define TestSetPageLocked_Lock(page) \ > > + test_and_set_bit_lock(PG_locked, &(page)->flags) > > Can we get away with just moving TestSetPageLocked() to the new function > rather than adding another accessor? Or how about LockPageLocked() and > UnlockPageLocked() rather than SetPageLocked_Lock() that last looks wrong Updated memory-barriers.txt, made the generic routines #defines because smp_mb isn't always included early enough, and culled some of the page- flags.h macros in favour of coding the lock operations directly. -- Introduce test_and_set_bit_lock / clear_bit_unlock bitops with lock semantics. Add non-trivial implementations for powerpc and ia64. Convert page lock, buffer lock, bit_spin_lock, tasklet locks to use the new locks. --- Documentation/atomic_ops.txt | 9 +++++++ Documentation/memory-barriers.txt | 14 ++++++++++-- drivers/scsi/sg.c | 2 - fs/buffer.c | 7 ++---- fs/cifs/file.c | 2 - fs/jbd/commit.c | 4 +-- fs/jbd2/commit.c | 4 +-- fs/ntfs/aops.c | 2 - fs/ntfs/compress.c | 2 - fs/ntfs/mft.c | 4 +-- fs/reiserfs/inode.c | 2 - fs/reiserfs/journal.c | 4 +-- fs/xfs/linux-2.6/xfs_aops.c | 4 +-- include/asm-alpha/bitops.h | 1 include/asm-arm/bitops.h | 1 include/asm-arm26/bitops.h | 1 include/asm-avr32/bitops.h | 1 include/asm-cris/bitops.h | 1 include/asm-frv/bitops.h | 1 include/asm-generic/bitops.h | 1 include/asm-generic/bitops/lock.h | 13 +++++++++++ include/asm-h8300/bitops.h | 1 include/asm-i386/bitops.h | 1 include/asm-ia64/bitops.h | 33 ++++++++++++++++++++++++++++ include/asm-m32r/bitops.h | 1 include/asm-m68k/bitops.h | 1 include/asm-m68knommu/bitops.h | 1 include/asm-mips/bitops.h | 1 include/asm-parisc/bitops.h | 1 include/asm-powerpc/bitops.h | 44 ++++++++++++++++++++++++++++++++++++++ include/asm-s390/bitops.h | 1 include/asm-sh/bitops.h | 1 include/asm-sh64/bitops.h | 1 include/asm-sparc/bitops.h | 1 include/asm-sparc64/bitops.h | 1 include/asm-v850/bitops.h | 1 include/asm-x86_64/bitops.h | 1 include/asm-xtensa/bitops.h | 1 include/linux/bit_spinlock.h | 11 +++++---- include/linux/buffer_head.h | 8 +++++- include/linux/interrupt.h | 5 +--- include/linux/page-flags.h | 6 ----- include/linux/pagemap.h | 9 ++++++- kernel/wait.c | 2 - mm/filemap.c | 17 ++++++-------- mm/memory.c | 2 - mm/migrate.c | 4 +-- mm/rmap.c | 2 - mm/shmem.c | 4 +-- mm/swap.c | 2 - mm/swap_state.c | 2 - mm/swapfile.c | 2 - mm/truncate.c | 4 +-- mm/vmscan.c | 4 +-- 54 files changed, 193 insertions(+), 63 deletions(-) Index: linux-2.6/include/asm-powerpc/bitops.h =================================================================== --- linux-2.6.orig/include/asm-powerpc/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-powerpc/bitops.h 2007-05-09 14:00:42.000000000 +1000 @@ -87,6 +87,24 @@ : "cc" ); } +static __inline__ void clear_bit_unlock(int nr, volatile unsigned long *addr) +{ + unsigned long old; + unsigned long mask = BITOP_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + + __asm__ __volatile__( + LWSYNC_ON_SMP +"1:" PPC_LLARX "%0,0,%3 # clear_bit_unlock\n" + "andc %0,%0,%2\n" + PPC405_ERR77(0,%3) + PPC_STLCX "%0,0,%3\n" + "bne- 1b" + : "=&r" (old), "+m" (*p) + : "r" (mask), "r" (p) + : "cc" ); +} + static __inline__ void change_bit(int nr, volatile unsigned long *addr) { unsigned long old; @@ -126,6 +144,27 @@ return (old & mask) != 0; } +static __inline__ int test_and_set_bit_lock(unsigned long nr, + volatile unsigned long *addr) +{ + unsigned long old, t; + unsigned long mask = BITOP_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + + __asm__ __volatile__( +"1:" PPC_LLARX "%0,0,%3 # test_and_set_bit_lock\n" + "or %1,%0,%2 \n" + PPC405_ERR77(0,%3) + PPC_STLCX "%1,0,%3 \n" + "bne- 1b" + ISYNC_ON_SMP + : "=&r" (old), "=&r" (t) + : "r" (mask), "r" (p) + : "cc", "memory"); + + return (old & mask) != 0; +} + static __inline__ int test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { Index: linux-2.6/include/linux/pagemap.h =================================================================== --- linux-2.6.orig/include/linux/pagemap.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/linux/pagemap.h 2007-05-09 13:57:33.000000000 +1000 @@ -135,13 +135,18 @@ extern void FASTCALL(__lock_page_nosync(struct page *page)); extern void FASTCALL(unlock_page(struct page *page)); +static inline int trylock_page(struct page *page) +{ + return (likely(!test_and_set_bit_lock(PG_locked, &page->flags))); +} + /* * lock_page may only be called if we have the page's inode pinned. */ static inline void lock_page(struct page *page) { might_sleep(); - if (TestSetPageLocked(page)) + if (!trylock_page(page)) __lock_page(page); } @@ -152,7 +157,7 @@ static inline void lock_page_nosync(struct page *page) { might_sleep(); - if (TestSetPageLocked(page)) + if (!trylock_page(page)) __lock_page_nosync(page); } Index: linux-2.6/drivers/scsi/sg.c =================================================================== --- linux-2.6.orig/drivers/scsi/sg.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/drivers/scsi/sg.c 2007-05-09 13:57:33.000000000 +1000 @@ -1734,7 +1734,7 @@ */ flush_dcache_page(pages[i]); /* ?? Is locking needed? I don't think so */ - /* if (TestSetPageLocked(pages[i])) + /* if (!trylock_page(pages[i])) goto out_unlock; */ } Index: linux-2.6/fs/cifs/file.c =================================================================== --- linux-2.6.orig/fs/cifs/file.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/fs/cifs/file.c 2007-05-09 13:57:33.000000000 +1000 @@ -1229,7 +1229,7 @@ if (first < 0) lock_page(page); - else if (TestSetPageLocked(page)) + else if (!trylock_page(page)) break; if (unlikely(page->mapping != mapping)) { Index: linux-2.6/fs/jbd/commit.c =================================================================== --- linux-2.6.orig/fs/jbd/commit.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/fs/jbd/commit.c 2007-05-09 13:57:33.000000000 +1000 @@ -64,7 +64,7 @@ goto nope; /* OK, it's a truncated page */ - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto nope; page_cache_get(page); @@ -209,7 +209,7 @@ * blocking lock_buffer(). */ if (buffer_dirty(bh)) { - if (test_set_buffer_locked(bh)) { + if (!trylock_buffer(bh)) { BUFFER_TRACE(bh, "needs blocking lock"); spin_unlock(&journal->j_list_lock); /* Write out all data to prevent deadlocks */ Index: linux-2.6/fs/jbd2/commit.c =================================================================== --- linux-2.6.orig/fs/jbd2/commit.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/fs/jbd2/commit.c 2007-05-09 13:57:33.000000000 +1000 @@ -64,7 +64,7 @@ goto nope; /* OK, it's a truncated page */ - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto nope; page_cache_get(page); @@ -209,7 +209,7 @@ * blocking lock_buffer(). */ if (buffer_dirty(bh)) { - if (test_set_buffer_locked(bh)) { + if (!trylock_buffer(bh)) { BUFFER_TRACE(bh, "needs blocking lock"); spin_unlock(&journal->j_list_lock); /* Write out all data to prevent deadlocks */ Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.c =================================================================== --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.c 2007-05-09 13:57:33.000000000 +1000 @@ -601,7 +601,7 @@ } else pg_offset = PAGE_CACHE_SIZE; - if (page->index == tindex && !TestSetPageLocked(page)) { + if (page->index == tindex && trylock_page(page)) { len = xfs_probe_page(page, pg_offset, mapped); unlock_page(page); } @@ -685,7 +685,7 @@ if (page->index != tindex) goto fail; - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto fail; if (PageWriteback(page)) goto fail_unlock_page; Index: linux-2.6/include/linux/page-flags.h =================================================================== --- linux-2.6.orig/include/linux/page-flags.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/linux/page-flags.h 2007-05-09 13:57:33.000000000 +1000 @@ -112,12 +112,6 @@ test_bit(PG_locked, &(page)->flags) #define SetPageLocked(page) \ set_bit(PG_locked, &(page)->flags) -#define TestSetPageLocked(page) \ - test_and_set_bit(PG_locked, &(page)->flags) -#define ClearPageLocked(page) \ - clear_bit(PG_locked, &(page)->flags) -#define TestClearPageLocked(page) \ - test_and_clear_bit(PG_locked, &(page)->flags) #define PageError(page) test_bit(PG_error, &(page)->flags) #define SetPageError(page) set_bit(PG_error, &(page)->flags) Index: linux-2.6/mm/memory.c =================================================================== --- linux-2.6.orig/mm/memory.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/mm/memory.c 2007-05-09 13:57:33.000000000 +1000 @@ -1550,7 +1550,7 @@ * not dirty accountable. */ if (PageAnon(old_page)) { - if (!TestSetPageLocked(old_page)) { + if (trylock_page(old_page)) { reuse = can_share_swap_page(old_page); unlock_page(old_page); } Index: linux-2.6/mm/migrate.c =================================================================== --- linux-2.6.orig/mm/migrate.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/mm/migrate.c 2007-05-09 13:57:33.000000000 +1000 @@ -569,7 +569,7 @@ * establishing additional references. We are the only one * holding a reference to the new page at this point. */ - if (TestSetPageLocked(newpage)) + if (!trylock_page(newpage)) BUG(); /* Prepare mapping for the new page.*/ @@ -621,7 +621,7 @@ goto move_newpage; rc = -EAGAIN; - if (TestSetPageLocked(page)) { + if (!trylock_page(page)) { if (!force) goto move_newpage; lock_page(page); Index: linux-2.6/mm/rmap.c =================================================================== --- linux-2.6.orig/mm/rmap.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/mm/rmap.c 2007-05-09 13:57:33.000000000 +1000 @@ -426,7 +426,7 @@ referenced += page_referenced_anon(page); else if (is_locked) referenced += page_referenced_file(page); - else if (TestSetPageLocked(page)) + else if (!trylock_page(page)) referenced++; else { if (page->mapping) Index: linux-2.6/mm/shmem.c =================================================================== --- linux-2.6.orig/mm/shmem.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/mm/shmem.c 2007-05-09 13:57:33.000000000 +1000 @@ -1154,7 +1154,7 @@ } /* We have to do this with page locked to prevent races */ - if (TestSetPageLocked(swappage)) { + if (!trylock_page(swappage)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); wait_on_page_locked(swappage); @@ -1213,7 +1213,7 @@ shmem_swp_unmap(entry); filepage = find_get_page(mapping, idx); if (filepage && - (!PageUptodate(filepage) || TestSetPageLocked(filepage))) { + (!PageUptodate(filepage) || !trylock_page(filepage))) { spin_unlock(&info->lock); wait_on_page_locked(filepage); page_cache_release(filepage); Index: linux-2.6/mm/swap.c =================================================================== --- linux-2.6.orig/mm/swap.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/mm/swap.c 2007-05-09 13:57:33.000000000 +1000 @@ -412,7 +412,7 @@ for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - if (PagePrivate(page) && !TestSetPageLocked(page)) { + if (PagePrivate(page) && trylock_page(page)) { if (PagePrivate(page)) try_to_release_page(page, 0); unlock_page(page); Index: linux-2.6/mm/swap_state.c =================================================================== --- linux-2.6.orig/mm/swap_state.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/mm/swap_state.c 2007-05-09 13:57:33.000000000 +1000 @@ -252,7 +252,7 @@ */ static inline void free_swap_cache(struct page *page) { - if (PageSwapCache(page) && !TestSetPageLocked(page)) { + if (PageSwapCache(page) && trylock_page(page)) { remove_exclusive_swap_page(page); unlock_page(page); } Index: linux-2.6/mm/swapfile.c =================================================================== --- linux-2.6.orig/mm/swapfile.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/mm/swapfile.c 2007-05-09 13:57:33.000000000 +1000 @@ -401,7 +401,7 @@ if (p) { if (swap_entry_free(p, swp_offset(entry)) == 1) { page = find_get_page(&swapper_space, entry.val); - if (page && unlikely(TestSetPageLocked(page))) { + if (page && unlikely(!trylock_page(page))) { page_cache_release(page); page = NULL; } Index: linux-2.6/mm/truncate.c =================================================================== --- linux-2.6.orig/mm/truncate.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/mm/truncate.c 2007-05-09 13:57:33.000000000 +1000 @@ -185,7 +185,7 @@ if (page_index > next) next = page_index; next++; - if (TestSetPageLocked(page)) + if (!trylock_page(page)) continue; if (PageWriteback(page)) { unlock_page(page); @@ -281,7 +281,7 @@ pgoff_t index; int lock_failed; - lock_failed = TestSetPageLocked(page); + lock_failed = !trylock_page(page); /* * We really shouldn't be looking at the ->index of an Index: linux-2.6/mm/vmscan.c =================================================================== --- linux-2.6.orig/mm/vmscan.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/mm/vmscan.c 2007-05-09 13:57:33.000000000 +1000 @@ -466,7 +466,7 @@ page = lru_to_page(page_list); list_del(&page->lru); - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto keep; VM_BUG_ON(PageActive(page)); @@ -538,7 +538,7 @@ * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the page. */ - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto keep; if (PageDirty(page) || PageWriteback(page)) goto keep_locked; Index: linux-2.6/include/linux/bit_spinlock.h =================================================================== --- linux-2.6.orig/include/linux/bit_spinlock.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/linux/bit_spinlock.h 2007-05-09 13:57:33.000000000 +1000 @@ -18,7 +18,7 @@ */ preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) - while (test_and_set_bit(bitnum, addr)) { + while (unlikely(test_and_set_bit_lock(bitnum, addr))) { while (test_bit(bitnum, addr)) { preempt_enable(); cpu_relax(); @@ -36,7 +36,7 @@ { preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) - if (test_and_set_bit(bitnum, addr)) { + if (test_and_set_bit_lock(bitnum, addr)) { preempt_enable(); return 0; } @@ -50,10 +50,11 @@ */ static inline void bit_spin_unlock(int bitnum, unsigned long *addr) { -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +#ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); - smp_mb__before_clear_bit(); - clear_bit(bitnum, addr); +#endif +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + clear_bit_unlock(bitnum, addr); #endif preempt_enable(); __release(bitlock); Index: linux-2.6/mm/filemap.c =================================================================== --- linux-2.6.orig/mm/filemap.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/mm/filemap.c 2007-05-09 13:57:33.000000000 +1000 @@ -525,17 +525,14 @@ * mechananism between PageLocked pages and PageWriteback pages is shared. * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. * - * The first mb is necessary to safely close the critical section opened by the - * TestSetPageLocked(), the second mb is necessary to enforce ordering between - * the clear_bit and the read of the waitqueue (to avoid SMP races with a - * parallel wait_on_page_locked()). + * The mb is necessary to enforce ordering between the clear_bit and the read + * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). */ void fastcall unlock_page(struct page *page) { - smp_mb__before_clear_bit(); - if (!TestClearPageLocked(page)) - BUG(); - smp_mb__after_clear_bit(); + VM_BUG_ON(!PageLocked(page)); + clear_bit_unlock(PG_locked, &page->flags); + smp_mb__after_clear_bit(); wake_up_page(page, PG_locked); } EXPORT_SYMBOL(unlock_page); @@ -625,7 +622,7 @@ page = radix_tree_lookup(&mapping->page_tree, offset); if (page) { page_cache_get(page); - if (TestSetPageLocked(page)) { + if (!trylock_page(page)) { read_unlock_irq(&mapping->tree_lock); __lock_page(page); read_lock_irq(&mapping->tree_lock); @@ -798,7 +795,7 @@ struct page *page = find_get_page(mapping, index); if (page) { - if (!TestSetPageLocked(page)) + if (trylock_page(page)) return page; page_cache_release(page); return NULL; Index: linux-2.6/include/asm-alpha/bitops.h =================================================================== --- linux-2.6.orig/include/asm-alpha/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-alpha/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -359,6 +359,7 @@ #else #include <asm-generic/bitops/hweight.h> #endif +#include <asm-generic/bitops/lock.h> #endif /* __KERNEL__ */ Index: linux-2.6/include/asm-arm/bitops.h =================================================================== --- linux-2.6.orig/include/asm-arm/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-arm/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -286,6 +286,7 @@ #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> /* * Ext2 is defined to use little-endian byte ordering. Index: linux-2.6/include/asm-arm26/bitops.h =================================================================== --- linux-2.6.orig/include/asm-arm26/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-arm26/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -167,6 +167,7 @@ #include <asm-generic/bitops/ffs.h> #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> /* * Ext2 is defined to use little-endian byte ordering. Index: linux-2.6/include/asm-avr32/bitops.h =================================================================== --- linux-2.6.orig/include/asm-avr32/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-avr32/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -288,6 +288,7 @@ #include <asm-generic/bitops/fls64.h> #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> #include <asm-generic/bitops/ext2-non-atomic.h> #include <asm-generic/bitops/ext2-atomic.h> Index: linux-2.6/include/asm-cris/bitops.h =================================================================== --- linux-2.6.orig/include/asm-cris/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-cris/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -154,6 +154,7 @@ #include <asm-generic/bitops/fls64.h> #include <asm-generic/bitops/hweight.h> #include <asm-generic/bitops/find.h> +#include <asm-generic/bitops/lock.h> #include <asm-generic/bitops/ext2-non-atomic.h> Index: linux-2.6/include/asm-frv/bitops.h =================================================================== --- linux-2.6.orig/include/asm-frv/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-frv/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -302,6 +302,7 @@ #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> #include <asm-generic/bitops/ext2-non-atomic.h> Index: linux-2.6/include/asm-generic/bitops.h =================================================================== --- linux-2.6.orig/include/asm-generic/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-generic/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -22,6 +22,7 @@ #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/ffs.h> #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> #include <asm-generic/bitops/ext2-non-atomic.h> #include <asm-generic/bitops/ext2-atomic.h> Index: linux-2.6/include/asm-generic/bitops/lock.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6/include/asm-generic/bitops/lock.h 2007-05-09 13:57:33.000000000 +1000 @@ -0,0 +1,13 @@ +#ifndef _ASM_GENERIC_BITOPS_LOCK_H_ +#define _ASM_GENERIC_BITOPS_LOCK_H_ + +#define test_and_set_bit_lock(nr, addr) test_and_set_bit(nr, addr) + +#define clear_bit_unlock(nr, addr) \ +do { \ + smp_mb__before_clear_bit(); \ + clear_bit(nr, addr); \ +} while (0) + +#endif /* _ASM_GENERIC_BITOPS_LOCK_H_ */ + Index: linux-2.6/include/asm-h8300/bitops.h =================================================================== --- linux-2.6.orig/include/asm-h8300/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-h8300/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -194,6 +194,7 @@ #include <asm-generic/bitops/find.h> #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> #include <asm-generic/bitops/ext2-non-atomic.h> #include <asm-generic/bitops/ext2-atomic.h> #include <asm-generic/bitops/minix.h> Index: linux-2.6/include/asm-i386/bitops.h =================================================================== --- linux-2.6.orig/include/asm-i386/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-i386/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -402,6 +402,7 @@ } #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> #endif /* __KERNEL__ */ Index: linux-2.6/include/asm-ia64/bitops.h =================================================================== --- linux-2.6.orig/include/asm-ia64/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-ia64/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -94,6 +94,30 @@ } /** + * clear_bit_unlock - Clears a bit in memory with release + * @nr: Bit to clear + * @addr: Address to start counting from + * + * clear_bit() is atomic and may not be reordered. It does + * contain a memory barrier suitable for unlock type operations. + */ +static __inline__ void +clear_bit_unlock (int nr, volatile void *addr) +{ + __u32 mask, old, new; + volatile __u32 *m; + CMPXCHG_BUGCHECK_DECL + + m = (volatile __u32 *) addr + (nr >> 5); + mask = ~(1 << (nr & 31)); + do { + CMPXCHG_BUGCHECK(m); + old = *m; + new = old & mask; + } while (cmpxchg_rel(m, old, new) != old); +} + +/** * __clear_bit - Clears a bit in memory (non-atomic version) */ static __inline__ void @@ -170,6 +194,15 @@ } /** + * test_and_set_bit_lock - Set a bit and return its old value for lock + * @nr: Bit to set + * @addr: Address to count from + * + * This is the same as test_and_set_bit on ia64 + */ +#define test_and_set_bit_lock test_and_set_bit + +/** * __test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from Index: linux-2.6/include/asm-m32r/bitops.h =================================================================== --- linux-2.6.orig/include/asm-m32r/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-m32r/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -255,6 +255,7 @@ #include <asm-generic/bitops/find.h> #include <asm-generic/bitops/ffs.h> #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> #endif /* __KERNEL__ */ Index: linux-2.6/include/asm-m68k/bitops.h =================================================================== --- linux-2.6.orig/include/asm-m68k/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-m68k/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -314,6 +314,7 @@ #include <asm-generic/bitops/fls64.h> #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> /* Bitmap functions for the minix filesystem */ Index: linux-2.6/include/asm-m68knommu/bitops.h =================================================================== --- linux-2.6.orig/include/asm-m68knommu/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-m68knommu/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -160,6 +160,7 @@ #include <asm-generic/bitops/find.h> #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> static __inline__ int ext2_set_bit(int nr, volatile void * addr) { Index: linux-2.6/include/asm-mips/bitops.h =================================================================== --- linux-2.6.orig/include/asm-mips/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-mips/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -569,6 +569,7 @@ #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> #include <asm-generic/bitops/ext2-non-atomic.h> #include <asm-generic/bitops/ext2-atomic.h> #include <asm-generic/bitops/minix.h> Index: linux-2.6/include/asm-parisc/bitops.h =================================================================== --- linux-2.6.orig/include/asm-parisc/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-parisc/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -208,6 +208,7 @@ #include <asm-generic/bitops/fls64.h> #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> #include <asm-generic/bitops/sched.h> #endif /* __KERNEL__ */ Index: linux-2.6/include/asm-s390/bitops.h =================================================================== --- linux-2.6.orig/include/asm-s390/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-s390/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -746,6 +746,7 @@ #include <asm-generic/bitops/fls64.h> #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> /* * ATTENTION: intel byte ordering convention for ext2 and minix !! Index: linux-2.6/include/asm-sh/bitops.h =================================================================== --- linux-2.6.orig/include/asm-sh/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-sh/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -137,6 +137,7 @@ #include <asm-generic/bitops/find.h> #include <asm-generic/bitops/ffs.h> #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/ext2-non-atomic.h> #include <asm-generic/bitops/ext2-atomic.h> Index: linux-2.6/include/asm-sh64/bitops.h =================================================================== --- linux-2.6.orig/include/asm-sh64/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-sh64/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -136,6 +136,7 @@ #include <asm-generic/bitops/__ffs.h> #include <asm-generic/bitops/find.h> #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/ffs.h> #include <asm-generic/bitops/ext2-non-atomic.h> Index: linux-2.6/include/asm-sparc/bitops.h =================================================================== --- linux-2.6.orig/include/asm-sparc/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-sparc/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -96,6 +96,7 @@ #include <asm-generic/bitops/fls.h> #include <asm-generic/bitops/fls64.h> #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> #include <asm-generic/bitops/find.h> #include <asm-generic/bitops/ext2-non-atomic.h> #include <asm-generic/bitops/ext2-atomic.h> Index: linux-2.6/include/asm-sparc64/bitops.h =================================================================== --- linux-2.6.orig/include/asm-sparc64/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-sparc64/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -81,6 +81,7 @@ #include <asm-generic/bitops/hweight.h> #endif +#include <asm-generic/bitops/lock.h> #endif /* __KERNEL__ */ #include <asm-generic/bitops/find.h> Index: linux-2.6/include/asm-v850/bitops.h =================================================================== --- linux-2.6.orig/include/asm-v850/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-v850/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -145,6 +145,7 @@ #include <asm-generic/bitops/find.h> #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> #include <asm-generic/bitops/ext2-non-atomic.h> #define ext2_set_bit_atomic(l,n,a) test_and_set_bit(n,a) Index: linux-2.6/include/asm-x86_64/bitops.h =================================================================== --- linux-2.6.orig/include/asm-x86_64/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-x86_64/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -408,6 +408,7 @@ #define ARCH_HAS_FAST_MULTIPLIER 1 #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> #endif /* __KERNEL__ */ Index: linux-2.6/include/asm-xtensa/bitops.h =================================================================== --- linux-2.6.orig/include/asm-xtensa/bitops.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/asm-xtensa/bitops.h 2007-05-09 13:57:33.000000000 +1000 @@ -115,6 +115,7 @@ #endif #include <asm-generic/bitops/hweight.h> +#include <asm-generic/bitops/lock.h> #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/minix.h> Index: linux-2.6/fs/buffer.c =================================================================== --- linux-2.6.orig/fs/buffer.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/fs/buffer.c 2007-05-09 13:57:33.000000000 +1000 @@ -78,8 +78,7 @@ void fastcall unlock_buffer(struct buffer_head *bh) { - smp_mb__before_clear_bit(); - clear_buffer_locked(bh); + clear_bit_unlock(BH_Lock, &bh->b_state); smp_mb__after_clear_bit(); wake_up_bit(&bh->b_state, BH_Lock); } @@ -1664,7 +1663,7 @@ */ if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { lock_buffer(bh); - } else if (test_set_buffer_locked(bh)) { + } else if (!trylock_buffer(bh)) { redirty_page_for_writepage(wbc, page); continue; } @@ -2719,7 +2718,7 @@ if (rw == SWRITE) lock_buffer(bh); - else if (test_set_buffer_locked(bh)) + else if (!trylock_buffer(bh)) continue; if (rw == WRITE || rw == SWRITE) { Index: linux-2.6/include/linux/buffer_head.h =================================================================== --- linux-2.6.orig/include/linux/buffer_head.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/linux/buffer_head.h 2007-05-09 13:57:33.000000000 +1000 @@ -115,7 +115,6 @@ BUFFER_FNS(Dirty, dirty) TAS_BUFFER_FNS(Dirty, dirty) BUFFER_FNS(Lock, locked) -TAS_BUFFER_FNS(Lock, locked) BUFFER_FNS(Req, req) TAS_BUFFER_FNS(Req, req) BUFFER_FNS(Mapped, mapped) @@ -301,10 +300,15 @@ __wait_on_buffer(bh); } +static inline int trylock_buffer(struct buffer_head *bh) +{ + return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state)); +} + static inline void lock_buffer(struct buffer_head *bh) { might_sleep(); - if (test_set_buffer_locked(bh)) + if (!trylock_buffer(bh)) __lock_buffer(bh); } Index: linux-2.6/include/linux/interrupt.h =================================================================== --- linux-2.6.orig/include/linux/interrupt.h 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/include/linux/interrupt.h 2007-05-09 13:57:33.000000000 +1000 @@ -310,13 +310,12 @@ #ifdef CONFIG_SMP static inline int tasklet_trylock(struct tasklet_struct *t) { - return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); + return !test_and_set_bit_lock(TASKLET_STATE_RUN, &(t)->state); } static inline void tasklet_unlock(struct tasklet_struct *t) { - smp_mb__before_clear_bit(); - clear_bit(TASKLET_STATE_RUN, &(t)->state); + clear_bit_unlock(TASKLET_STATE_RUN, &(t)->state); } static inline void tasklet_unlock_wait(struct tasklet_struct *t) Index: linux-2.6/kernel/wait.c =================================================================== --- linux-2.6.orig/kernel/wait.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/kernel/wait.c 2007-05-09 13:57:33.000000000 +1000 @@ -195,7 +195,7 @@ if ((ret = (*action)(q->key.flags))) break; } - } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); + } while (test_and_set_bit_lock(q->key.bit_nr, q->key.flags)); finish_wait(wq, &q->wait); return ret; } Index: linux-2.6/Documentation/atomic_ops.txt =================================================================== --- linux-2.6.orig/Documentation/atomic_ops.txt 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/Documentation/atomic_ops.txt 2007-05-09 13:57:33.000000000 +1000 @@ -369,6 +369,15 @@ */ smp_mb__after_clear_bit(); +There are two special bitops with lock barrier semantics (acquire/release, +same as spinlocks). These operate in the same way as their non-_lock/unlock +postfixed variants, except that they are to provide acquire/release semantics, +respectively. This means they can be used for bit_spin_trylock and +bit_spin_unlock type operations without specifying any more barriers. + + int test_and_set_bit_lock(unsigned long nr, unsigned long *addr); + void clear_bit_unlock(unsigned long nr, unsigned long *addr); + Finally, there are non-atomic versions of the bitmask operations provided. They are used in contexts where some other higher-level SMP locking scheme is being used to protect the bitmask, and thus less Index: linux-2.6/fs/ntfs/aops.c =================================================================== --- linux-2.6.orig/fs/ntfs/aops.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/fs/ntfs/aops.c 2007-05-09 13:57:33.000000000 +1000 @@ -1199,7 +1199,7 @@ tbh = bhs[i]; if (!tbh) continue; - if (unlikely(test_set_buffer_locked(tbh))) + if (!trylock_buffer(tbh)) BUG(); /* The buffer dirty state is now irrelevant, just clean it. */ clear_buffer_dirty(tbh); Index: linux-2.6/fs/ntfs/compress.c =================================================================== --- linux-2.6.orig/fs/ntfs/compress.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/fs/ntfs/compress.c 2007-05-09 13:57:33.000000000 +1000 @@ -655,7 +655,7 @@ for (i = 0; i < nr_bhs; i++) { struct buffer_head *tbh = bhs[i]; - if (unlikely(test_set_buffer_locked(tbh))) + if (!trylock_buffer(tbh)) continue; if (unlikely(buffer_uptodate(tbh))) { unlock_buffer(tbh); Index: linux-2.6/fs/ntfs/mft.c =================================================================== --- linux-2.6.orig/fs/ntfs/mft.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/fs/ntfs/mft.c 2007-05-09 13:57:33.000000000 +1000 @@ -586,7 +586,7 @@ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { struct buffer_head *tbh = bhs[i_bhs]; - if (unlikely(test_set_buffer_locked(tbh))) + if (!trylock_buffer(tbh)) BUG(); BUG_ON(!buffer_uptodate(tbh)); clear_buffer_dirty(tbh); @@ -779,7 +779,7 @@ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { struct buffer_head *tbh = bhs[i_bhs]; - if (unlikely(test_set_buffer_locked(tbh))) + if (!trylock_buffer(tbh)) BUG(); BUG_ON(!buffer_uptodate(tbh)); clear_buffer_dirty(tbh); Index: linux-2.6/fs/reiserfs/inode.c =================================================================== --- linux-2.6.orig/fs/reiserfs/inode.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/fs/reiserfs/inode.c 2007-05-09 13:57:33.000000000 +1000 @@ -2448,7 +2448,7 @@ if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { lock_buffer(bh); } else { - if (test_set_buffer_locked(bh)) { + if (!trylock_buffer(bh)) { redirty_page_for_writepage(wbc, page); continue; } Index: linux-2.6/fs/reiserfs/journal.c =================================================================== --- linux-2.6.orig/fs/reiserfs/journal.c 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/fs/reiserfs/journal.c 2007-05-09 13:57:33.000000000 +1000 @@ -830,7 +830,7 @@ jh = JH_ENTRY(list->next); bh = jh->bh; get_bh(bh); - if (test_set_buffer_locked(bh)) { + if (!trylock_buffer(bh)) { if (!buffer_dirty(bh)) { list_move(&jh->list, &tmp); goto loop_next; @@ -3838,7 +3838,7 @@ { PROC_INFO_INC(p_s_sb, journal.prepare); - if (test_set_buffer_locked(bh)) { + if (!trylock_buffer(bh)) { if (!wait) return 0; lock_buffer(bh); Index: linux-2.6/Documentation/memory-barriers.txt =================================================================== --- linux-2.6.orig/Documentation/memory-barriers.txt 2007-05-09 13:56:56.000000000 +1000 +++ linux-2.6/Documentation/memory-barriers.txt 2007-05-09 13:57:33.000000000 +1000 @@ -1479,7 +1479,8 @@ Any atomic operation that modifies some state in memory and returns information about the state (old or new) implies an SMP-conditional general memory barrier -(smp_mb()) on each side of the actual operation. These include: +(smp_mb()) on each side of the actual operation (with the exception of +explicit lock operations, described later). These include: xchg(); cmpxchg(); @@ -1536,10 +1537,19 @@ do need memory barriers as a lock primitive generally has to do things in a specific order. - Basically, each usage case has to be carefully considered as to whether memory barriers are needed or not. +The following operations are special locking primitives: + + test_and_set_bit_lock(); + clear_bit_unlock(); + +These implement LOCK-class and UNLOCK-class operations, respectively. These +should be used in preference to other operations when implementing locking +primitives, because their implementations can be optimised on many +architectures. + [!] Note that special memory barrier primitives are available for these situations because on some CPUs the atomic instructions used imply full memory barriers, and so barrier instructions are superfluous in conjunction with them, ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] lock bitops 2007-05-08 11:37 [rfc] lock bitops Nick Piggin 2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin 2007-05-08 12:22 ` [rfc] lock bitops David Howells @ 2007-05-08 15:06 ` Matthew Wilcox 2007-05-08 21:23 ` Benjamin Herrenschmidt 2007-05-08 22:29 ` Nick Piggin 2007-05-09 12:08 ` Nikita Danilov 3 siblings, 2 replies; 34+ messages in thread From: Matthew Wilcox @ 2007-05-08 15:06 UTC (permalink / raw) To: Nick Piggin Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton, Linux Kernel Mailing List On Tue, May 08, 2007 at 01:37:09PM +0200, Nick Piggin wrote: > -- > Introduce test_and_set_bit_lock / clear_bit_unlock bitops with lock semantics. > Add non-trivial for powerpc and ia64. Convert page lock, buffer lock, > bit_spin_lock, tasklet locks to use the new locks. The names are a bit clumsy. How about naming them after the effect, rather than the implementation? It struck me that really these things are bit mutexes -- you can sleep while holding the lock. How about calling them bit_mutex_trylock() and bit_mutex_unlock()? ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] lock bitops 2007-05-08 15:06 ` Matthew Wilcox @ 2007-05-08 21:23 ` Benjamin Herrenschmidt 2007-05-08 22:29 ` Nick Piggin 1 sibling, 0 replies; 34+ messages in thread From: Benjamin Herrenschmidt @ 2007-05-08 21:23 UTC (permalink / raw) To: Matthew Wilcox Cc: Nick Piggin, linux-arch, Andrew Morton, Linux Kernel Mailing List On Tue, 2007-05-08 at 09:06 -0600, Matthew Wilcox wrote: > On Tue, May 08, 2007 at 01:37:09PM +0200, Nick Piggin wrote: > > -- > > Introduce test_and_set_bit_lock / clear_bit_unlock bitops with lock semantics. > > Add non-trivial for powerpc and ia64. Convert page lock, buffer lock, > > bit_spin_lock, tasklet locks to use the new locks. > > The names are a bit clumsy. How about naming them after the effect, > rather than the implementation? It struck me that really these things > are bit mutexes -- you can sleep while holding the lock. How about > calling them bit_mutex_trylock() and bit_mutex_unlock()? Hrm... spin_trylock vs. mutex_trylock ... what difference ? :-) Note that if we're gonna generalize the usage as a mutex, we might want to extend the unlock semantic to return the first word flag atomically so that the caller can test for other bits without having to read the word back (which might be a performance hit on CPUs without store forwarding). That's already what Nick's new unlock_page() does (testing the flag again right after unlock). At first, I though it would make clear_bit_unlock() semantics a bit too clumsy but maybe it's worth it. Ben. ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] lock bitops 2007-05-08 15:06 ` Matthew Wilcox 2007-05-08 21:23 ` Benjamin Herrenschmidt @ 2007-05-08 22:29 ` Nick Piggin 2007-05-08 22:40 ` Matthew Wilcox 1 sibling, 1 reply; 34+ messages in thread From: Nick Piggin @ 2007-05-08 22:29 UTC (permalink / raw) To: Matthew Wilcox Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton, Linux Kernel Mailing List On Tue, May 08, 2007 at 09:06:32AM -0600, Matthew Wilcox wrote: > On Tue, May 08, 2007 at 01:37:09PM +0200, Nick Piggin wrote: > > -- > > Introduce test_and_set_bit_lock / clear_bit_unlock bitops with lock semantics. > > Add non-trivial for powerpc and ia64. Convert page lock, buffer lock, > > bit_spin_lock, tasklet locks to use the new locks. > > The names are a bit clumsy. How about naming them after the effect, > rather than the implementation? It struck me that really these things > are bit mutexes -- you can sleep while holding the lock. How about > calling them bit_mutex_trylock() and bit_mutex_unlock()? bit_spin_trylock / bit_spin_unlock be OK? ;) ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] lock bitops 2007-05-08 22:29 ` Nick Piggin @ 2007-05-08 22:40 ` Matthew Wilcox 2007-05-08 22:45 ` Nick Piggin 0 siblings, 1 reply; 34+ messages in thread From: Matthew Wilcox @ 2007-05-08 22:40 UTC (permalink / raw) To: Nick Piggin Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton, Linux Kernel Mailing List On Wed, May 09, 2007 at 12:29:27AM +0200, Nick Piggin wrote: > On Tue, May 08, 2007 at 09:06:32AM -0600, Matthew Wilcox wrote: > > On Tue, May 08, 2007 at 01:37:09PM +0200, Nick Piggin wrote: > > > -- > > > Introduce test_and_set_bit_lock / clear_bit_unlock bitops with lock semantics. > > > Add non-trivial for powerpc and ia64. Convert page lock, buffer lock, > > > bit_spin_lock, tasklet locks to use the new locks. > > > > The names are a bit clumsy. How about naming them after the effect, > > rather than the implementation? It struck me that really these things > > are bit mutexes -- you can sleep while holding the lock. How about > > calling them bit_mutex_trylock() and bit_mutex_unlock()? > > bit_spin_trylock / bit_spin_unlock be OK? ;) We already have a bit_spin_trylock -- it keeps preempt disabled until you bit_spin_unlock. Oh, and it only actually sets a bit if you've got SMP or lock debugging on. Nice try though ;-) ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] lock bitops 2007-05-08 22:40 ` Matthew Wilcox @ 2007-05-08 22:45 ` Nick Piggin 0 siblings, 0 replies; 34+ messages in thread From: Nick Piggin @ 2007-05-08 22:45 UTC (permalink / raw) To: Matthew Wilcox Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton, Linux Kernel Mailing List On Tue, May 08, 2007 at 04:40:36PM -0600, Matthew Wilcox wrote: > On Wed, May 09, 2007 at 12:29:27AM +0200, Nick Piggin wrote: > > On Tue, May 08, 2007 at 09:06:32AM -0600, Matthew Wilcox wrote: > > > On Tue, May 08, 2007 at 01:37:09PM +0200, Nick Piggin wrote: > > > > -- > > > > Introduce test_and_set_bit_lock / clear_bit_unlock bitops with lock semantics. > > > > Add non-trivial for powerpc and ia64. Convert page lock, buffer lock, > > > > bit_spin_lock, tasklet locks to use the new locks. > > > > > > The names are a bit clumsy. How about naming them after the effect, > > > rather than the implementation? It struck me that really these things > > > are bit mutexes -- you can sleep while holding the lock. How about > > > calling them bit_mutex_trylock() and bit_mutex_unlock()? > > > > bit_spin_trylock / bit_spin_unlock be OK? ;) > > We already have a bit_spin_trylock -- it keeps preempt disabled until > you bit_spin_unlock. Oh, and it only actually sets a bit if you've got > SMP or lock debugging on. Nice try though ;-) OK, I'll be blunt then. I think s/test_and_set_bit_lock/bit_mutex_trylock is much worse ;) ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] lock bitops 2007-05-08 11:37 [rfc] lock bitops Nick Piggin ` (2 preceding siblings ...) 2007-05-08 15:06 ` Matthew Wilcox @ 2007-05-09 12:08 ` Nikita Danilov 2007-05-09 12:20 ` Nick Piggin 3 siblings, 1 reply; 34+ messages in thread From: Nikita Danilov @ 2007-05-09 12:08 UTC (permalink / raw) To: Nick Piggin Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton, Linux Kernel Mailing List Nick Piggin writes: > Hi, [...] > > /** > + * clear_bit_unlock - Clears a bit in memory with release > + * @nr: Bit to clear > + * @addr: Address to start counting from > + * > + * clear_bit() is atomic and may not be reordered. It does s/clear_bit/clear_bit_unlock/ ? > + * contain a memory barrier suitable for unlock type operations. > + */ > +static __inline__ void > +clear_bit_unlock (int nr, volatile void *addr) > +{ Nikita. ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [rfc] lock bitops 2007-05-09 12:08 ` Nikita Danilov @ 2007-05-09 12:20 ` Nick Piggin 0 siblings, 0 replies; 34+ messages in thread From: Nick Piggin @ 2007-05-09 12:20 UTC (permalink / raw) To: Nikita Danilov Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton, Linux Kernel Mailing List On Wed, May 09, 2007 at 04:08:41PM +0400, Nikita Danilov wrote: > Nick Piggin writes: > > Hi, > > [...] > > > > > /** > > + * clear_bit_unlock - Clears a bit in memory with release > > + * @nr: Bit to clear > > + * @addr: Address to start counting from > > + * > > + * clear_bit() is atomic and may not be reordered. It does > > s/clear_bit/clear_bit_unlock/ ? Yes :) ^ permalink raw reply [flat|nested] 34+ messages in thread
end of thread, other threads:[~2007-05-17 6:27 UTC | newest] Thread overview: 34+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2007-05-08 11:37 [rfc] lock bitops Nick Piggin 2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin 2007-05-08 12:13 ` David Howells 2007-05-08 22:35 ` Nick Piggin 2007-05-08 20:08 ` Hugh Dickins 2007-05-08 21:30 ` Benjamin Herrenschmidt 2007-05-08 22:41 ` Nick Piggin 2007-05-08 22:50 ` Nick Piggin 2007-05-09 19:33 ` Hugh Dickins 2007-05-09 21:21 ` Benjamin Herrenschmidt 2007-05-10 3:37 ` Nick Piggin 2007-05-10 19:14 ` Hugh Dickins 2007-05-11 8:54 ` Nick Piggin 2007-05-11 13:15 ` Hugh Dickins 2007-05-13 3:32 ` Nick Piggin 2007-05-13 4:39 ` Hugh Dickins 2007-05-13 6:52 ` Nick Piggin 2007-05-16 17:54 ` Hugh Dickins 2007-05-16 18:18 ` Nick Piggin 2007-05-16 19:28 ` Hugh Dickins 2007-05-16 19:47 ` Linus Torvalds 2007-05-17 6:27 ` Nick Piggin 2007-05-16 17:21 ` Hugh Dickins 2007-05-16 17:38 ` Nick Piggin 2007-05-08 12:22 ` [rfc] lock bitops David Howells 2007-05-08 22:33 ` Nick Piggin 2007-05-09 6:18 ` Nick Piggin 2007-05-08 15:06 ` Matthew Wilcox 2007-05-08 21:23 ` Benjamin Herrenschmidt 2007-05-08 22:29 ` Nick Piggin 2007-05-08 22:40 ` Matthew Wilcox 2007-05-08 22:45 ` Nick Piggin 2007-05-09 12:08 ` Nikita Danilov 2007-05-09 12:20 ` Nick Piggin
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).