[PATCH][5/8] Arch agnostic completely out of line locks / ppc64

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
@ 2004-09-03  0:02 Zwane Mwaikambo
  2004-09-09  5:42 ` Paul Mackerras
  0 siblings, 1 reply; 50+ messages in thread
From: Zwane Mwaikambo @ 2004-09-03  0:02 UTC (permalink / raw)
  To: Linux Kernel
  Cc: Andrew Morton, Linus Torvalds, William Lee Irwin III,
	Matt Mackall, Anton Blanchard, Paul Mackerras

 arch/ppc64/kernel/time.c        |   14 ++
 arch/ppc64/kernel/vmlinux.lds.S |    1
 arch/ppc64/lib/locks.c          |  216 +-------------------------------
 include/asm-ppc64/ptrace.h      |    5
 include/asm-ppc64/spinlock.h    |  268 ++++++++++++++++++----------------------
 5 files changed, 150 insertions(+), 354 deletions(-)

oprofile needs some eyeballing here, we need to check the program counter
against the locking function bounds.

Signed-off-by: Zwane Mwaikambo <zwane@fsmlabs.com>

Index: linux-2.6.9-rc1-mm1-stage/arch/ppc64/kernel/time.c
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-mm1/arch/ppc64/kernel/time.c,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 time.c
--- linux-2.6.9-rc1-mm1-stage/arch/ppc64/kernel/time.c	26 Aug 2004 13:13:04 -0000	1.1.1.1
+++ linux-2.6.9-rc1-mm1-stage/arch/ppc64/kernel/time.c	2 Sep 2004 13:53:34 -0000
@@ -158,6 +158,20 @@ static __inline__ void timer_sync_xtime(
 	}
 }

+#ifdef CONFIG_SMP
+unsigned long profile_pc(struct pt_regs *regs)
+{
+	unsigned long pc = instruction_pointer(regs);
+
+	if (pc >= (unsigned long)&__lock_text_start &&
+	    pc <= (unsigned long)&__lock_text_end)
+		return regs->link;
+
+	return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+#endif
+
 #ifdef CONFIG_PPC_ISERIES

 /*
Index: linux-2.6.9-rc1-mm1-stage/arch/ppc64/kernel/vmlinux.lds.S
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-mm1/arch/ppc64/kernel/vmlinux.lds.S,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 vmlinux.lds.S
--- linux-2.6.9-rc1-mm1-stage/arch/ppc64/kernel/vmlinux.lds.S	26 Aug 2004 13:13:04 -0000	1.1.1.1
+++ linux-2.6.9-rc1-mm1-stage/arch/ppc64/kernel/vmlinux.lds.S	2 Sep 2004 13:08:15 -0000
@@ -14,6 +14,7 @@ SECTIONS
   .text : {
 	*(.text .text.*)
 	SCHED_TEXT
+	LOCK_TEXT
 	*(.fixup)
 	. = ALIGN(4096);
 	_etext = .;
Index: linux-2.6.9-rc1-mm1-stage/arch/ppc64/lib/locks.c
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-mm1/arch/ppc64/lib/locks.c,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 locks.c
--- linux-2.6.9-rc1-mm1-stage/arch/ppc64/lib/locks.c	26 Aug 2004 13:13:04 -0000	1.1.1.1
+++ linux-2.6.9-rc1-mm1-stage/arch/ppc64/lib/locks.c	2 Sep 2004 13:08:15 -0000
@@ -22,26 +22,9 @@

 #ifndef CONFIG_SPINLINE

-/*
- * On a system with shared processors (that is, where a physical
- * processor is multiplexed between several virtual processors),
- * there is no point spinning on a lock if the holder of the lock
- * isn't currently scheduled on a physical processor.  Instead
- * we detect this situation and ask the hypervisor to give the
- * rest of our timeslice to the lock holder.
- *
- * So that we can tell which virtual processor is holding a lock,
- * we put 0x80000000 | smp_processor_id() in the lock when it is
- * held.  Conveniently, we have a word in the paca that holds this
- * value.
- */
-
 /* waiting for a spinlock... */
 #if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES)

-/* We only yield to the hypervisor if we are in shared processor mode */
-#define SHARED_PROCESSOR (get_paca()->lppaca.xSharedProc)
-
 void __spin_yield(spinlock_t *lock)
 {
 	unsigned int lock_value, holder_cpu, yield_count;
@@ -68,96 +51,11 @@ void __spin_yield(spinlock_t *lock)
 #endif
 }

-#else /* SPLPAR || ISERIES */
-#define __spin_yield(x)	barrier()
-#define SHARED_PROCESSOR	0
-#endif
-
-/*
- * This returns the old value in the lock, so we succeeded
- * in getting the lock if the return value is 0.
- */
-static __inline__ unsigned long __spin_trylock(spinlock_t *lock)
-{
-	unsigned long tmp, tmp2;
-
-	__asm__ __volatile__(
-"	lwz		%1,%3(13)		# __spin_trylock\n\
-1:	lwarx		%0,0,%2\n\
-	cmpwi		0,%0,0\n\
-	bne-		2f\n\
-	stwcx.		%1,0,%2\n\
-	bne-		1b\n\
-	isync\n\
-2:"	: "=&r" (tmp), "=&r" (tmp2)
-	: "r" (&lock->lock), "i" (offsetof(struct paca_struct, lock_token))
-	: "cr0", "memory");
-
-	return tmp;
-}
-
-int _raw_spin_trylock(spinlock_t *lock)
-{
-	return __spin_trylock(lock) == 0;
-}
-
-EXPORT_SYMBOL(_raw_spin_trylock);
-
-void _raw_spin_lock(spinlock_t *lock)
-{
-	while (1) {
-		if (likely(__spin_trylock(lock) == 0))
-			break;
-		do {
-			HMT_low();
-			if (SHARED_PROCESSOR)
-				__spin_yield(lock);
-		} while (likely(lock->lock != 0));
-		HMT_medium();
-	}
-}
-
-EXPORT_SYMBOL(_raw_spin_lock);
-
-void _raw_spin_lock_flags(spinlock_t *lock, unsigned long flags)
-{
-	unsigned long flags_dis;
-
-	while (1) {
-		if (likely(__spin_trylock(lock) == 0))
-			break;
-		local_save_flags(flags_dis);
-		local_irq_restore(flags);
-		do {
-			HMT_low();
-			if (SHARED_PROCESSOR)
-				__spin_yield(lock);
-		} while (likely(lock->lock != 0));
-		HMT_medium();
-		local_irq_restore(flags_dis);
-	}
-}
-
-EXPORT_SYMBOL(_raw_spin_lock_flags);
-
-void spin_unlock_wait(spinlock_t *lock)
-{
-	while (lock->lock) {
-		HMT_low();
-		if (SHARED_PROCESSOR)
-			__spin_yield(lock);
-	}
-	HMT_medium();
-}
-
-EXPORT_SYMBOL(spin_unlock_wait);
-
 /*
  * Waiting for a read lock or a write lock on a rwlock...
  * This turns out to be the same for read and write locks, since
  * we only know the holder if it is write-locked.
  */
-#if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES)
 void __rw_yield(rwlock_t *rw)
 {
 	int lock_value;
@@ -184,118 +82,18 @@ void __rw_yield(rwlock_t *rw)
 			   yield_count);
 #endif
 }
-
-#else /* SPLPAR || ISERIES */
-#define __rw_yield(x)	barrier()
 #endif

-/*
- * This returns the old value in the lock + 1,
- * so we got a read lock if the return value is > 0.
- */
-static __inline__ long __read_trylock(rwlock_t *rw)
-{
-	long tmp;
-
-	__asm__ __volatile__(
-"1:	lwarx		%0,0,%1		# read_trylock\n\
-	extsw		%0,%0\n\
-	addic.		%0,%0,1\n\
-	ble-		2f\n\
-	stwcx.		%0,0,%1\n\
-	bne-		1b\n\
-	isync\n\
-2:"	: "=&r" (tmp)
-	: "r" (&rw->lock)
-	: "cr0", "xer", "memory");
-
-	return tmp;
-}
-
-int _raw_read_trylock(rwlock_t *rw)
-{
-	return __read_trylock(rw) > 0;
-}
-
-EXPORT_SYMBOL(_raw_read_trylock);
-
-void _raw_read_lock(rwlock_t *rw)
-{
-	while (1) {
-		if (likely(__read_trylock(rw) > 0))
-			break;
-		do {
-			HMT_low();
-			if (SHARED_PROCESSOR)
-				__rw_yield(rw);
-		} while (likely(rw->lock < 0));
-		HMT_medium();
-	}
-}
-
-EXPORT_SYMBOL(_raw_read_lock);
-
-void _raw_read_unlock(rwlock_t *rw)
-{
-	long tmp;
-
-	__asm__ __volatile__(
-	"eieio				# read_unlock\n\
-1:	lwarx		%0,0,%1\n\
-	addic		%0,%0,-1\n\
-	stwcx.		%0,0,%1\n\
-	bne-		1b"
-	: "=&r"(tmp)
-	: "r"(&rw->lock)
-	: "cr0", "memory");
-}
-
-EXPORT_SYMBOL(_raw_read_unlock);
-
-/*
- * This returns the old value in the lock,
- * so we got the write lock if the return value is 0.
- */
-static __inline__ long __write_trylock(rwlock_t *rw)
-{
-	long tmp, tmp2;
-
-	__asm__ __volatile__(
-"	lwz		%1,%3(13)	# write_trylock\n\
-1:	lwarx		%0,0,%2\n\
-	cmpwi		0,%0,0\n\
-	bne-		2f\n\
-	stwcx.		%1,0,%2\n\
-	bne-		1b\n\
-	isync\n\
-2:"	: "=&r" (tmp), "=&r" (tmp2)
-	: "r" (&rw->lock), "i" (offsetof(struct paca_struct, lock_token))
-	: "cr0", "memory");
-
-	return tmp;
-}
-
-int _raw_write_trylock(rwlock_t *rw)
-{
-	return __write_trylock(rw) == 0;
-}
-
-EXPORT_SYMBOL(_raw_write_trylock);
-
-void _raw_write_lock(rwlock_t *rw)
+void spin_unlock_wait(spinlock_t *lock)
 {
-	while (1) {
-		if (likely(__write_trylock(rw) == 0))
-			break;
-		do {
-			HMT_low();
-			if (SHARED_PROCESSOR)
-				__rw_yield(rw);
-		} while (likely(rw->lock != 0));
-		HMT_medium();
+	while (lock->lock) {
+		HMT_low();
+		if (SHARED_PROCESSOR)
+			__spin_yield(lock);
 	}
+	HMT_medium();
 }

-EXPORT_SYMBOL(_raw_write_lock);
+EXPORT_SYMBOL(spin_unlock_wait);

 #endif /* CONFIG_SPINLINE */
Index: linux-2.6.9-rc1-mm1-stage/include/asm-ppc64/ptrace.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-mm1/include/asm-ppc64/ptrace.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 ptrace.h
--- linux-2.6.9-rc1-mm1-stage/include/asm-ppc64/ptrace.h	26 Aug 2004 13:13:09 -0000	1.1.1.1
+++ linux-2.6.9-rc1-mm1-stage/include/asm-ppc64/ptrace.h	2 Sep 2004 13:08:16 -0000
@@ -69,7 +69,12 @@ struct pt_regs32 {
 #define __SIGNAL_FRAMESIZE32	64

 #define instruction_pointer(regs) ((regs)->nip)
+#ifdef CONFIG_SMP
+extern unsigned long profile_pc(struct pt_regs *regs);
+#else
 #define profile_pc(regs) instruction_pointer(regs)
+#endif
+
 #define user_mode(regs) ((((regs)->msr) >> MSR_PR_LG) & 0x1)

 #define force_successful_syscall_return()   \
Index: linux-2.6.9-rc1-mm1-stage/include/asm-ppc64/spinlock.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-mm1/include/asm-ppc64/spinlock.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 spinlock.h
--- linux-2.6.9-rc1-mm1-stage/include/asm-ppc64/spinlock.h	26 Aug 2004 13:13:09 -0000	1.1.1.1
+++ linux-2.6.9-rc1-mm1-stage/include/asm-ppc64/spinlock.h	2 Sep 2004 13:08:16 -0000
@@ -6,6 +6,8 @@
  *
  * Copyright (C) 2001-2004 Paul Mackerras <paulus@au.ibm.com>, IBM
  * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ * Copyright (C) 2002 Dave Engebretsen <engebret@us.ibm.com>, IBM
+ *	Rework to support virtual processors
  *
  * Type of int is used as a full 64b word is not necessary.
  *
@@ -16,6 +18,8 @@
  */
 #include <linux/config.h>
 #include <asm/paca.h>
+#include <asm/hvcall.h>
+#include <asm/iSeries/HvCall.h>

 typedef struct {
 	volatile unsigned int lock;
@@ -34,101 +38,91 @@ static __inline__ void _raw_spin_unlock(
 }

 /*
- * Normally we use the spinlock functions in arch/ppc64/lib/locks.c.
- * For special applications such as profiling, we can have the
- * spinlock functions inline by defining CONFIG_SPINLINE.
- * This is not recommended on partitioned systems with shared
- * processors, since the inline spinlock functions don't include
- * the code for yielding the CPU to the lock holder.
+ * On a system with shared processors (that is, where a physical
+ * processor is multiplexed between several virtual processors),
+ * there is no point spinning on a lock if the holder of the lock
+ * isn't currently scheduled on a physical processor.  Instead
+ * we detect this situation and ask the hypervisor to give the
+ * rest of our timeslice to the lock holder.
+ *
+ * So that we can tell which virtual processor is holding a lock,
+ * we put 0x80000000 | smp_processor_id() in the lock when it is
+ * held.  Conveniently, we have a word in the paca that holds this
+ * value.
  */

-#ifndef CONFIG_SPINLINE
-extern int _raw_spin_trylock(spinlock_t *lock);
-extern void _raw_spin_lock(spinlock_t *lock);
-extern void _raw_spin_lock_flags(spinlock_t *lock, unsigned long flags);
+#if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES)
+/* We only yield to the hypervisor if we are in shared processor mode */
+#define SHARED_PROCESSOR (get_paca()->lppaca.xSharedProc)
+extern void __spin_yield(spinlock_t *lock);
+extern void __rw_yield(spinlock_t *lock);
+#else /* SPLPAR || ISERIES */
+#define __spin_yield(x)	barrier()
+#define __rw_yield(x)	barrier()
+#define SHARED_PROCESSOR	0
+#endif
 extern void spin_unlock_wait(spinlock_t *lock);

-#else
-
-static __inline__ int _raw_spin_trylock(spinlock_t *lock)
+/*
+ * This returns the old value in the lock, so we succeeded
+ * in getting the lock if the return value is 0.
+ */
+static __inline__ unsigned long __spin_trylock(spinlock_t *lock)
 {
-	unsigned int tmp, tmp2;
+	unsigned long tmp, tmp2;

 	__asm__ __volatile__(
-"1:	lwarx		%0,0,%2		# spin_trylock\n\
+"	lwz		%1,%3(13)		# __spin_trylock\n\
+1:	lwarx		%0,0,%2\n\
 	cmpwi		0,%0,0\n\
 	bne-		2f\n\
-	lwz		%1,%3(13)\n\
 	stwcx.		%1,0,%2\n\
 	bne-		1b\n\
 	isync\n\
-2:"	: "=&r"(tmp), "=&r"(tmp2)
-	: "r"(&lock->lock), "i"(offsetof(struct paca_struct, lock_token))
+2:"	: "=&r" (tmp), "=&r" (tmp2)
+	: "r" (&lock->lock), "i" (offsetof(struct paca_struct, lock_token))
 	: "cr0", "memory");

-	return tmp == 0;
+	return tmp;
 }

-static __inline__ void _raw_spin_lock(spinlock_t *lock)
+static int __inline__ _raw_spin_trylock(spinlock_t *lock)
 {
-	unsigned int tmp;
-
-	__asm__ __volatile__(
-	"b		2f		# spin_lock\n\
-1:"
-	HMT_LOW
-"	lwzx		%0,0,%1\n\
-	cmpwi		0,%0,0\n\
-	bne+		1b\n"
-	HMT_MEDIUM
-"2:	lwarx		%0,0,%1\n\
-	cmpwi		0,%0,0\n\
-	bne-		1b\n\
-	lwz		%0,%2(13)\n\
-	stwcx.		%0,0,%1\n\
-	bne-		2b\n\
-	isync"
-	: "=&r"(tmp)
-	: "r"(&lock->lock), "i"(offsetof(struct paca_struct, lock_token))
-	: "cr0", "memory");
+	return __spin_trylock(lock) == 0;
 }

-/*
- * Note: if we ever want to inline the spinlocks on iSeries,
- * we will have to change the irq enable/disable stuff in here.
- */
-static __inline__ void _raw_spin_lock_flags(spinlock_t *lock,
-					    unsigned long flags)
+static void __inline__ _raw_spin_lock(spinlock_t *lock)
 {
-	unsigned int tmp;
-	unsigned long tmp2;
-
-	__asm__ __volatile__(
-	"b		3f		# spin_lock\n\
-1:	mfmsr		%1\n\
-	mtmsrd		%3,1\n\
-2:"	HMT_LOW
-"	lwzx		%0,0,%2\n\
-	cmpwi		0,%0,0\n\
-	bne+		2b\n"
-	HMT_MEDIUM
-"	mtmsrd		%1,1\n\
-3:	lwarx		%0,0,%2\n\
-	cmpwi		0,%0,0\n\
-	bne-		1b\n\
-	lwz		%1,%4(13)\n\
-	stwcx.		%1,0,%2\n\
-	bne-		3b\n\
-	isync"
-	: "=&r"(tmp), "=&r"(tmp2)
-	: "r"(&lock->lock), "r"(flags),
-	  "i" (offsetof(struct paca_struct, lock_token))
-	: "cr0", "memory");
+	while (1) {
+		if (likely(__spin_trylock(lock) == 0))
+			break;
+		do {
+			HMT_low();
+			if (SHARED_PROCESSOR)
+				__spin_yield(lock);
+		} while (likely(lock->lock != 0));
+		HMT_medium();
+	}
 }

-#define spin_unlock_wait(x)	do { cpu_relax(); } while (spin_is_locked(x))
+static void __inline__ _raw_spin_lock_flags(spinlock_t *lock, unsigned long flags)
+{
+	unsigned long flags_dis;

-#endif /* CONFIG_SPINLINE */
+	while (1) {
+		if (likely(__spin_trylock(lock) == 0))
+			break;
+		local_save_flags(flags_dis);
+		local_irq_restore(flags);
+		do {
+			HMT_low();
+			if (SHARED_PROCESSOR)
+				__spin_yield(lock);
+		} while (likely(lock->lock != 0));
+		HMT_medium();
+		local_irq_restore(flags_dis);
+	}
+}

 /*
  * Read-write spinlocks, allowing multiple readers
@@ -165,67 +159,54 @@ static __inline__ void _raw_write_unlock
 	rw->lock = 0;
 }

-#ifndef CONFIG_SPINLINE
-extern int _raw_read_trylock(rwlock_t *rw);
-extern void _raw_read_lock(rwlock_t *rw);
-extern void _raw_read_unlock(rwlock_t *rw);
-extern int _raw_write_trylock(rwlock_t *rw);
-extern void _raw_write_lock(rwlock_t *rw);
-extern void _raw_write_unlock(rwlock_t *rw);
-
-#else
-static __inline__ int _raw_read_trylock(rwlock_t *rw)
+/*
+ * This returns the old value in the lock + 1,
+ * so we got a read lock if the return value is > 0.
+ */
+static long __inline__ __read_trylock(rwlock_t *rw)
 {
-	unsigned int tmp;
-	unsigned int ret;
+	long tmp;

 	__asm__ __volatile__(
-"1:	lwarx		%0,0,%2		# read_trylock\n\
-	li		%1,0\n\
+"1:	lwarx		%0,0,%1		# read_trylock\n\
 	extsw		%0,%0\n\
 	addic.		%0,%0,1\n\
 	ble-		2f\n\
-	stwcx.		%0,0,%2\n\
+	stwcx.		%0,0,%1\n\
 	bne-		1b\n\
-	li		%1,1\n\
 	isync\n\
-2:"	: "=&r"(tmp), "=&r"(ret)
-	: "r"(&rw->lock)
-	: "cr0", "memory");
+2:"	: "=&r" (tmp)
+	: "r" (&rw->lock)
+	: "cr0", "xer", "memory");

-	return ret;
+	return tmp;
 }

-static __inline__ void _raw_read_lock(rwlock_t *rw)
+static int __inline__ _raw_read_trylock(rwlock_t *rw)
 {
-	unsigned int tmp;
+	return __read_trylock(rw) > 0;
+}

-	__asm__ __volatile__(
-	"b		2f		# read_lock\n\
-1:"
-	HMT_LOW
-"	lwax		%0,0,%1\n\
-	cmpwi		0,%0,0\n\
-	blt+		1b\n"
-	HMT_MEDIUM
-"2:	lwarx		%0,0,%1\n\
-	extsw		%0,%0\n\
-	addic.		%0,%0,1\n\
-	ble-		1b\n\
-	stwcx.		%0,0,%1\n\
-	bne-		2b\n\
-	isync"
-	: "=&r"(tmp)
-	: "r"(&rw->lock)
-	: "cr0", "memory");
+static void __inline__ _raw_read_lock(rwlock_t *rw)
+{
+	while (1) {
+		if (likely(__read_trylock(rw) > 0))
+			break;
+		do {
+			HMT_low();
+			if (SHARED_PROCESSOR)
+				__rw_yield(rw);
+		} while (likely(rw->lock < 0));
+		HMT_medium();
+	}
 }

-static __inline__ void _raw_read_unlock(rwlock_t *rw)
+static void __inline__ _raw_read_unlock(rwlock_t *rw)
 {
-	unsigned int tmp;
+	long tmp;

 	__asm__ __volatile__(
-	"lwsync				# read_unlock\n\
+	"eieio				# read_unlock\n\
 1:	lwarx		%0,0,%1\n\
 	addic		%0,%0,-1\n\
 	stwcx.		%0,0,%1\n\
@@ -235,50 +216,47 @@ static __inline__ void _raw_read_unlock(
 	: "cr0", "memory");
 }

-static __inline__ int _raw_write_trylock(rwlock_t *rw)
+/*
+ * This returns the old value in the lock,
+ * so we got the write lock if the return value is 0.
+ */
+static __inline__ long __write_trylock(rwlock_t *rw)
 {
-	unsigned int tmp;
-	unsigned int ret;
+	long tmp, tmp2;

 	__asm__ __volatile__(
-"1:	lwarx		%0,0,%2		# write_trylock\n\
+"	lwz		%1,%3(13)	# write_trylock\n\
+1:	lwarx		%0,0,%2\n\
 	cmpwi		0,%0,0\n\
-	li		%1,0\n\
 	bne-		2f\n\
-	stwcx.		%3,0,%2\n\
+	stwcx.		%1,0,%2\n\
 	bne-		1b\n\
-	li		%1,1\n\
 	isync\n\
-2:"	: "=&r"(tmp), "=&r"(ret)
-	: "r"(&rw->lock), "r"(-1)
+2:"	: "=&r" (tmp), "=&r" (tmp2)
+	: "r" (&rw->lock), "i" (offsetof(struct paca_struct, lock_token))
 	: "cr0", "memory");

-	return ret;
+	return tmp;
 }

-static __inline__ void _raw_write_lock(rwlock_t *rw)
+static int __inline__ _raw_write_trylock(rwlock_t *rw)
 {
-	unsigned int tmp;
+	return __write_trylock(rw) == 0;
+}

-	__asm__ __volatile__(
-	"b		2f		# write_lock\n\
-1:"
-	HMT_LOW
-	"lwax		%0,0,%1\n\
-	cmpwi		0,%0,0\n\
-	bne+		1b\n"
-	HMT_MEDIUM
-"2:	lwarx		%0,0,%1\n\
-	cmpwi		0,%0,0\n\
-	bne-		1b\n\
-	stwcx.		%2,0,%1\n\
-	bne-		2b\n\
-	isync"
-	: "=&r"(tmp)
-	: "r"(&rw->lock), "r"(-1)
-	: "cr0", "memory");
+static void __inline__ _raw_write_lock(rwlock_t *rw)
+{
+	while (1) {
+		if (likely(__write_trylock(rw) == 0))
+			break;
+		do {
+			HMT_low();
+			if (SHARED_PROCESSOR)
+				__rw_yield(rw);
+		} while (likely(rw->lock != 0));
+		HMT_medium();
+	}
 }
-#endif /* CONFIG_SPINLINE */

 #endif /* __KERNEL__ */
 #endif /* __ASM_SPINLOCK_H */

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-03  0:02 [PATCH][5/8] Arch agnostic completely out of line locks / ppc64 Zwane Mwaikambo
@ 2004-09-09  5:42 ` Paul Mackerras
  2004-09-09 12:41   ` Zwane Mwaikambo
  0 siblings, 1 reply; 50+ messages in thread
From: Paul Mackerras @ 2004-09-09  5:42 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: Linux Kernel, Andrew Morton, Linus Torvalds,
	William Lee Irwin III, Matt Mackall, Anton Blanchard

Zwane,

Just got a chance to look at the new out-of-line spinlock stuff
(better late than never :).  I see a couple of problems there.  First,
we now go two levels deep on SMP && PREEMPT: spin_lock is _spin_lock,
which is out of line in kernel/sched.c.  That calls
__preempt_spin_lock, which is out of line in kernel/sched.c, and isn't
in the .text.lock section.  So if we get a timer interrupt in there,
we won't attribute the profile tick to the original caller.

The second problem is that __preempt_spin_lock doesn't do the yield to
the hypervisor which we need to do on shared processor systems.  This
is actually a long-standing problem, not one you have just introduced,
but I have only just noticed it.  I can't make cpu_relax do the yield
because the yield is a directed yield to a specific other virtual cpu
(it says "give the rest of my timeslice to that guy over there") and I
need the value in the lock variable in order to know who is holding
the lock.

Regards,
Paul.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-09  5:42 ` Paul Mackerras
@ 2004-09-09 12:41   ` Zwane Mwaikambo
  2004-09-09 14:54     ` Linus Torvalds
  2004-09-09 15:43     ` [PATCH][5/8] Arch agnostic completely out of line locks / ppc64 Anton Blanchard
  0 siblings, 2 replies; 50+ messages in thread
From: Zwane Mwaikambo @ 2004-09-09 12:41 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Linux Kernel, Andrew Morton, Linus Torvalds,
	William Lee Irwin III, Matt Mackall, Anton Blanchard,
	Nakajima, Jun

Hello Paul,

On Thu, 9 Sep 2004, Paul Mackerras wrote:

> Just got a chance to look at the new out-of-line spinlock stuff
> (better late than never :).  I see a couple of problems there.  First,
> we now go two levels deep on SMP && PREEMPT: spin_lock is _spin_lock,
> which is out of line in kernel/sched.c.  That calls
> __preempt_spin_lock, which is out of line in kernel/sched.c, and isn't
> in the .text.lock section.  So if we get a timer interrupt in there,
> we won't attribute the profile tick to the original caller.

I think that bit is actually intentional since __preempt_spin_lock is also 
marked __sched so that it'll get charged as a scheduling function.

> The second problem is that __preempt_spin_lock doesn't do the yield to
> the hypervisor which we need to do on shared processor systems.  This
> is actually a long-standing problem, not one you have just introduced,
> but I have only just noticed it.  I can't make cpu_relax do the yield
> because the yield is a directed yield to a specific other virtual cpu
> (it says "give the rest of my timeslice to that guy over there") and I
> need the value in the lock variable in order to know who is holding
> the lock.

I think cpu_relax() (or some other primitive) should actually take a 
parameter, this will allow for us to use monitor/mwait on i386 too so 
that in cases where we're spinning waiting on memory modify we could do 
something akin to the following;

while (spin_is_locked(lock))
	cpu_relax(lock);

Although there are wakeup latencies when using monitor/mwait for such, 
some cases such as above should be ok (although there are implementation 
details such as the cost of a monitor operation on things like spin 
unlock paths). I believe such an API modification would be beneficiel for 
you too. What do others think?

Thanks,
	Zwane

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-09 12:41   ` Zwane Mwaikambo
@ 2004-09-09 14:54     ` Linus Torvalds
  2004-09-09 14:55       ` Linus Torvalds
  2004-09-09 15:43     ` [PATCH][5/8] Arch agnostic completely out of line locks / ppc64 Anton Blanchard
  1 sibling, 1 reply; 50+ messages in thread
From: Linus Torvalds @ 2004-09-09 14:54 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: Paul Mackerras, Linux Kernel, Andrew Morton,
	William Lee Irwin III, Matt Mackall, Anton Blanchard,
	Nakajima, Jun



On Thu, 9 Sep 2004, Zwane Mwaikambo wrote:
> 
> I think cpu_relax() (or some other primitive) should actually take a 
> parameter, this will allow for us to use monitor/mwait on i386 too so 
> that in cases where we're spinning waiting on memory modify we could do 
> something akin to the following;
> 
> while (spin_is_locked(lock))
> 	cpu_relax(lock);

You can't do it that way. It needs to be arch-specific. When using 
something like monitor/mwait (or a "futex", which something like UML might 
use), you need to load the value you want to wait on _before_. So the 
interface literally would have to be the monitor/mwait interface:

	for (;;) {
		lockval = monitor(lock);
		if (!is_locked(lockval))
			break;
		mwait(lock, lockval);
	}

and the fact is, this is all much better just done in the arch-specific 
spinlock code. 

		Linus

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-09 14:54     ` Linus Torvalds
@ 2004-09-09 14:55       ` Linus Torvalds
  2004-09-09 15:32         ` Zwane Mwaikambo
  0 siblings, 1 reply; 50+ messages in thread
From: Linus Torvalds @ 2004-09-09 14:55 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: Paul Mackerras, Linux Kernel, Andrew Morton,
	William Lee Irwin III, Matt Mackall, Anton Blanchard,
	Nakajima, Jun

On Thu, 9 Sep 2004, Linus Torvalds wrote:
> 
> and the fact is, this is all much better just done in the arch-specific 
> spinlock code. 

This is especially true since some architectures may have high overheads 
for this, so you may do normal spinning for a while before you even start 
doing the "fancy" stuff. So there is no ay we should expose this as a 
"generic" interface. It ain't generic. It's very much a low-level 
implementation detail of "spin_lock()".

		Linus

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-09 14:55       ` Linus Torvalds
@ 2004-09-09 15:32         ` Zwane Mwaikambo
  2004-09-12  4:59           ` [PATCH] Yielding processor resources during lock contention Zwane Mwaikambo
  0 siblings, 1 reply; 50+ messages in thread
From: Zwane Mwaikambo @ 2004-09-09 15:32 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Paul Mackerras, Linux Kernel, Andrew Morton,
	William Lee Irwin III, Matt Mackall, Anton Blanchard,
	Nakajima, Jun

On Thu, 9 Sep 2004, Linus Torvalds wrote:

> On Thu, 9 Sep 2004, Linus Torvalds wrote:
> > 
> > and the fact is, this is all much better just done in the arch-specific 
> > spinlock code. 
> 
> This is especially true since some architectures may have high overheads 
> for this, so you may do normal spinning for a while before you even start 
> doing the "fancy" stuff. So there is no ay we should expose this as a 
> "generic" interface. It ain't generic. It's very much a low-level 
> implementation detail of "spin_lock()".

Agreed, Paul we may as well remove the cpu_relax() in __preempt_spin_lock 
and use something like "cpu_yield" (architectures not supporting it would 
just call cpu_relax) i'll have something for you later.

Thanks,
	Zwane

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH] Yielding processor resources during lock contention
  2004-09-09 15:32         ` Zwane Mwaikambo
@ 2004-09-12  4:59           ` Zwane Mwaikambo
  2004-09-12  5:00             ` Andrew Morton
                               ` (3 more replies)
  0 siblings, 4 replies; 50+ messages in thread
From: Zwane Mwaikambo @ 2004-09-12  4:59 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Paul Mackerras, Linux Kernel, Andrew Morton, Anton Blanchard,
	Nakajima, Jun, Andi Kleen, Ingo Molnar

> > This is especially true since some architectures may have high overheads 
> > for this, so you may do normal spinning for a while before you even start 
> > doing the "fancy" stuff. So there is no ay we should expose this as a 
> > "generic" interface. It ain't generic. It's very much a low-level 
> > implementation detail of "spin_lock()".
> 
> Agreed, Paul we may as well remove the cpu_relax() in __preempt_spin_lock 
> and use something like "cpu_yield" (architectures not supporting it would 
> just call cpu_relax) i'll have something for you later.

The following patch introduces cpu_lock_yield which allows architectures 
to possibly yield processor resources during lock contention. The original 
requirement stems from Paul's requirement on PPC64 LPAR systems to yield 
the processor to the hypervisor instead of spinning. However the general 
concept can be used beneficially on other architectures such as i386 and 
x86_64 with HT to also avoid bouncing cachelines about, utilising 
execution resources and possibly unecessarily consuming power during 
contention. On i386 processors with PNI this is achieved by using the 
monitor/mwait opcodes to halt the processor until a write to the lock is 
done. Paul is the following useable on PPC64 (I'll let you fill in the 
PPC64 bits)?

 include/asm-alpha/processor.h     |    1 +
 include/asm-arm/processor.h       |    1 +
 include/asm-arm26/processor.h     |    1 +
 include/asm-cris/processor.h      |    1 +
 include/asm-h8300/processor.h     |    1 +
 include/asm-i386/processor.h      |   11 +++++++++++
 include/asm-i386/system.h         |    4 ++--
 include/asm-ia64/processor.h      |    1 +
 include/asm-m68k/processor.h      |    1 +
 include/asm-m68knommu/processor.h |    1 +
 include/asm-mips/processor.h      |    1 +
 include/asm-parisc/processor.h    |    1 +
 include/asm-ppc/processor.h       |    1 +
 include/asm-ppc64/processor.h     |    1 +
 include/asm-s390/processor.h      |    2 ++
 include/asm-sh/processor.h        |    1 +
 include/asm-sh64/processor.h      |    1 +
 include/asm-sparc/processor.h     |    1 +
 include/asm-sparc64/processor.h   |    1 +
 include/asm-v850/processor.h      |    1 +
 include/asm-x86_64/processor.h    |   11 +++++++++++
 kernel/spinlock.c                 |    7 +++----
 22 files changed, 46 insertions(+), 6 deletions(-)

Signed-off-by: Zwane Mwaikambo <zwane@fsmlabs.com>

Index: linux-2.6.9-rc1-bk18-stage/include/asm-alpha/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-alpha/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-alpha/processor.h	11 Sep 2004 14:17:53 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-alpha/processor.h	12 Sep 2004 03:53:09 -0000
@@ -73,6 +73,7 @@ unsigned long get_wchan(struct task_stru
   ((tsk) == current ? rdusp() : (tsk)->thread_info->pcb.usp)
 
 #define cpu_relax()	barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #define ARCH_HAS_PREFETCH
 #define ARCH_HAS_PREFETCHW
Index: linux-2.6.9-rc1-bk18-stage/include/asm-arm/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-arm/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-arm/processor.h	11 Sep 2004 14:17:53 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-arm/processor.h	12 Sep 2004 03:53:42 -0000
@@ -85,6 +85,7 @@ extern void release_thread(struct task_s
 unsigned long get_wchan(struct task_struct *p);
 
 #define cpu_relax()			barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 /*
  * Create a new kernel thread
Index: linux-2.6.9-rc1-bk18-stage/include/asm-arm26/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-arm26/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-arm26/processor.h	11 Sep 2004 14:17:54 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-arm26/processor.h	12 Sep 2004 03:57:27 -0000
@@ -102,6 +102,7 @@ extern void release_thread(struct task_s
 unsigned long get_wchan(struct task_struct *p);
 
 #define cpu_relax()			barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 /* Prepare to copy thread state - unlazy all lazy status */
 #define prepare_to_copy(tsk)    do { } while (0)
Index: linux-2.6.9-rc1-bk18-stage/include/asm-cris/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-cris/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-cris/processor.h	11 Sep 2004 14:17:54 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-cris/processor.h	12 Sep 2004 03:53:31 -0000
@@ -75,5 +75,6 @@ extern inline void release_thread(struct
 #define init_stack      (init_thread_union.stack)
 
 #define cpu_relax()     barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif /* __ASM_CRIS_PROCESSOR_H */
Index: linux-2.6.9-rc1-bk18-stage/include/asm-h8300/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-h8300/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-h8300/processor.h	11 Sep 2004 14:17:54 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-h8300/processor.h	12 Sep 2004 03:57:17 -0000
@@ -136,5 +136,6 @@ unsigned long get_wchan(struct task_stru
 #define	KSTK_ESP(tsk)	((tsk) == current ? rdusp() : (tsk)->thread.usp)
 
 #define cpu_relax()    do { } while (0)
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif
Index: linux-2.6.9-rc1-bk18-stage/include/asm-i386/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-i386/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-i386/processor.h	11 Sep 2004 14:17:54 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-i386/processor.h	12 Sep 2004 04:25:06 -0000
@@ -554,6 +554,17 @@ static inline void rep_nop(void)
 
 #define cpu_relax()	rep_nop()
 
+#define __PAUSE8	"rep;nop;rep;nop;rep;nop;rep;nop\n"
+#define __PAUSE16	__PAUSE8 __PAUSE8
+
+#define	cpu_lock_yield(lock, lock_test)	do {	\
+	alternative_input(__PAUSE16, ".byte 0x0f,0x01,0xc8;",		\
+			X86_FEATURE_MWAIT, "a"(lock), "c"(0), "d"(0));	\
+	if (lock_test(lock))			\
+		alternative_input(__PAUSE8, ".byte 0x0f,0x01,0xc9;",	\
+			X86_FEATURE_MWAIT, "a"(0), "c"(0) : "memory"); \
+} while (lock_test(lock))
+
 /* generic versions from gas */
 #define GENERIC_NOP1	".byte 0x90\n"
 #define GENERIC_NOP2    	".byte 0x89,0xf6\n"
Index: linux-2.6.9-rc1-bk18-stage/include/asm-i386/system.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-i386/system.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 system.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-i386/system.h	11 Sep 2004 14:17:54 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-i386/system.h	12 Sep 2004 04:18:35 -0000
@@ -321,7 +321,7 @@ struct alt_instr { 
  * If you use variable sized constraints like "m" or "g" in the 
  * replacement maake sure to pad to the worst case length.
  */
-#define alternative_input(oldinstr, newinstr, feature, input)			\
+#define alternative_input(oldinstr, newinstr, feature, input...)		\
 	asm volatile ("661:\n\t" oldinstr "\n662:\n"				\
 		      ".section .altinstructions,\"a\"\n"			\
 		      "  .align 4\n"						\
@@ -333,7 +333,7 @@ struct alt_instr { 
 		      ".previous\n"						\
 		      ".section .altinstr_replacement,\"ax\"\n"			\
 		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ 	\
-		      ".previous" :: "i" (feature), input)  
+		      ".previous" :: "i" (feature), ##input)  
 
 /*
  * Force strict CPU ordering.
Index: linux-2.6.9-rc1-bk18-stage/include/asm-ia64/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-ia64/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-ia64/processor.h	11 Sep 2004 14:17:54 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-ia64/processor.h	12 Sep 2004 03:54:36 -0000
@@ -576,6 +576,7 @@ ia64_eoi (void)
 }
 
 #define cpu_relax()	ia64_hint(ia64_hint_pause)
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 static inline void
 ia64_set_lrr0 (unsigned long val)
Index: linux-2.6.9-rc1-bk18-stage/include/asm-m68k/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-m68k/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-m68k/processor.h	11 Sep 2004 14:17:55 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-m68k/processor.h	12 Sep 2004 03:54:22 -0000
@@ -135,5 +135,6 @@ unsigned long get_wchan(struct task_stru
 #define	KSTK_ESP(tsk)	((tsk) == current ? rdusp() : (tsk)->thread.usp)
 
 #define cpu_relax()	barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif
Index: linux-2.6.9-rc1-bk18-stage/include/asm-m68knommu/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-m68knommu/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-m68knommu/processor.h	11 Sep 2004 14:17:55 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-m68knommu/processor.h	12 Sep 2004 03:54:29 -0000
@@ -132,5 +132,6 @@ unsigned long get_wchan(struct task_stru
 #define	KSTK_ESP(tsk)	((tsk) == current ? rdusp() : (tsk)->thread.usp)
 
 #define cpu_relax()    do { } while (0)
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif
Index: linux-2.6.9-rc1-bk18-stage/include/asm-mips/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-mips/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-mips/processor.h	11 Sep 2004 14:17:55 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-mips/processor.h	12 Sep 2004 03:54:13 -0000
@@ -263,6 +263,7 @@ unsigned long get_wchan(struct task_stru
 #define KSTK_STATUS(tsk) (*(unsigned long *)(__KSTK_TOS(tsk) + __PT_REG(cp0_status)))
 
 #define cpu_relax()	barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif /* __KERNEL__ */
 
Index: linux-2.6.9-rc1-bk18-stage/include/asm-parisc/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-parisc/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-parisc/processor.h	11 Sep 2004 14:17:55 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-parisc/processor.h	12 Sep 2004 03:54:01 -0000
@@ -341,6 +341,7 @@ extern inline void prefetchw(const void 
 #endif
 
 #define cpu_relax()	barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif /* __ASSEMBLY__ */
 
Index: linux-2.6.9-rc1-bk18-stage/include/asm-ppc/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-ppc/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-ppc/processor.h	11 Sep 2004 14:17:55 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-ppc/processor.h	12 Sep 2004 03:54:49 -0000
@@ -177,6 +177,7 @@ void _nmask_and_or_msr(unsigned long nma
 #define have_of (_machine == _MACH_chrp || _machine == _MACH_Pmac)
 
 #define cpu_relax()	barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 /*
  * Prefetch macros.
Index: linux-2.6.9-rc1-bk18-stage/include/asm-ppc64/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-ppc64/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-ppc64/processor.h	11 Sep 2004 14:17:55 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-ppc64/processor.h	12 Sep 2004 03:53:51 -0000
@@ -599,6 +599,7 @@ static inline unsigned long __pack_fe01(
 }
 
 #define cpu_relax()	do { HMT_low(); HMT_medium(); barrier(); } while (0)
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 /*
  * Prefetch macros.
Index: linux-2.6.9-rc1-bk18-stage/include/asm-s390/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-s390/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-s390/processor.h	11 Sep 2004 14:17:56 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-s390/processor.h	12 Sep 2004 03:55:00 -0000
@@ -208,6 +208,8 @@ unsigned long get_wchan(struct task_stru
 	asm volatile ("ex 0,%0" : : "i" (__LC_DIAG44_OPCODE) : "memory")
 #endif /* __s390x__ */
 
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
+
 /*
  * Set PSW mask to specified value, while leaving the
  * PSW addr pointing to the next instruction.
Index: linux-2.6.9-rc1-bk18-stage/include/asm-sh/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-sh/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-sh/processor.h	11 Sep 2004 14:17:56 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-sh/processor.h	12 Sep 2004 03:55:17 -0000
@@ -273,5 +273,6 @@ extern unsigned long get_wchan(struct ta
 
 #define cpu_sleep()	__asm__ __volatile__ ("sleep" : : : "memory")
 #define cpu_relax()	do { } while (0)
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif /* __ASM_SH_PROCESSOR_H */
Index: linux-2.6.9-rc1-bk18-stage/include/asm-sh64/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-sh64/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-sh64/processor.h	11 Sep 2004 14:17:56 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-sh64/processor.h	12 Sep 2004 03:55:08 -0000
@@ -286,6 +286,7 @@ extern unsigned long get_wchan(struct ta
 #define KSTK_ESP(tsk)  ((tsk)->thread.sp)
 
 #define cpu_relax()	do { } while (0)
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif	/* __ASSEMBLY__ */
 #endif /* __ASM_SH64_PROCESSOR_H */
Index: linux-2.6.9-rc1-bk18-stage/include/asm-sparc/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-sparc/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-sparc/processor.h	11 Sep 2004 14:17:56 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-sparc/processor.h	12 Sep 2004 03:55:36 -0000
@@ -128,6 +128,7 @@ extern unsigned long get_wchan(struct ta
 extern struct task_struct *last_task_used_math;
 
 #define cpu_relax()	barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif
 
Index: linux-2.6.9-rc1-bk18-stage/include/asm-sparc64/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-sparc64/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-sparc64/processor.h	11 Sep 2004 14:17:56 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-sparc64/processor.h	12 Sep 2004 03:55:29 -0000
@@ -195,6 +195,7 @@ extern unsigned long get_wchan(struct ta
 #define KSTK_ESP(tsk)  ((tsk)->thread_info->kregs->u_regs[UREG_FP])
 
 #define cpu_relax()	barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif /* !(__ASSEMBLY__) */
 
Index: linux-2.6.9-rc1-bk18-stage/include/asm-v850/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-v850/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-v850/processor.h	11 Sep 2004 14:17:56 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-v850/processor.h	12 Sep 2004 03:55:47 -0000
@@ -115,6 +115,7 @@ unsigned long get_wchan (struct task_str
 
 
 #define cpu_relax()    ((void)0)
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 
 #else /* __ASSEMBLY__ */
Index: linux-2.6.9-rc1-bk18-stage/include/asm-x86_64/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-x86_64/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-x86_64/processor.h	11 Sep 2004 14:17:56 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-x86_64/processor.h	12 Sep 2004 04:25:18 -0000
@@ -405,6 +405,17 @@ static inline void prefetchw(void *x) 
 
 #define cpu_relax()   rep_nop()
 
+#define __PAUSE8	"rep;nop;rep;nop;rep;nop;rep;nop\n"
+#define __PAUSE16	__PAUSE8 __PAUSE8
+
+#define	cpu_lock_yield(lock, lock_test)	do {	\
+	alternative_input(__PAUSE16, ".byte 0x0f,0x01,0xc8;",		\
+			X86_FEATURE_MWAIT, "a"(lock), "c"(0), "d"(0));	\
+	if (lock_test(lock))			\
+		alternative_input(__PAUSE8, ".byte 0x0f,0x01,0xc9;",	\
+			X86_FEATURE_MWAIT, "a"(0), "c"(0) : "memory"); \
+} while (lock_test(lock))
+
 /*
  *      NSC/Cyrix CPU configuration register indexes
  */
Index: linux-2.6.9-rc1-bk18-stage/kernel/spinlock.c
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/kernel/spinlock.c,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 spinlock.c
--- linux-2.6.9-rc1-bk18-stage/kernel/spinlock.c	11 Sep 2004 14:17:58 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/kernel/spinlock.c	12 Sep 2004 03:47:37 -0000
@@ -10,6 +10,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <asm/processor.h>
 
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
@@ -53,8 +54,7 @@ static inline void __preempt_spin_lock(s
 
 	do {
 		preempt_enable();
-		while (spin_is_locked(lock))
-			cpu_relax();
+		cpu_lock_yield(lock, spin_is_locked);
 		preempt_disable();
 	} while (!_raw_spin_trylock(lock));
 }
@@ -75,8 +75,7 @@ static inline void __preempt_write_lock(
 
 	do {
 		preempt_enable();
-		while (rwlock_is_locked(lock))
-			cpu_relax();
+		cpu_lock_yield(lock, rwlock_is_locked);
 		preempt_disable();
 	} while (!_raw_write_trylock(lock));
 }

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12  4:59           ` [PATCH] Yielding processor resources during lock contention Zwane Mwaikambo
@ 2004-09-12  5:00             ` Andrew Morton
  2004-09-12  5:10               ` Anton Blanchard
  2004-09-12  5:13               ` Zwane Mwaikambo
  2004-09-12  5:10             ` Linus Torvalds
                               ` (2 subsequent siblings)
  3 siblings, 2 replies; 50+ messages in thread
From: Andrew Morton @ 2004-09-12  5:00 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: torvalds, paulus, linux-kernel, anton, jun.nakajima, ak, mingo

Zwane Mwaikambo <zwane@fsmlabs.com> wrote:
>
> The following patch introduces cpu_lock_yield which allows architectures 
>  to possibly yield processor resources during lock contention.

err.. Haven't you just invented a really sucky semaphore?

> The original 
>  requirement stems from Paul's requirement on PPC64 LPAR systems to yield 
>  the processor to the hypervisor instead of spinning.

Maybe Paul needs to use a semaphore.


Now, maybe Paul has tied himself into sufficiently tangly locking knots
that in some circumstances he needs to spin on the lock and cannot schedule
away.  But he can still use a semaphore and spin on down_trylock.

Confused by all of this.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12  5:00             ` Andrew Morton
@ 2004-09-12  5:10               ` Anton Blanchard
  2004-09-12  5:13               ` Zwane Mwaikambo
  1 sibling, 0 replies; 50+ messages in thread
From: Anton Blanchard @ 2004-09-12  5:10 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Zwane Mwaikambo, torvalds, paulus, linux-kernel, jun.nakajima, ak,
	mingo

> Now, maybe Paul has tied himself into sufficiently tangly locking knots
> that in some circumstances he needs to spin on the lock and cannot schedule
> away.  But he can still use a semaphore and spin on down_trylock.
> 
> Confused by all of this.

On a shared processor machine you might have a 4 way box with a number
of 4 way partitions on it. Having them sized as 4 way partitions allows
them to utilise the entire machine when the others are idle.

Now when all partitions start getting busy you have the problem where
one partition may be spinning waiting for a lock but the cpu with the
lock is not currently being scheduled by the hypervisor. Some other
partition is running over there.

At the moment we spin burning cycles until the other cpu finally gets
scheduled by the hypervisor. A slightly better option is for the busy
spinning cpu to call the hypervisor telling it youve got no useful work
to do. 

The best option is to call the hypervisor and tell it that you are are
busy spinning waiting for that other cpu to be scheduled. The hypervisor
then takes whatever action it sees fit (transferring the current cpus
timeslice to the other cpu etc).

Anton

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12  5:00             ` Andrew Morton
  2004-09-12  5:10               ` Anton Blanchard
@ 2004-09-12  5:13               ` Zwane Mwaikambo
  2004-09-12  4:32                 ` Nick Piggin
  2004-09-12  5:18                 ` Andrew Morton
  1 sibling, 2 replies; 50+ messages in thread
From: Zwane Mwaikambo @ 2004-09-12  5:13 UTC (permalink / raw)
  To: Andrew Morton
  Cc: torvalds, paulus, linux-kernel, anton, jun.nakajima, ak, mingo

On Sat, 11 Sep 2004, Andrew Morton wrote:

> Zwane Mwaikambo <zwane@fsmlabs.com> wrote:
> >
> > The following patch introduces cpu_lock_yield which allows architectures 
> >  to possibly yield processor resources during lock contention.
> 
> err.. Haven't you just invented a really sucky semaphore?

Wait up.. didn't _you_ create that __preempt_spin_lock thing!? ;)

> > The original 
> >  requirement stems from Paul's requirement on PPC64 LPAR systems to yield 
> >  the processor to the hypervisor instead of spinning.
> 
> Maybe Paul needs to use a semaphore.
> 
> 
> Now, maybe Paul has tied himself into sufficiently tangly locking knots
> that in some circumstances he needs to spin on the lock and cannot schedule
> away.  But he can still use a semaphore and spin on down_trylock.
> 
> Confused by all of this.

Well currently it just enables preempt and spins like a mad man until the 
lock is free. The idea is to allow preempt to get some scheduling done 
during the spin.. But! if you accept this patch today, you get the 
i386 version which will allow your processor to halt until a write to the 
lock occurs whilst allowing interrupts to also trigger the preempt 
scheduling, much easier on the caches.

	Zwane

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12  5:13               ` Zwane Mwaikambo
@ 2004-09-12  4:32                 ` Nick Piggin
  2004-09-12  5:33                   ` Zwane Mwaikambo
  2004-09-12  5:18                 ` Andrew Morton
  1 sibling, 1 reply; 50+ messages in thread
From: Nick Piggin @ 2004-09-12  4:32 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: Andrew Morton, torvalds, paulus, linux-kernel, anton,
	jun.nakajima, ak, mingo

Zwane Mwaikambo wrote:
> On Sat, 11 Sep 2004, Andrew Morton wrote:

>>Now, maybe Paul has tied himself into sufficiently tangly locking knots
>>that in some circumstances he needs to spin on the lock and cannot schedule
>>away.  But he can still use a semaphore and spin on down_trylock.
>>
>>Confused by all of this.
> 
> 
> Well currently it just enables preempt and spins like a mad man until the 
> lock is free. The idea is to allow preempt to get some scheduling done 
> during the spin.. But! if you accept this patch today, you get the 
> i386 version which will allow your processor to halt until a write to the 
> lock occurs whilst allowing interrupts to also trigger the preempt 
> scheduling, much easier on the caches.
> 

That's the idea though isn't it? If your locks are significantly more
expensive than a context switch and associated cache trashing, use a
semaphore, hypervisor or no.

I presume the hypervisor switch much incur the same sorts of costs as
a context switch?

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12  4:32                 ` Nick Piggin
@ 2004-09-12  5:33                   ` Zwane Mwaikambo
  2004-09-12  4:46                     ` Nick Piggin
  0 siblings, 1 reply; 50+ messages in thread
From: Zwane Mwaikambo @ 2004-09-12  5:33 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, torvalds, paulus, linux-kernel, anton,
	jun.nakajima, ak, mingo

On Sun, 12 Sep 2004, Nick Piggin wrote:

> > Well currently it just enables preempt and spins like a mad man until the 
> > lock is free. The idea is to allow preempt to get some scheduling done 
> > during the spin.. But! if you accept this patch today, you get the 
> > i386 version which will allow your processor to halt until a write to the 
> > lock occurs whilst allowing interrupts to also trigger the preempt 
> > scheduling, much easier on the caches.
> > 
> 
> That's the idea though isn't it? If your locks are significantly more
> expensive than a context switch and associated cache trashing, use a
> semaphore, hypervisor or no.
> 
> I presume the hypervisor switch much incur the same sorts of costs as
> a context switch?

In the PPC64 and P4/HT case the spinning on a lock is a bad utilisation of 
the execution resources and that's what we're really trying to avoid, not 
necessarily cache thrashing from a context switch.

	Zwane


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12  5:33                   ` Zwane Mwaikambo
@ 2004-09-12  4:46                     ` Nick Piggin
  2004-09-12  5:51                       ` Zwane Mwaikambo
  0 siblings, 1 reply; 50+ messages in thread
From: Nick Piggin @ 2004-09-12  4:46 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: Andrew Morton, torvalds, paulus, linux-kernel, anton,
	jun.nakajima, ak, mingo

Zwane Mwaikambo wrote:
> On Sun, 12 Sep 2004, Nick Piggin wrote:

>>I presume the hypervisor switch much incur the same sorts of costs as
>>a context switch?
> 
> 
> In the PPC64 and P4/HT case the spinning on a lock is a bad utilisation of 
> the execution resources and that's what we're really trying to avoid, not 
> necessarily cache thrashing from a context switch.
> 

But isn't yielding to the hypervisor and thus causing it to schedule
something else basically the same as a context switch? (I don't know
anything about POWER).

If yes, then shouldn't your lock be a blocking lock anyway?

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12  4:46                     ` Nick Piggin
@ 2004-09-12  5:51                       ` Zwane Mwaikambo
  2004-09-12  5:02                         ` Nick Piggin
  0 siblings, 1 reply; 50+ messages in thread
From: Zwane Mwaikambo @ 2004-09-12  5:51 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, torvalds, paulus, linux-kernel, anton,
	jun.nakajima, ak, mingo

On Sun, 12 Sep 2004, Nick Piggin wrote:

> Zwane Mwaikambo wrote:
> > On Sun, 12 Sep 2004, Nick Piggin wrote:
> 
> >>I presume the hypervisor switch much incur the same sorts of costs as
> >>a context switch?
> > 
> > In the PPC64 and P4/HT case the spinning on a lock is a bad utilisation of 
> > the execution resources and that's what we're really trying to avoid, not 
> > necessarily cache thrashing from a context switch.
> 
> But isn't yielding to the hypervisor and thus causing it to schedule
> something else basically the same as a context switch? (I don't know
> anything about POWER).

Yielding processor to the hypervisor allows it to schedule the physical 
processor for execution time on a different logical partition. The 
scheduling in this case is at the hardware/firmware level.

> If yes, then shouldn't your lock be a blocking lock anyway?

We just happen to allow preemption since we know we're at an upper level 
lock, so we may as well not disable preemption during contention. It 
wouldn't be as straightforward to switch to a blocking lock.

	Zwane

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12  5:51                       ` Zwane Mwaikambo
@ 2004-09-12  5:02                         ` Nick Piggin
  2004-09-12  6:09                           ` Anton Blanchard
  0 siblings, 1 reply; 50+ messages in thread
From: Nick Piggin @ 2004-09-12  5:02 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: Andrew Morton, torvalds, paulus, linux-kernel, anton,
	jun.nakajima, ak, mingo

Zwane Mwaikambo wrote:

>>If yes, then shouldn't your lock be a blocking lock anyway?
> 
> 
> We just happen to allow preemption since we know we're at an upper level 
> lock, so we may as well not disable preemption during contention. It 
> wouldn't be as straightforward to switch to a blocking lock.
> 

OK that sounds alight.

If it is a "I'm going to be spinning for ages, so schedule me off *now*"
then I think it is wrong... but if it just allows you to keep the hypervisor
preemption turned on, then fine.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12  5:02                         ` Nick Piggin
@ 2004-09-12  6:09                           ` Anton Blanchard
  2004-09-12  6:58                             ` William Lee Irwin III
  0 siblings, 1 reply; 50+ messages in thread
From: Anton Blanchard @ 2004-09-12  6:09 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Zwane Mwaikambo, Andrew Morton, torvalds, paulus, linux-kernel,
	jun.nakajima, ak, mingo

> OK that sounds alight.
> 
> If it is a "I'm going to be spinning for ages, so schedule me off *now*"
> then I think it is wrong... but if it just allows you to keep the hypervisor
> preemption turned on, then fine.

The hypervisor can preempt us anywhere but we can give it clues so it
tends to preempt us at better times.

Within a few instructions the hypervisor should be able to answer the
big question: is the cpu Im waiting for currently scheduled on another
cpu. If it is then it might be best for the hypervisor to leave us
alone.

If it isnt then the spin time will most likely be dominated by how
long it takes to get that cpu scheduled again and the hypervisor should
make an effort to schedule it.

Anton

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12  6:09                           ` Anton Blanchard
@ 2004-09-12  6:58                             ` William Lee Irwin III
  0 siblings, 0 replies; 50+ messages in thread
From: William Lee Irwin III @ 2004-09-12  6:58 UTC (permalink / raw)
  To: Anton Blanchard
  Cc: Nick Piggin, Zwane Mwaikambo, Andrew Morton, torvalds, paulus,
	linux-kernel, jun.nakajima, ak, mingo

At some point in the past, Nick Piggin wrote:
>> OK that sounds alight.
>> If it is a "I'm going to be spinning for ages, so schedule me off *now*"
>> then I think it is wrong... but if it just allows you to keep the hypervisor
>> preemption turned on, then fine.

On Sun, Sep 12, 2004 at 04:09:04PM +1000, Anton Blanchard wrote:
> The hypervisor can preempt us anywhere but we can give it clues so it
> tends to preempt us at better times.
> Within a few instructions the hypervisor should be able to answer the
> big question: is the cpu Im waiting for currently scheduled on another
> cpu. If it is then it might be best for the hypervisor to leave us
> alone.
> If it isnt then the spin time will most likely be dominated by how
> long it takes to get that cpu scheduled again and the hypervisor should
> make an effort to schedule it.

Preemption of a guest holding a busywait-protected resource is
catastrophic, yes. Voluntarily yielding to the host is one of the
few tactics a guest can use, alongside more widespread use of blocking
locking primitives and using host-provided locking primitives.

Gang scheduling in the host is stronger medicine for this, and has the
further advantage that it does not confuse contention with host
preemption. It may be valuable for Linux likewise to implement gang
scheduling to accommodate its own usage as a host, as futexes are only
usable when the guest issues system calls natively (as is the case in
UML; guests in full CPU emulators may not even be executing native
instructions).

I suspect that the optimal configuration would use 2-tier locking in
addition to gang scheduling the guests, where the outer tier busywaits
for some interval before backing off to an inner tier using a host-
provided locking primitive (obviously the host should not suspend the
entire gang when a guest blocks on the host-provided locking primitive).
It's unclear whether some attempt should be made to avoid the host-
provided wakeup portion of the locking primitives. The 2-tier locking
algorithms are still beneficial in the absence of gang scheduling, as
spurious host context switches likely carry a high cost; the backoff
should be arranged so that busywait is only done as long as the
expected cost of a spurious host context switch remains greater than
the expected cost of busywaiting during a host preemption, where in
the presence of gang scheduling the latter cost is merely lock contention
(i.e. gang scheduling allows guests to busywait as long as they expect
the latency of lock acquisition to be smaller than a host context switch,
where the longer one has busywaited the longer one expectes to busywait).

I suppose the first step is to get a hook for the host primitives into
the locking algorithms, so I approve of all this etc.

-- wli

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12  5:13               ` Zwane Mwaikambo
  2004-09-12  4:32                 ` Nick Piggin
@ 2004-09-12  5:18                 ` Andrew Morton
  1 sibling, 0 replies; 50+ messages in thread
From: Andrew Morton @ 2004-09-12  5:18 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: torvalds, paulus, linux-kernel, anton, jun.nakajima, ak, mingo

Zwane Mwaikambo <zwane@linuxpower.ca> wrote:
>
>  > Confused by all of this.
> 
>  Well currently it just enables preempt and spins like a mad man until the 
>  lock is free. The idea is to allow preempt to get some scheduling done 
>  during the spin..

Oh, Ok, I thought you were proposing that "yield" meant an actual yield().

yes, allowing the arch to hook into that busywait loop makes sense.


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12  4:59           ` [PATCH] Yielding processor resources during lock contention Zwane Mwaikambo
  2004-09-12  5:00             ` Andrew Morton
@ 2004-09-12  5:10             ` Linus Torvalds
  2004-09-12  5:27               ` Zwane Mwaikambo
  2004-09-12  7:49             ` [PATCH] Yielding processor resources during lock contention Ingo Molnar
  2004-09-12 10:10             ` Arnd Bergmann
  3 siblings, 1 reply; 50+ messages in thread
From: Linus Torvalds @ 2004-09-12  5:10 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: Paul Mackerras, Linux Kernel, Andrew Morton, Anton Blanchard,
	Nakajima, Jun, Andi Kleen, Ingo Molnar

On Sun, 12 Sep 2004, Zwane Mwaikambo wrote:
> 
> On i386 processors with PNI this is achieved by using the 
> monitor/mwait opcodes to halt the processor until a write to the lock is 
> done. 

I'd seriously suggest you ask Intel for an official opinion on this. Last
I heard (and that was, I believe, before monitor/mwait had been officially
announced, so it's certainly a bit dated now) it wasn't architecturally
clear that it's a good idea using it for things like spinlocks.

In particular, if the CPU idly waits for a cacheline to be dirtied, it is 
entirely possible that the other CPU that owns the lock and releases it 
won't actually _tell_ the world that the lock has been released for quite 
some time. After all, why should it - if it is the exclusive owner, and it 
sees no memory traffic on the bus, it may have no reason to push out the 
fact that it just released the lock. Just keep it dirty in its caches.

In other words: monitor/mwait on purpose obviously causes fewer bus
cycles. But that very fact may well mean (at least in theory) that you get
very high latencies. It could make spinlock contention very very unfair
(the original CPU keeps getting the lock over and over again, while the
monitor/mwait one never gets to play), and it might also make ping-pong
locking latency be extremely high.

Also, it's entirely possible that monitor/mwait ends up shutting down the
CPU to the point that getting out of a lower-power mode might have a
latency of microseconds or even milliseconds. External (or even internal)
voltage regulators and frequency changes are not instantaneous by any
means..

In other words, I would strongly suggest you _not_ really consider this a
serious thing (feel free to play around with it and try to get some
numbers) with this without getting an answer from Intel about what the
_architected_ behaviour of monitor/mwait is, from a latency standpoint.

Because otherwise you might find that even if it performs like you want it
to on some existing machine, next year it would suck so badly that it's
not even funny.

			Linus

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12  5:10             ` Linus Torvalds
@ 2004-09-12  5:27               ` Zwane Mwaikambo
  2004-09-13 12:21                 ` [PATCH] Yielding processor resources during lock contention #2 Zwane Mwaikambo
  0 siblings, 1 reply; 50+ messages in thread
From: Zwane Mwaikambo @ 2004-09-12  5:27 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Paul Mackerras, Linux Kernel, Andrew Morton, Anton Blanchard,
	Nakajima, Jun, Andi Kleen, Ingo Molnar

On Sat, 11 Sep 2004, Linus Torvalds wrote:

> I'd seriously suggest you ask Intel for an official opinion on this. Last
> I heard (and that was, I believe, before monitor/mwait had been officially
> announced, so it's certainly a bit dated now) it wasn't architecturally
> clear that it's a good idea using it for things like spinlocks.

Indeed last i heard, it was unuseable for low latency locking, for this 
case (lock acquisition after relinquish on remote processor) we may be 
able to put up with the higher latency.

> In particular, if the CPU idly waits for a cacheline to be dirtied, it is 
> entirely possible that the other CPU that owns the lock and releases it 
> won't actually _tell_ the world that the lock has been released for quite 
> some time. After all, why should it - if it is the exclusive owner, and it 
> sees no memory traffic on the bus, it may have no reason to push out the 
> fact that it just released the lock. Just keep it dirty in its caches.
> 
> In other words: monitor/mwait on purpose obviously causes fewer bus
> cycles. But that very fact may well mean (at least in theory) that you get
> very high latencies. It could make spinlock contention very very unfair
> (the original CPU keeps getting the lock over and over again, while the
> monitor/mwait one never gets to play), and it might also make ping-pong
> locking latency be extremely high.

Good point, i can see this scenario occuring on current processors.

> Also, it's entirely possible that monitor/mwait ends up shutting down the
> CPU to the point that getting out of a lower-power mode might have a
> latency of microseconds or even milliseconds. External (or even internal)
> voltage regulators and frequency changes are not instantaneous by any
> means..

Yes i noted too, currently it doesn't support any additional options 
to affect how the processor halts during mwait, intel could indeed pull a 
number and make the default ultra high wakeup latency mode. I truly 
hope they don't as it really would make the whole thing useless.

> In other words, I would strongly suggest you _not_ really consider this a
> serious thing (feel free to play around with it and try to get some
> numbers) with this without getting an answer from Intel about what the
> _architected_ behaviour of monitor/mwait is, from a latency standpoint.

Well the i386 and x86_64 versions were purely for testing purposes on my 
part, i'm content with dropping them. The main user was going to be PPC64, 
but i felt compelled to throw in an architecture implementation.

Thanks,
	Zwane

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH] Yielding processor resources during lock contention #2
  2004-09-12  5:27               ` Zwane Mwaikambo
@ 2004-09-13 12:21                 ` Zwane Mwaikambo
  0 siblings, 0 replies; 50+ messages in thread
From: Zwane Mwaikambo @ 2004-09-13 12:21 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Paul Mackerras, Linux Kernel, Andrew Morton, Anton Blanchard,
	Nakajima, Jun, Ingo Molnar

On Sun, 12 Sep 2004, Zwane Mwaikambo wrote:

> Well the i386 and x86_64 versions were purely for testing purposes on my 
> part, i'm content with dropping them. The main user was going to be PPC64, 
> but i felt compelled to throw in an architecture implementation.

The following patch introduces cpu_lock_yield which allows architectures
to possibly yield processor resources during lock contention. The original
requirement stems from Paul's requirement on PPC64 LPAR systems to yield
the processor to the hypervisor instead of spinning. However the general
concept can be used beneficially on other architectures such as i386 and
x86_64 with HT to also avoid bouncing cachelines about, utilising
execution resources and possibly unecessarily consuming power during
contention.

The following patch only provides generic versions for all architectures, 
Paul, Anton could slip in the PPC64 version.

 include/asm-alpha/processor.h     |    1 +
 include/asm-arm/processor.h       |    1 +
 include/asm-arm26/processor.h     |    1 +
 include/asm-cris/processor.h      |    1 +
 include/asm-h8300/processor.h     |    1 +
 include/asm-i386/processor.h      |    1 +
 include/asm-ia64/processor.h      |    1 +
 include/asm-m68k/processor.h      |    1 +
 include/asm-m68knommu/processor.h |    1 +
 include/asm-mips/processor.h      |    1 +
 include/asm-parisc/processor.h    |    1 +
 include/asm-ppc/processor.h       |    1 +
 include/asm-ppc64/processor.h     |    1 +
 include/asm-s390/processor.h      |    2 ++
 include/asm-sh/processor.h        |    1 +
 include/asm-sh64/processor.h      |    1 +
 include/asm-sparc/processor.h     |    1 +
 include/asm-sparc64/processor.h   |    1 +
 include/asm-v850/processor.h      |    1 +
 include/asm-x86_64/processor.h    |    1 +
 kernel/spinlock.c                 |    7 +++----
 21 files changed, 24 insertions(+), 4 deletions(-)

Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca>

Index: linux-2.6.9-rc1-bk18-stage/include/asm-alpha/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-alpha/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-alpha/processor.h	11 Sep 2004 14:17:53 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-alpha/processor.h	12 Sep 2004 03:53:09 -0000
@@ -73,6 +73,7 @@ unsigned long get_wchan(struct task_stru
   ((tsk) == current ? rdusp() : (tsk)->thread_info->pcb.usp)
 
 #define cpu_relax()	barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #define ARCH_HAS_PREFETCH
 #define ARCH_HAS_PREFETCHW
Index: linux-2.6.9-rc1-bk18-stage/include/asm-arm/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-arm/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-arm/processor.h	11 Sep 2004 14:17:53 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-arm/processor.h	12 Sep 2004 03:53:42 -0000
@@ -85,6 +85,7 @@ extern void release_thread(struct task_s
 unsigned long get_wchan(struct task_struct *p);
 
 #define cpu_relax()			barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 /*
  * Create a new kernel thread
Index: linux-2.6.9-rc1-bk18-stage/include/asm-arm26/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-arm26/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-arm26/processor.h	11 Sep 2004 14:17:54 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-arm26/processor.h	12 Sep 2004 03:57:27 -0000
@@ -102,6 +102,7 @@ extern void release_thread(struct task_s
 unsigned long get_wchan(struct task_struct *p);
 
 #define cpu_relax()			barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 /* Prepare to copy thread state - unlazy all lazy status */
 #define prepare_to_copy(tsk)    do { } while (0)
Index: linux-2.6.9-rc1-bk18-stage/include/asm-cris/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-cris/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-cris/processor.h	11 Sep 2004 14:17:54 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-cris/processor.h	12 Sep 2004 03:53:31 -0000
@@ -75,5 +75,6 @@ extern inline void release_thread(struct
 #define init_stack      (init_thread_union.stack)
 
 #define cpu_relax()     barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif /* __ASM_CRIS_PROCESSOR_H */
Index: linux-2.6.9-rc1-bk18-stage/include/asm-h8300/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-h8300/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-h8300/processor.h	11 Sep 2004 14:17:54 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-h8300/processor.h	12 Sep 2004 03:57:17 -0000
@@ -136,5 +136,6 @@ unsigned long get_wchan(struct task_stru
 #define	KSTK_ESP(tsk)	((tsk) == current ? rdusp() : (tsk)->thread.usp)
 
 #define cpu_relax()    do { } while (0)
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif
Index: linux-2.6.9-rc1-bk18-stage/include/asm-i386/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-i386/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-i386/processor.h	11 Sep 2004 14:17:54 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-i386/processor.h	13 Sep 2004 12:16:28 -0000
@@ -553,6 +553,7 @@ static inline void rep_nop(void)
 }
 
 #define cpu_relax()	rep_nop()
+#define	cpu_lock_yield(lock, lock_test)	do { cpu_relax(); } while (lock_test(lock))
 
 /* generic versions from gas */
 #define GENERIC_NOP1	".byte 0x90\n"
Index: linux-2.6.9-rc1-bk18-stage/include/asm-ia64/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-ia64/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-ia64/processor.h	11 Sep 2004 14:17:54 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-ia64/processor.h	12 Sep 2004 03:54:36 -0000
@@ -576,6 +576,7 @@ ia64_eoi (void)
 }
 
 #define cpu_relax()	ia64_hint(ia64_hint_pause)
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 static inline void
 ia64_set_lrr0 (unsigned long val)
Index: linux-2.6.9-rc1-bk18-stage/include/asm-m68k/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-m68k/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-m68k/processor.h	11 Sep 2004 14:17:55 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-m68k/processor.h	12 Sep 2004 03:54:22 -0000
@@ -135,5 +135,6 @@ unsigned long get_wchan(struct task_stru
 #define	KSTK_ESP(tsk)	((tsk) == current ? rdusp() : (tsk)->thread.usp)
 
 #define cpu_relax()	barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif
Index: linux-2.6.9-rc1-bk18-stage/include/asm-m68knommu/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-m68knommu/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-m68knommu/processor.h	11 Sep 2004 14:17:55 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-m68knommu/processor.h	12 Sep 2004 03:54:29 -0000
@@ -132,5 +132,6 @@ unsigned long get_wchan(struct task_stru
 #define	KSTK_ESP(tsk)	((tsk) == current ? rdusp() : (tsk)->thread.usp)
 
 #define cpu_relax()    do { } while (0)
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif
Index: linux-2.6.9-rc1-bk18-stage/include/asm-mips/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-mips/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-mips/processor.h	11 Sep 2004 14:17:55 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-mips/processor.h	12 Sep 2004 03:54:13 -0000
@@ -263,6 +263,7 @@ unsigned long get_wchan(struct task_stru
 #define KSTK_STATUS(tsk) (*(unsigned long *)(__KSTK_TOS(tsk) + __PT_REG(cp0_status)))
 
 #define cpu_relax()	barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif /* __KERNEL__ */
 
Index: linux-2.6.9-rc1-bk18-stage/include/asm-parisc/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-parisc/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-parisc/processor.h	11 Sep 2004 14:17:55 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-parisc/processor.h	12 Sep 2004 03:54:01 -0000
@@ -341,6 +341,7 @@ extern inline void prefetchw(const void 
 #endif
 
 #define cpu_relax()	barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif /* __ASSEMBLY__ */
 
Index: linux-2.6.9-rc1-bk18-stage/include/asm-ppc/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-ppc/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-ppc/processor.h	11 Sep 2004 14:17:55 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-ppc/processor.h	12 Sep 2004 03:54:49 -0000
@@ -177,6 +177,7 @@ void _nmask_and_or_msr(unsigned long nma
 #define have_of (_machine == _MACH_chrp || _machine == _MACH_Pmac)
 
 #define cpu_relax()	barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 /*
  * Prefetch macros.
Index: linux-2.6.9-rc1-bk18-stage/include/asm-ppc64/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-ppc64/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-ppc64/processor.h	11 Sep 2004 14:17:55 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-ppc64/processor.h	12 Sep 2004 03:53:51 -0000
@@ -599,6 +599,7 @@ static inline unsigned long __pack_fe01(
 }
 
 #define cpu_relax()	do { HMT_low(); HMT_medium(); barrier(); } while (0)
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 /*
  * Prefetch macros.
Index: linux-2.6.9-rc1-bk18-stage/include/asm-s390/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-s390/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-s390/processor.h	11 Sep 2004 14:17:56 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-s390/processor.h	12 Sep 2004 03:55:00 -0000
@@ -208,6 +208,8 @@ unsigned long get_wchan(struct task_stru
 	asm volatile ("ex 0,%0" : : "i" (__LC_DIAG44_OPCODE) : "memory")
 #endif /* __s390x__ */
 
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
+
 /*
  * Set PSW mask to specified value, while leaving the
  * PSW addr pointing to the next instruction.
Index: linux-2.6.9-rc1-bk18-stage/include/asm-sh/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-sh/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-sh/processor.h	11 Sep 2004 14:17:56 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-sh/processor.h	12 Sep 2004 03:55:17 -0000
@@ -273,5 +273,6 @@ extern unsigned long get_wchan(struct ta
 
 #define cpu_sleep()	__asm__ __volatile__ ("sleep" : : : "memory")
 #define cpu_relax()	do { } while (0)
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif /* __ASM_SH_PROCESSOR_H */
Index: linux-2.6.9-rc1-bk18-stage/include/asm-sh64/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-sh64/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-sh64/processor.h	11 Sep 2004 14:17:56 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-sh64/processor.h	12 Sep 2004 03:55:08 -0000
@@ -286,6 +286,7 @@ extern unsigned long get_wchan(struct ta
 #define KSTK_ESP(tsk)  ((tsk)->thread.sp)
 
 #define cpu_relax()	do { } while (0)
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif	/* __ASSEMBLY__ */
 #endif /* __ASM_SH64_PROCESSOR_H */
Index: linux-2.6.9-rc1-bk18-stage/include/asm-sparc/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-sparc/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-sparc/processor.h	11 Sep 2004 14:17:56 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-sparc/processor.h	12 Sep 2004 03:55:36 -0000
@@ -128,6 +128,7 @@ extern unsigned long get_wchan(struct ta
 extern struct task_struct *last_task_used_math;
 
 #define cpu_relax()	barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif
 
Index: linux-2.6.9-rc1-bk18-stage/include/asm-sparc64/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-sparc64/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-sparc64/processor.h	11 Sep 2004 14:17:56 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-sparc64/processor.h	12 Sep 2004 03:55:29 -0000
@@ -195,6 +195,7 @@ extern unsigned long get_wchan(struct ta
 #define KSTK_ESP(tsk)  ((tsk)->thread_info->kregs->u_regs[UREG_FP])
 
 #define cpu_relax()	barrier()
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 #endif /* !(__ASSEMBLY__) */
 
Index: linux-2.6.9-rc1-bk18-stage/include/asm-v850/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-v850/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-v850/processor.h	11 Sep 2004 14:17:56 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-v850/processor.h	12 Sep 2004 03:55:47 -0000
@@ -115,6 +115,7 @@ unsigned long get_wchan (struct task_str
 
 
 #define cpu_relax()    ((void)0)
+#define cpu_lock_yield(lock, lock_test) do { cpu_relax(); } while (lock_test(lock))
 
 
 #else /* __ASSEMBLY__ */
Index: linux-2.6.9-rc1-bk18-stage/include/asm-x86_64/processor.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/include/asm-x86_64/processor.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 processor.h
--- linux-2.6.9-rc1-bk18-stage/include/asm-x86_64/processor.h	11 Sep 2004 14:17:56 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/include/asm-x86_64/processor.h	13 Sep 2004 12:16:13 -0000
@@ -404,6 +404,7 @@ static inline void prefetchw(void *x) 
 #define spin_lock_prefetch(x)  prefetchw(x)
 
 #define cpu_relax()   rep_nop()
+#define	cpu_lock_yield(lock, lock_test)	do { cpu_relax(); } while (lock_test(lock))
 
 /*
  *      NSC/Cyrix CPU configuration register indexes
Index: linux-2.6.9-rc1-bk18-stage/kernel/spinlock.c
===================================================================
RCS file: /home/cvsroot/linux-2.6.9-rc1-bk18/kernel/spinlock.c,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 spinlock.c
--- linux-2.6.9-rc1-bk18-stage/kernel/spinlock.c	11 Sep 2004 14:17:58 -0000	1.1.1.1
+++ linux-2.6.9-rc1-bk18-stage/kernel/spinlock.c	12 Sep 2004 03:47:37 -0000
@@ -10,6 +10,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <asm/processor.h>
 
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
@@ -53,8 +54,7 @@ static inline void __preempt_spin_lock(s
 
 	do {
 		preempt_enable();
-		while (spin_is_locked(lock))
-			cpu_relax();
+		cpu_lock_yield(lock, spin_is_locked);
 		preempt_disable();
 	} while (!_raw_spin_trylock(lock));
 }
@@ -75,8 +75,7 @@ static inline void __preempt_write_lock(
 
 	do {
 		preempt_enable();
-		while (rwlock_is_locked(lock))
-			cpu_relax();
+		cpu_lock_yield(lock, rwlock_is_locked);
 		preempt_disable();
 	} while (!_raw_write_trylock(lock));
 }

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12  4:59           ` [PATCH] Yielding processor resources during lock contention Zwane Mwaikambo
  2004-09-12  5:00             ` Andrew Morton
  2004-09-12  5:10             ` Linus Torvalds
@ 2004-09-12  7:49             ` Ingo Molnar
  2004-09-12 16:10               ` Zwane Mwaikambo
  2004-09-12 10:10             ` Arnd Bergmann
  3 siblings, 1 reply; 50+ messages in thread
From: Ingo Molnar @ 2004-09-12  7:49 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: Linus Torvalds, Paul Mackerras, Linux Kernel, Andrew Morton,
	Anton Blanchard, Nakajima, Jun, Andi Kleen

* Zwane Mwaikambo <zwane@fsmlabs.com> wrote:

> > Agreed, Paul we may as well remove the cpu_relax() in __preempt_spin_lock 
> > and use something like "cpu_yield" (architectures not supporting it would 
> > just call cpu_relax) i'll have something for you later.
> 
> The following patch introduces cpu_lock_yield which allows
> architectures to possibly yield processor resources during lock
> contention. [...]

it is not clear from the Intel documentation how well MONITOR+MWAIT
works on SMP. It seems to be targeted towards hyperthreaded CPUs - where
i suspect it's much easier to monitor the stream of stores done to an
address.

on SMP MESI caches the question is, does MONITOR+MWAIT detect a
cacheline invalidate or even a natural cacheline flush? I doubt it does.
But without having the monitored cacheline in Modified state in the
sleeping CPU for sure it's fundamentally racy and it's not possible to
guarantee latencies: another CPU could have grabbed the cacheline and
could keep it dirty indefinitely. (it could itself go idle forever after
this point!)

the only safe way would be if MONITOR moved the cacheline into Exclusive
state and if it would watch that cacheline possibly going away (i.e. 
another CPU unlocking the spinlock) after this point - in addition to
watching the stores of any HT sibling. But there is no description of
the SMP behavior in the Intel docs - and i truly suspect it would be
documented all over the place if it worked correctly on SMP... So i
think this is an HT-only instruction. (and in that limited context we
could indeed use it!)

one good thing to do would be to test the behavior and count cycles - it
is possible to set up the 'wrong' caching case that can potentially lead
to indefinite delays in mwait. If it turns out to work the expected way
then it would be nice to use. (The deep-sleep worries are not a too big
issue for latency-sensitive users as deep sleep can already occur via
the idle loop so it has to be turned off / tuned anyway.)

but unless the SMP case is guaranteed to work in a time-deterministic
way i dont think this patch can be added :-( It's not just the question
of high latencies, it's the question of fundamental correctness: with
large enough caches there is no guarantee that a CPU will _ever_ flush a
dirty cacheline to RAM.

	Ingo

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12  7:49             ` [PATCH] Yielding processor resources during lock contention Ingo Molnar
@ 2004-09-12 16:10               ` Zwane Mwaikambo
  2004-09-12 19:34                 ` Ingo Molnar
  0 siblings, 1 reply; 50+ messages in thread
From: Zwane Mwaikambo @ 2004-09-12 16:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Paul Mackerras, Linux Kernel, Andrew Morton,
	Anton Blanchard, Nakajima, Jun, Andi Kleen

Hi Ingo,

On Sun, 12 Sep 2004, Ingo Molnar wrote:

> it is not clear from the Intel documentation how well MONITOR+MWAIT
> works on SMP. It seems to be targeted towards hyperthreaded CPUs - where
> i suspect it's much easier to monitor the stream of stores done to an
> address.

Indeed, Linus also pointed out how lax the documentation on usage and 
effects of these opcodes.

> one good thing to do would be to test the behavior and count cycles - it
> is possible to set up the 'wrong' caching case that can potentially lead
> to indefinite delays in mwait. If it turns out to work the expected way
> then it would be nice to use. (The deep-sleep worries are not a too big
> issue for latency-sensitive users as deep sleep can already occur via
> the idle loop so it has to be turned off / tuned anyway.)
> 
> but unless the SMP case is guaranteed to work in a time-deterministic
> way i dont think this patch can be added :-( It's not just the question
> of high latencies, it's the question of fundamental correctness: with
> large enough caches there is no guarantee that a CPU will _ever_ flush a
> dirty cacheline to RAM.

The suggested usage of monitor/mwait avoids that scenario, the recommended 
usage being;

while (*addr != condition) {
	monitor(addr)
	if (*addr != condition)
		mwait();
}

This method is used because you can receive spurious wakeups from mwait. 
Like 'hlt' an interrupt, NMI, SMI etc will cause the processor to continue 
executing the next instruction after mwait. The extended sleep state 
however could occur if interrupts are disabled which isn't the case with 
the current users. So there is a sense of time deterministic behaviour 
with an operating system which uses a periodic timer tick for example. But 
regardless of that, i also think this requires further clarification with 
respect to side effects and benchmarking with current hardware.

Thanks,
	Zwane

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12 16:10               ` Zwane Mwaikambo
@ 2004-09-12 19:34                 ` Ingo Molnar
  0 siblings, 0 replies; 50+ messages in thread
From: Ingo Molnar @ 2004-09-12 19:34 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: Linus Torvalds, Paul Mackerras, Linux Kernel, Andrew Morton,
	Anton Blanchard, Nakajima, Jun, Andi Kleen


* Zwane Mwaikambo <zwane@linuxpower.ca> wrote:

> > but unless the SMP case is guaranteed to work in a time-deterministic
> > way i dont think this patch can be added :-( It's not just the question
> > of high latencies, it's the question of fundamental correctness: with
> > large enough caches there is no guarantee that a CPU will _ever_ flush a
> > dirty cacheline to RAM.
> 
> The suggested usage of monitor/mwait avoids that scenario, the recommended 
> usage being;
> 
> while (*addr != condition) {
> 	monitor(addr)
> 	if (*addr != condition)
> 		mwait();
> }
> 
> This method is used because you can receive spurious wakeups from
> mwait.  Like 'hlt' an interrupt, NMI, SMI etc will cause the processor
> to continue executing the next instruction after mwait. The extended
> sleep state however could occur if interrupts are disabled which isn't
> the case with the current users. So there is a sense of time
> deterministic behaviour with an operating system which uses a periodic
> timer tick for example. But regardless of that, i also think this
> requires further clarification with respect to side effects and
> benchmarking with current hardware.

right now we might have a periodic timer interrupt, but for the sake of
powersaving (and for virtualization) we want to try variable rate timer
interrupts real soon, which would remove any sort of determinism from
the mwait() loop. So we cannot take this approach unless CPU makers
guarantee the behavior. Right now it's inherently fragile.

Besides, even a 1msec upper cap on spinlock-wait times is totally
unacceptable from a latency and quality of implementation point of view.

	Ingo

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12  4:59           ` [PATCH] Yielding processor resources during lock contention Zwane Mwaikambo
                               ` (2 preceding siblings ...)
  2004-09-12  7:49             ` [PATCH] Yielding processor resources during lock contention Ingo Molnar
@ 2004-09-12 10:10             ` Arnd Bergmann
  2004-09-12 10:43               ` Anton Blanchard
  3 siblings, 1 reply; 50+ messages in thread
From: Arnd Bergmann @ 2004-09-12 10:10 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: Linus Torvalds, Paul Mackerras, Linux Kernel, Andrew Morton,
	Anton Blanchard, Nakajima, Jun, Andi Kleen, Ingo Molnar,
	Martin Schwidefsky

[-- Attachment #1: Type: text/plain, Size: 684 bytes --]

On Sonntag, 12. September 2004 06:59, Zwane Mwaikambo wrote:
> The following patch introduces cpu_lock_yield which allows architectures 
> to possibly yield processor resources during lock contention. The original 
> requirement stems from Paul's requirement on PPC64 LPAR systems to yield 
> the processor to the hypervisor instead of spinning. 

For s390, this was solved by simply defining cpu_relax() to the hypervisor
yield operation, because we found that cpu_relax() is used only in busy-wait
situations where it makes sense to continue on another virtual CPU.

What is the benefit of not always doing a full hypervisor yield when
you hit cpu_relax()?

	Arnd <><

[-- Attachment #2: signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12 10:10             ` Arnd Bergmann
@ 2004-09-12 10:43               ` Anton Blanchard
  2004-09-12 11:12                 ` Arnd Bergmann
  0 siblings, 1 reply; 50+ messages in thread
From: Anton Blanchard @ 2004-09-12 10:43 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Zwane Mwaikambo, Linus Torvalds, Paul Mackerras, Linux Kernel,
	Andrew Morton, Nakajima, Jun, Andi Kleen, Ingo Molnar,
	Martin Schwidefsky

 
Hi,

> For s390, this was solved by simply defining cpu_relax() to the hypervisor
> yield operation, because we found that cpu_relax() is used only in busy-wait
> situations where it makes sense to continue on another virtual CPU.
> 
> What is the benefit of not always doing a full hypervisor yield when
> you hit cpu_relax()?

cpu_relax doesnt tell us why we are busy looping. In this particular
case we want to pass to the hypervisor which virtual cpu we are waiting
on so the hypervisor make better scheduling decisions.

Did you manage to see any improvement by yielding to the hypervisor
in cpu_relax?

Anton

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-12 10:43               ` Anton Blanchard
@ 2004-09-12 11:12                 ` Arnd Bergmann
  0 siblings, 0 replies; 50+ messages in thread
From: Arnd Bergmann @ 2004-09-12 11:12 UTC (permalink / raw)
  To: Anton Blanchard
  Cc: Zwane Mwaikambo, Linus Torvalds, Paul Mackerras, Linux Kernel,
	Andrew Morton, Nakajima, Jun, Andi Kleen, Ingo Molnar,
	Martin Schwidefsky, Eberhard Pasch

[-- Attachment #1: Type: text/plain, Size: 675 bytes --]

On Sonntag, 12. September 2004 12:43, Anton Blanchard wrote:
> cpu_relax doesnt tell us why we are busy looping. In this particular
> case we want to pass to the hypervisor which virtual cpu we are waiting
> on so the hypervisor make better scheduling decisions.

Ah, interesting. I think s390 could do the same with the upcoming 5.1 release
of z/VM. However, we decided to ask for an implementation of a futex hypercall
instead, which lets us implement the linux spinlock as a hypervisor semaphore.

> Did you manage to see any improvement by yielding to the hypervisor
> in cpu_relax?
I'm not sure if this has been measured, maybe Martin or Eberhard knows more.

	Arnd <><

[-- Attachment #2: signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-09 12:41   ` Zwane Mwaikambo
  2004-09-09 14:54     ` Linus Torvalds
@ 2004-09-09 15:43     ` Anton Blanchard
  2004-09-09 17:19       ` William Lee Irwin III
  1 sibling, 1 reply; 50+ messages in thread
From: Anton Blanchard @ 2004-09-09 15:43 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: Paul Mackerras, Linux Kernel, Andrew Morton, Linus Torvalds,
	William Lee Irwin III, Matt Mackall, Nakajima, Jun

 
> I think that bit is actually intentional since __preempt_spin_lock is also 
> marked __sched so that it'll get charged as a scheduling function.

Yeah, its a bit unfortunate. In profiles with preempt on we end up with
almost all our ticks inside __preempt_spin_lock and never get to use the
nice profile_pc code. I ended up turning preempt off again.

Anton

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-09 15:43     ` [PATCH][5/8] Arch agnostic completely out of line locks / ppc64 Anton Blanchard
@ 2004-09-09 17:19       ` William Lee Irwin III
  2004-09-09 21:38         ` Paul Mackerras
  0 siblings, 1 reply; 50+ messages in thread
From: William Lee Irwin III @ 2004-09-09 17:19 UTC (permalink / raw)
  To: Anton Blanchard
  Cc: Zwane Mwaikambo, Paul Mackerras, Linux Kernel, Andrew Morton,
	Linus Torvalds, Matt Mackall, Nakajima, Jun

At some point in the past, Zwane Mwaikambo wrote:
>> I think that bit is actually intentional since __preempt_spin_lock is also 
>> marked __sched so that it'll get charged as a scheduling function.

On Fri, Sep 10, 2004 at 01:43:00AM +1000, Anton Blanchard wrote:
> Yeah, its a bit unfortunate. In profiles with preempt on we end up with
> almost all our ticks inside __preempt_spin_lock and never get to use the
> nice profile_pc code. I ended up turning preempt off again.

Checking in_lock_functions() (not sure what the name in Zwane's code
was) for non-leaf functions in get_wchan() in addition to
in_sched_functions() should do. e.g.

Index: mm4-2.6.9-rc1/arch/ppc64/kernel/process.c
===================================================================
--- mm4-2.6.9-rc1.orig/arch/ppc64/kernel/process.c	2004-09-08 05:46:09.000000000 -0700
+++ mm4-2.6.9-rc1/arch/ppc64/kernel/process.c	2004-09-09 09:58:47.448326528 -0700
@@ -555,7 +555,8 @@
 			return 0;
 		if (count > 0) {
 			ip = *(unsigned long *)(sp + 16);
-			if (!in_sched_functions(ip))
+			if (!in_sched_functions(ip) &&
+					|| (!count || !in_lock_functions(ip)))
 				return ip;
 		}
 	} while (count++ < 16);
Index: mm4-2.6.9-rc1/include/linux/spinlock.h
===================================================================
--- mm4-2.6.9-rc1.orig/include/linux/spinlock.h	2004-09-08 05:46:18.000000000 -0700
+++ mm4-2.6.9-rc1/include/linux/spinlock.h	2004-09-09 09:57:33.529563888 -0700
@@ -46,6 +46,7 @@
 
 #define __lockfunc fastcall __attribute__((section(".lock.text")))
 
+int in_lock_functions(unsigned long);
 int __lockfunc _spin_trylock(spinlock_t *lock);
 int __lockfunc _write_trylock(rwlock_t *lock);
 void __lockfunc _spin_lock(spinlock_t *lock);
Index: mm4-2.6.9-rc1/kernel/spinlock.c
===================================================================
--- mm4-2.6.9-rc1.orig/kernel/spinlock.c	2004-09-08 05:46:04.000000000 -0700
+++ mm4-2.6.9-rc1/kernel/spinlock.c	2004-09-09 09:57:10.569054416 -0700
@@ -11,6 +11,12 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 
+int in_lock_functions(unsigned long vaddr)
+{
+	return vaddr >= (unsigned long)&__lock_text_start
+			&& vaddr < (unsigned long)&__lock_text_end;
+}
+
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
 	preempt_disable();

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-09 17:19       ` William Lee Irwin III
@ 2004-09-09 21:38         ` Paul Mackerras
  2004-09-09 22:00           ` William Lee Irwin III
  0 siblings, 1 reply; 50+ messages in thread
From: Paul Mackerras @ 2004-09-09 21:38 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Anton Blanchard, Zwane Mwaikambo, Linux Kernel, Andrew Morton,
	Linus Torvalds, Matt Mackall, Nakajima, Jun

William Lee Irwin III writes:

> Checking in_lock_functions() (not sure what the name in Zwane's code
> was) for non-leaf functions in get_wchan() in addition to
> in_sched_functions() should do. e.g.

Well, no, not if we are two levels deep at this point (i.e. something
calls _spin_lock which calls __preempt_spin_lock).  I really don't
want to have to start doing a stack trace in profile_pc().

Paul.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-09 21:38         ` Paul Mackerras
@ 2004-09-09 22:00           ` William Lee Irwin III
  2004-09-09 23:36             ` Paul Mackerras
  0 siblings, 1 reply; 50+ messages in thread
From: William Lee Irwin III @ 2004-09-09 22:00 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Anton Blanchard, Zwane Mwaikambo, Linux Kernel, Andrew Morton,
	Linus Torvalds, Matt Mackall, Nakajima, Jun

William Lee Irwin III writes:
>> Checking in_lock_functions() (not sure what the name in Zwane's code
>> was) for non-leaf functions in get_wchan() in addition to
>> in_sched_functions() should do. e.g.

On Fri, Sep 10, 2004 at 07:38:15AM +1000, Paul Mackerras wrote:
> Well, no, not if we are two levels deep at this point (i.e. something
> calls _spin_lock which calls __preempt_spin_lock).  I really don't
> want to have to start doing a stack trace in profile_pc().

The semantics of profile_pc() have never included backtracing through
scheduling primitives, so I'd say just report __preempt_spin_lock().


-- wli

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-09 22:00           ` William Lee Irwin III
@ 2004-09-09 23:36             ` Paul Mackerras
  2004-09-10  0:09               ` William Lee Irwin III
  0 siblings, 1 reply; 50+ messages in thread
From: Paul Mackerras @ 2004-09-09 23:36 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Anton Blanchard, Zwane Mwaikambo, Linux Kernel, Andrew Morton,
	Linus Torvalds, Matt Mackall, Nakajima, Jun

William Lee Irwin III writes:

> The semantics of profile_pc() have never included backtracing through
> scheduling primitives, so I'd say just report __preempt_spin_lock().

I disagree that __preempt_spin_lock is a scheduling primitive, or at
least I disagree that it is primarily a scheduling primitive.  We
don't spend vast amounts of time spinning in any of the other
scheduling primitives; if we have to wait we call schedule().

Paul.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-09 23:36             ` Paul Mackerras
@ 2004-09-10  0:09               ` William Lee Irwin III
  2004-09-10  0:21                 ` Linus Torvalds
  0 siblings, 1 reply; 50+ messages in thread
From: William Lee Irwin III @ 2004-09-10  0:09 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Anton Blanchard, Zwane Mwaikambo, Linux Kernel, Andrew Morton,
	Linus Torvalds, Matt Mackall, Nakajima, Jun

William Lee Irwin III writes:
>> The semantics of profile_pc() have never included backtracing through
>> scheduling primitives, so I'd say just report __preempt_spin_lock().

On Fri, Sep 10, 2004 at 09:36:52AM +1000, Paul Mackerras wrote:
> I disagree that __preempt_spin_lock is a scheduling primitive, or at
> least I disagree that it is primarily a scheduling primitive.  We
> don't spend vast amounts of time spinning in any of the other
> scheduling primitives; if we have to wait we call schedule().

Unfortunately the alternative appears to be stack unwinding in
profile_pc(), which was why I hoped we could punt. Any other ideas?


-- wli

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-10  0:09               ` William Lee Irwin III
@ 2004-09-10  0:21                 ` Linus Torvalds
  2004-09-10  0:35                   ` Anton Blanchard
  0 siblings, 1 reply; 50+ messages in thread
From: Linus Torvalds @ 2004-09-10  0:21 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Paul Mackerras, Anton Blanchard, Zwane Mwaikambo, Linux Kernel,
	Andrew Morton, Matt Mackall, Nakajima, Jun

On Thu, 9 Sep 2004, William Lee Irwin III wrote:
> 
> Unfortunately the alternative appears to be stack unwinding in
> profile_pc(), which was why I hoped we could punt. Any other ideas?

Why do we care about profile_pc() here? It should do the right thing 
as-is.

What you care about is wchan, and stack unwiding _over_ the spinlocks. 
Since a spinlock can never be part of the wchan callchain, I vote we just 
change "in_sched_functions()" to claim that anything in the spinlock 
section is also a scheduler function as far as it's concerned.

That makes wchan happy, and profile_pc() really never should care.

		Linus

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-10  0:21                 ` Linus Torvalds
@ 2004-09-10  0:35                   ` Anton Blanchard
  2004-09-10  0:54                     ` Linus Torvalds
  0 siblings, 1 reply; 50+ messages in thread
From: Anton Blanchard @ 2004-09-10  0:35 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: William Lee Irwin III, Paul Mackerras, Zwane Mwaikambo,
	Linux Kernel, Andrew Morton, Matt Mackall, Nakajima, Jun

Hi,

> Why do we care about profile_pc() here? It should do the right thing 
> as-is.

With preempt off profile_pc works as expected, timer ticks in spinlocks
are apportioned to the calling function. Turn preempt on and you end up
with one big bucket called __preempt_spin_lock where most of the
spinlock ticks end up.

> What you care about is wchan, and stack unwiding _over_ the spinlocks. 
> Since a spinlock can never be part of the wchan callchain, I vote we just 
> change "in_sched_functions()" to claim that anything in the spinlock 
> section is also a scheduler function as far as it's concerned.
> 
> That makes wchan happy, and profile_pc() really never should care.

At the moment profile_pc is simple and only looks at the timer interrupt
pt_regs. With 2 levels of locks (_spin_lock -> __preempt_spin_lock) we
now have to walk the stack. Obviously fixable, it just a bit more work
in profile_pc. We would also need to move __preempt_spin_lock into the 
lock section which should be fine if we change wchan backtracing to do
as you suggest.

Anton

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-10  0:35                   ` Anton Blanchard
@ 2004-09-10  0:54                     ` Linus Torvalds
  2004-09-10  1:42                       ` Anton Blanchard
  2004-09-10  3:23                       ` Anton Blanchard
  0 siblings, 2 replies; 50+ messages in thread
From: Linus Torvalds @ 2004-09-10  0:54 UTC (permalink / raw)
  To: Anton Blanchard
  Cc: William Lee Irwin III, Paul Mackerras, Zwane Mwaikambo,
	Linux Kernel, Andrew Morton, Matt Mackall, Nakajima, Jun



On Fri, 10 Sep 2004, Anton Blanchard wrote:
> 
> With preempt off profile_pc works as expected, timer ticks in spinlocks
> are apportioned to the calling function. Turn preempt on and you end up
> with one big bucket called __preempt_spin_lock where most of the
> spinlock ticks end up.

But that's because "__preempt_spin_lock" on ppc is in the wrong section, 
no?

Just change it from "__sched" to "__lockfunc", and move it to 
kernel/spinlock.c while you're at it, and everything works right. Do the 
same for __preempt_write_lock() too.

Oh, and you need to do the "is_sched_function()" change too that I 
outlined in the previous email.

All in all, about four lines of code changes (+ some movement to make it 
all saner)

		Linus

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-10  0:54                     ` Linus Torvalds
@ 2004-09-10  1:42                       ` Anton Blanchard
  2004-09-10  1:50                         ` Anton Blanchard
  2004-09-10  3:23                       ` Anton Blanchard
  1 sibling, 1 reply; 50+ messages in thread
From: Anton Blanchard @ 2004-09-10  1:42 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: William Lee Irwin III, Paul Mackerras, Zwane Mwaikambo,
	Linux Kernel, Andrew Morton, Matt Mackall, Nakajima, Jun

 
> But that's because "__preempt_spin_lock" on ppc is in the wrong section, 
> no?
> 
> Just change it from "__sched" to "__lockfunc", and move it to 
> kernel/spinlock.c while you're at it, and everything works right. Do the 
> same for __preempt_write_lock() too.
> 
> Oh, and you need to do the "is_sched_function()" change too that I 
> outlined in the previous email.

Yep Im agreeing with you :) But we also need to fix profile_pc() since
it wont handle the 2 deep _spin_lock -> __preempt_spin_lock. Should be
no problems, ill work on this.

Anton

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-10  1:42                       ` Anton Blanchard
@ 2004-09-10  1:50                         ` Anton Blanchard
  2004-09-10  2:22                           ` William Lee Irwin III
  0 siblings, 1 reply; 50+ messages in thread
From: Anton Blanchard @ 2004-09-10  1:50 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: William Lee Irwin III, Paul Mackerras, Zwane Mwaikambo,
	Linux Kernel, Andrew Morton, Matt Mackall, Nakajima, Jun

 
> Yep Im agreeing with you :) But we also need to fix profile_pc() since
> it wont handle the 2 deep _spin_lock -> __preempt_spin_lock. Should be
> no problems, ill work on this.

Lets just make __preempt_spin_lock inline, then everything should work
as is.

Anton

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-10  1:50                         ` Anton Blanchard
@ 2004-09-10  2:22                           ` William Lee Irwin III
  2004-09-10  2:32                             ` William Lee Irwin III
                                               ` (2 more replies)
  0 siblings, 3 replies; 50+ messages in thread
From: William Lee Irwin III @ 2004-09-10  2:22 UTC (permalink / raw)
  To: Anton Blanchard
  Cc: Linus Torvalds, Paul Mackerras, Zwane Mwaikambo, Linux Kernel,
	Andrew Morton, Matt Mackall, Nakajima, Jun

On Fri, Sep 10, 2004 at 11:50:41AM +1000, Anton Blanchard wrote:
> Lets just make __preempt_spin_lock inline, then everything should work
> as is.

Well, there are patches that do this along with other more useful
things in the works (my spin on this is en route shortly, sorry the
response was delayed due to a power failure).


-- wli

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-10  2:22                           ` William Lee Irwin III
@ 2004-09-10  2:32                             ` William Lee Irwin III
  2004-09-10  2:37                               ` William Lee Irwin III
  2004-09-10  3:24                             ` Anton Blanchard
  2004-09-10  7:40                             ` Ingo Molnar
  2 siblings, 1 reply; 50+ messages in thread
From: William Lee Irwin III @ 2004-09-10  2:32 UTC (permalink / raw)
  To: Anton Blanchard
  Cc: Linus Torvalds, Paul Mackerras, Zwane Mwaikambo, Linux Kernel,
	Andrew Morton, Matt Mackall, Nakajima, Jun

On Fri, Sep 10, 2004 at 11:50:41AM +1000, Anton Blanchard wrote:
>> Lets just make __preempt_spin_lock inline, then everything should work
>> as is.

On Thu, Sep 09, 2004 at 07:22:04PM -0700, William Lee Irwin III wrote:
> Well, there are patches that do this along with other more useful
> things in the works (my spin on this is en route shortly, sorry the
> response was delayed due to a power failure).

I ran into instant pain because various read_lock() -related locking
primitives don't exist... first approximation:

This patch folds __preempt_spin_lock() and __preempt_write_lock() into
their callers, and follows Ingo's patch in sweeping various other locking
variants into doing likewise for CONFIG_PREEMPT=y && CONFIG_SMP=y.

Compiletested on ia64.


Index: mm4-2.6.9-rc1/kernel/sched.c
===================================================================
--- mm4-2.6.9-rc1.orig/kernel/sched.c	2004-09-08 06:10:47.000000000 -0700
+++ mm4-2.6.9-rc1/kernel/sched.c	2004-09-09 18:59:53.723177997 -0700
@@ -4572,49 +4572,3 @@
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
-
-
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
-/*
- * This could be a long-held lock.  If another CPU holds it for a long time,
- * and that CPU is not asked to reschedule then *this* CPU will spin on the
- * lock for a long time, even if *this* CPU is asked to reschedule.
- *
- * So what we do here, in the slow (contended) path is to spin on the lock by
- * hand while permitting preemption.
- *
- * Called inside preempt_disable().
- */
-void __sched __preempt_spin_lock(spinlock_t *lock)
-{
-	if (preempt_count() > 1) {
-		_raw_spin_lock(lock);
-		return;
-	}
-	do {
-		preempt_enable();
-		while (spin_is_locked(lock))
-			cpu_relax();
-		preempt_disable();
-	} while (!_raw_spin_trylock(lock));
-}
-
-EXPORT_SYMBOL(__preempt_spin_lock);
-
-void __sched __preempt_write_lock(rwlock_t *lock)
-{
-	if (preempt_count() > 1) {
-		_raw_write_lock(lock);
-		return;
-	}
-
-	do {
-		preempt_enable();
-		while (rwlock_is_locked(lock))
-			cpu_relax();
-		preempt_disable();
-	} while (!_raw_write_trylock(lock));
-}
-
-EXPORT_SYMBOL(__preempt_write_lock);
-#endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */
Index: mm4-2.6.9-rc1/kernel/spinlock.c
===================================================================
--- mm4-2.6.9-rc1.orig/kernel/spinlock.c	2004-09-08 06:10:36.000000000 -0700
+++ mm4-2.6.9-rc1/kernel/spinlock.c	2004-09-09 19:34:54.890144445 -0700
@@ -33,90 +33,274 @@
 }
 EXPORT_SYMBOL(_write_trylock);
 
+
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
+/*
+ * This could be a long-held lock.  If another CPU holds it for a long time,
+ * and that CPU is not asked to reschedule then *this* CPU will spin on the
+ * lock for a long time, even if *this* CPU is asked to reschedule.
+ * So what we do here, in the slow (contended) path is to spin on the lock by
+ * hand while permitting preemption.
+ */
 void __lockfunc _spin_lock(spinlock_t *lock)
 {
 	preempt_disable();
-	if (unlikely(!_raw_spin_trylock(lock)))
-		__preempt_spin_lock(lock);
+	if (unlikely(!_raw_spin_trylock(lock))) {
+		if (preempt_count() > 1)
+			_raw_spin_lock(lock);
+		else {
+			do {
+				preempt_enable();
+				while (spin_is_locked(lock))
+					cpu_relax();
+				preempt_disable();
+			} while (!_raw_spin_trylock(lock));
+		}
+	}
 }
 
-void __lockfunc _write_lock(rwlock_t *lock)
+void __lockfunc _spin_lock_irq(spinlock_t *lock)
 {
+	local_irq_disable();
 	preempt_disable();
-	if (unlikely(!_raw_write_trylock(lock)))
-		__preempt_write_lock(lock);
+	if (unlikely(!_raw_spin_trylock(lock))) {
+		if (preempt_count() > 1)
+			_raw_spin_lock(lock);
+		else {
+			do {
+				preempt_enable();
+				local_irq_enable();
+				while (spin_is_locked(lock))
+					cpu_relax();
+				preempt_disable();
+				local_irq_disable();
+			} while (!_raw_spin_trylock(lock));
+		}
+	}
 }
-#else
-void __lockfunc _spin_lock(spinlock_t *lock)
+
+unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
 {
+	unsigned long flags;
+
+	local_irq_save(flags);
 	preempt_disable();
-	_raw_spin_lock(lock);
+	if (unlikely(!_raw_spin_trylock(lock))) {
+		if (preempt_count() > 1)
+			_raw_spin_lock_flags(lock, flags);
+		else {
+			do {
+				preempt_enable();
+				local_irq_restore(flags);
+				while (spin_is_locked(lock))
+					cpu_relax();
+				preempt_disable();
+				local_irq_save(flags);
+			} while (!_raw_spin_trylock(lock));
+		}
+	}
+	return flags;
+}
+
+void __lockfunc _spin_lock_bh(spinlock_t *lock)
+{
+	local_bh_disable();
+	preempt_disable();
+	if (unlikely(!_raw_spin_trylock(lock))) {
+		if (preempt_count() > 1)
+			_raw_spin_lock(lock);
+		else {
+			do {
+				preempt_enable();
+				local_bh_enable();
+				while (spin_is_locked(lock))
+					cpu_relax();
+				preempt_disable();
+				local_bh_disable();
+			} while (!_raw_spin_trylock(lock));
+		}
+	}
 }
 
 void __lockfunc _write_lock(rwlock_t *lock)
 {
 	preempt_disable();
-	_raw_write_lock(lock);
+	if (unlikely(!_raw_write_trylock(lock))) {
+		if (preempt_count() > 1)
+			_raw_write_lock(lock);
+		else {
+			do {
+				preempt_enable();
+				while (rwlock_is_locked(lock))
+					cpu_relax();
+				preempt_disable();
+			} while (!_raw_write_trylock(lock));
+		}
+	}
 }
-#endif
-EXPORT_SYMBOL(_spin_lock);
-EXPORT_SYMBOL(_write_lock);
 
-void __lockfunc _read_lock(rwlock_t *lock)
+void __lockfunc _write_lock_irq(rwlock_t *lock)
 {
+	local_irq_disable();
 	preempt_disable();
-	_raw_read_lock(lock);
+	if (unlikely(!_raw_write_trylock(lock))) {
+		if (preempt_count() > 1)
+			_raw_write_lock(lock);
+		else {
+			do {
+				preempt_enable();
+				local_irq_enable();
+				while (rwlock_is_locked(lock))
+					cpu_relax();
+				preempt_disable();
+				local_irq_disable();
+			} while (!_raw_write_trylock(lock));
+		}
+	}
 }
-EXPORT_SYMBOL(_read_lock);
 
-void __lockfunc _spin_unlock(spinlock_t *lock)
+unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
 {
-	_raw_spin_unlock(lock);
-	preempt_enable();
+	unsigned long flags;
+
+	local_irq_save(flags);
+	preempt_disable();
+	if (unlikely(!_raw_write_trylock(lock))) {
+		if (preempt_count() > 1)
+			_raw_write_lock(lock);
+		else {
+			do {
+				preempt_enable();
+				local_irq_restore(flags);
+				while (rwlock_is_locked(lock))
+					cpu_relax();
+				local_irq_save(flags);
+				preempt_disable();
+			} while (!_raw_write_trylock(lock));
+		}
+	}
+	return flags;
 }
-EXPORT_SYMBOL(_spin_unlock);
 
-void __lockfunc _write_unlock(rwlock_t *lock)
+void __lockfunc _write_lock_bh(rwlock_t *lock)
 {
-	_raw_write_unlock(lock);
-	preempt_enable();
+	local_bh_disable();
+	preempt_disable();
+	if (unlikely(!_raw_write_trylock(lock))) {
+		if (preempt_count() > 1)
+			_raw_write_lock(lock);
+		else {
+			do {
+				preempt_enable();
+				local_bh_enable();
+				while (rwlock_is_locked(lock))
+					cpu_relax();
+				local_bh_disable();
+				preempt_disable();
+			} while (!_raw_write_trylock(lock));
+		}
+	}
+}
+
+#ifdef NOTYET
+/*
+ * XXX: FIXME architectures don't implement locking primitives used here:
+ * _raw_read_trylock()
+ * _raw_read_lock_flags()
+ * rwlock_is_write_locked()
+ */
+void __lockfunc _read_lock(rwlock_t *lock)
+{
+	preempt_disable();
+	if (unlikely(!_raw_read_trylock(lock))) {
+		if (preempt_count() > 1)
+			_raw_read_lock(lock);
+		else {
+			do {
+				preempt_enable();
+				while (rwlock_is_write_locked(lock))
+					cpu_relax();
+				preempt_disable();
+			} while (!_raw_read_trylock(lock));
+		}
+	}
 }
-EXPORT_SYMBOL(_write_unlock);
 
-void __lockfunc _read_unlock(rwlock_t *lock)
+void __lockfunc _read_lock_irq(rwlock_t *lock)
 {
-	_raw_read_unlock(lock);
-	preempt_enable();
+	local_irq_disable();
+	preempt_disable();
+	if (unlikely(!_raw_read_trylock(lock))) {
+		if (preempt_count() > 1)
+			_raw_read_lock(lock);
+		else {
+			do {
+				preempt_enable();
+				local_irq_enable();
+				while (rwlock_is_write_locked(lock))
+					cpu_relax();
+				local_irq_disable();
+				preempt_disable();
+			} while (!_raw_read_trylock(lock));
+		}
+	}
 }
-EXPORT_SYMBOL(_read_unlock);
 
-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
 	preempt_disable();
-	_raw_spin_lock_flags(lock, flags);
+	if (unlikely(!_raw_read_trylock(lock))) {
+		if (preempt_count() > 1)
+			_raw_read_lock_flags(lock, flags);
+		else {
+			do {
+				preempt_enable();
+				local_irq_restore(flags);
+				while (rwlock_is_write_locked(lock))
+					cpu_relax();
+				local_irq_save(flags);
+				preempt_disable();
+			} while (!_raw_read_trylock(lock));
+		}
+	}
 	return flags;
 }
-EXPORT_SYMBOL(_spin_lock_irqsave);
 
-void __lockfunc _spin_lock_irq(spinlock_t *lock)
+void __lockfunc _read_lock_bh(rwlock_t *lock)
 {
-	local_irq_disable();
+	local_bh_disable();
 	preempt_disable();
-	_raw_spin_lock(lock);
+	if (unlikely(_raw_read_trylock(lock))) {
+		if (preempt_count() > 1)
+			_raw_read_lock(lock);
+		else {
+			do {
+				preempt_enable();
+				local_bh_enable();
+				while (rwlock_is_write_locked(lock))
+					cpu_relax();
+				local_bh_disable();
+				preempt_disable();
+			} while (!_raw_read_trylock(lock));
+		}
+	}
+}
+#else /* NOTYET */
+void __lockfunc _read_lock(rwlock_t *lock)
+{
+	preempt_disable();
+	_raw_read_lock(lock);
 }
-EXPORT_SYMBOL(_spin_lock_irq);
 
-void __lockfunc _spin_lock_bh(spinlock_t *lock)
+void __lockfunc _read_lock_irq(rwlock_t *lock)
 {
-	local_bh_disable();
+	local_irq_disable();
 	preempt_disable();
-	_raw_spin_lock(lock);
+	_raw_read_lock(lock);
 }
-EXPORT_SYMBOL(_spin_lock_bh);
 
 unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
 {
@@ -127,34 +311,51 @@
 	_raw_read_lock(lock);
 	return flags;
 }
-EXPORT_SYMBOL(_read_lock_irqsave);
 
-void __lockfunc _read_lock_irq(rwlock_t *lock)
+void __lockfunc _read_lock_bh(rwlock_t *lock)
 {
-	local_irq_disable();
+	local_bh_disable();
 	preempt_disable();
 	_raw_read_lock(lock);
 }
-EXPORT_SYMBOL(_read_lock_irq);
+#endif /* NOTYET */
 
-void __lockfunc _read_lock_bh(rwlock_t *lock)
+#else /* !CONFIG_PREEMPT */
+void __lockfunc _spin_lock(spinlock_t *lock)
 {
-	local_bh_disable();
 	preempt_disable();
-	_raw_read_lock(lock);
+	_raw_spin_lock(lock);
 }
-EXPORT_SYMBOL(_read_lock_bh);
 
-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
+void __lockfunc _spin_lock_irq(spinlock_t *lock)
+{
+	local_irq_disable();
+	preempt_disable();
+	_raw_spin_lock(lock);
+}
+
+unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
 	preempt_disable();
-	_raw_write_lock(lock);
+	_raw_spin_lock_flags(lock, flags);
 	return flags;
 }
-EXPORT_SYMBOL(_write_lock_irqsave);
+
+void __lockfunc _spin_lock_bh(spinlock_t *lock)
+{
+	local_bh_disable();
+	preempt_disable();
+	_raw_spin_lock(lock);
+}
+
+void __lockfunc _write_lock(rwlock_t *lock)
+{
+	preempt_disable();
+	_raw_write_lock(lock);
+}
 
 void __lockfunc _write_lock_irq(rwlock_t *lock)
 {
@@ -162,7 +363,16 @@
 	preempt_disable();
 	_raw_write_lock(lock);
 }
-EXPORT_SYMBOL(_write_lock_irq);
+
+unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	preempt_disable();
+	_raw_write_lock(lock);
+	return flags;
+}
 
 void __lockfunc _write_lock_bh(rwlock_t *lock)
 {
@@ -170,8 +380,71 @@
 	preempt_disable();
 	_raw_write_lock(lock);
 }
+
+void __lockfunc _read_lock(rwlock_t *lock)
+{
+	preempt_disable();
+	_raw_read_lock(lock);
+}
+
+void __lockfunc _read_lock_irq(rwlock_t *lock)
+{
+	local_irq_disable();
+	preempt_disable();
+	_raw_read_lock(lock);
+}
+
+unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	preempt_disable();
+	_raw_read_lock(lock);
+	return flags;
+}
+
+void __lockfunc _read_lock_bh(rwlock_t *lock)
+{
+	local_bh_disable();
+	preempt_disable();
+	_raw_read_lock(lock);
+}
+#endif
+EXPORT_SYMBOL(_spin_lock);
+EXPORT_SYMBOL(_write_lock);
+EXPORT_SYMBOL(_read_lock);
+EXPORT_SYMBOL(_spin_lock_irq);
+EXPORT_SYMBOL(_spin_lock_irqsave);
+EXPORT_SYMBOL(_spin_lock_bh);
+EXPORT_SYMBOL(_read_lock_irq);
+EXPORT_SYMBOL(_read_lock_irqsave);
+EXPORT_SYMBOL(_read_lock_bh);
+EXPORT_SYMBOL(_write_lock_irq);
+EXPORT_SYMBOL(_write_lock_irqsave);
 EXPORT_SYMBOL(_write_lock_bh);
 
+void __lockfunc _spin_unlock(spinlock_t *lock)
+{
+	_raw_spin_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_spin_unlock);
+
+void __lockfunc _write_unlock(rwlock_t *lock)
+{
+	_raw_write_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_write_unlock);
+
+void __lockfunc _read_unlock(rwlock_t *lock)
+{
+	_raw_read_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_read_unlock);
+
 void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 {
 	_raw_spin_unlock(lock);

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-10  2:32                             ` William Lee Irwin III
@ 2004-09-10  2:37                               ` William Lee Irwin III
  0 siblings, 0 replies; 50+ messages in thread
From: William Lee Irwin III @ 2004-09-10  2:37 UTC (permalink / raw)
  To: Anton Blanchard
  Cc: Linus Torvalds, Paul Mackerras, Zwane Mwaikambo, Linux Kernel,
	Andrew Morton, Matt Mackall, Nakajima, Jun

On Thu, Sep 09, 2004 at 07:32:58PM -0700, William Lee Irwin III wrote:
> I ran into instant pain because various read_lock() -related locking
> primitives don't exist... first approximation:
> This patch folds __preempt_spin_lock() and __preempt_write_lock() into
> their callers, and follows Ingo's patch in sweeping various other locking
> variants into doing likewise for CONFIG_PREEMPT=y && CONFIG_SMP=y.
> Compiletested on ia64.

Don't bother with this, Ingo's preempt-smp.patch at least got
_raw_read_trylock() implemented so just use that.


-- wli

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-10  2:22                           ` William Lee Irwin III
  2004-09-10  2:32                             ` William Lee Irwin III
@ 2004-09-10  3:24                             ` Anton Blanchard
  2004-09-10  7:40                             ` Ingo Molnar
  2 siblings, 0 replies; 50+ messages in thread
From: Anton Blanchard @ 2004-09-10  3:24 UTC (permalink / raw)
  To: William Lee Irwin III, Linus Torvalds, Paul Mackerras,
	Zwane Mwaikambo, Linux Kernel, Andrew Morton, Matt Mackall,
	Nakajima, Jun

 
> Well, there are patches that do this along with other more useful
> things in the works (my spin on this is en route shortly, sorry the
> response was delayed due to a power failure).

OK. I just sent out a minimal fix, hopefully there isnt too much overlap :)

Anton

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-10  2:22                           ` William Lee Irwin III
  2004-09-10  2:32                             ` William Lee Irwin III
  2004-09-10  3:24                             ` Anton Blanchard
@ 2004-09-10  7:40                             ` Ingo Molnar
  2004-09-10 12:16                               ` William Lee Irwin III
  2 siblings, 1 reply; 50+ messages in thread
From: Ingo Molnar @ 2004-09-10  7:40 UTC (permalink / raw)
  To: William Lee Irwin III, Anton Blanchard, Linus Torvalds,
	Paul Mackerras, Zwane Mwaikambo, Linux Kernel, Andrew Morton,
	Matt Mackall, Nakajima, Jun


* William Lee Irwin III <wli@holomorphy.com> wrote:

> On Fri, Sep 10, 2004 at 11:50:41AM +1000, Anton Blanchard wrote:
> > Lets just make __preempt_spin_lock inline, then everything should work
> > as is.
> 
> Well, there are patches that do this along with other more useful
> things in the works (my spin on this is en route shortly, sorry the
> response was delayed due to a power failure).

i already sent the full solution that primarily solves the SMP &&
PREEMPT latency problems but also solves the section issue, two days
ago:

   http://lkml.org/lkml/2004/9/8/97

	Ingo

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-10  7:40                             ` Ingo Molnar
@ 2004-09-10 12:16                               ` William Lee Irwin III
  2004-09-10 12:22                                 ` Ingo Molnar
  0 siblings, 1 reply; 50+ messages in thread
From: William Lee Irwin III @ 2004-09-10 12:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Anton Blanchard, Linus Torvalds, Paul Mackerras, Zwane Mwaikambo,
	Linux Kernel, Andrew Morton, Matt Mackall, Nakajima, Jun

* William Lee Irwin III <wli@holomorphy.com> wrote:
>> Well, there are patches that do this along with other more useful
>> things in the works (my spin on this is en route shortly, sorry the
>> response was delayed due to a power failure).

On Fri, Sep 10, 2004 at 09:40:34AM +0200, Ingo Molnar wrote:
> i already sent the full solution that primarily solves the SMP &&
> PREEMPT latency problems but also solves the section issue, two days
> ago:
>    http://lkml.org/lkml/2004/9/8/97

When I noticed there was work to do along the lines of creating
read_trylock() primitives I dropped the ->break_lock -less variant I
had brewed up and directed Linus to your patch.


-- wli

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-10 12:16                               ` William Lee Irwin III
@ 2004-09-10 12:22                                 ` Ingo Molnar
  0 siblings, 0 replies; 50+ messages in thread
From: Ingo Molnar @ 2004-09-10 12:22 UTC (permalink / raw)
  To: William Lee Irwin III, Anton Blanchard, Linus Torvalds,
	Paul Mackerras, Zwane Mwaikambo, Linux Kernel, Andrew Morton,
	Matt Mackall, Nakajima, Jun


* William Lee Irwin III <wli@holomorphy.com> wrote:

> * William Lee Irwin III <wli@holomorphy.com> wrote:
> >> Well, there are patches that do this along with other more useful
> >> things in the works (my spin on this is en route shortly, sorry the
> >> response was delayed due to a power failure).
> 
> On Fri, Sep 10, 2004 at 09:40:34AM +0200, Ingo Molnar wrote:
> > i already sent the full solution that primarily solves the SMP &&
> > PREEMPT latency problems but also solves the section issue, two days
> > ago:
> >    http://lkml.org/lkml/2004/9/8/97
> 
> When I noticed there was work to do along the lines of creating
> read_trylock() primitives I dropped the ->break_lock -less variant I
> had brewed up and directed Linus to your patch.

thanks - i just read that mail. I'd expect the preempt-smp patch to go
the normal route via touching -mm first.

	Ingo

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-10  0:54                     ` Linus Torvalds
  2004-09-10  1:42                       ` Anton Blanchard
@ 2004-09-10  3:23                       ` Anton Blanchard
  2004-09-10  7:56                         ` Anton Blanchard
  1 sibling, 1 reply; 50+ messages in thread
From: Anton Blanchard @ 2004-09-10  3:23 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: William Lee Irwin III, Paul Mackerras, Zwane Mwaikambo,
	Linux Kernel, Andrew Morton, Matt Mackall, Nakajima, Jun

 
> All in all, about four lines of code changes (+ some movement to make it 
> all saner)

How does this look?

- create in_lock_functions() to match in_sched_functions(). Export it
  for use in oprofile.
- use char __lock_text_start[] instead of long __lock_text_start when
  declaring linker symbols. Rusty fixed a number of these a while ago
  based on advice from rth.
- Move __preempt_*_lock into kernel/spinlock.c and make it inline. This
  means locks are only one deep.
- Make in_sched_functions() check in_lock_functions()

I apologise if I have broken any architectures.

Anton

Signed-off-by: Anton Blanchard <anton@samba.org>

diff -puN arch/arm/kernel/time.c~spinlockfix arch/arm/kernel/time.c
--- compat_affinity/arch/arm/kernel/time.c~spinlockfix	2004-09-10 13:13:50.266767502 +1000
+++ compat_affinity-anton/arch/arm/kernel/time.c	2004-09-10 13:13:50.347761276 +1000
@@ -57,8 +57,7 @@ unsigned long profile_pc(struct pt_regs 
 {
 	unsigned long fp, pc = instruction_pointer(regs);
 
-	if (pc >= (unsigned long)&__lock_text_start &&
-	    pc <= (unsigned long)&__lock_text_end) {
+	if (in_lock_functions(pc)) {
 		fp = thread_saved_fp(current);
 		pc = pc_pointer(((unsigned long *)fp)[-1]);
 	}
diff -puN arch/i386/kernel/time.c~spinlockfix arch/i386/kernel/time.c
--- compat_affinity/arch/i386/kernel/time.c~spinlockfix	2004-09-10 13:13:50.272767040 +1000
+++ compat_affinity-anton/arch/i386/kernel/time.c	2004-09-10 13:13:50.349761122 +1000
@@ -205,8 +205,7 @@ unsigned long profile_pc(struct pt_regs 
 {
 	unsigned long pc = instruction_pointer(regs);
 
-	if (pc >= (unsigned long)&__lock_text_start &&
-	    pc <= (unsigned long)&__lock_text_end)
+	if (in_lock_functions(pc))
 		return *(unsigned long *)(regs->ebp + 4);
 
 	return pc;
diff -puN arch/ppc/kernel/time.c~spinlockfix arch/ppc/kernel/time.c
--- compat_affinity/arch/ppc/kernel/time.c~spinlockfix	2004-09-10 13:13:50.278766579 +1000
+++ compat_affinity-anton/arch/ppc/kernel/time.c	2004-09-10 13:13:50.350761045 +1000
@@ -113,8 +113,7 @@ unsigned long profile_pc(struct pt_regs 
 {
 	unsigned long pc = instruction_pointer(regs);
 
-	if (pc >= (unsigned long)&__lock_text_start &&
-	    pc <= (unsigned long)&__lock_text_end)
+	if (in_lock_functions(pc))
 		return regs->link;
 
 	return pc;
diff -puN arch/ppc64/kernel/rtasd.c~spinlockfix arch/ppc64/kernel/rtasd.c
diff -puN arch/ppc64/kernel/time.c~spinlockfix arch/ppc64/kernel/time.c
--- compat_affinity/arch/ppc64/kernel/time.c~spinlockfix	2004-09-10 13:13:50.290765657 +1000
+++ compat_affinity-anton/arch/ppc64/kernel/time.c	2004-09-10 13:13:50.353760815 +1000
@@ -163,8 +163,7 @@ unsigned long profile_pc(struct pt_regs 
 {
 	unsigned long pc = instruction_pointer(regs);
 
-	if (pc >= (unsigned long)&__lock_text_start &&
-	    pc <= (unsigned long)&__lock_text_end)
+	if (in_lock_functions(pc))
 		return regs->link;
 
 	return pc;
diff -puN arch/sparc/kernel/time.c~spinlockfix arch/sparc/kernel/time.c
--- compat_affinity/arch/sparc/kernel/time.c~spinlockfix	2004-09-10 13:13:50.296765196 +1000
+++ compat_affinity-anton/arch/sparc/kernel/time.c	2004-09-10 13:13:50.355760661 +1000
@@ -81,20 +81,22 @@ struct intersil *intersil_clock;
 
 unsigned long profile_pc(struct pt_regs *regs)
 {
-	extern int __copy_user_begin, __copy_user_end;
-	extern int __atomic_begin, __atomic_end;
-	extern int __bzero_begin, __bzero_end;
-	extern int __bitops_begin, __bitops_end;
+	extern char __copy_user_begin[], __copy_user_end[];
+	extern char __atomic_begin[], __atomic_end[];
+	extern char __bzero_begin[], __bzero_end[];
+	extern char __bitops_begin[], __bitops_end[];
+
 	unsigned long pc = regs->pc;
 
-	if ((pc >= (unsigned long) &__copy_user_begin &&
-	     pc < (unsigned long) &__copy_user_end) ||
-	    (pc >= (unsigned long) &__atomic_begin &&
-	     pc < (unsigned long) &__atomic_end) ||
-	    (pc >= (unsigned long) &__bzero_begin &&
-	     pc < (unsigned long) &__bzero_end) ||
-	    (pc >= (unsigned long) &__bitops_begin &&
-	     pc < (unsigned long) &__bitops_end))
+	if (in_lock_functions(pc) ||
+	    (pc >= (unsigned long) __copy_user_begin &&
+	     pc < (unsigned long) __copy_user_end) ||
+	    (pc >= (unsigned long) __atomic_begin &&
+	     pc < (unsigned long) __atomic_end) ||
+	    (pc >= (unsigned long) __bzero_begin &&
+	     pc < (unsigned long) __bzero_end) ||
+	    (pc >= (unsigned long) __bitops_begin &&
+	     pc < (unsigned long) __bitops_end))
 		pc = regs->u_regs[UREG_RETPC];
 	return pc;
 }
diff -puN arch/sparc64/kernel/time.c~spinlockfix arch/sparc64/kernel/time.c
--- compat_affinity/arch/sparc64/kernel/time.c~spinlockfix	2004-09-10 13:13:50.302764735 +1000
+++ compat_affinity-anton/arch/sparc64/kernel/time.c	2004-09-10 13:13:50.358760430 +1000
@@ -73,8 +73,7 @@ unsigned long profile_pc(struct pt_regs 
 {
 	unsigned long pc = instruction_pointer(regs);
 
-	if (pc >= (unsigned long)&__lock_text_start &&
-	    pc <= (unsigned long)&__lock_text_end)
+	if (in_lock_functions(pc))
 		return regs->u_regs[UREG_RETPC];
 	return pc;
 }
diff -puN arch/x86_64/kernel/time.c~spinlockfix arch/x86_64/kernel/time.c
--- compat_affinity/arch/x86_64/kernel/time.c~spinlockfix	2004-09-10 13:13:50.308764273 +1000
+++ compat_affinity-anton/arch/x86_64/kernel/time.c	2004-09-10 13:13:50.361760200 +1000
@@ -184,8 +184,7 @@ unsigned long profile_pc(struct pt_regs 
 {
 	unsigned long pc = instruction_pointer(regs);
 
-	if (pc >= (unsigned long)&__lock_text_start &&
-	    pc <= (unsigned long)&__lock_text_end)
+	if (in_lock_functions(pc))
 		return *(unsigned long *)regs->rbp;
 	return pc;
 }
diff -puN include/linux/spinlock.h~spinlockfix include/linux/spinlock.h
--- compat_affinity/include/linux/spinlock.h~spinlockfix	2004-09-10 13:13:50.314763812 +1000
+++ compat_affinity-anton/include/linux/spinlock.h	2004-09-10 13:13:50.363760046 +1000
@@ -68,11 +68,11 @@ void __lockfunc _write_unlock_irqrestore
 void __lockfunc _write_unlock_irq(rwlock_t *lock);
 void __lockfunc _write_unlock_bh(rwlock_t *lock);
 int __lockfunc _spin_trylock_bh(spinlock_t *lock);
-
-extern unsigned long __lock_text_start;
-extern unsigned long __lock_text_end;
+int in_lock_functions(unsigned long addr);
 #else
 
+#define in_lock_functions(ADDR) 0
+
 #if !defined(CONFIG_PREEMPT) && !defined(CONFIG_DEBUG_SPINLOCK)
 # define atomic_dec_and_lock(atomic,lock) atomic_dec_and_test(atomic)
 # define ATOMIC_DEC_AND_LOCK
diff -puN kernel/sched.c~spinlockfix kernel/sched.c
--- compat_affinity/kernel/sched.c~spinlockfix	2004-09-10 13:13:50.321763274 +1000
+++ compat_affinity-anton/kernel/sched.c	2004-09-10 13:13:50.373759277 +1000
@@ -4672,8 +4672,9 @@ int in_sched_functions(unsigned long add
 {
 	/* Linker adds these: start and end of __sched functions */
 	extern char __sched_text_start[], __sched_text_end[];
-	return addr >= (unsigned long)__sched_text_start
-		&& addr < (unsigned long)__sched_text_end;
+	return in_lock_functions(addr) ||
+		(addr >= (unsigned long)__sched_text_start
+		&& addr < (unsigned long)__sched_text_end);
 }
 
 void __init sched_init(void)
@@ -4765,49 +4766,3 @@ void __might_sleep(char *file, int line)
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
-
-
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
-/*
- * This could be a long-held lock.  If another CPU holds it for a long time,
- * and that CPU is not asked to reschedule then *this* CPU will spin on the
- * lock for a long time, even if *this* CPU is asked to reschedule.
- *
- * So what we do here, in the slow (contended) path is to spin on the lock by
- * hand while permitting preemption.
- *
- * Called inside preempt_disable().
- */
-void __sched __preempt_spin_lock(spinlock_t *lock)
-{
-	if (preempt_count() > 1) {
-		_raw_spin_lock(lock);
-		return;
-	}
-	do {
-		preempt_enable();
-		while (spin_is_locked(lock))
-			cpu_relax();
-		preempt_disable();
-	} while (!_raw_spin_trylock(lock));
-}
-
-EXPORT_SYMBOL(__preempt_spin_lock);
-
-void __sched __preempt_write_lock(rwlock_t *lock)
-{
-	if (preempt_count() > 1) {
-		_raw_write_lock(lock);
-		return;
-	}
-
-	do {
-		preempt_enable();
-		while (rwlock_is_locked(lock))
-			cpu_relax();
-		preempt_disable();
-	} while (!_raw_write_trylock(lock));
-}
-
-EXPORT_SYMBOL(__preempt_write_lock);
-#endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */
diff -puN kernel/spinlock.c~spinlockfix kernel/spinlock.c
--- compat_affinity/kernel/spinlock.c~spinlockfix	2004-09-10 13:13:50.326762890 +1000
+++ compat_affinity-anton/kernel/spinlock.c	2004-09-10 13:19:40.103522704 +1000
@@ -33,7 +33,32 @@ int __lockfunc _write_trylock(rwlock_t *
 }
 EXPORT_SYMBOL(_write_trylock);
 
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
+#ifdef CONFIG_PREEMPT
+/*
+ * This could be a long-held lock.  If another CPU holds it for a long time,
+ * and that CPU is not asked to reschedule then *this* CPU will spin on the
+ * lock for a long time, even if *this* CPU is asked to reschedule.
+ *
+ * So what we do here, in the slow (contended) path is to spin on the lock by
+ * hand while permitting preemption.
+ *
+ * Called inside preempt_disable().
+ */
+static inline void __preempt_spin_lock(spinlock_t *lock)
+{
+	if (preempt_count() > 1) {
+		_raw_spin_lock(lock);
+		return;
+	}
+
+	do {
+		preempt_enable();
+		while (spin_is_locked(lock))
+			cpu_relax();
+		preempt_disable();
+	} while (!_raw_spin_trylock(lock));
+}
+
 void __lockfunc _spin_lock(spinlock_t *lock)
 {
 	preempt_disable();
@@ -41,6 +66,21 @@ void __lockfunc _spin_lock(spinlock_t *l
 		__preempt_spin_lock(lock);
 }
 
+static inline void __preempt_write_lock(rwlock_t *lock)
+{
+	if (preempt_count() > 1) {
+		_raw_write_lock(lock);
+		return;
+	}
+
+	do {
+		preempt_enable();
+		while (rwlock_is_locked(lock))
+			cpu_relax();
+		preempt_disable();
+	} while (!_raw_write_trylock(lock));
+}
+
 void __lockfunc _write_lock(rwlock_t *lock)
 {
 	preempt_disable();
@@ -256,3 +296,13 @@ int __lockfunc _spin_trylock_bh(spinlock
 	return 0;
 }
 EXPORT_SYMBOL(_spin_trylock_bh);
+
+int in_lock_functions(unsigned long addr)
+{
+	/* Linker adds these: start and end of __lockfunc functions */
+	extern char __lock_text_start[], __lock_text_end[];
+
+	return addr >= (unsigned long)__lock_text_start
+	&& addr < (unsigned long)__lock_text_end;
+}
+EXPORT_SYMBOL(in_lock_functions);
_

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH][5/8] Arch agnostic completely out of line locks / ppc64
  2004-09-10  3:23                       ` Anton Blanchard
@ 2004-09-10  7:56                         ` Anton Blanchard
  0 siblings, 0 replies; 50+ messages in thread
From: Anton Blanchard @ 2004-09-10  7:56 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: William Lee Irwin III, Paul Mackerras, Zwane Mwaikambo,
	Linux Kernel, Andrew Morton, Matt Mackall, Nakajima, Jun

 
I forgot to remove the no longer used __preempt_*_lock prototypes.

Signed-off-by: Anton Blanchard <anton@samba.org>

diff -puN include/linux/spinlock.h~fix_preempt include/linux/spinlock.h
--- foobar2/include/linux/spinlock.h~fix_preempt	2004-09-10 17:45:47.600637107 +1000
+++ foobar2-anton/include/linux/spinlock.h	2004-09-10 17:45:54.710204190 +1000
@@ -405,11 +405,6 @@ do { \
 
 /* Where's read_trylock? */
 
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
-void __preempt_spin_lock(spinlock_t *lock);
-void __preempt_write_lock(rwlock_t *lock);
-#endif
-
 #define spin_lock(lock)		_spin_lock(lock)
 #define write_lock(lock)	_write_lock(lock)
 #define read_lock(lock)		_read_lock(lock)
_

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [PATCH] Yielding processor resources during lock contention
@ 2004-09-13  0:45 Nakajima, Jun
  0 siblings, 0 replies; 50+ messages in thread
From: Nakajima, Jun @ 2004-09-13  0:45 UTC (permalink / raw)
  To: Linus Torvalds, Zwane Mwaikambo
  Cc: Paul Mackerras, Linux Kernel, Andrew Morton, Anton Blanchard,
	Andi Kleen, Ingo Molnar

>From: Linus Torvalds [mailto:torvalds@osdl.org]
>Sent: Saturday, September 11, 2004 10:10 PM
>To: Zwane Mwaikambo
>Cc: Paul Mackerras; Linux Kernel; Andrew Morton; Anton Blanchard;
Nakajima,
>Jun; Andi Kleen; Ingo Molnar
>Subject: Re: [PATCH] Yielding processor resources during lock
contention
>
>
>
>On Sun, 12 Sep 2004, Zwane Mwaikambo wrote:
>>
>> On i386 processors with PNI this is achieved by using the
>> monitor/mwait opcodes to halt the processor until a write to the lock
is
>> done.
>
>I'd seriously suggest you ask Intel for an official opinion on this.
Last
>I heard (and that was, I believe, before monitor/mwait had been
officially
>announced, so it's certainly a bit dated now) it wasn't architecturally
>clear that it's a good idea using it for things like spinlocks.
>
>In particular, if the CPU idly waits for a cacheline to be dirtied, it
is
>entirely possible that the other CPU that owns the lock and releases it
>won't actually _tell_ the world that the lock has been released for
quite
>some time. After all, why should it - if it is the exclusive owner, and
it
>sees no memory traffic on the bus, it may have no reason to push out
the
>fact that it just released the lock. Just keep it dirty in its caches.
>
>In other words: monitor/mwait on purpose obviously causes fewer bus
>cycles. But that very fact may well mean (at least in theory) that you
get
>very high latencies. It could make spinlock contention very very unfair
>(the original CPU keeps getting the lock over and over again, while the
>monitor/mwait one never gets to play), and it might also make ping-pong
>locking latency be extremely high.
>
This is my personal comment, but the current monitor/mwait
implementation on Prescott is not proper for things like spinlock
because high latency. At this point, the idle loop in the kernel is one
of the intended usage models under that implementation. In the future,
the latency may be lowered, and we'll revisit spinlocks using
monitor/mwait in that case.

Jun

<snip>
>			Linus

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [PATCH] Yielding processor resources during lock contention
@ 2004-09-13  1:35 Nakajima, Jun
  2004-09-13  6:35 ` Ingo Molnar
  0 siblings, 1 reply; 50+ messages in thread
From: Nakajima, Jun @ 2004-09-13  1:35 UTC (permalink / raw)
  To: Ingo Molnar, Zwane Mwaikambo
  Cc: Linus Torvalds, Paul Mackerras, Linux Kernel, Andrew Morton,
	Anton Blanchard, Andi Kleen

>From: Ingo Molnar [mailto:mingo@elte.hu]
>Sent: Sunday, September 12, 2004 12:49 AM
>To: Zwane Mwaikambo
>Cc: Linus Torvalds; Paul Mackerras; Linux Kernel; Andrew Morton; Anton
>Blanchard; Nakajima, Jun; Andi Kleen
>Subject: Re: [PATCH] Yielding processor resources during lock
contention
>
>
>* Zwane Mwaikambo <zwane@fsmlabs.com> wrote:
>
>> > Agreed, Paul we may as well remove the cpu_relax() in
>__preempt_spin_lock
>> > and use something like "cpu_yield" (architectures not supporting it
>would
>> > just call cpu_relax) i'll have something for you later.
>>
>> The following patch introduces cpu_lock_yield which allows
>> architectures to possibly yield processor resources during lock
>> contention. [...]
>
>it is not clear from the Intel documentation how well MONITOR+MWAIT
>works on SMP. It seems to be targeted towards hyperthreaded CPUs -
where
>i suspect it's much easier to monitor the stream of stores done to an
>address.

Ingo, Hi

>
>on SMP MESI caches the question is, does MONITOR+MWAIT detect a
>cacheline invalidate or even a natural cacheline flush? I doubt it
does.
>But without having the monitored cacheline in Modified state in the
>sleeping CPU for sure it's fundamentally racy and it's not possible to
>guarantee latencies: another CPU could have grabbed the cacheline and
>could keep it dirty indefinitely. (it could itself go idle forever
after
>this point!)
>
>the only safe way would be if MONITOR moved the cacheline into
Exclusive
>state and if it would watch that cacheline possibly going away (i.e.
>another CPU unlocking the spinlock) after this point - in addition to
>watching the stores of any HT sibling. But there is no description of
>the SMP behavior in the Intel docs - and i truly suspect it would be
>documented all over the place if it worked correctly on SMP... So i
>think this is an HT-only instruction. (and in that limited context we
>could indeed use it!)

MONITOR+MWAIT works on SMP as well.

>
>one good thing to do would be to test the behavior and count cycles -
it
>is possible to set up the 'wrong' caching case that can potentially
lead
>to indefinite delays in mwait. If it turns out to work the expected way
>then it would be nice to use. (The deep-sleep worries are not a too big
>issue for latency-sensitive users as deep sleep can already occur via
>the idle loop so it has to be turned off / tuned anyway.)

The instruction mwait is just a hint for the processor to enter an
(implementation-specific) optimized state, and in general it cannot
cause indefinite delays because of various break events, including
interrupts. If an interrupt happens, then the processor gets out of the
mwait state. The instruction does not support a restart at the mwait
instruction. In other words the processor needs to redo the mwait
instruction to reenter the state after a break event. Event time-outs
may take the processor out of the state, depending on the
implementation.

>
>but unless the SMP case is guaranteed to work in a time-deterministic
>way i dont think this patch can be added :-( It's not just the question
>of high latencies, it's the question of fundamental correctness: with
>large enough caches there is no guarantee that a CPU will _ever_ flush
a
>dirty cacheline to RAM.

As I noted (and Linus suspected), monitor/mwait is not proper for
spinlocks on Prescott/Nocona because of high latencies.

>
>	Ingo

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Yielding processor resources during lock contention
  2004-09-13  1:35 Nakajima, Jun
@ 2004-09-13  6:35 ` Ingo Molnar
  0 siblings, 0 replies; 50+ messages in thread
From: Ingo Molnar @ 2004-09-13  6:35 UTC (permalink / raw)
  To: Nakajima, Jun
  Cc: Zwane Mwaikambo, Linus Torvalds, Paul Mackerras, Linux Kernel,
	Andrew Morton, Anton Blanchard, Andi Kleen

* Nakajima, Jun <jun.nakajima@intel.com> wrote:

> The instruction mwait is just a hint for the processor to enter an
> (implementation-specific) optimized state, and in general it cannot
> cause indefinite delays because of various break events, including
> interrupts. If an interrupt happens, then the processor gets out of
> the mwait state. [...]

the problem is, there is no guarantee that an interrupt may happen on a
CPU from that point on _ever_. Currently we do have a periodic timer
interrupt but this is not cast into stone. So we cannot really use MWAIT
for the idle loop either - at most as a replacement for HLT without any
latency assumptions. We could monitor the idle task's ->need_resched
flag.

> [...] The instruction does not support a restart at the mwait
> instruction. In other words the processor needs to redo the mwait
> instruction to reenter the state after a break event. Event time-outs
> may take the processor out of the state, depending on the
> implementation.

this is not a problem. The proper way to fix MWAIT for spinlocks (as i
suggested in the first email) would be to monitor a given cacheline's
existence in the L2 cache. I.e. to snoop for invalidates of that 
cacheline. That would make it possible to do:

	while (!spin_trylock(lock)) {
		MONITOR(lock);
		if (spin_is_locked(lock))
			MWAIT(lock);
	}

the first spin_trylock() test brings the cacheline into the L2 cache. 
There is no guarantee that by the time MONITOR executes it will still be
there but the likelyhood is high. Then we execute MONITOR which clears
the event flag. The second spin_is_locked() test brings the cacheline
into the L2 for sure - then we enter MWAIT which waits until the
event-flag is clear. If at any point another CPU moves the cacheline
into Exclusive (and then Modified) state then this CPU _must_ invalidate
the lock cacheline - hence the event flag will be set and MWAIT
immediately exits. I know that this is not how MONITOR/MWAIT works right
now but this is how MONITOR/MWAIT could work well for spinlocks.

(if the cache is not MESI but MOESI then the second test has to be
spin_trylock() as well [which dirties the cacheline] - since in that
case there would be no guarantee that the spin_is_locked() read-only
test would invalidate the possibly dirty cacheline on the spinlock owner
CPU.)

	Ingo

^ permalink raw reply	[flat|nested] 50+ messages in thread

end of thread, other threads:[~2004-09-13 12:18 UTC | newest]

Thread overview: 50+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-09-03  0:02 [PATCH][5/8] Arch agnostic completely out of line locks / ppc64 Zwane Mwaikambo
2004-09-09  5:42 ` Paul Mackerras
2004-09-09 12:41   ` Zwane Mwaikambo
2004-09-09 14:54     ` Linus Torvalds
2004-09-09 14:55       ` Linus Torvalds
2004-09-09 15:32         ` Zwane Mwaikambo
2004-09-12  4:59           ` [PATCH] Yielding processor resources during lock contention Zwane Mwaikambo
2004-09-12  5:00             ` Andrew Morton
2004-09-12  5:10               ` Anton Blanchard
2004-09-12  5:13               ` Zwane Mwaikambo
2004-09-12  4:32                 ` Nick Piggin
2004-09-12  5:33                   ` Zwane Mwaikambo
2004-09-12  4:46                     ` Nick Piggin
2004-09-12  5:51                       ` Zwane Mwaikambo
2004-09-12  5:02                         ` Nick Piggin
2004-09-12  6:09                           ` Anton Blanchard
2004-09-12  6:58                             ` William Lee Irwin III
2004-09-12  5:18                 ` Andrew Morton
2004-09-12  5:10             ` Linus Torvalds
2004-09-12  5:27               ` Zwane Mwaikambo
2004-09-13 12:21                 ` [PATCH] Yielding processor resources during lock contention #2 Zwane Mwaikambo
2004-09-12  7:49             ` [PATCH] Yielding processor resources during lock contention Ingo Molnar
2004-09-12 16:10               ` Zwane Mwaikambo
2004-09-12 19:34                 ` Ingo Molnar
2004-09-12 10:10             ` Arnd Bergmann
2004-09-12 10:43               ` Anton Blanchard
2004-09-12 11:12                 ` Arnd Bergmann
2004-09-09 15:43     ` [PATCH][5/8] Arch agnostic completely out of line locks / ppc64 Anton Blanchard
2004-09-09 17:19       ` William Lee Irwin III
2004-09-09 21:38         ` Paul Mackerras
2004-09-09 22:00           ` William Lee Irwin III
2004-09-09 23:36             ` Paul Mackerras
2004-09-10  0:09               ` William Lee Irwin III
2004-09-10  0:21                 ` Linus Torvalds
2004-09-10  0:35                   ` Anton Blanchard
2004-09-10  0:54                     ` Linus Torvalds
2004-09-10  1:42                       ` Anton Blanchard
2004-09-10  1:50                         ` Anton Blanchard
2004-09-10  2:22                           ` William Lee Irwin III
2004-09-10  2:32                             ` William Lee Irwin III
2004-09-10  2:37                               ` William Lee Irwin III
2004-09-10  3:24                             ` Anton Blanchard
2004-09-10  7:40                             ` Ingo Molnar
2004-09-10 12:16                               ` William Lee Irwin III
2004-09-10 12:22                                 ` Ingo Molnar
2004-09-10  3:23                       ` Anton Blanchard
2004-09-10  7:56                         ` Anton Blanchard
  -- strict thread matches above, loose matches on Subject: below --
2004-09-13  0:45 [PATCH] Yielding processor resources during lock contention Nakajima, Jun
2004-09-13  1:35 Nakajima, Jun
2004-09-13  6:35 ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox