i386 spinlock fairness: bizarre test results

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* i386 spinlock fairness: bizarre test results
@ 2005-10-11  4:04 Chuck Ebbert
  2005-10-11  9:42 ` Eric Dumazet
  2005-10-11 13:00 ` Alan Cox
  0 siblings, 2 replies; 25+ messages in thread
From: Chuck Ebbert @ 2005-10-11  4:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux, Linus Torvalds, Kirill Korotaev

  After seeing Kirill's message about spinlocks I decided to do my own
testing with the userspace program below; the results were very strange.

  When using the 'mov' instruction to do the unlock I was able to reproduce
hogging of the spinlock by a single CPU even on Pentium II under some
conditions, while using 'xchg' always allowed the other CPU to get the
lock:


[me@d2 t]$ gcc -O2 -o spintest.o -Wall -DUSE_MOV=1 spintest.c ; ./spintest.o
parent CPU 1, child CPU 0, using mov instruction for unlock
parent did: 34534 of 10000001 iterations
CPU clocks:     2063591864

[me@d2 t]$ gcc -O2 -o spintest.o -Wall -DUSE_MOV=0 spintest.c ; ./spintest.o
parent CPU 1, child CPU 0, using xchg instruction for unlock
parent did: 5169760 of 10000001 iterations
CPU clocks:     2164689784


  The results were dependent on the alignment of the "lock ; decl" at the start
of the spinlock code.  If that 7-byte instruction spanned two cachelines then
the CPU with that code could somehow hog the spinlock.  Optimizing for
Pentium II forced 16-byte alignment and made the spinlock fairer, but still
somewhat biased when using 'mov':


[me@d2 t]$ gcc -O2 -o spintest.o -Wall -DUSE_MOV=1 -mcpu=pentium2 spintest.c ; ./spintest.o
parent CPU 1, child CPU 0, using mov instruction for unlock
parent did: 4181147 of 10000001 iterations
CPU clocks:     2057436825


  That test machine was a dual 350MHz Pentium II Xeon; on a dual 333MHz Pentium II
Overdrive (with very slow Socket 8 bus) I could not reproduce those results.
However, on that machine the 'xchg' instruction made the test run almost 20%
_faster_ than using 'mov'.

  So I think the i386 spinlock code should be changed to always use 'xchg' to do
spin_unlock.

================================================================================
/* spinlock test
 *
 * Tests spinlock fairness on SMP i386 machine.
 */

/* number of tests  */
#ifndef ITERS
#define ITERS 10000000
#endif

/* use "mov" instruction for spin_unlock instead of "xchg"  */
#ifndef USE_MOV
#define USE_MOV 1
#endif

/* cpu to run parent process; child will use !PARENT_CPU  */
#ifndef PARENT_CPU
#define PARENT_CPU 1
#endif

/* change this to match your version of glibc -- "2" means use 2 args */
#ifndef SETAFFINITY
#define SETAFFINITY 2
#endif

#if SETAFFINITY == 2
#define setaffinity(pid, mask) sched_setaffinity((pid), &(mask))
#else /* 3 args  */
#define setaffinity(pid, mask) sched_setaffinity((pid), sizeof(mask), &(mask))
#endif

#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sched.h>
#include <sys/ptrace.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <asm/user.h>

#define likely(x)	__builtin_expect(!!(x), 1)
#define unlikely(x)	__builtin_expect(!!(x), 0)

#define RDTSCLL(r) asm volatile("rdtsc" : "=A" (r))

unsigned long long tsc1, tsc2;
cpu_set_t cpuset0, cpuset1;
int test = 1; /* for testing -- starts unlocked  */
int strt = 0; /* sync startup  -- starts locked  */
int stk[1024];
int iters, p_iters, c_iters;

static inline void __raw_spin_lock(int *l)
{
	asm volatile(
		"\n1:\t"
		"lock ; decl %0\n\t"
		"jns 3f\n"
		"2:\t"
		"rep;nop\n\t"
		"cmpl $0,%0\n\t"
		"jle 2b\n\t"
		"jmp 1b\n"
		"3:\n\t"
		:"=m" (*l) : : "memory");
}

#if USE_MOV

static inline void __raw_spin_unlock(int *l)
{
	asm volatile("movl $1,%0"
		     : "=m" (*l) : : "memory");
}

#else

static inline void __raw_spin_unlock(int *l)
{
	int oldval = 1;

	asm volatile("xchgl %0,%1"
		     : "=q" (oldval), "=m" (*l)
		     : "0" (oldval) : "memory");
}

#endif /* USE_MOV  */

int do_child(void *vp)
{
	CPU_SET(!PARENT_CPU, &cpuset1);
	if (unlikely(setaffinity(0, cpuset1)))
		perror("child setaffinity");

	/* Add "nop" insns as necessary to make 1st
	 * insn of __raw_spin_lock span cachelines.
	 */
	asm("nop ; nop ; nop");

	__raw_spin_unlock(&strt); /* release parent  */
again:
	__raw_spin_lock(&test);
	c_iters++;
	if (likely(++iters < ITERS)) {
		__raw_spin_unlock(&test);
		goto again;
	}
	__raw_spin_unlock(&test);

	return 0;
}

int main(int argc, char * const argv[])
{
	CPU_SET(PARENT_CPU, &cpuset0);
	if (unlikely(setaffinity(0, cpuset0)))
		perror("parent setaffinity");

	clone(do_child, &stk[1023], CLONE_VM, 0);

	__raw_spin_lock(&strt); /* wait for child to init  */
	RDTSCLL(tsc1);
again:
	__raw_spin_lock(&test);
	p_iters++;
	if (likely(++iters < ITERS)) {
		__raw_spin_unlock(&test);
		goto again;
	}
	__raw_spin_unlock(&test);
	RDTSCLL(tsc2);

	printf("parent CPU %d, child CPU %d, ", PARENT_CPU, !PARENT_CPU);
	printf("using %s instruction for unlock\n", USE_MOV ? "mov" : "xchg");
	printf("parent did: %d of %d iterations\n", p_iters, iters);
	printf("CPU clocks: %14llu\n", tsc2 - tsc1);

	return 0;
}
__
Chuck

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: i386 spinlock fairness: bizarre test results
       [not found] <4WjCM-7Aq-7@gated-at.bofh.it>
@ 2005-10-11  4:32 ` Robert Hancock
  2005-10-11 12:31   ` Joe Seigh
  0 siblings, 1 reply; 25+ messages in thread
From: Robert Hancock @ 2005-10-11  4:32 UTC (permalink / raw)
  To: linux-kernel

Chuck Ebbert wrote:
>   After seeing Kirill's message about spinlocks I decided to do my own
> testing with the userspace program below; the results were very strange.
> 
>   When using the 'mov' instruction to do the unlock I was able to reproduce
> hogging of the spinlock by a single CPU even on Pentium II under some
> conditions, while using 'xchg' always allowed the other CPU to get the
> lock:

This might not necessarily be a win in all situations. If two CPUs A and 
  B are trying to get into a spinlock-protected critical section to do 5 
operations, it may well be more efficient for them to do AAAAABBBBB as 
opposed to ABABABABAB, as the second situation may result in cache lines 
bouncing between the two CPUs each time, etc.

I don't know that making spinlocks "fairer" is really very worthwhile. 
If some spinlocks are so heavily contented that fairness becomes an 
issue, it would be better to find a way to reduce that contention.

-- 
Robert Hancock      Saskatoon, SK, Canada
To email, remove "nospam" from hancockr@nospamshaw.ca
Home Page: http://www.roberthancock.com/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: i386 spinlock fairness: bizarre test results
  2005-10-11  4:04 i386 spinlock fairness: bizarre test results Chuck Ebbert
@ 2005-10-11  9:42 ` Eric Dumazet
  2005-10-11 13:00 ` Alan Cox
  1 sibling, 0 replies; 25+ messages in thread
From: Eric Dumazet @ 2005-10-11  9:42 UTC (permalink / raw)
  To: Chuck Ebbert; +Cc: linux-kernel, linux, Linus Torvalds, Kirill Korotaev

Chuck Ebbert a écrit :
>   After seeing Kirill's message about spinlocks I decided to do my own
> testing with the userspace program below; the results were very strange.
> 
>   When using the 'mov' instruction to do the unlock I was able to reproduce
> hogging of the spinlock by a single CPU even on Pentium II under some
> conditions, while using 'xchg' always allowed the other CPU to get the
> lock:
> 
> 
> parent CPU 1, child CPU 0, using mov instruction for unlock
> parent did: 34534 of 10000001 iterations
> CPU clocks:     2063591864
> 
> parent CPU 1, child CPU 0, using xchg instruction for unlock
> parent did: 5169760 of 10000001 iterations
> CPU clocks:     2164689784
> 
Hi Chuck

Thats interesting, because on my machines I get opposite results :

On a Xeon 2.0 GHz, Hyper Threading, I get better results (in the sense of 
fairness) with the MOV version

    parent CPU 1, child CPU 0, using mov instruction for unlock
    parent did: 5291346 of 10000001 iterations
    CPU clocks:     3.437.053.244

    parent CPU 1, child CPU 0, using xchg instruction for unlock
    parent did: 7732349 of 10000001 iterations
    CPU clocks:     3.408.285.308


On a dual Opteron 248 (2.2GHz) machine, I get bad fairness results regardless 
of MOV/XCHG (but less cycles per iteration). A lot of variations in the 
results (maybe because of NUMA effects ?), but xchg gives slightly better 
fairness.

    parent CPU 1, child CPU 0, using mov instruction for unlock
    parent did: 256810 of 10000001 iterations
    CPU clocks:      772.838.640

    parent CPU 1, child CPU 0, using xchg instruction for unlock
    parent did: 438280 of 10000001 iterations
    CPU clocks:     1.115.653.346

    parent CPU 1, child CPU 0, using xchg instruction for unlock
    parent did: 574501 of 10000001 iterations
    CPU clocks:     1.200.129.428


On a dual core Opteron 275 (2.2GHz), xchg is faster but unfair.

(threads running on the same physical CPU)
    parent CPU 1, child CPU 0, using mov instruction for unlock
    parent did: 4822270 of 10000001 iterations
    CPU clocks:      738.438.383

    parent CPU 1, child CPU 0, using xchg instruction for unlock
    parent did: 9702075 of 10000001 iterations
    CPU clocks:      561.457.724

Totally different results if affinity changed so that threads run on different 
physical cpus  : (XCHG is slower)

    parent CPU 0, child CPU 2, using mov instruction for unlock
    parent did: 1081522 of 10000001 iterations
    CPU clocks:      508.611.273

    parent CPU 0, child CPU 2, using xchg instruction for unlock
    parent did: 4310427 of 10000000 iterations
    CPU clocks:     1.074.170.246



For reference, if only one thread is running, the MOV version is also faster 
on both platforms :

[Xeon 2GHz]
    one thread, using mov instruction for unlock
    10000000 iterations
    CPU clocks:     1.278.879.528

[Xeon 2GHz]
    one thread, using xchg instruction for unlock
    10000000 iterations
    CPU clocks:     2.486.912.752

Of course Opterons are faster :) (less cycles per iteration)

[Opteron 248]
    one thread, using mov instruction for unlock
    10000000 iterations
    CPU clocks:      212.514.637

[Opteron 248]
    one thread, using xchg instruction for unlock
    10000000 iterations
    CPU clocks:      383.306.420

[Opteron 275]
    one thread, using mov instruction for unlock
    10000000 iterations
    CPU clocks:      208.472.009

[Opteron 275]
    one thread, using xchg instruction for unlock
    10000000 iterations
    CPU clocks:      417.502.675

In conclusion, I would say that for uncontended locks, MOV is faster. For 
contended locks, XCHG might be faster on some platforms (dual core Opterons, 
only if on the same physical CPU)

Eric


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: i386 spinlock fairness: bizarre test results
  2005-10-11  4:32 ` i386 spinlock fairness: bizarre test results Robert Hancock
@ 2005-10-11 12:31   ` Joe Seigh
  2005-10-11 21:01     ` Bill Davidsen
  0 siblings, 1 reply; 25+ messages in thread
From: Joe Seigh @ 2005-10-11 12:31 UTC (permalink / raw)
  To: linux-kernel

Robert Hancock wrote:
> Chuck Ebbert wrote:
> 
>>   After seeing Kirill's message about spinlocks I decided to do my own
>> testing with the userspace program below; the results were very strange.
>>
>>   When using the 'mov' instruction to do the unlock I was able to 
>> reproduce
>> hogging of the spinlock by a single CPU even on Pentium II under some
>> conditions, while using 'xchg' always allowed the other CPU to get the
>> lock:
> 
> 
> This might not necessarily be a win in all situations. If two CPUs A and 
>  B are trying to get into a spinlock-protected critical section to do 5 
> operations, it may well be more efficient for them to do AAAAABBBBB as 
> opposed to ABABABABAB, as the second situation may result in cache lines 
> bouncing between the two CPUs each time, etc.
> 
> I don't know that making spinlocks "fairer" is really very worthwhile. 
> If some spinlocks are so heavily contented that fairness becomes an 
> issue, it would be better to find a way to reduce that contention.
> 

You're right that it wouldn't be an issue on a system with relatively few
cpu's since that amount of contention would cripple the system.  Though
with 100's of cpu's you could get contention hotspots with some spin locks
being concurrently accessed by some subset of the cpu's for periods of time.

The real issue is scalability or how gracefully does a system degrade
when it starts to hit its contention limits.  It's not a good thing when
a system appears to run fine and then catastrophically hangs when it
bumps across its critical limit.  It's better when a system exhibit's
some sort of linear degradation.  The former exhibits bistable behavior
which requires a drastic, probably impossible, reduction in work load
to regain normal performance.  Reboots are the normal course of correction.
The linearly degrading systems just require moderation of the workload
to move back into acceptable performance.

Anyway, if you want to build a scalable system, it makes sense to build it
out of scalable components.  Right?

Joe Seigh

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: i386 spinlock fairness: bizarre test results
  2005-10-11  4:04 i386 spinlock fairness: bizarre test results Chuck Ebbert
  2005-10-11  9:42 ` Eric Dumazet
@ 2005-10-11 13:00 ` Alan Cox
  2005-10-11 14:44   ` Linus Torvalds
  1 sibling, 1 reply; 25+ messages in thread
From: Alan Cox @ 2005-10-11 13:00 UTC (permalink / raw)
  To: Chuck Ebbert; +Cc: linux-kernel, linux, Linus Torvalds, Kirill Korotaev

On Maw, 2005-10-11 at 00:04 -0400, Chuck Ebbert wrote:
>   That test machine was a dual 350MHz Pentium II Xeon; on a dual 333MHz Pentium II
> Overdrive (with very slow Socket 8 bus) I could not reproduce those results.
> However, on that machine the 'xchg' instruction made the test run almost 20%
> _faster_ than using 'mov'.
> 
>   So I think the i386 spinlock code should be changed to always use 'xchg' to do
> spin_unlock.

Using xchg on the spin unlock path is expensive. Really expensive on P4
compared to movb. It also doesn't guarantee anything either way around
especially as you go to four cores or change CPU (or in some cases quite
likely even chipset).

Spin lock paths should not be heavily contested. If they are then fix
the underlying problem with finer locking, or if you can't do that then
perhaps by serializing it with a queue.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: i386 spinlock fairness: bizarre test results
  2005-10-11 13:00 ` Alan Cox
@ 2005-10-11 14:44   ` Linus Torvalds
  2005-10-11 15:32     ` [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits Eric Dumazet
  0 siblings, 1 reply; 25+ messages in thread
From: Linus Torvalds @ 2005-10-11 14:44 UTC (permalink / raw)
  To: Alan Cox; +Cc: Chuck Ebbert, linux-kernel, linux, Kirill Korotaev

On Tue, 11 Oct 2005, Alan Cox wrote:

> On Maw, 2005-10-11 at 00:04 -0400, Chuck Ebbert wrote:
> >   That test machine was a dual 350MHz Pentium II Xeon; on a dual 333MHz Pentium II
> > Overdrive (with very slow Socket 8 bus) I could not reproduce those results.
> > However, on that machine the 'xchg' instruction made the test run almost 20%
> > _faster_ than using 'mov'.
> > 
> >   So I think the i386 spinlock code should be changed to always use 'xchg' to do
> > spin_unlock.
> 
> 
> Using xchg on the spin unlock path is expensive. Really expensive on P4
> compared to movb. It also doesn't guarantee anything either way around
> especially as you go to four cores or change CPU (or in some cases quite
> likely even chipset).

Indeed.

I suspect that the behaviour Chuck saw is (a) only present under 
contention and (b) very much dependent on other timing issues.

(a) is the wrong thing to optimize for, and (b) means that Chuck's numbers 
aren't reliable anyway (as shown by the fact that things like instruction 
alignment matters, and by Eric's numbers on other machines).

We want the spinlocks to behave well when they are _not_ under heavy 
contention. If a spinlock gets so much contention that it starts having 
these kinds of issues, then there's something wrong at higher levels, and 
the fix is to use a different algorithm, or use a different kind of lock.

Spinlocks by definition are the _simplest_ locks there are. Not the 
smartest or most fair. Trying to make them anything else is kind of 
missing the whole point of them.

			Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-11 14:44   ` Linus Torvalds
@ 2005-10-11 15:32     ` Eric Dumazet
  2005-10-11 16:03       ` Linus Torvalds
  0 siblings, 1 reply; 25+ messages in thread
From: Eric Dumazet @ 2005-10-11 15:32 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux, Kirill Korotaev

[-- Attachment #1: Type: text/plain, Size: 351 bytes --]

As NR_CPUS might be > 128, and every spining CPU decrements the lock, we need 
to use more than 8 bits for a spinlock. The current (i386/x86_64) 
implementations have a (theorical) bug in this area.
As the allocated space for spinlock_t is 32 bits, let's use full 32 bits and 
let the CPUS fight :)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>


[-- Attachment #2: i386_spinlock --]
[-- Type: text/plain, Size: 2056 bytes --]

--- linux-2.6.14-rc4/include/asm-i386/spinlock.h	2005-10-11 03:19:19.000000000 +0200
+++ linux-2.6.14-rc4-ed/include/asm-i386/spinlock.h	2005-10-11 19:19:27.000000000 +0200
@@ -18,23 +18,24 @@
  * (the type definitions are in asm/spinlock_types.h)
  */
 
+
 #define __raw_spin_is_locked(x) \
-		(*(volatile signed char *)(&(x)->slock) <= 0)
+		(*(volatile int *)(&(x)->slock) <= 0)
 
 #define __raw_spin_lock_string \
 	"\n1:\t" \
-	"lock ; decb %0\n\t" \
+	"lock ; decl %0\n\t" \
 	"jns 3f\n" \
 	"2:\t" \
 	"rep;nop\n\t" \
-	"cmpb $0,%0\n\t" \
+	"cmpl $0,%0\n\t" \
 	"jle 2b\n\t" \
 	"jmp 1b\n" \
 	"3:\n\t"
 
 #define __raw_spin_lock_string_flags \
 	"\n1:\t" \
-	"lock ; decb %0\n\t" \
+	"lock ; decl %0\n\t" \
 	"jns 4f\n\t" \
 	"2:\t" \
 	"testl $0x200, %1\n\t" \
@@ -42,7 +43,7 @@
 	"sti\n\t" \
 	"3:\t" \
 	"rep;nop\n\t" \
-	"cmpb $0, %0\n\t" \
+	"cmpl $0, %0\n\t" \
 	"jle 3b\n\t" \
 	"cli\n\t" \
 	"jmp 1b\n" \
@@ -64,9 +65,10 @@
 
 static inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
-	char oldval;
+	int oldval;
+	BUILD_BUG_ON(sizeof(lock->slock) != sizeof(oldval));
 	__asm__ __volatile__(
-		"xchgb %b0,%1"
+		"xchgl %0,%1"
 		:"=q" (oldval), "=m" (lock->slock)
 		:"0" (0) : "memory");
 	return oldval > 0;
@@ -75,14 +77,14 @@
 /*
  * __raw_spin_unlock based on writing $1 to the low byte.
  * This method works. Despite all the confusion.
- * (except on PPro SMP or if we are using OOSTORE, so we use xchgb there)
+ * (except on PPro SMP or if we are using OOSTORE, so we use xchgl there)
  * (PPro errata 66, 92)
  */
 
 #if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
 
 #define __raw_spin_unlock_string \
-	"movb $1,%0" \
+	"movl $1,%0" \
 		:"=m" (lock->slock) : : "memory"
 
 
@@ -96,13 +98,13 @@
 #else
 
 #define __raw_spin_unlock_string \
-	"xchgb %b0, %1" \
+	"xchgl %0, %1" \
 		:"=q" (oldval), "=m" (lock->slock) \
 		:"0" (oldval) : "memory"
 
 static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
-	char oldval = 1;
+	int oldval = 1;
 
 	__asm__ __volatile__(
 		__raw_spin_unlock_string

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-11 15:32     ` [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits Eric Dumazet
@ 2005-10-11 16:03       ` Linus Torvalds
  2005-10-11 16:36         ` Eric Dumazet
  2005-10-11 17:59         ` Andi Kleen
  0 siblings, 2 replies; 25+ messages in thread
From: Linus Torvalds @ 2005-10-11 16:03 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: linux-kernel, linux, Kirill Korotaev



On Tue, 11 Oct 2005, Eric Dumazet wrote:
>
> As NR_CPUS might be > 128, and every spining CPU decrements the lock, we need
> to use more than 8 bits for a spinlock. The current (i386/x86_64)
> implementations have a (theorical) bug in this area.

I don't think there are any x86 machines with > 128 CPU's right now.

The advantage of the byte lock is that a "movb $0" is three bytes shorter 
than a "movl $0". And that's the unlock sequence.

		Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-11 16:03       ` Linus Torvalds
@ 2005-10-11 16:36         ` Eric Dumazet
  2005-10-11 16:54           ` Linus Torvalds
  2005-10-17  7:03           ` Andrew Morton
  2005-10-11 17:59         ` Andi Kleen
  1 sibling, 2 replies; 25+ messages in thread
From: Eric Dumazet @ 2005-10-11 16:36 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel

Linus Torvalds a écrit :
> 
> On Tue, 11 Oct 2005, Eric Dumazet wrote:
> 
>>As NR_CPUS might be > 128, and every spining CPU decrements the lock, we need
>>to use more than 8 bits for a spinlock. The current (i386/x86_64)
>>implementations have a (theorical) bug in this area.
> 
> 
> I don't think there are any x86 machines with > 128 CPU's right now.
> 
> The advantage of the byte lock is that a "movb $0" is three bytes shorter 
> than a "movl $0". And that's the unlock sequence.

1) Would you prefer to change arch/i386/Kconfig

config NR_CPUS
     int "Maximum number of CPUs (2-255)"
     range 2 255

2) The unlock sequence is not anymore inlined. It appears twice or three times 
in the kernel.

3) i386 code is often taken as the base when a port is done. For example 
x86_64 has the same problem.

Eric

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-11 16:36         ` Eric Dumazet
@ 2005-10-11 16:54           ` Linus Torvalds
  2005-10-11 16:55             ` Linus Torvalds
  2005-10-17  7:03           ` Andrew Morton
  1 sibling, 1 reply; 25+ messages in thread
From: Linus Torvalds @ 2005-10-11 16:54 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: linux-kernel



On Tue, 11 Oct 2005, Eric Dumazet wrote:
> 
> 2) The unlock sequence is not anymore inlined. It appears twice or three times
> in the kernel.

Ahh, that (2) is the killer, I'd totally forgotten.

Ok, the patch is valid, no arguments.

		Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-11 16:54           ` Linus Torvalds
@ 2005-10-11 16:55             ` Linus Torvalds
  0 siblings, 0 replies; 25+ messages in thread
From: Linus Torvalds @ 2005-10-11 16:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: linux-kernel



On Tue, 11 Oct 2005, Linus Torvalds wrote:
> 
> Ok, the patch is valid, no arguments.

That said.. It's not like it's critical. So can you please re-send after 
2.6.14 to remind me?

		Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-11 16:03       ` Linus Torvalds
  2005-10-11 16:36         ` Eric Dumazet
@ 2005-10-11 17:59         ` Andi Kleen
  1 sibling, 0 replies; 25+ messages in thread
From: Andi Kleen @ 2005-10-11 17:59 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux, Kirill Korotaev, dada1

Linus Torvalds <torvalds@osdl.org> writes:

> On Tue, 11 Oct 2005, Eric Dumazet wrote:
> >
> > As NR_CPUS might be > 128, and every spining CPU decrements the lock, we need
> > to use more than 8 bits for a spinlock. The current (i386/x86_64)
> > implementations have a (theorical) bug in this area.

I already queued a patch for x86-64 for this for post 2.6.14
following earlier complaints from Eric.

iirc on 32bit there are other issues though like limited
number of reader lock users.

But then I'm not sure why anybody would want to run a 32bit
kernel on such a big machine ...

> I don't think there are any x86 machines with > 128 CPU's right now.

Someone at Unisys just needs to put dual core hyper threaded CPUs
into their 64 socket systems, then you'll have a 256 CPU x86 machine
(which is currently the maximum the APICs can take anyways...) 
With Multicores being on everybody's roadmap we'll probably see 
such big systems soon.

-Andi

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: i386 spinlock fairness: bizarre test results
  2005-10-11 12:31   ` Joe Seigh
@ 2005-10-11 21:01     ` Bill Davidsen
  0 siblings, 0 replies; 25+ messages in thread
From: Bill Davidsen @ 2005-10-11 21:01 UTC (permalink / raw)
  To: Joe Seigh, Linux Kernel Mailing List

Joe Seigh wrote:
> Robert Hancock wrote:
> 
>> Chuck Ebbert wrote:
>>
>>>   After seeing Kirill's message about spinlocks I decided to do my own
>>> testing with the userspace program below; the results were very strange.
>>>
>>>   When using the 'mov' instruction to do the unlock I was able to 
>>> reproduce
>>> hogging of the spinlock by a single CPU even on Pentium II under some
>>> conditions, while using 'xchg' always allowed the other CPU to get the
>>> lock:
>>
>>
>>
>> This might not necessarily be a win in all situations. If two CPUs A 
>> and  B are trying to get into a spinlock-protected critical section to 
>> do 5 operations, it may well be more efficient for them to do 
>> AAAAABBBBB as opposed to ABABABABAB, as the second situation may 
>> result in cache lines bouncing between the two CPUs each time, etc.
>>
>> I don't know that making spinlocks "fairer" is really very worthwhile. 
>> If some spinlocks are so heavily contented that fairness becomes an 
>> issue, it would be better to find a way to reduce that contention.
>>
> 
> You're right that it wouldn't be an issue on a system with relatively few
> cpu's since that amount of contention would cripple the system.  Though
> with 100's of cpu's you could get contention hotspots with some spin locks
> being concurrently accessed by some subset of the cpu's for periods of 
> time.
> 
> The real issue is scalability or how gracefully does a system degrade
> when it starts to hit its contention limits.  It's not a good thing when
> a system appears to run fine and then catastrophically hangs when it
> bumps across its critical limit.  It's better when a system exhibit's
> some sort of linear degradation.  The former exhibits bistable behavior
> which requires a drastic, probably impossible, reduction in work load
> to regain normal performance.  Reboots are the normal course of correction.
> The linearly degrading systems just require moderation of the workload
> to move back into acceptable performance.
> 
> Anyway, if you want to build a scalable system, it makes sense to build it
> out of scalable components.  Right?
> 
Joe, I totally agree. That type of non-linear performance is sometimes 
called a "jackpot condition," and you see it in various algorithms. The 
comment that depending on fairness is poor design is correct, but 
because loads may be vastly higher than the original program design, 
it's sometimes not obvious that there could be high contention.

-- 
    -bill davidsen (davidsen@tmr.com)
"The secret to procrastination is to put things off until the
  last possible moment - but no longer"  -me

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-11 16:36         ` Eric Dumazet
  2005-10-11 16:54           ` Linus Torvalds
@ 2005-10-17  7:03           ` Andrew Morton
  2005-10-17  7:20             ` Arjan van de Ven
  1 sibling, 1 reply; 25+ messages in thread
From: Andrew Morton @ 2005-10-17  7:03 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: torvalds, linux-kernel, Ingo Molnar

Eric Dumazet <dada1@cosmosbay.com> wrote:
>
>  2) The unlock sequence is not anymore inlined. It appears twice or three times 
>  in the kernel.

Is that intentional though?  With <randon .config> my mm/swapfile.i has an
unreferenced

static inline void __raw_spin_unlock(raw_spinlock_t *lock)
{
	__asm__ __volatile__(
		"movb $1,%0" :"=m" (lock->slock) : : "memory" 
	);
}

which either a) shouldn't be there or b) should be referenced.

Ingo, can you confirm that x86's spin_unlock is never inlined?  If so,
what's my __raw_spin_unlock() doing there?


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-17  7:03           ` Andrew Morton
@ 2005-10-17  7:20             ` Arjan van de Ven
  2005-10-20 21:50               ` Ingo Molnar
  0 siblings, 1 reply; 25+ messages in thread
From: Arjan van de Ven @ 2005-10-17  7:20 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Eric Dumazet, torvalds, linux-kernel, Ingo Molnar

On Mon, 2005-10-17 at 00:03 -0700, Andrew Morton wrote:
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> >
> >  2) The unlock sequence is not anymore inlined. It appears twice or three times 
> >  in the kernel.
> 
> Is that intentional though?  With <randon .config> my mm/swapfile.i has an
> unreferenced
> 
> static inline void __raw_spin_unlock(raw_spinlock_t *lock)
> {
> 	__asm__ __volatile__(
> 		"movb $1,%0" :"=m" (lock->slock) : : "memory" 
> 	);
> }
> 
> which either a) shouldn't be there or b) should be referenced.
> 
> Ingo, can you confirm that x86's spin_unlock is never inlined?  If so,
> what's my __raw_spin_unlock() doing there?

I would really want this one inlined! 
A movb is a much shorter code sequence than a call (esp if you factor in
argument setup). De-inlining to save space is nice and all, but it can
go too far....




^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-17  7:20             ` Arjan van de Ven
@ 2005-10-20 21:50               ` Ingo Molnar
  2005-10-20 21:57                 ` Linus Torvalds
  0 siblings, 1 reply; 25+ messages in thread
From: Ingo Molnar @ 2005-10-20 21:50 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: Andrew Morton, Eric Dumazet, torvalds, linux-kernel


* Arjan van de Ven <arjan@infradead.org> wrote:

> > Is that intentional though?  With <randon .config> my mm/swapfile.i has an
> > unreferenced
> > 
> > static inline void __raw_spin_unlock(raw_spinlock_t *lock)
> > {
> > 	__asm__ __volatile__(
> > 		"movb $1,%0" :"=m" (lock->slock) : : "memory" 
> > 	);
> > }
> > 
> > which either a) shouldn't be there or b) should be referenced.
> > 
> > Ingo, can you confirm that x86's spin_unlock is never inlined?  If so,
> > what's my __raw_spin_unlock() doing there?

__raw_spin_unlock is currently only inlined in the kernel/spinlock.c 
code.

> I would really want this one inlined!  A movb is a much shorter code 
> sequence than a call (esp if you factor in argument setup). 
> De-inlining to save space is nice and all, but it can go too far....

yeah, it makes sense to inline the single-instruction unlock operations: 
nondebug spin_unlock(), read_unlock() and write_unlock(). This gives a 
0.2% code-size reduction:

    text    data     bss     dec     hex filename
 4072031  858208  387196 5317435  51233b vmlinux-smp-uninlined
 4060671  858212  387196 5306079  50f6df vmlinux-smp-inlined

patch against -rc5. Boot-tested on a 4-way x86 SMP box.

	Ingo

Signed-off-by: Ingo Molnar <mingo@elte.hu>

 include/linux/spinlock.h |   15 ++++++++++++---
 1 files changed, 12 insertions(+), 3 deletions(-)

Index: linux/include/linux/spinlock.h
===================================================================
--- linux.orig/include/linux/spinlock.h
+++ linux/include/linux/spinlock.h
@@ -171,9 +171,18 @@ extern int __lockfunc generic__raw_read_
 #define write_lock_irq(lock)		_write_lock_irq(lock)
 #define write_lock_bh(lock)		_write_lock_bh(lock)
 
-#define spin_unlock(lock)		_spin_unlock(lock)
-#define write_unlock(lock)		_write_unlock(lock)
-#define read_unlock(lock)		_read_unlock(lock)
+/*
+ * We inline the unlock functions in the nondebug case:
+ */
+#ifdef CONFIG_DEBUG_SPINLOCK
+# define spin_unlock(lock)		_spin_unlock(lock)
+# define read_unlock(lock)		_read_unlock(lock)
+# define write_unlock(lock)		_write_unlock(lock)
+#else
+# define spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
+# define read_unlock(lock)		__raw_read_unlock(&(lock)->raw_lock)
+# define write_unlock(lock)		__raw_write_unlock(&(lock)->raw_lock)
+#endif
 
 #define spin_unlock_irqrestore(lock, flags) \
 					_spin_unlock_irqrestore(lock, flags)

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-20 21:50               ` Ingo Molnar
@ 2005-10-20 21:57                 ` Linus Torvalds
  2005-10-20 22:02                   ` Ingo Molnar
  0 siblings, 1 reply; 25+ messages in thread
From: Linus Torvalds @ 2005-10-20 21:57 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Arjan van de Ven, Andrew Morton, Eric Dumazet, linux-kernel



On Thu, 20 Oct 2005, Ingo Molnar wrote:
>
> +/*
> + * We inline the unlock functions in the nondebug case:
> + */
> +#ifdef CONFIG_DEBUG_SPINLOCK

That can't be right. What about preemption etc?

There's a lot more to spin_unlock() than just the debugging stuff.

		Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-20 21:57                 ` Linus Torvalds
@ 2005-10-20 22:02                   ` Ingo Molnar
  2005-10-20 22:15                     ` Linus Torvalds
  0 siblings, 1 reply; 25+ messages in thread
From: Ingo Molnar @ 2005-10-20 22:02 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Arjan van de Ven, Andrew Morton, Eric Dumazet, linux-kernel

* Linus Torvalds <torvalds@osdl.org> wrote:

> On Thu, 20 Oct 2005, Ingo Molnar wrote:
> >
> > +/*
> > + * We inline the unlock functions in the nondebug case:
> > + */
> > +#ifdef CONFIG_DEBUG_SPINLOCK
> 
> That can't be right. What about preemption etc?
> 
> There's a lot more to spin_unlock() than just the debugging stuff.

the unlock is simple even in the preemption case - it's the _lock that 
gets complicated there. Once there's some attachment to the unlock 
operation (irq restore, or bh enabling) it again makes sense to keep 
things uninlined, but for the specific case of the simple-unlocks, it's 
a 0.2% space win to not inline - mostly from reduced clobbering of %eax, 
%ecx, %edx. Should be less of a win on 64-bit CPUs with enough 
registers.

	Ingo

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-20 22:02                   ` Ingo Molnar
@ 2005-10-20 22:15                     ` Linus Torvalds
  2005-10-20 22:27                       ` Ingo Molnar
  0 siblings, 1 reply; 25+ messages in thread
From: Linus Torvalds @ 2005-10-20 22:15 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Arjan van de Ven, Andrew Morton, Eric Dumazet, linux-kernel

On Fri, 21 Oct 2005, Ingo Molnar wrote:
> 
> the unlock is simple even in the preemption case

No it's not. It needs to decrement the preemption counter and test it.

See kernel/spinlock.c:

	void __lockfunc _spin_unlock(spinlock_t *lock)
	{
	        _raw_spin_unlock(lock);
	        preempt_enable();
	}
	EXPORT_SYMBOL(_spin_unlock);

and look at what "preempt_enable()" does.

In other words, with CONFIG_PREEMPT, your patch is WRONG. You made 
"spin_unlock()" just skip the preempt_enable.

In fact, with preemption, the _locking_ is the simpler part. Unlock is the 
complex one.

			Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-20 22:15                     ` Linus Torvalds
@ 2005-10-20 22:27                       ` Ingo Molnar
  2005-10-20 22:44                         ` Andrew Morton
  0 siblings, 1 reply; 25+ messages in thread
From: Ingo Molnar @ 2005-10-20 22:27 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Arjan van de Ven, Andrew Morton, Eric Dumazet, linux-kernel


* Linus Torvalds <torvalds@osdl.org> wrote:

> 
> 
> On Fri, 21 Oct 2005, Ingo Molnar wrote:
> > 
> > the unlock is simple even in the preemption case
> 
> No it's not. It needs to decrement the preemption counter and test it.

arghhh. Right you are. I even had the proper solution coded up initially 
(i had || defined(CONFIG_PREEMPT)) but dropped it due to fatal brain 
error. Been hacking on way too much ktimer code today ... Hopefully 
better patch attached. Booted on SMP+PREEMPT x86. But i'd really advise 
you against taking patches from me tonight :-|

	Ingo

Signed-off-by: Ingo Molnar <mingo@elte.hu>

 include/linux/spinlock.h |   15 ++++++++++++---
 1 files changed, 12 insertions(+), 3 deletions(-)

Index: linux/include/linux/spinlock.h
===================================================================
--- linux.orig/include/linux/spinlock.h
+++ linux/include/linux/spinlock.h
@@ -171,9 +171,18 @@ extern int __lockfunc generic__raw_read_
 #define write_lock_irq(lock)		_write_lock_irq(lock)
 #define write_lock_bh(lock)		_write_lock_bh(lock)
 
-#define spin_unlock(lock)		_spin_unlock(lock)
-#define write_unlock(lock)		_write_unlock(lock)
-#define read_unlock(lock)		_read_unlock(lock)
+/*
+ * We inline the unlock functions in the nondebug case:
+ */
+#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+# define spin_unlock(lock)		_spin_unlock(lock)
+# define read_unlock(lock)		_read_unlock(lock)
+# define write_unlock(lock)		_write_unlock(lock)
+#else
+# define spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
+# define read_unlock(lock)		__raw_read_unlock(&(lock)->raw_lock)
+# define write_unlock(lock)		__raw_write_unlock(&(lock)->raw_lock)
+#endif
 
 #define spin_unlock_irqrestore(lock, flags) \
 					_spin_unlock_irqrestore(lock, flags)

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-20 22:27                       ` Ingo Molnar
@ 2005-10-20 22:44                         ` Andrew Morton
  2005-10-20 22:53                           ` Ingo Molnar
  0 siblings, 1 reply; 25+ messages in thread
From: Andrew Morton @ 2005-10-20 22:44 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: torvalds, arjan, dada1, linux-kernel

Ingo Molnar <mingo@elte.hu> wrote:
>
> +/*
>  + * We inline the unlock functions in the nondebug case:
>  + */
>  +#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
>  +# define spin_unlock(lock)		_spin_unlock(lock)
>  +# define read_unlock(lock)		_read_unlock(lock)
>  +# define write_unlock(lock)		_write_unlock(lock)
>  +#else
>  +# define spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
>  +# define read_unlock(lock)		__raw_read_unlock(&(lock)->raw_lock)
>  +# define write_unlock(lock)		__raw_write_unlock(&(lock)->raw_lock)
>  +#endif
>   

spin_lock is still uninlined.

static inline __attribute__((always_inline)) struct dentry *dget_parent(struct dentry *dentry)
{
 struct dentry *ret;

 _spin_lock(&dentry->d_lock);
 ret = dget(dentry->d_parent);
 __raw_spin_unlock(&(&dentry->d_lock)->raw_lock);
 return ret;
}

as is spin_lock_irqsave() and spin_lock_irq()

uninlining spin_lock will probably increase overall text size, but mainly in
the out-of-line section.

<looks>

we removed the out-of-line section :(


read_lock is out-of-line.   read_unlock is inlined

write_lock is out-of-line.  write_unlock is out-of-line.


Needs work ;)

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-20 22:44                         ` Andrew Morton
@ 2005-10-20 22:53                           ` Ingo Molnar
  2005-10-20 23:01                             ` Andrew Morton
  0 siblings, 1 reply; 25+ messages in thread
From: Ingo Molnar @ 2005-10-20 22:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: torvalds, arjan, dada1, linux-kernel


* Andrew Morton <akpm@osdl.org> wrote:

> spin_lock is still uninlined.

yes, and that should stay so i believe, for text size reasons. The BTB 
should eliminate most effects of the call+ret itself.

> as is spin_lock_irqsave() and spin_lock_irq()

yes, for them the code length is even higher.

> uninlining spin_lock will probably increase overall text size, but 
> mainly in the out-of-line section.

you mean inlining it again? I dont think we should do it.

> read_lock is out-of-line.  read_unlock is inlined
> 
> write_lock is out-of-line.  write_unlock is out-of-line.

hm, with my patch, write_unlock should be inlined too.

	Ingo

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-20 22:53                           ` Ingo Molnar
@ 2005-10-20 23:01                             ` Andrew Morton
  2005-10-20 23:26                               ` Ingo Molnar
  0 siblings, 1 reply; 25+ messages in thread
From: Andrew Morton @ 2005-10-20 23:01 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: torvalds, arjan, dada1, linux-kernel

Ingo Molnar <mingo@elte.hu> wrote:
>
> 
> * Andrew Morton <akpm@osdl.org> wrote:
> 
> > spin_lock is still uninlined.
> 
> yes, and that should stay so i believe, for text size reasons. The BTB 
> should eliminate most effects of the call+ret itself.

The old

	lock; decb
	js <different section>
	...

was pretty good.

> > as is spin_lock_irqsave() and spin_lock_irq()
> 
> yes, for them the code length is even higher.
> 
> > uninlining spin_lock will probably increase overall text size, but 
> > mainly in the out-of-line section.
> 
> you mean inlining it again? I dont think we should do it.
> 
> > read_lock is out-of-line.  read_unlock is inlined
> > 
> > write_lock is out-of-line.  write_unlock is out-of-line.
> 
> hm, with my patch, write_unlock should be inlined too.
> 

So it is.  foo_unlock_irq() isn't though.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-20 23:01                             ` Andrew Morton
@ 2005-10-20 23:26                               ` Ingo Molnar
  2005-10-27 16:54                                 ` Paul Jackson
  0 siblings, 1 reply; 25+ messages in thread
From: Ingo Molnar @ 2005-10-20 23:26 UTC (permalink / raw)
  To: Andrew Morton; +Cc: torvalds, arjan, dada1, linux-kernel


* Andrew Morton <akpm@osdl.org> wrote:

> > > spin_lock is still uninlined.
> > 
> > yes, and that should stay so i believe, for text size reasons. The BTB 
> > should eliminate most effects of the call+ret itself.
> 
> The old
> 
> 	lock; decb
> 	js <different section>
> 	...
> 
> was pretty good.

yes, but that's 4-7+6==10-13 bytes of inline footprint, compared to 
fixed 5 bytes. That gives quite some icache footprint with thousands of 
call sites.

> > hm, with my patch, write_unlock should be inlined too.
> 
> So it is.  foo_unlock_irq() isn't though.

yes, that one should probably be inlined too, it's just 1 byte longer, 
still the network-effects on register allocations give a net win:

    text    data     bss     dec     hex filename
 4072031  858208  387196 5317435  51233b vmlinux-smp-uninlined
 4060671  858212  387196 5306079  50f6df vmlinux-smp-inlined
 4058543  858212  387196 5303951  50ee8f vmlinux-irqop-inlined-too

another 0.05% drop in text size. Add-on patch below, it is against -rc5 
plus prev_spinlock_patch. Boot-tested on 4-way x86 SMP. The box crashed 
and burned. Joking.

	Ingo

 include/linux/spinlock.h |   16 +++++++++++++---
 1 files changed, 13 insertions(+), 3 deletions(-)

Index: linux/include/linux/spinlock.h
===================================================================
--- linux.orig/include/linux/spinlock.h
+++ linux/include/linux/spinlock.h
@@ -184,19 +184,29 @@ extern int __lockfunc generic__raw_read_
 # define write_unlock(lock)		__raw_write_unlock(&(lock)->raw_lock)
 #endif
 
+#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+# define spin_unlock_irq(lock)		_spin_unlock_irq(lock)
+# define read_unlock_irq(lock)		_read_unlock_irq(lock)
+# define write_unlock_irq(lock)		_write_unlock_irq(lock)
+#else
+# define spin_unlock_irq(lock) \
+    do { __raw_spin_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
+# define read_unlock_irq(lock) \
+    do { __raw_read_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
+# define write_unlock_irq(lock) \
+    do { __raw_write_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
+#endif
+
 #define spin_unlock_irqrestore(lock, flags) \
 					_spin_unlock_irqrestore(lock, flags)
-#define spin_unlock_irq(lock)		_spin_unlock_irq(lock)
 #define spin_unlock_bh(lock)		_spin_unlock_bh(lock)
 
 #define read_unlock_irqrestore(lock, flags) \
 					_read_unlock_irqrestore(lock, flags)
-#define read_unlock_irq(lock)		_read_unlock_irq(lock)
 #define read_unlock_bh(lock)		_read_unlock_bh(lock)
 
 #define write_unlock_irqrestore(lock, flags) \
 					_write_unlock_irqrestore(lock, flags)
-#define write_unlock_irq(lock)		_write_unlock_irq(lock)
 #define write_unlock_bh(lock)		_write_unlock_bh(lock)
 
 #define spin_trylock_bh(lock)		__cond_lock(_spin_trylock_bh(lock))

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits
  2005-10-20 23:26                               ` Ingo Molnar
@ 2005-10-27 16:54                                 ` Paul Jackson
  0 siblings, 0 replies; 25+ messages in thread
From: Paul Jackson @ 2005-10-27 16:54 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: akpm, torvalds, arjan, dada1, linux-kernel


  Ingo - I think you broke the sparc defconfig build with this.

I am seeing the sparc defconfig (crosstool) build broken with 2.6.14-rc5-mm1.
It built ok with 2.6.14-rc4-mm1.

This build now fails for me, with:

=========================================================
  CC      net/ipv4/route.o
In file included from include/linux/mroute.h:129,
                 from net/ipv4/route.c:89:
include/net/sock.h: In function `sk_dst_get':
include/net/sock.h:972: warning: implicit declaration of function `__raw_read_unlock'
include/net/sock.h: In function `sk_dst_set':
include/net/sock.h:991: warning: implicit declaration of function `__raw_write_unlock'
net/ipv4/route.c: In function `rt_check_expire':
net/ipv4/route.c:663: warning: dereferencing `void *' pointer
net/ipv4/route.c:663: error: request for member `raw_lock' in something not a structure or union
make[2]: *** [net/ipv4/route.o] Error 1
=========================================================

Your patch added:

> +++ linux/include/linux/spinlock.h
> ...
> +# define write_unlock_irq(lock) \
> +    do { __raw_write_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)

I see __raw_write_unlock defined in include/asm-sparc/spinlock.h, which
is not included in defconfig sparc builds because such builds are non-
debug UP builds.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2005-10-27 16:56 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-10-11  4:04 i386 spinlock fairness: bizarre test results Chuck Ebbert
2005-10-11  9:42 ` Eric Dumazet
2005-10-11 13:00 ` Alan Cox
2005-10-11 14:44   ` Linus Torvalds
2005-10-11 15:32     ` [PATCH] i386 spinlocks should use the full 32 bits, not only 8 bits Eric Dumazet
2005-10-11 16:03       ` Linus Torvalds
2005-10-11 16:36         ` Eric Dumazet
2005-10-11 16:54           ` Linus Torvalds
2005-10-11 16:55             ` Linus Torvalds
2005-10-17  7:03           ` Andrew Morton
2005-10-17  7:20             ` Arjan van de Ven
2005-10-20 21:50               ` Ingo Molnar
2005-10-20 21:57                 ` Linus Torvalds
2005-10-20 22:02                   ` Ingo Molnar
2005-10-20 22:15                     ` Linus Torvalds
2005-10-20 22:27                       ` Ingo Molnar
2005-10-20 22:44                         ` Andrew Morton
2005-10-20 22:53                           ` Ingo Molnar
2005-10-20 23:01                             ` Andrew Morton
2005-10-20 23:26                               ` Ingo Molnar
2005-10-27 16:54                                 ` Paul Jackson
2005-10-11 17:59         ` Andi Kleen
     [not found] <4WjCM-7Aq-7@gated-at.bofh.it>
2005-10-11  4:32 ` i386 spinlock fairness: bizarre test results Robert Hancock
2005-10-11 12:31   ` Joe Seigh
2005-10-11 21:01     ` Bill Davidsen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox