* [PATCH] MIPS: Optimize spinlocks.
@ 2010-02-04 19:31 David Daney
2010-02-24 15:53 ` Ralf Baechle
0 siblings, 1 reply; 6+ messages in thread
From: David Daney @ 2010-02-04 19:31 UTC (permalink / raw)
To: linux-mips, ralf; +Cc: David Daney
The current locking mechanism uses a ll/sc sequence to release a
spinlock. This is slower than a wmb() followed by a store to unlock.
The branching forward to .subsection 2 on sc failure slows down the
contended case. So we get rid of that part too.
Since we are now working on naturally aligned u16 values, we can get
rid of a masking operation as the LHU already does the right thing.
The ANDI are reversed for better scheduling on multi-issue CPUs
On a 12 CPU 750MHz Octeon cn5750 this patch improves ipv4 UDP packet
forwarding rates from 3.58*10^6 PPS to 3.99*10^6 PPS, or about 11%.
Signed-off-by: David Daney <ddaney@caviumnetworks.com>
---
arch/mips/include/asm/barrier.h | 6 ++
arch/mips/include/asm/spinlock.h | 118 ++++++++++++--------------------
arch/mips/include/asm/spinlock_types.h | 24 +++++--
3 files changed, 67 insertions(+), 81 deletions(-)
diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h
index a2670a2..c0884f0 100644
--- a/arch/mips/include/asm/barrier.h
+++ b/arch/mips/include/asm/barrier.h
@@ -168,8 +168,14 @@
#ifdef CONFIG_CPU_CAVIUM_OCTEON
#define smp_mb__before_llsc() smp_wmb()
+/* Cause previous writes to become visible on all CPUs as soon as possible */
+#define nudge_writes() __asm__ __volatile__(".set push\n\t" \
+ ".set arch=octeon\n\t" \
+ "syncw\n\t" \
+ ".set pop" : : : "memory")
#else
#define smp_mb__before_llsc() smp_llsc_mb()
+#define nudge_writes() mb()
#endif
#endif /* __ASM_BARRIER_H */
diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h
index 5f16696..396e402 100644
--- a/arch/mips/include/asm/spinlock.h
+++ b/arch/mips/include/asm/spinlock.h
@@ -36,9 +36,9 @@
static inline int arch_spin_is_locked(arch_spinlock_t *lock)
{
- unsigned int counters = ACCESS_ONCE(lock->lock);
+ u32 counters = ACCESS_ONCE(lock->lock);
- return ((counters >> 14) ^ counters) & 0x1fff;
+ return ((counters >> 16) ^ counters) & 0xffff;
}
#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
@@ -47,9 +47,9 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lock)
static inline int arch_spin_is_contended(arch_spinlock_t *lock)
{
- unsigned int counters = ACCESS_ONCE(lock->lock);
+ u32 counters = ACCESS_ONCE(lock->lock);
- return (((counters >> 14) - counters) & 0x1fff) > 1;
+ return (((counters >> 16) - counters) & 0xffff) > 1;
}
#define arch_spin_is_contended arch_spin_is_contended
@@ -57,6 +57,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
{
int my_ticket;
int tmp;
+ int inc = 0x10000;
if (R10000_LLSC_WAR) {
__asm__ __volatile__ (
@@ -64,25 +65,24 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
" .set noreorder \n"
" \n"
"1: ll %[ticket], %[ticket_ptr] \n"
- " addiu %[my_ticket], %[ticket], 0x4000 \n"
+ " addu %[my_ticket], %[ticket], %[inc] \n"
" sc %[my_ticket], %[ticket_ptr] \n"
" beqzl %[my_ticket], 1b \n"
" nop \n"
- " srl %[my_ticket], %[ticket], 14 \n"
- " andi %[my_ticket], %[my_ticket], 0x1fff \n"
- " andi %[ticket], %[ticket], 0x1fff \n"
+ " srl %[my_ticket], %[ticket], 16 \n"
+ " andi %[ticket], %[ticket], 0xffff \n"
+ " andi %[my_ticket], %[my_ticket], 0xffff \n"
" bne %[ticket], %[my_ticket], 4f \n"
" subu %[ticket], %[my_ticket], %[ticket] \n"
"2: \n"
" .subsection 2 \n"
- "4: andi %[ticket], %[ticket], 0x1fff \n"
+ "4: andi %[ticket], %[ticket], 0xffff \n"
" sll %[ticket], 5 \n"
" \n"
"6: bnez %[ticket], 6b \n"
" subu %[ticket], 1 \n"
" \n"
- " lw %[ticket], %[ticket_ptr] \n"
- " andi %[ticket], %[ticket], 0x1fff \n"
+ " lhu %[ticket], %[serving_now_ptr] \n"
" beq %[ticket], %[my_ticket], 2b \n"
" subu %[ticket], %[my_ticket], %[ticket] \n"
" b 4b \n"
@@ -90,36 +90,33 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
" .previous \n"
" .set pop \n"
: [ticket_ptr] "+m" (lock->lock),
+ [serving_now_ptr] "+m" (lock->h.serving_now),
[ticket] "=&r" (tmp),
- [my_ticket] "=&r" (my_ticket));
+ [my_ticket] "=&r" (my_ticket)
+ : [inc] "r" (inc));
} else {
__asm__ __volatile__ (
" .set push # arch_spin_lock \n"
" .set noreorder \n"
" \n"
- " ll %[ticket], %[ticket_ptr] \n"
- "1: addiu %[my_ticket], %[ticket], 0x4000 \n"
+ "1: ll %[ticket], %[ticket_ptr] \n"
+ " addu %[my_ticket], %[ticket], %[inc] \n"
" sc %[my_ticket], %[ticket_ptr] \n"
- " beqz %[my_ticket], 3f \n"
- " nop \n"
- " srl %[my_ticket], %[ticket], 14 \n"
- " andi %[my_ticket], %[my_ticket], 0x1fff \n"
- " andi %[ticket], %[ticket], 0x1fff \n"
+ " beqz %[my_ticket], 1b \n"
+ " srl %[my_ticket], %[ticket], 16 \n"
+ " andi %[ticket], %[ticket], 0xffff \n"
+ " andi %[my_ticket], %[my_ticket], 0xffff \n"
" bne %[ticket], %[my_ticket], 4f \n"
" subu %[ticket], %[my_ticket], %[ticket] \n"
"2: \n"
" .subsection 2 \n"
- "3: b 1b \n"
- " ll %[ticket], %[ticket_ptr] \n"
- " \n"
"4: andi %[ticket], %[ticket], 0x1fff \n"
" sll %[ticket], 5 \n"
" \n"
"6: bnez %[ticket], 6b \n"
" subu %[ticket], 1 \n"
" \n"
- " lw %[ticket], %[ticket_ptr] \n"
- " andi %[ticket], %[ticket], 0x1fff \n"
+ " lhu %[ticket], %[serving_now_ptr] \n"
" beq %[ticket], %[my_ticket], 2b \n"
" subu %[ticket], %[my_ticket], %[ticket] \n"
" b 4b \n"
@@ -127,8 +124,10 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
" .previous \n"
" .set pop \n"
: [ticket_ptr] "+m" (lock->lock),
+ [serving_now_ptr] "+m" (lock->h.serving_now),
[ticket] "=&r" (tmp),
- [my_ticket] "=&r" (my_ticket));
+ [my_ticket] "=&r" (my_ticket)
+ : [inc] "r" (inc));
}
smp_llsc_mb();
@@ -136,47 +135,16 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
static inline void arch_spin_unlock(arch_spinlock_t *lock)
{
- int tmp;
-
- smp_mb__before_llsc();
-
- if (R10000_LLSC_WAR) {
- __asm__ __volatile__ (
- " # arch_spin_unlock \n"
- "1: ll %[ticket], %[ticket_ptr] \n"
- " addiu %[ticket], %[ticket], 1 \n"
- " ori %[ticket], %[ticket], 0x2000 \n"
- " xori %[ticket], %[ticket], 0x2000 \n"
- " sc %[ticket], %[ticket_ptr] \n"
- " beqzl %[ticket], 1b \n"
- : [ticket_ptr] "+m" (lock->lock),
- [ticket] "=&r" (tmp));
- } else {
- __asm__ __volatile__ (
- " .set push # arch_spin_unlock \n"
- " .set noreorder \n"
- " \n"
- " ll %[ticket], %[ticket_ptr] \n"
- "1: addiu %[ticket], %[ticket], 1 \n"
- " ori %[ticket], %[ticket], 0x2000 \n"
- " xori %[ticket], %[ticket], 0x2000 \n"
- " sc %[ticket], %[ticket_ptr] \n"
- " beqz %[ticket], 2f \n"
- " nop \n"
- " \n"
- " .subsection 2 \n"
- "2: b 1b \n"
- " ll %[ticket], %[ticket_ptr] \n"
- " .previous \n"
- " .set pop \n"
- : [ticket_ptr] "+m" (lock->lock),
- [ticket] "=&r" (tmp));
- }
+ unsigned int serving_now = lock->h.serving_now + 1;
+ wmb();
+ lock->h.serving_now = (u16)serving_now;
+ nudge_writes();
}
static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock)
{
int tmp, tmp2, tmp3;
+ int inc = 0x10000;
if (R10000_LLSC_WAR) {
__asm__ __volatile__ (
@@ -184,11 +152,11 @@ static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock)
" .set noreorder \n"
" \n"
"1: ll %[ticket], %[ticket_ptr] \n"
- " srl %[my_ticket], %[ticket], 14 \n"
- " andi %[my_ticket], %[my_ticket], 0x1fff \n"
- " andi %[now_serving], %[ticket], 0x1fff \n"
+ " srl %[my_ticket], %[ticket], 16 \n"
+ " andi %[my_ticket], %[my_ticket], 0xffff \n"
+ " andi %[now_serving], %[ticket], 0xffff \n"
" bne %[my_ticket], %[now_serving], 3f \n"
- " addiu %[ticket], %[ticket], 0x4000 \n"
+ " addu %[ticket], %[ticket], %[inc] \n"
" sc %[ticket], %[ticket_ptr] \n"
" beqzl %[ticket], 1b \n"
" li %[ticket], 1 \n"
@@ -201,33 +169,33 @@ static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock)
: [ticket_ptr] "+m" (lock->lock),
[ticket] "=&r" (tmp),
[my_ticket] "=&r" (tmp2),
- [now_serving] "=&r" (tmp3));
+ [now_serving] "=&r" (tmp3)
+ : [inc] "r" (inc));
} else {
__asm__ __volatile__ (
" .set push # arch_spin_trylock \n"
" .set noreorder \n"
" \n"
- " ll %[ticket], %[ticket_ptr] \n"
- "1: srl %[my_ticket], %[ticket], 14 \n"
- " andi %[my_ticket], %[my_ticket], 0x1fff \n"
- " andi %[now_serving], %[ticket], 0x1fff \n"
+ "1: ll %[ticket], %[ticket_ptr] \n"
+ " srl %[my_ticket], %[ticket], 16 \n"
+ " andi %[my_ticket], %[my_ticket], 0xffff \n"
+ " andi %[now_serving], %[ticket], 0xffff \n"
" bne %[my_ticket], %[now_serving], 3f \n"
- " addiu %[ticket], %[ticket], 0x4000 \n"
+ " addu %[ticket], %[ticket], %[inc] \n"
" sc %[ticket], %[ticket_ptr] \n"
- " beqz %[ticket], 4f \n"
+ " beqz %[ticket], 1b \n"
" li %[ticket], 1 \n"
"2: \n"
" .subsection 2 \n"
"3: b 2b \n"
" li %[ticket], 0 \n"
- "4: b 1b \n"
- " ll %[ticket], %[ticket_ptr] \n"
" .previous \n"
" .set pop \n"
: [ticket_ptr] "+m" (lock->lock),
[ticket] "=&r" (tmp),
[my_ticket] "=&r" (tmp2),
- [now_serving] "=&r" (tmp3));
+ [now_serving] "=&r" (tmp3)
+ : [inc] "r" (inc));
}
smp_llsc_mb();
diff --git a/arch/mips/include/asm/spinlock_types.h b/arch/mips/include/asm/spinlock_types.h
index ee197c2..c52f360 100644
--- a/arch/mips/include/asm/spinlock_types.h
+++ b/arch/mips/include/asm/spinlock_types.h
@@ -5,16 +5,28 @@
# error "please don't include this file directly"
#endif
-typedef struct {
+#include <linux/types.h>
+
+#include <asm/byteorder.h>
+
+typedef union {
/*
- * bits 0..13: serving_now
- * bits 14 : junk data
- * bits 15..28: ticket
+ * bits 0..15 : serving_now
+ * bits 16..31 : ticket
*/
- unsigned int lock;
+ u32 lock;
+ struct {
+#ifdef __BIG_ENDIAN
+ u16 ticket;
+ u16 serving_now;
+#else
+ u16 serving_now;
+ u16 ticket;
+#endif
+ } h;
} arch_spinlock_t;
-#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
+#define __ARCH_SPIN_LOCK_UNLOCKED { .lock = 0 }
typedef struct {
volatile unsigned int lock;
--
1.6.0.6
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH] MIPS: Optimize spinlocks.
2010-02-04 19:31 [PATCH] MIPS: Optimize spinlocks David Daney
@ 2010-02-24 15:53 ` Ralf Baechle
2010-02-24 15:54 ` Ralf Baechle
2010-02-24 16:55 ` David Daney
0 siblings, 2 replies; 6+ messages in thread
From: Ralf Baechle @ 2010-02-24 15:53 UTC (permalink / raw)
To: David Daney; +Cc: linux-mips
On Thu, Feb 04, 2010 at 11:31:49AM -0800, David Daney wrote:
> The current locking mechanism uses a ll/sc sequence to release a
> spinlock. This is slower than a wmb() followed by a store to unlock.
>
> The branching forward to .subsection 2 on sc failure slows down the
> contended case. So we get rid of that part too.
>
> Since we are now working on naturally aligned u16 values, we can get
> rid of a masking operation as the LHU already does the right thing.
> The ANDI are reversed for better scheduling on multi-issue CPUs
>
> On a 12 CPU 750MHz Octeon cn5750 this patch improves ipv4 UDP packet
> forwarding rates from 3.58*10^6 PPS to 3.99*10^6 PPS, or about 11%.
And in your benchmarking patch you wrote:
> spin_single spin_multi
> base 106885 247941
> spinlock_patch 75194 219465
I did some benchmarking on an IP27 (180MHz, 2 CPU, needs LL/SC workaround):
spin_single spin_multi
base 229341 3505690
spinlock_patch 177847 3615326
So about 22% speedup for spin_single but 3% slowdown for spin_multi.
Disabling the R10k LL/SC workaround btw. gives another 23% speedup for
spin_single and marginal 0.3% for spin_multi; the latter may well be
statistical noise.
Ralf
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] MIPS: Optimize spinlocks.
2010-02-24 15:53 ` Ralf Baechle
@ 2010-02-24 15:54 ` Ralf Baechle
2010-02-24 16:55 ` David Daney
1 sibling, 0 replies; 6+ messages in thread
From: Ralf Baechle @ 2010-02-24 15:54 UTC (permalink / raw)
To: David Daney; +Cc: linux-mips
On Wed, Feb 24, 2010 at 04:53:36PM +0100, Ralf Baechle wrote:
> And in your benchmarking patch you wrote:
>
> > spin_single spin_multi
> > base 106885 247941
> > spinlock_patch 75194 219465
>
> I did some benchmarking on an IP27 (180MHz, 2 CPU, needs LL/SC workaround):
>
> spin_single spin_multi
> base 229341 3505690
> spinlock_patch 177847 3615326
>
> So about 22% speedup for spin_single but 3% slowdown for spin_multi.
>
> Disabling the R10k LL/SC workaround btw. gives another 23% speedup for
> spin_single and marginal 0.3% for spin_multi; the latter may well be
> statistical noise.
And I forgot the most important - patch queued for 2.6.34.
Thanks!
Ralf
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] MIPS: Optimize spinlocks.
2010-02-24 15:53 ` Ralf Baechle
2010-02-24 15:54 ` Ralf Baechle
@ 2010-02-24 16:55 ` David Daney
2010-02-25 14:15 ` Ralf Baechle
1 sibling, 1 reply; 6+ messages in thread
From: David Daney @ 2010-02-24 16:55 UTC (permalink / raw)
To: Ralf Baechle; +Cc: linux-mips
On 02/24/2010 07:53 AM, Ralf Baechle wrote:
> On Thu, Feb 04, 2010 at 11:31:49AM -0800, David Daney wrote:
>
>> The current locking mechanism uses a ll/sc sequence to release a
>> spinlock. This is slower than a wmb() followed by a store to unlock.
>>
>> The branching forward to .subsection 2 on sc failure slows down the
>> contended case. So we get rid of that part too.
>>
>> Since we are now working on naturally aligned u16 values, we can get
>> rid of a masking operation as the LHU already does the right thing.
>> The ANDI are reversed for better scheduling on multi-issue CPUs
>>
>> On a 12 CPU 750MHz Octeon cn5750 this patch improves ipv4 UDP packet
>> forwarding rates from 3.58*10^6 PPS to 3.99*10^6 PPS, or about 11%.
>
> And in your benchmarking patch you wrote:
>
>> spin_single spin_multi
>> base 106885 247941
>> spinlock_patch 75194 219465
>
> I did some benchmarking on an IP27 (180MHz, 2 CPU, needs LL/SC workaround):
>
> spin_single spin_multi
> base 229341 3505690
> spinlock_patch 177847 3615326
>
> So about 22% speedup for spin_single but 3% slowdown for spin_multi.
>
It is possible that by choosing a better nudge_writes() implementation
for R10K, that the 3% degradation could be erased. Perhaps:
#define nudge_writes() do { } while (0)
Basically you want something that is fast, but that also forces the
write to be globally visible as soon as possible. Some processors have
a prefetch instruction that does this. On other processors a NOP is
optimal as they don't combine writes in the write back buffer.
There is a wbflush() function that could potentially be used, but its
implementation is too heavy on Octeon.
> Disabling the R10k LL/SC workaround btw. gives another 23% speedup for
> spin_single and marginal 0.3% for spin_multi; the latter may well be
> statistical noise.
>
> Ralf
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] MIPS: Optimize spinlocks.
2010-02-24 16:55 ` David Daney
@ 2010-02-25 14:15 ` Ralf Baechle
2010-02-25 17:31 ` David Daney
0 siblings, 1 reply; 6+ messages in thread
From: Ralf Baechle @ 2010-02-25 14:15 UTC (permalink / raw)
To: David Daney; +Cc: linux-mips
On Wed, Feb 24, 2010 at 08:55:12AM -0800, David Daney wrote:
> It is possible that by choosing a better nudge_writes()
> implementation for R10K, that the 3% degradation could be erased.
> Perhaps:
>
> #define nudge_writes() do { } while (0)
raw_spin_unlock must provide a barrier so this wouldn't be a valid
implementation for nudge_writes(). Implementing it as barrier() this
is a pure compiler barrier is the most liberal valid implementation.
> Basically you want something that is fast, but that also forces the
> write to be globally visible as soon as possible. Some processors
> have a prefetch instruction that does this. On other processors a
> NOP is optimal as they don't combine writes in the write back
> buffer.
>
> There is a wbflush() function that could potentially be used, but
> its implementation is too heavy on Octeon.
For IP27 which is a strongly ordered system nudge_writes() is implemented
as barrier().
Another experiment I did was alignment. A branch on an R10000 has a
significant execution time penalty if it's delay slot is overlapping a
128 byte S-cache boundary. Suitable alignment however didn't not seem
to make any difference at all on R10000.
Ralf
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] MIPS: Optimize spinlocks.
2010-02-25 14:15 ` Ralf Baechle
@ 2010-02-25 17:31 ` David Daney
0 siblings, 0 replies; 6+ messages in thread
From: David Daney @ 2010-02-25 17:31 UTC (permalink / raw)
To: Ralf Baechle; +Cc: linux-mips
On 02/25/2010 06:15 AM, Ralf Baechle wrote:
> On Wed, Feb 24, 2010 at 08:55:12AM -0800, David Daney wrote:
>
>> It is possible that by choosing a better nudge_writes()
>> implementation for R10K, that the 3% degradation could be erased.
>> Perhaps:
>>
>> #define nudge_writes() do { } while (0)
>
> raw_spin_unlock must provide a barrier so this wouldn't be a valid
> implementation for nudge_writes().
That barrier is separate (and present). The sole purpose of
nudge_writes() is to make speed up the global visibility of the
releasing write, it does not have anything to do with locking semantics.
> Implementing it as barrier() this
> is a pure compiler barrier is the most liberal valid implementation.
No, the most liberal would be a true NOP: 'do { } while (0)'.
>
>> Basically you want something that is fast, but that also forces the
>> write to be globally visible as soon as possible. Some processors
>> have a prefetch instruction that does this. On other processors a
>> NOP is optimal as they don't combine writes in the write back
>> buffer.
>>
>> There is a wbflush() function that could potentially be used, but
>> its implementation is too heavy on Octeon.
>
> For IP27 which is a strongly ordered system nudge_writes() is implemented
> as barrier().
>
> Another experiment I did was alignment. A branch on an R10000 has a
> significant execution time penalty if it's delay slot is overlapping a
> 128 byte S-cache boundary. Suitable alignment however didn't not seem
> to make any difference at all on R10000.
>
> Ralf
>
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2010-02-25 17:32 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-02-04 19:31 [PATCH] MIPS: Optimize spinlocks David Daney
2010-02-24 15:53 ` Ralf Baechle
2010-02-24 15:54 ` Ralf Baechle
2010-02-24 16:55 ` David Daney
2010-02-25 14:15 ` Ralf Baechle
2010-02-25 17:31 ` David Daney
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.