* [PATCH 0/2] Rearrange MIPS barriers and optimize for Octeon.
@ 2010-01-09 1:16 David Daney
2010-01-09 1:17 ` [PATCH 1/2] MIPS: New macro smp_mb__before_llsc David Daney
2010-01-09 1:17 ` [PATCH 2/2] MIPS: Octeon: Use optimized memory barrier primitives David Daney
0 siblings, 2 replies; 5+ messages in thread
From: David Daney @ 2010-01-09 1:16 UTC (permalink / raw)
To: Ralf Baechle, linux-mips
Locking/atomic performance on Octeon can be improved by using
optimized barrier instructions. The first patch in this set
rearranges and simplifies (at least in my mind) the existing barriers.
Then the second patch adds Octeon specific barriers.
I will reply with the two patches.
David Daney (2):
MIPS: New macro smp_mb__before_llsc.
MIPS: Octeon: Use optimized memory barrier primitives.
arch/mips/Kconfig | 1 -
arch/mips/include/asm/atomic.h | 16 ++++++------
arch/mips/include/asm/barrier.h | 52
+++++++++++++++++++++++++++-----------
arch/mips/include/asm/bitops.h | 8 +++---
arch/mips/include/asm/cmpxchg.h | 10 +++---
arch/mips/include/asm/spinlock.h | 4 +-
arch/mips/include/asm/system.h | 4 +++
7 files changed, 60 insertions(+), 35 deletions(-)
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH 1/2] MIPS: New macro smp_mb__before_llsc.
2010-01-09 1:16 [PATCH 0/2] Rearrange MIPS barriers and optimize for Octeon David Daney
@ 2010-01-09 1:17 ` David Daney
2010-01-13 12:32 ` Ralf Baechle
2010-01-09 1:17 ` [PATCH 2/2] MIPS: Octeon: Use optimized memory barrier primitives David Daney
1 sibling, 1 reply; 5+ messages in thread
From: David Daney @ 2010-01-09 1:17 UTC (permalink / raw)
To: linux-mips, ralf; +Cc: David Daney
Replace some instances of smp_llsc_mb() with a new macro
smp_mb__before_llsc(). It is used before ll/sc sequences that are
documented as needing write barrier semantics.
The default implementation of smp_mb__before_llsc() is just
smp_llsc_mb(), so there are no changes in semantics.
Also simplify definition of smp_mb(), smp_rmb(), and smp_wmb() to be
just barrier() in the non-SMP case.
Signed-off-by: David Daney <ddaney@caviumnetworks.com>
---
arch/mips/include/asm/atomic.h | 16 ++++++++--------
arch/mips/include/asm/barrier.h | 15 +++++++++------
arch/mips/include/asm/bitops.h | 8 ++++----
arch/mips/include/asm/cmpxchg.h | 10 +++++-----
arch/mips/include/asm/spinlock.h | 4 ++--
arch/mips/include/asm/system.h | 4 ++++
6 files changed, 32 insertions(+), 25 deletions(-)
diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index dd75d67..519197e 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -137,7 +137,7 @@ static __inline__ int atomic_add_return(int i, atomic_t * v)
{
int result;
- smp_llsc_mb();
+ smp_mb__before_llsc();
if (kernel_uses_llsc && R10000_LLSC_WAR) {
int temp;
@@ -189,7 +189,7 @@ static __inline__ int atomic_sub_return(int i, atomic_t * v)
{
int result;
- smp_llsc_mb();
+ smp_mb__before_llsc();
if (kernel_uses_llsc && R10000_LLSC_WAR) {
int temp;
@@ -249,7 +249,7 @@ static __inline__ int atomic_sub_if_positive(int i, atomic_t * v)
{
int result;
- smp_llsc_mb();
+ smp_mb__before_llsc();
if (kernel_uses_llsc && R10000_LLSC_WAR) {
int temp;
@@ -516,7 +516,7 @@ static __inline__ long atomic64_add_return(long i, atomic64_t * v)
{
long result;
- smp_llsc_mb();
+ smp_mb__before_llsc();
if (kernel_uses_llsc && R10000_LLSC_WAR) {
long temp;
@@ -568,7 +568,7 @@ static __inline__ long atomic64_sub_return(long i, atomic64_t * v)
{
long result;
- smp_llsc_mb();
+ smp_mb__before_llsc();
if (kernel_uses_llsc && R10000_LLSC_WAR) {
long temp;
@@ -628,7 +628,7 @@ static __inline__ long atomic64_sub_if_positive(long i, atomic64_t * v)
{
long result;
- smp_llsc_mb();
+ smp_mb__before_llsc();
if (kernel_uses_llsc && R10000_LLSC_WAR) {
long temp;
@@ -788,9 +788,9 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u)
* atomic*_return operations are serializing but not the non-*_return
* versions.
*/
-#define smp_mb__before_atomic_dec() smp_llsc_mb()
+#define smp_mb__before_atomic_dec() smp_mb__before_llsc()
#define smp_mb__after_atomic_dec() smp_llsc_mb()
-#define smp_mb__before_atomic_inc() smp_llsc_mb()
+#define smp_mb__before_atomic_inc() smp_mb__before_llsc()
#define smp_mb__after_atomic_inc() smp_llsc_mb()
#include <asm-generic/atomic-long.h>
diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h
index 91785dc..1a5a51c 100644
--- a/arch/mips/include/asm/barrier.h
+++ b/arch/mips/include/asm/barrier.h
@@ -131,23 +131,26 @@
#endif /* !CONFIG_CPU_HAS_WB */
#if defined(CONFIG_WEAK_ORDERING) && defined(CONFIG_SMP)
-#define __WEAK_ORDERING_MB " sync \n"
+#define smp_mb() __asm__ __volatile__("sync" : : :"memory")
+#define smp_rmb() __asm__ __volatile__("sync" : : :"memory")
+#define smp_wmb() __asm__ __volatile__("sync" : : :"memory")
#else
-#define __WEAK_ORDERING_MB " \n"
+#define smp_mb() barrier()
+#define smp_rmb() barrier()
+#define smp_wmb() barrier()
#endif
+
#if defined(CONFIG_WEAK_REORDERING_BEYOND_LLSC) && defined(CONFIG_SMP)
#define __WEAK_LLSC_MB " sync \n"
#else
#define __WEAK_LLSC_MB " \n"
#endif
-#define smp_mb() __asm__ __volatile__(__WEAK_ORDERING_MB : : :"memory")
-#define smp_rmb() __asm__ __volatile__(__WEAK_ORDERING_MB : : :"memory")
-#define smp_wmb() __asm__ __volatile__(__WEAK_ORDERING_MB : : :"memory")
-
#define set_mb(var, value) \
do { var = value; smp_mb(); } while (0)
#define smp_llsc_mb() __asm__ __volatile__(__WEAK_LLSC_MB : : :"memory")
+#define smp_mb__before_llsc() smp_llsc_mb()
+
#endif /* __ASM_BARRIER_H */
diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h
index 84a3838..9255cfb 100644
--- a/arch/mips/include/asm/bitops.h
+++ b/arch/mips/include/asm/bitops.h
@@ -42,7 +42,7 @@
/*
* clear_bit() doesn't provide any barrier for the compiler.
*/
-#define smp_mb__before_clear_bit() smp_llsc_mb()
+#define smp_mb__before_clear_bit() smp_mb__before_llsc()
#define smp_mb__after_clear_bit() smp_llsc_mb()
/*
@@ -258,7 +258,7 @@ static inline int test_and_set_bit(unsigned long nr,
unsigned short bit = nr & SZLONG_MASK;
unsigned long res;
- smp_llsc_mb();
+ smp_mb__before_llsc();
if (kernel_uses_llsc && R10000_LLSC_WAR) {
unsigned long *m = ((unsigned long *) addr) + (nr >> SZLONG_LOG);
@@ -395,7 +395,7 @@ static inline int test_and_clear_bit(unsigned long nr,
unsigned short bit = nr & SZLONG_MASK;
unsigned long res;
- smp_llsc_mb();
+ smp_mb__before_llsc();
if (kernel_uses_llsc && R10000_LLSC_WAR) {
unsigned long *m = ((unsigned long *) addr) + (nr >> SZLONG_LOG);
@@ -485,7 +485,7 @@ static inline int test_and_change_bit(unsigned long nr,
unsigned short bit = nr & SZLONG_MASK;
unsigned long res;
- smp_llsc_mb();
+ smp_mb__before_llsc();
if (kernel_uses_llsc && R10000_LLSC_WAR) {
unsigned long *m = ((unsigned long *) addr) + (nr >> SZLONG_LOG);
diff --git a/arch/mips/include/asm/cmpxchg.h b/arch/mips/include/asm/cmpxchg.h
index 815a438..ed9aaaa 100644
--- a/arch/mips/include/asm/cmpxchg.h
+++ b/arch/mips/include/asm/cmpxchg.h
@@ -72,14 +72,14 @@
*/
extern void __cmpxchg_called_with_bad_pointer(void);
-#define __cmpxchg(ptr, old, new, barrier) \
+#define __cmpxchg(ptr, old, new, pre_barrier, post_barrier) \
({ \
__typeof__(ptr) __ptr = (ptr); \
__typeof__(*(ptr)) __old = (old); \
__typeof__(*(ptr)) __new = (new); \
__typeof__(*(ptr)) __res = 0; \
\
- barrier; \
+ pre_barrier; \
\
switch (sizeof(*(__ptr))) { \
case 4: \
@@ -96,13 +96,13 @@ extern void __cmpxchg_called_with_bad_pointer(void);
break; \
} \
\
- barrier; \
+ post_barrier; \
\
__res; \
})
-#define cmpxchg(ptr, old, new) __cmpxchg(ptr, old, new, smp_llsc_mb())
-#define cmpxchg_local(ptr, old, new) __cmpxchg(ptr, old, new, )
+#define cmpxchg(ptr, old, new) __cmpxchg(ptr, old, new, smp_mb__before_llsc(), smp_llsc_mb())
+#define cmpxchg_local(ptr, old, new) __cmpxchg(ptr, old, new, , )
#define cmpxchg64(ptr, o, n) \
({ \
diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h
index 21ef9ef..5f16696 100644
--- a/arch/mips/include/asm/spinlock.h
+++ b/arch/mips/include/asm/spinlock.h
@@ -138,7 +138,7 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
{
int tmp;
- smp_llsc_mb();
+ smp_mb__before_llsc();
if (R10000_LLSC_WAR) {
__asm__ __volatile__ (
@@ -305,7 +305,7 @@ static inline void arch_read_unlock(arch_rwlock_t *rw)
{
unsigned int tmp;
- smp_llsc_mb();
+ smp_mb__before_llsc();
if (R10000_LLSC_WAR) {
__asm__ __volatile__(
diff --git a/arch/mips/include/asm/system.h b/arch/mips/include/asm/system.h
index 83b5509..bb937cc 100644
--- a/arch/mips/include/asm/system.h
+++ b/arch/mips/include/asm/system.h
@@ -95,6 +95,8 @@ static inline unsigned long __xchg_u32(volatile int * m, unsigned int val)
{
__u32 retval;
+ smp_mb__before_llsc();
+
if (kernel_uses_llsc && R10000_LLSC_WAR) {
unsigned long dummy;
@@ -147,6 +149,8 @@ static inline __u64 __xchg_u64(volatile __u64 * m, __u64 val)
{
__u64 retval;
+ smp_mb__before_llsc();
+
if (kernel_uses_llsc && R10000_LLSC_WAR) {
unsigned long dummy;
--
1.6.0.6
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 2/2] MIPS: Octeon: Use optimized memory barrier primitives.
2010-01-09 1:16 [PATCH 0/2] Rearrange MIPS barriers and optimize for Octeon David Daney
2010-01-09 1:17 ` [PATCH 1/2] MIPS: New macro smp_mb__before_llsc David Daney
@ 2010-01-09 1:17 ` David Daney
2010-01-13 12:33 ` Ralf Baechle
1 sibling, 1 reply; 5+ messages in thread
From: David Daney @ 2010-01-09 1:17 UTC (permalink / raw)
To: linux-mips, ralf; +Cc: David Daney
In order to achieve correct synchronization semantics, the Octeon port
had defined CONFIG_WEAK_REORDERING_BEYOND_LLSC. This resulted in code
that looks like:
sync
ll ...
.
.
.
sc ...
.
.
sync
The second SYNC was redundant, but harmless.
Octeon has a SYNCW instruction that acts as a write-memory-barrier
(due to an erratum in some parts two SYNCW are used). It is much
faster than SYNC because it imposes ordering on the writes, but
doesn't otherwise stall the execution pipeline. On Octeon, SYNC
stalls execution until all preceeding writes are committed to the
coherent memory system.
Using:
syncw;syncw
ll
.
.
.
sc
.
.
Has identical semantics to the first sequence, but is much faster.
The SYNCW orders the writes, and the SC will not complete successfully
until the write is committed to the coherent memory system. So at the
end all preceeding writes have been committed. Since Octeon does not
do speculative reads, this functions as a full barrier.
The patch removes CONFIG_WEAK_REORDERING_BEYOND_LLSC, and substitutes
SYNCW for SYNC in write-memory-barriers.
Signed-off-by: David Daney <ddaney@caviumnetworks.com>
---
arch/mips/Kconfig | 1 -
arch/mips/include/asm/barrier.h | 43 ++++++++++++++++++++++++++++----------
2 files changed, 31 insertions(+), 13 deletions(-)
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 9dde8d1..cd04d36 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1295,7 +1295,6 @@ config CPU_CAVIUM_OCTEON
select SYS_SUPPORTS_SMP
select NR_CPUS_DEFAULT_16
select WEAK_ORDERING
- select WEAK_REORDERING_BEYOND_LLSC
select CPU_SUPPORTS_HIGHMEM
select CPU_SUPPORTS_HUGEPAGES
help
diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h
index 1a5a51c..a2670a2 100644
--- a/arch/mips/include/asm/barrier.h
+++ b/arch/mips/include/asm/barrier.h
@@ -88,12 +88,20 @@
: /* no output */ \
: "m" (*(int *)CKSEG1) \
: "memory")
-
-#define fast_wmb() __sync()
-#define fast_rmb() __sync()
-#define fast_mb() __sync()
-#ifdef CONFIG_SGI_IP28
-#define fast_iob() \
+#ifdef CONFIG_CPU_CAVIUM_OCTEON
+# define OCTEON_SYNCW_STR ".set push\n.set arch=octeon\nsyncw\nsyncw\n.set pop\n"
+# define __syncw() __asm__ __volatile__(OCTEON_SYNCW_STR : : : "memory")
+
+# define fast_wmb() __syncw()
+# define fast_rmb() barrier()
+# define fast_mb() __sync()
+# define fast_iob() do { } while (0)
+#else /* ! CONFIG_CPU_CAVIUM_OCTEON */
+# define fast_wmb() __sync()
+# define fast_rmb() __sync()
+# define fast_mb() __sync()
+# ifdef CONFIG_SGI_IP28
+# define fast_iob() \
__asm__ __volatile__( \
".set push\n\t" \
".set noreorder\n\t" \
@@ -104,13 +112,14 @@
: /* no output */ \
: "m" (*(int *)CKSEG1ADDR(0x1fa00004)) \
: "memory")
-#else
-#define fast_iob() \
+# else
+# define fast_iob() \
do { \
__sync(); \
__fast_iob(); \
} while (0)
-#endif
+# endif
+#endif /* CONFIG_CPU_CAVIUM_OCTEON */
#ifdef CONFIG_CPU_HAS_WB
@@ -131,9 +140,15 @@
#endif /* !CONFIG_CPU_HAS_WB */
#if defined(CONFIG_WEAK_ORDERING) && defined(CONFIG_SMP)
-#define smp_mb() __asm__ __volatile__("sync" : : :"memory")
-#define smp_rmb() __asm__ __volatile__("sync" : : :"memory")
-#define smp_wmb() __asm__ __volatile__("sync" : : :"memory")
+# ifdef CONFIG_CPU_CAVIUM_OCTEON
+# define smp_mb() __sync()
+# define smp_rmb() barrier()
+# define smp_wmb() __syncw()
+# else
+# define smp_mb() __asm__ __volatile__("sync" : : :"memory")
+# define smp_rmb() __asm__ __volatile__("sync" : : :"memory")
+# define smp_wmb() __asm__ __volatile__("sync" : : :"memory")
+# endif
#else
#define smp_mb() barrier()
#define smp_rmb() barrier()
@@ -151,6 +166,10 @@
#define smp_llsc_mb() __asm__ __volatile__(__WEAK_LLSC_MB : : :"memory")
+#ifdef CONFIG_CPU_CAVIUM_OCTEON
+#define smp_mb__before_llsc() smp_wmb()
+#else
#define smp_mb__before_llsc() smp_llsc_mb()
+#endif
#endif /* __ASM_BARRIER_H */
--
1.6.0.6
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH 1/2] MIPS: New macro smp_mb__before_llsc.
2010-01-09 1:17 ` [PATCH 1/2] MIPS: New macro smp_mb__before_llsc David Daney
@ 2010-01-13 12:32 ` Ralf Baechle
0 siblings, 0 replies; 5+ messages in thread
From: Ralf Baechle @ 2010-01-13 12:32 UTC (permalink / raw)
To: David Daney; +Cc: linux-mips
On Fri, Jan 08, 2010 at 05:17:43PM -0800, David Daney wrote:
> Replace some instances of smp_llsc_mb() with a new macro
> smp_mb__before_llsc(). It is used before ll/sc sequences that are
> documented as needing write barrier semantics.
>
> The default implementation of smp_mb__before_llsc() is just
> smp_llsc_mb(), so there are no changes in semantics.
>
> Also simplify definition of smp_mb(), smp_rmb(), and smp_wmb() to be
> just barrier() in the non-SMP case.
Queued for 2.6.34. Thanks!
Ralf
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH 2/2] MIPS: Octeon: Use optimized memory barrier primitives.
2010-01-09 1:17 ` [PATCH 2/2] MIPS: Octeon: Use optimized memory barrier primitives David Daney
@ 2010-01-13 12:33 ` Ralf Baechle
0 siblings, 0 replies; 5+ messages in thread
From: Ralf Baechle @ 2010-01-13 12:33 UTC (permalink / raw)
To: David Daney; +Cc: linux-mips
On Fri, Jan 08, 2010 at 05:17:44PM -0800, David Daney wrote:
> In order to achieve correct synchronization semantics, the Octeon port
> had defined CONFIG_WEAK_REORDERING_BEYOND_LLSC. This resulted in code
> that looks like:
>
> sync
> ll ...
> .
> .
> .
> sc ...
> .
> .
> sync
>
> The second SYNC was redundant, but harmless.
>
> Octeon has a SYNCW instruction that acts as a write-memory-barrier
> (due to an erratum in some parts two SYNCW are used). It is much
> faster than SYNC because it imposes ordering on the writes, but
> doesn't otherwise stall the execution pipeline. On Octeon, SYNC
> stalls execution until all preceeding writes are committed to the
> coherent memory system.
>
> Using:
>
> syncw;syncw
> ll
> .
> .
> .
> sc
> .
> .
>
> Has identical semantics to the first sequence, but is much faster.
> The SYNCW orders the writes, and the SC will not complete successfully
> until the write is committed to the coherent memory system. So at the
> end all preceeding writes have been committed. Since Octeon does not
> do speculative reads, this functions as a full barrier.
>
> The patch removes CONFIG_WEAK_REORDERING_BEYOND_LLSC, and substitutes
> SYNCW for SYNC in write-memory-barriers.
Queued for 2.6.34. Thanks!
Ralf
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2010-01-13 12:33 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-01-09 1:16 [PATCH 0/2] Rearrange MIPS barriers and optimize for Octeon David Daney
2010-01-09 1:17 ` [PATCH 1/2] MIPS: New macro smp_mb__before_llsc David Daney
2010-01-13 12:32 ` Ralf Baechle
2010-01-09 1:17 ` [PATCH 2/2] MIPS: Octeon: Use optimized memory barrier primitives David Daney
2010-01-13 12:33 ` Ralf Baechle
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).