linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 2/3] MIPS: enforce LL-SC loop enclosing with SYNC (ACQUIRE and RELEASE)
  2015-06-02  0:09 [PATCH 0/3] MIPS: SMP memory barriers: lightweight sync, acquire-release Leonid Yegoshin
@ 2015-06-02  0:09 ` Leonid Yegoshin
  2015-06-02 11:39   ` James Hogan
  0 siblings, 1 reply; 3+ messages in thread
From: Leonid Yegoshin @ 2015-06-02  0:09 UTC (permalink / raw)
  To: linux-mips, benh, will.deacon, linux-kernel, ralf,
	markos.chandras, macro, Steven.Hill, alexander.h.duyck, davem

Many MIPS32 R2 and all MIPS R6 CPUs are out of order execution, so it
needs memory barriers in SMP environment. However, past cores may have
a pipeline short enough to ignore that requirements and problem may
never occurs until recently.

This patch gives an option to enclose LL-SC loops by SYNC barriers in spinlocks,
atomics, futexes, cmpxchg and bitops.

So, this option is defined for MIPS32 R2 only, because that recent
CPUs may occasionally have problems in accordance with HW team.
And most of MIPS64 R2 vendor processors already have some kind of memory
barrier and the only one generic 5KEs has a pretty short pipeline.

Using memory barriers in MIPS R6 is mandatory, all that
processors have a speculative memory read which can inflict a trouble
without a correct use of barriers in LL-SC loop cycles.
The same is actually for MIPS32 R5 I5600 processor.

Signed-off-by: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
---
 arch/mips/Kconfig               |   25 +++++++++++++++++++++++++
 arch/mips/include/asm/barrier.h |   26 ++++++++++++++++++++++----
 2 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index c7d0cacece3d..676eb64f5545 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1896,6 +1896,30 @@ config MIPS_LIGHTWEIGHT_SYNC
 	  converted to generic "SYNC 0".
 
 	  If you unsure, say N here, it may slightly decrease your performance
+
+config MIPS_ENFORCE_WEAK_REORDERING_BEYOND_LLSC
+	bool "Enforce memory barriers at LLSC loops - atomics, spinlocks etc"
+	depends on CPU_MIPS32_R2
+	default y if CPU_MIPSR6
+	select WEAK_REORDERING_BEYOND_LLSC
+	help
+	  Many MIPS32 R2 and all MIPS R6 CPUs are out of order execution, so it
+	  needs memory barriers in SMP environment. However, past cores may have
+	  a pipeline short enough to ignore that requirements and problem may
+	  never occurs until recently.
+
+	  So, this option is defined for MIPS32 R2 only, because that recent
+	  CPUs may occasionally have problems in accordance with HW team.
+	  And MIPS64 R2 vendor processors already have some kind of memory
+	  barrier and the only one generic 5KEs has a pretty short pipeline.
+
+	  Using memory barriers in MIPS R6 is mandatory, all that
+	  processors have a speculative memory read which can inflict a trouble
+	  without a correct use of barriers in LL-SC loop cycles.
+	  The same is actually for MIPS32 R5 I5600 processor.
+
+	  If you unsure, say Y here, it may slightly decrease your performance
+	  but increase a reliability.
 endmenu
 
 #
@@ -1924,6 +1948,7 @@ config CPU_MIPSR2
 config CPU_MIPSR6
 	bool
 	default y if CPU_MIPS32_R6 || CPU_MIPS64_R6
+	select MIPS_ENFORCE_WEAK_REORDERING_BEYOND_LLSC
 	select MIPS_SPRAM
 
 config EVA
diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h
index d2a63abfc7c6..f3cc7a91ac0d 100644
--- a/arch/mips/include/asm/barrier.h
+++ b/arch/mips/include/asm/barrier.h
@@ -95,33 +95,51 @@
 #  define smp_mb()	__sync()
 #  define smp_rmb()	barrier()
 #  define smp_wmb()	__syncw()
+#  define smp_acquire() __sync()
+#  define smp_release() __sync()
 # else
 #  ifdef CONFIG_MIPS_LIGHTWEIGHT_SYNC
 #  define smp_mb()      __asm__ __volatile__("sync 0x10" : : :"memory")
 #  define smp_rmb()     __asm__ __volatile__("sync 0x13" : : :"memory")
 #  define smp_wmb()     __asm__ __volatile__("sync 0x4" : : :"memory")
+#  define smp_acquire() __asm__ __volatile__("sync 0x11" : : :"memory")
+#  define smp_release() __asm__ __volatile__("sync 0x12" : : :"memory")
 #  else
 #  define smp_mb()	__asm__ __volatile__("sync" : : :"memory")
 #  define smp_rmb()	__asm__ __volatile__("sync" : : :"memory")
 #  define smp_wmb()	__asm__ __volatile__("sync" : : :"memory")
+#  define smp_acquire() __asm__ __volatile__("sync" : : :"memory")
+#  define smp_release() __asm__ __volatile__("sync" : : :"memory")
 #  endif
 # endif
 #else
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
+#define smp_acquire()   barrier()
+#define smp_release()   barrier()
 #endif
 
 #if defined(CONFIG_WEAK_REORDERING_BEYOND_LLSC) && defined(CONFIG_SMP)
+#ifdef CONFIG_MIPS_LIGHTWEIGHT_SYNC
+#define __WEAK_LLSC_MB          "       sync    0x10    \n"
+#define __WEAK_ACQUIRE          "       sync    0x11    \n"
+#define __WEAK_RELEASE          "       sync    0x12    \n"
+#else
 #define __WEAK_LLSC_MB		"	sync	\n"
+#define __WEAK_ACQUIRE          __WEAK_LLSC_MB
+#define __WEAK_RELEASE          __WEAK_LLSC_MB
+#endif
 #else
 #define __WEAK_LLSC_MB		"		\n"
+#define __WEAK_ACQUIRE          __WEAK_LLSC_MB
+#define __WEAK_RELEASE          __WEAK_LLSC_MB
 #endif
 
 #define set_mb(var, value) \
 	do { var = value; smp_mb(); } while (0)
 
-#define smp_llsc_mb()	__asm__ __volatile__(__WEAK_LLSC_MB : : :"memory")
+#define smp_llsc_mb()           __asm__ __volatile__(__WEAK_ACQUIRE : : :"memory")
 
 #ifdef CONFIG_CPU_CAVIUM_OCTEON
 #define smp_mb__before_llsc() smp_wmb()
@@ -131,14 +149,14 @@
 					    "syncw\n\t"			\
 					    ".set pop" : : : "memory")
 #else
-#define smp_mb__before_llsc() smp_llsc_mb()
+#define smp_mb__before_llsc()   __asm__ __volatile__(__WEAK_RELEASE : : :"memory")
 #define nudge_writes() mb()
 #endif
 
 #define smp_store_release(p, v)						\
 do {									\
 	compiletime_assert_atomic_type(*p);				\
-	smp_mb();							\
+	smp_release();                                                       \
 	ACCESS_ONCE(*p) = (v);						\
 } while (0)
 
@@ -146,7 +164,7 @@ do {									\
 ({									\
 	typeof(*p) ___p1 = ACCESS_ONCE(*p);				\
 	compiletime_assert_atomic_type(*p);				\
-	smp_mb();							\
+	smp_acquire();                                                       \
 	___p1;								\
 })
 


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH 2/3] MIPS: enforce LL-SC loop enclosing with SYNC (ACQUIRE and RELEASE)
  2015-06-02  0:09 ` [PATCH 2/3] MIPS: enforce LL-SC loop enclosing with SYNC (ACQUIRE and RELEASE) Leonid Yegoshin
@ 2015-06-02 11:39   ` James Hogan
  0 siblings, 0 replies; 3+ messages in thread
From: James Hogan @ 2015-06-02 11:39 UTC (permalink / raw)
  To: Leonid Yegoshin, linux-mips, benh, will.deacon, linux-kernel,
	ralf, markos.chandras, macro, Steven.Hill, alexander.h.duyck,
	davem

[-- Attachment #1: Type: text/plain, Size: 7978 bytes --]

Hi Leonid,

On 02/06/15 01:09, Leonid Yegoshin wrote:
> Many MIPS32 R2 and all MIPS R6 CPUs are out of order execution, so it
> needs memory barriers in SMP environment. However, past cores may have
> a pipeline short enough to ignore that requirements and problem may
> never occurs until recently.
> 
> This patch gives an option to enclose LL-SC loops by SYNC barriers in spinlocks,
> atomics, futexes, cmpxchg and bitops.

Please reflow text.

> 
> So, this option is defined for MIPS32 R2 only, because that recent

s/that/those/

> CPUs may occasionally have problems in accordance with HW team.

"have problems in accordance with HW team" is a bit confusing. What do
you mean?

> And most of MIPS64 R2 vendor processors already have some kind of memory
> barrier and the only one generic 5KEs has a pretty short pipeline.
> 
> Using memory barriers in MIPS R6 is mandatory, all that

s/that/those/

> processors have a speculative memory read which can inflict a trouble

s/a //

> without a correct use of barriers in LL-SC loop cycles.
> The same is actually for MIPS32 R5 I5600 processor.

Actually *true*?
P5600 you mean?

Same in Kconfig help text.


> 
> Signed-off-by: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
> ---
>  arch/mips/Kconfig               |   25 +++++++++++++++++++++++++
>  arch/mips/include/asm/barrier.h |   26 ++++++++++++++++++++++----
>  2 files changed, 47 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
> index c7d0cacece3d..676eb64f5545 100644
> --- a/arch/mips/Kconfig
> +++ b/arch/mips/Kconfig
> @@ -1896,6 +1896,30 @@ config MIPS_LIGHTWEIGHT_SYNC
>  	  converted to generic "SYNC 0".
>  
>  	  If you unsure, say N here, it may slightly decrease your performance
> +
> +config MIPS_ENFORCE_WEAK_REORDERING_BEYOND_LLSC
> +	bool "Enforce memory barriers at LLSC loops - atomics, spinlocks etc"
> +	depends on CPU_MIPS32_R2
> +	default y if CPU_MIPSR6
> +	select WEAK_REORDERING_BEYOND_LLSC
> +	help
> +	  Many MIPS32 R2 and all MIPS R6 CPUs are out of order execution, so it
> +	  needs memory barriers in SMP environment. However, past cores may have
> +	  a pipeline short enough to ignore that requirements and problem may
> +	  never occurs until recently.
> +
> +	  So, this option is defined for MIPS32 R2 only, because that recent
> +	  CPUs may occasionally have problems in accordance with HW team.
> +	  And MIPS64 R2 vendor processors already have some kind of memory
> +	  barrier and the only one generic 5KEs has a pretty short pipeline.
> +
> +	  Using memory barriers in MIPS R6 is mandatory, all that
> +	  processors have a speculative memory read which can inflict a trouble
> +	  without a correct use of barriers in LL-SC loop cycles.
> +	  The same is actually for MIPS32 R5 I5600 processor.
> +
> +	  If you unsure, say Y here, it may slightly decrease your performance

If you *are* unsure (same in patch 1). Same comment as patch 1 too about
ambiguity of "it".

> +	  but increase a reliability.

well only if the hardware has weak ordering, and only because it would
be technically wrong in that case to say N here.

It feels wrong to be giving the user this option. Can't we just select
WEAK_REORDERING_BEYOND_LLSC automatically based on the hardware that
needs to be supported by the kernel configuration (e.g. CPU_MIPSR6 or
CPU_MIPS32_R2)? Those who care about mips r2 performance on hardware
which doesn't strictly need it can always speak up / add an exception.

Out of interest, are futex operations safe with weak llsc ordering, on
the premise that they're mainly used by userland so ordering with normal
kernel accesses just doesn't matter in practice?

>  endmenu
>  
>  #
> @@ -1924,6 +1948,7 @@ config CPU_MIPSR2
>  config CPU_MIPSR6
>  	bool
>  	default y if CPU_MIPS32_R6 || CPU_MIPS64_R6
> +	select MIPS_ENFORCE_WEAK_REORDERING_BEYOND_LLSC
>  	select MIPS_SPRAM
>  
>  config EVA
> diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h
> index d2a63abfc7c6..f3cc7a91ac0d 100644
> --- a/arch/mips/include/asm/barrier.h
> +++ b/arch/mips/include/asm/barrier.h
> @@ -95,33 +95,51 @@
>  #  define smp_mb()	__sync()
>  #  define smp_rmb()	barrier()
>  #  define smp_wmb()	__syncw()
> +#  define smp_acquire() __sync()
> +#  define smp_release() __sync()
>  # else
>  #  ifdef CONFIG_MIPS_LIGHTWEIGHT_SYNC
>  #  define smp_mb()      __asm__ __volatile__("sync 0x10" : : :"memory")
>  #  define smp_rmb()     __asm__ __volatile__("sync 0x13" : : :"memory")
>  #  define smp_wmb()     __asm__ __volatile__("sync 0x4" : : :"memory")
> +#  define smp_acquire() __asm__ __volatile__("sync 0x11" : : :"memory")
> +#  define smp_release() __asm__ __volatile__("sync 0x12" : : :"memory")

same question about use of sync_acquire/sync_release instructions which
binutils understands since 2.21.

>  #  else
>  #  define smp_mb()	__asm__ __volatile__("sync" : : :"memory")
>  #  define smp_rmb()	__asm__ __volatile__("sync" : : :"memory")
>  #  define smp_wmb()	__asm__ __volatile__("sync" : : :"memory")
> +#  define smp_acquire() __asm__ __volatile__("sync" : : :"memory")
> +#  define smp_release() __asm__ __volatile__("sync" : : :"memory")
>  #  endif
>  # endif
>  #else
>  #define smp_mb()	barrier()
>  #define smp_rmb()	barrier()
>  #define smp_wmb()	barrier()
> +#define smp_acquire()   barrier()
> +#define smp_release()   barrier()
>  #endif
>  
>  #if defined(CONFIG_WEAK_REORDERING_BEYOND_LLSC) && defined(CONFIG_SMP)
> +#ifdef CONFIG_MIPS_LIGHTWEIGHT_SYNC
> +#define __WEAK_LLSC_MB          "       sync    0x10    \n"
> +#define __WEAK_ACQUIRE          "       sync    0x11    \n"
> +#define __WEAK_RELEASE          "       sync    0x12    \n"

tabs

__WEAK_ACQUIRE and __WEAK_RELEASE are specific to llsc, maybe call them
__WEAK_LLSC_ACQUIRE and __WEAK_LLSC_RELEASE instead to avoid confusion.


> +#else
>  #define __WEAK_LLSC_MB		"	sync	\n"
> +#define __WEAK_ACQUIRE          __WEAK_LLSC_MB
> +#define __WEAK_RELEASE          __WEAK_LLSC_MB

tabs

> +#endif
>  #else
>  #define __WEAK_LLSC_MB		"		\n"
> +#define __WEAK_ACQUIRE          __WEAK_LLSC_MB
> +#define __WEAK_RELEASE          __WEAK_LLSC_MB

tabs

>  #endif
>  
>  #define set_mb(var, value) \
>  	do { var = value; smp_mb(); } while (0)
>  
> -#define smp_llsc_mb()	__asm__ __volatile__(__WEAK_LLSC_MB : : :"memory")
> +#define smp_llsc_mb()           __asm__ __volatile__(__WEAK_ACQUIRE : : :"memory")

tabs

>  
>  #ifdef CONFIG_CPU_CAVIUM_OCTEON
>  #define smp_mb__before_llsc() smp_wmb()
> @@ -131,14 +149,14 @@
>  					    "syncw\n\t"			\
>  					    ".set pop" : : : "memory")
>  #else
> -#define smp_mb__before_llsc() smp_llsc_mb()
> +#define smp_mb__before_llsc()   __asm__ __volatile__(__WEAK_RELEASE : : :"memory")
>  #define nudge_writes() mb()
>  #endif
>  
>  #define smp_store_release(p, v)						\
>  do {									\
>  	compiletime_assert_atomic_type(*p);				\
> -	smp_mb();							\
> +	smp_release();                                                       \
>  	ACCESS_ONCE(*p) = (v);						\
>  } while (0)
>  
> @@ -146,7 +164,7 @@ do {									\
>  ({									\
>  	typeof(*p) ___p1 = ACCESS_ONCE(*p);				\
>  	compiletime_assert_atomic_type(*p);				\
> -	smp_mb();							\
> +	smp_acquire();                                                       \
>  	___p1;								\
>  })
>  
> 
> 

This patch does 3 logically separable things:
1) add smp_release/smp_acquire based on MIPS_LIGHTWEIGHT_SYNC and weaken
smp_store_release()/smp_load_acquire() to use them.
2) weaken llsc barriers when MIPS_LIGHTWEIGHT_SYNC.
3) the MIPS_ENFORCE_WEAK_REORDERING_BEYOND_LLSC Kconfig stuff (or
whatever method to select WEAK_REORDERING_BEYOND_LLSC more often).

Any reason not to split them, and give a clear description of each?

Cheers
James


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH 2/3] MIPS: enforce LL-SC loop enclosing with SYNC (ACQUIRE and RELEASE)
       [not found] <556DF959.1010704@imgtec.com>
@ 2015-06-02 18:53 ` Leonid Yegoshin
  0 siblings, 0 replies; 3+ messages in thread
From: Leonid Yegoshin @ 2015-06-02 18:53 UTC (permalink / raw)
  To: linux-kernel@vger.kernel.org >> LKML,
	linux-mips@linux-mips.org

On 06/02/2015 04:39 AM, James Hogan wrote:
> Hi Leonid,
>
> On 02/06/15 01:09, Leonid Yegoshin wrote:
>
>> CPUs may occasionally have problems in accordance with HW team.
> "have problems in accordance with HW team" is a bit confusing. What do
> you mean?

I wrote about memory barriers and problems may happens without it.
Some details from internal e-mail exchange:

-------------------------------------------

yes, it is possible for the stores to be observed out of order
The SC B can complete if it has ownership and the SW A is still
waiting for ownership.  This scenario was also possible on older
cores.

(snip)

-----Original Message-----
From: Leonid Yegoshin
Subject: more SYNC issues

...

I want to know - do XXXX orders stores without SYNC?

Let's consider a scenario:

SW    $0, A (cache miss)
...
LL    $1, B (cache hit)
..
SC    $1, B (cache hit again)
B        (is not taken!)
..
SYNC  0x10

Is it possible for other core to get new value of B before new value of 
A between SC-B and SYNC?
-------------------------------------------------------------

Another mail, another processor:

==========================================

Hi Leonid

I looked into the LSU RTL and I do not see speculation being blocked for
younger loads following LL.

I also ran a testcase to confirm my observation:

LL  (cacheable)Miss

LW (cacheable)Miss, different cacheline

Miss request for LW goes out before the Miss request for LL, also the
GPR updated for LW happens before the LL GPR update.
==========================================


> Actually *true*? P5600 you mean? Same in Kconfig help text.

Yes, thank you, it is P5600 and I5600 doesn't exist.


> It feels wrong to be giving the user this option. Can't we just select
> WEAK_REORDERING_BEYOND_LLSC automatically based on the hardware that
> needs to be supported by the kernel configuration (e.g. CPU_MIPSR6 or
> CPU_MIPS32_R2)?

No, we can't - a lot of old processors are in-order and all of that is
still MIPS R2.

>   Those who care about mips r2 performance on hardware
> which doesn't strictly need it can always speak up / add an exception.
>
> Out of interest, are futex operations safe with weak llsc ordering, on
> the premise that they're mainly used by userland so ordering with normal
> kernel accesses just doesn't matter in practice?

I think futex is used to communicate between user threads and problem is
theoretically still here.

> This patch does 3 logically separable things:
> 1) add smp_release/smp_acquire based on MIPS_LIGHTWEIGHT_SYNC and weaken
> smp_store_release()/smp_load_acquire() to use them.
> 2) weaken llsc barriers when MIPS_LIGHTWEIGHT_SYNC.
> 3) the MIPS_ENFORCE_WEAK_REORDERING_BEYOND_LLSC Kconfig stuff (or
> whatever method to select WEAK_REORDERING_BEYOND_LLSC more often).
>
> Any reason not to split them, and give a clear description of each?
>
>

I don't see a reason to split it.

- Leonid.



^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2015-06-02 18:53 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <556DF959.1010704@imgtec.com>
2015-06-02 18:53 ` [PATCH 2/3] MIPS: enforce LL-SC loop enclosing with SYNC (ACQUIRE and RELEASE) Leonid Yegoshin
2015-06-02  0:09 [PATCH 0/3] MIPS: SMP memory barriers: lightweight sync, acquire-release Leonid Yegoshin
2015-06-02  0:09 ` [PATCH 2/3] MIPS: enforce LL-SC loop enclosing with SYNC (ACQUIRE and RELEASE) Leonid Yegoshin
2015-06-02 11:39   ` James Hogan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).