* [PATCH] Improve atomic.h implementation robustness
@ 2004-12-01 7:00 Thiemo Seufer
2004-12-01 10:20 ` Dominic Sweetman
0 siblings, 1 reply; 21+ messages in thread
From: Thiemo Seufer @ 2004-12-01 7:00 UTC (permalink / raw)
To: linux-mips; +Cc: ralf
Hello All,
the atomic functions use so far memory references for the inline
assembler to access the semaphore. This can lead to additional
instructions in the ll/sc loop, because newer compilers don't
expand the memory reference any more but leave it to the assembler.
The appended patch uses registers instead, and makes the ll/sc
arguments more explicit. In some cases it will lead also to better
register scheduling because the register isn't bound to an output
any more.
Thiemo
Index: include/asm-mips/atomic.h
===================================================================
RCS file: /home/cvs/linux/include/asm-mips/atomic.h,v
retrieving revision 1.36
diff -u -p -r1.36 atomic.h
--- include/asm-mips/atomic.h 19 Aug 2004 09:54:23 -0000 1.36
+++ include/asm-mips/atomic.h 1 Dec 2004 06:45:27 -0000
@@ -62,22 +62,24 @@ static __inline__ void atomic_add(int i,
unsigned long temp;
__asm__ __volatile__(
- "1: ll %0, %1 # atomic_add \n"
+ "1: ll %0, (%1) # atomic_add \n"
" addu %0, %2 \n"
- " sc %0, %1 \n"
+ " sc %0, (%1) \n"
" beqzl %0, 1b \n"
- : "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter));
+ : "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
+ : "memory");
} else if (cpu_has_llsc) {
unsigned long temp;
__asm__ __volatile__(
- "1: ll %0, %1 # atomic_add \n"
+ "1: ll %0, (%1) # atomic_add \n"
" addu %0, %2 \n"
- " sc %0, %1 \n"
+ " sc %0, (%1) \n"
" beqz %0, 1b \n"
- : "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter));
+ : "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
+ : "memory");
} else {
unsigned long flags;
@@ -100,22 +102,24 @@ static __inline__ void atomic_sub(int i,
unsigned long temp;
__asm__ __volatile__(
- "1: ll %0, %1 # atomic_sub \n"
+ "1: ll %0, (%1) # atomic_sub \n"
" subu %0, %2 \n"
- " sc %0, %1 \n"
+ " sc %0, (%1) \n"
" beqzl %0, 1b \n"
- : "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter));
+ : "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
+ : "memory");
} else if (cpu_has_llsc) {
unsigned long temp;
__asm__ __volatile__(
- "1: ll %0, %1 # atomic_sub \n"
+ "1: ll %0, (%1) # atomic_sub \n"
" subu %0, %2 \n"
- " sc %0, %1 \n"
+ " sc %0, (%1) \n"
" beqz %0, 1b \n"
- : "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter));
+ : "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
+ : "memory");
} else {
unsigned long flags;
@@ -136,27 +140,27 @@ static __inline__ int atomic_add_return(
unsigned long temp;
__asm__ __volatile__(
- "1: ll %1, %2 # atomic_add_return \n"
+ "1: ll %1, (%2) # atomic_add_return \n"
" addu %0, %1, %3 \n"
- " sc %0, %2 \n"
+ " sc %0, (%2) \n"
" beqzl %0, 1b \n"
" addu %0, %1, %3 \n"
" sync \n"
- : "=&r" (result), "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter)
+ : "=&r" (result), "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
: "memory");
} else if (cpu_has_llsc) {
unsigned long temp;
__asm__ __volatile__(
- "1: ll %1, %2 # atomic_add_return \n"
+ "1: ll %1, (%2) # atomic_add_return \n"
" addu %0, %1, %3 \n"
- " sc %0, %2 \n"
+ " sc %0, (%2) \n"
" beqz %0, 1b \n"
" addu %0, %1, %3 \n"
" sync \n"
- : "=&r" (result), "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter)
+ : "=&r" (result), "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
: "memory");
} else {
unsigned long flags;
@@ -179,27 +183,27 @@ static __inline__ int atomic_sub_return(
unsigned long temp;
__asm__ __volatile__(
- "1: ll %1, %2 # atomic_sub_return \n"
+ "1: ll %1, (%2) # atomic_sub_return \n"
" subu %0, %1, %3 \n"
- " sc %0, %2 \n"
+ " sc %0, (%2) \n"
" beqzl %0, 1b \n"
" subu %0, %1, %3 \n"
" sync \n"
- : "=&r" (result), "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter)
+ : "=&r" (result), "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
: "memory");
} else if (cpu_has_llsc) {
unsigned long temp;
__asm__ __volatile__(
- "1: ll %1, %2 # atomic_sub_return \n"
+ "1: ll %1, (%2) # atomic_sub_return \n"
" subu %0, %1, %3 \n"
- " sc %0, %2 \n"
+ " sc %0, (%2) \n"
" beqz %0, 1b \n"
" subu %0, %1, %3 \n"
" sync \n"
- : "=&r" (result), "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter)
+ : "=&r" (result), "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
: "memory");
} else {
unsigned long flags;
@@ -229,29 +233,29 @@ static __inline__ int atomic_sub_if_posi
unsigned long temp;
__asm__ __volatile__(
- "1: ll %1, %2 # atomic_sub_if_positive\n"
+ "1: ll %1, (%2) # atomic_sub_if_positive\n"
" subu %0, %1, %3 \n"
" bltz %0, 1f \n"
- " sc %0, %2 \n"
+ " sc %0, (%2) \n"
" beqzl %0, 1b \n"
" sync \n"
"1: \n"
- : "=&r" (result), "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter)
+ : "=&r" (result), "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
: "memory");
} else if (cpu_has_llsc) {
unsigned long temp;
__asm__ __volatile__(
- "1: ll %1, %2 # atomic_sub_if_positive\n"
+ "1: ll %1, (%2) # atomic_sub_if_positive\n"
" subu %0, %1, %3 \n"
" bltz %0, 1f \n"
- " sc %0, %2 \n"
+ " sc %0, (%2) \n"
" beqz %0, 1b \n"
" sync \n"
"1: \n"
- : "=&r" (result), "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter)
+ : "=&r" (result), "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
: "memory");
} else {
unsigned long flags;
@@ -367,22 +371,24 @@ static __inline__ void atomic64_add(long
unsigned long temp;
__asm__ __volatile__(
- "1: lld %0, %1 # atomic64_add \n"
+ "1: lld %0, (%1) # atomic64_add \n"
" addu %0, %2 \n"
- " scd %0, %1 \n"
+ " scd %0, (%1) \n"
" beqzl %0, 1b \n"
- : "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter));
+ : "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
+ : "memory");
} else if (cpu_has_llsc) {
unsigned long temp;
__asm__ __volatile__(
- "1: lld %0, %1 # atomic64_add \n"
+ "1: lld %0, (%1) # atomic64_add \n"
" addu %0, %2 \n"
- " scd %0, %1 \n"
+ " scd %0, (%1) \n"
" beqz %0, 1b \n"
- : "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter));
+ : "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
+ : "memory");
} else {
unsigned long flags;
@@ -405,22 +411,24 @@ static __inline__ void atomic64_sub(long
unsigned long temp;
__asm__ __volatile__(
- "1: lld %0, %1 # atomic64_sub \n"
+ "1: lld %0, (%1) # atomic64_sub \n"
" subu %0, %2 \n"
- " scd %0, %1 \n"
+ " scd %0, (%1) \n"
" beqzl %0, 1b \n"
- : "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter));
+ : "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
+ : "memory");
} else if (cpu_has_llsc) {
unsigned long temp;
__asm__ __volatile__(
- "1: lld %0, %1 # atomic64_sub \n"
+ "1: lld %0, (%1) # atomic64_sub \n"
" subu %0, %2 \n"
- " scd %0, %1 \n"
+ " scd %0, (%1) \n"
" beqz %0, 1b \n"
- : "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter));
+ : "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
+ : "memory");
} else {
unsigned long flags;
@@ -441,27 +449,27 @@ static __inline__ long atomic64_add_retu
unsigned long temp;
__asm__ __volatile__(
- "1: lld %1, %2 # atomic64_add_return \n"
+ "1: lld %1, (%2) # atomic64_add_return \n"
" addu %0, %1, %3 \n"
- " scd %0, %2 \n"
+ " scd %0, (%2) \n"
" beqzl %0, 1b \n"
" addu %0, %1, %3 \n"
" sync \n"
- : "=&r" (result), "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter)
+ : "=&r" (result), "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
: "memory");
} else if (cpu_has_llsc) {
unsigned long temp;
__asm__ __volatile__(
- "1: lld %1, %2 # atomic64_add_return \n"
+ "1: lld %1, (%2) # atomic64_add_return \n"
" addu %0, %1, %3 \n"
- " scd %0, %2 \n"
+ " scd %0, (%2) \n"
" beqz %0, 1b \n"
" addu %0, %1, %3 \n"
" sync \n"
- : "=&r" (result), "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter)
+ : "=&r" (result), "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
: "memory");
} else {
unsigned long flags;
@@ -484,27 +492,27 @@ static __inline__ long atomic64_sub_retu
unsigned long temp;
__asm__ __volatile__(
- "1: lld %1, %2 # atomic64_sub_return \n"
+ "1: lld %1, (%2) # atomic64_sub_return \n"
" subu %0, %1, %3 \n"
- " scd %0, %2 \n"
+ " scd %0, (%2) \n"
" beqzl %0, 1b \n"
" subu %0, %1, %3 \n"
" sync \n"
- : "=&r" (result), "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter)
+ : "=&r" (result), "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
: "memory");
} else if (cpu_has_llsc) {
unsigned long temp;
__asm__ __volatile__(
- "1: lld %1, %2 # atomic64_sub_return \n"
+ "1: lld %1, (%2) # atomic64_sub_return \n"
" subu %0, %1, %3 \n"
- " scd %0, %2 \n"
+ " scd %0, (%2) \n"
" beqz %0, 1b \n"
" subu %0, %1, %3 \n"
" sync \n"
- : "=&r" (result), "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter)
+ : "=&r" (result), "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
: "memory");
} else {
unsigned long flags;
@@ -534,29 +542,29 @@ static __inline__ long atomic64_sub_if_p
unsigned long temp;
__asm__ __volatile__(
- "1: lld %1, %2 # atomic64_sub_if_positive\n"
+ "1: lld %1, (%2) # atomic64_sub_if_positive\n"
" dsubu %0, %1, %3 \n"
" bltz %0, 1f \n"
- " scd %0, %2 \n"
+ " scd %0, (%2) \n"
" beqzl %0, 1b \n"
" sync \n"
"1: \n"
- : "=&r" (result), "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter)
+ : "=&r" (result), "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
: "memory");
} else if (cpu_has_llsc) {
unsigned long temp;
__asm__ __volatile__(
- "1: lld %1, %2 # atomic64_sub_if_positive\n"
+ "1: lld %1, (%2) # atomic64_sub_if_positive\n"
" dsubu %0, %1, %3 \n"
" bltz %0, 1f \n"
- " scd %0, %2 \n"
+ " scd %0, (%2) \n"
" beqz %0, 1b \n"
" sync \n"
"1: \n"
- : "=&r" (result), "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter)
+ : "=&r" (result), "=&r" (temp)
+ : "r" (&v->counter), "Ir" (i)
: "memory");
} else {
unsigned long flags;
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH] Improve atomic.h implementation robustness
@ 2004-12-01 10:20 ` Dominic Sweetman
0 siblings, 0 replies; 21+ messages in thread
From: Dominic Sweetman @ 2004-12-01 10:20 UTC (permalink / raw)
To: Thiemo Seufer; +Cc: linux-mips, ralf, Nigel Stephens, David Ung
Thiemo writes:
> the atomic functions use so far memory references for the inline
> assembler to access the semaphore. This can lead to additional
> instructions in the ll/sc loop, because newer compilers don't
> expand the memory reference any more but leave it to the assembler.
>
> The appended patch...
I thought it was an explicit aim of the substantial rewrite of the
MIPS backend for 3.x to get the compiler to generate only "real"
instructions, not macros which expand to multiple instructions inside
the assembler. So it's disappointing if newer compilers got worse.
Can one of our compiler-knowledgable people follow this up?
--
Dominic Sweetman
MIPS Technologies
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Improve atomic.h implementation robustness
@ 2004-12-01 10:20 ` Dominic Sweetman
0 siblings, 0 replies; 21+ messages in thread
From: Dominic Sweetman @ 2004-12-01 10:20 UTC (permalink / raw)
To: Thiemo Seufer; +Cc: linux-mips, ralf, Nigel Stephens, David Ung
Thiemo writes:
> the atomic functions use so far memory references for the inline
> assembler to access the semaphore. This can lead to additional
> instructions in the ll/sc loop, because newer compilers don't
> expand the memory reference any more but leave it to the assembler.
>
> The appended patch...
I thought it was an explicit aim of the substantial rewrite of the
MIPS backend for 3.x to get the compiler to generate only "real"
instructions, not macros which expand to multiple instructions inside
the assembler. So it's disappointing if newer compilers got worse.
Can one of our compiler-knowledgable people follow this up?
--
Dominic Sweetman
MIPS Technologies
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-01 10:20 ` Dominic Sweetman
(?)
@ 2004-12-01 12:33 ` Ralf Baechle
2004-12-01 21:50 ` Maciej W. Rozycki
-1 siblings, 1 reply; 21+ messages in thread
From: Ralf Baechle @ 2004-12-01 12:33 UTC (permalink / raw)
To: Dominic Sweetman; +Cc: Thiemo Seufer, linux-mips, Nigel Stephens, David Ung
On Wed, Dec 01, 2004 at 10:20:28AM +0000, Dominic Sweetman wrote:
> > the atomic functions use so far memory references for the inline
> > assembler to access the semaphore. This can lead to additional
> > instructions in the ll/sc loop, because newer compilers don't
> > expand the memory reference any more but leave it to the assembler.
> >
> > The appended patch...
>
> I thought it was an explicit aim of the substantial rewrite of the
> MIPS backend for 3.x to get the compiler to generate only "real"
> instructions, not macros which expand to multiple instructions inside
> the assembler. So it's disappointing if newer compilers got worse.
>
> Can one of our compiler-knowledgable people follow this up?
Dominik,
this problem here is specific to inline assembler. The splitlock code for
a reasonable CPU is:
static __inline__ void atomic_add(int i, atomic_t * v)
{
unsigned long temp;
__asm__ __volatile__(
"1: ll %0, %1 # atomic_add \n"
" addu %0, %2 \n"
" sc %0, %1 \n"
" beqz %0, 1b \n"
: "=&r" (temp), "=m" (v->counter)
: "Ir" (i), "m" (v->counter));
}
For the average atomic op generated code is going to look about like:
80100634: lui a0,0x802c
80100638: ll a0,-24160(a0)
8010063c: addu a0,a0,v0
80100640: lui at,0x802c
80100644: addu at,at,v1
80100648: sc a0,-24160(at)
8010064c: beqz a0,80100634 <init+0x194>
80100650: nop
It's significantly worse for 64-bit due to the excessive code sequence
generated for loading a 64-bit address. One outside CKSEGx that is.
On 32-bit Thiemo's patch would cut that down to something like:
80100630: lui t0,0x802c
80100634: addiu t0,t0,-24160
80100638: ll a0,0(t0)
8010063c: addu a0,a0,v0
80100648: sc a0,0(to)
8010064c: beqz a0,80100638 <init+0x194>
80100650: nop
On 64-bit the savings would be even more significant. But what we actually
want would be using the "o" constraint. Which just at least on the
compilers where I've tried it, didn't produce code any different from "m".
The expected code would be something like:
80100634: lui t0,0x802c
80100638: ll a0,-24160(t0)
8010063c: addu a0,a0,v0
80100648: sc a0,-24160(to)
8010064c: beqz a0,80100634 <init+0x194>
80100650: nop
So another instruction less.
Ralf
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-01 12:33 ` Ralf Baechle
@ 2004-12-01 21:50 ` Maciej W. Rozycki
2004-12-01 23:39 ` Ralf Baechle
2004-12-07 11:56 ` Richard Sandiford
0 siblings, 2 replies; 21+ messages in thread
From: Maciej W. Rozycki @ 2004-12-01 21:50 UTC (permalink / raw)
To: Ralf Baechle
Cc: Dominic Sweetman, Thiemo Seufer, linux-mips, Nigel Stephens,
David Ung
On Wed, 1 Dec 2004, Ralf Baechle wrote:
> this problem here is specific to inline assembler. The splitlock code for
> a reasonable CPU is:
>
> static __inline__ void atomic_add(int i, atomic_t * v)
> {
> unsigned long temp;
>
> __asm__ __volatile__(
> "1: ll %0, %1 # atomic_add \n"
> " addu %0, %2 \n"
> " sc %0, %1 \n"
> " beqz %0, 1b \n"
> : "=&r" (temp), "=m" (v->counter)
> : "Ir" (i), "m" (v->counter));
> }
>
> For the average atomic op generated code is going to look about like:
>
> 80100634: lui a0,0x802c
> 80100638: ll a0,-24160(a0)
> 8010063c: addu a0,a0,v0
> 80100640: lui at,0x802c
> 80100644: addu at,at,v1
> 80100648: sc a0,-24160(at)
> 8010064c: beqz a0,80100634 <init+0x194>
> 80100650: nop
>
> It's significantly worse for 64-bit due to the excessive code sequence
> generated for loading a 64-bit address. One outside CKSEGx that is.
Only for old compilers. For current (>= 3.4) ones you can use the "R"
constraint and get exactly what you need. Rewriting inline asms to use
"R" for GCC >= 3.4 has actually been on my to-do list for some time;
predating the current working implementation even.
> On 32-bit Thiemo's patch would cut that down to something like:
>
> 80100630: lui t0,0x802c
> 80100634: addiu t0,t0,-24160
> 80100638: ll a0,0(t0)
> 8010063c: addu a0,a0,v0
> 80100648: sc a0,0(to)
> 8010064c: beqz a0,80100638 <init+0x194>
> 80100650: nop
Plus it clobbers memory requiring a writeback and a refetch of all
unrelated variables that have happened to be cached in registers.
> On 64-bit the savings would be even more significant. But what we actually
> want would be using the "o" constraint. Which just at least on the
> compilers where I've tried it, didn't produce code any different from "m".
No surprise as the "o" constraint doesn't mean anything particular for
MIPS. All addresses are offsettable -- there is no addressing mode that
would preclude it, so "o" is exactly the same as "m".
> The expected code would be something like:
>
> 80100634: lui t0,0x802c
> 80100638: ll a0,-24160(t0)
> 8010063c: addu a0,a0,v0
> 80100648: sc a0,-24160(to)
> 8010064c: beqz a0,80100634 <init+0x194>
> 80100650: nop
>
> So another instruction less.
That's exactly what's emitted with "R". Should I accelerate my work on
it? It's nothing that would require a lot of effort -- it's more boring
than challenging.
Maciej
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-01 21:50 ` Maciej W. Rozycki
@ 2004-12-01 23:39 ` Ralf Baechle
2004-12-02 0:01 ` Maciej W. Rozycki
2004-12-07 11:56 ` Richard Sandiford
1 sibling, 1 reply; 21+ messages in thread
From: Ralf Baechle @ 2004-12-01 23:39 UTC (permalink / raw)
To: Maciej W. Rozycki
Cc: Dominic Sweetman, Thiemo Seufer, linux-mips, Nigel Stephens,
David Ung
On Wed, Dec 01, 2004 at 09:50:45PM +0000, Maciej W. Rozycki wrote:
> No surprise as the "o" constraint doesn't mean anything particular for
> MIPS. All addresses are offsettable -- there is no addressing mode that
> would preclude it, so "o" is exactly the same as "m".
This is what the gcc docs say:
[...]
`o'
A memory operand is allowed, but only if the address is
"offsettable". This means that adding a small integer (actually,
the width in bytes of the operand, as determined by its machine
mode) may be added to the address and the result is also a valid
memory address.
For example, an address which is constant is offsettable; so is an
address that is the sum of a register and a constant (as long as a
slightly larger constant is also within the range of
address-offsets supported by the machine); but an autoincrement or
autodecrement address is not offsettable. More complicated
indirect/indexed addresses may or may not be offsettable depending
on the other addressing modes that the machine supports.
[...]
So it is not the same as "m".
Ralf
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-01 23:39 ` Ralf Baechle
@ 2004-12-02 0:01 ` Maciej W. Rozycki
0 siblings, 0 replies; 21+ messages in thread
From: Maciej W. Rozycki @ 2004-12-02 0:01 UTC (permalink / raw)
To: Ralf Baechle
Cc: Dominic Sweetman, Thiemo Seufer, linux-mips, Nigel Stephens,
David Ung
On Thu, 2 Dec 2004, Ralf Baechle wrote:
> > No surprise as the "o" constraint doesn't mean anything particular for
> > MIPS. All addresses are offsettable -- there is no addressing mode that
> > would preclude it, so "o" is exactly the same as "m".
>
> This is what the gcc docs say:
[...]
> So it is not the same as "m".
It is the same *for* MIPS. Not in general.
Maciej
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-01 21:50 ` Maciej W. Rozycki
2004-12-01 23:39 ` Ralf Baechle
@ 2004-12-07 11:56 ` Richard Sandiford
2004-12-07 12:56 ` Thiemo Seufer
1 sibling, 1 reply; 21+ messages in thread
From: Richard Sandiford @ 2004-12-07 11:56 UTC (permalink / raw)
To: Maciej W. Rozycki
Cc: Ralf Baechle, Dominic Sweetman, Thiemo Seufer, linux-mips,
Nigel Stephens, David Ung
I've only been reading this list in batches recently, so this reply
has quotes from several separate messages, sorry.
First some general comments:
- Explicit relocs for n64 non-PIC weren't implemented before the gcc 3.4
feature freeze. gcc 3.4.x therefore still uses symbolic addresses when
compiling a 64-bit kernel.
- gcc 4.0 knows how to generate explicit relocs for n64 non-PIC and
(by default) will use them instead of symbolic addresses.
- As Thiemo says, there as talk of a -msym32 option (more below), but it
hasn't been implemented yet. This means that if you want to the use
the 2-instruction dla hack, you'll need to use -mno-explicit-relocs
when compiling with 4.0. Don't count on that option being around
forever though!
"Maciej W. Rozycki" <macro@linux-mips.org> writes:
> On Wed, 1 Dec 2004, Ralf Baechle wrote:
>> this problem here is specific to inline assembler. The splitlock code for
>> a reasonable CPU is:
>>
>> static __inline__ void atomic_add(int i, atomic_t * v)
>> {
>> unsigned long temp;
>>
>> __asm__ __volatile__(
>> "1: ll %0, %1 # atomic_add \n"
>> " addu %0, %2 \n"
>> " sc %0, %1 \n"
>> " beqz %0, 1b \n"
>> : "=&r" (temp), "=m" (v->counter)
>> : "Ir" (i), "m" (v->counter));
>> }
>>
>> For the average atomic op generated code is going to look about like:
>>
>> 80100634: lui a0,0x802c
>> 80100638: ll a0,-24160(a0)
>> 8010063c: addu a0,a0,v0
>> 80100640: lui at,0x802c
>> 80100644: addu at,at,v1
>> 80100648: sc a0,-24160(at)
>> 8010064c: beqz a0,80100634 <init+0x194>
>> 80100650: nop
>>
>> It's significantly worse for 64-bit due to the excessive code sequence
>> generated for loading a 64-bit address. One outside CKSEGx that is.
>
> Only for old compilers. For current (>= 3.4) ones you can use the "R"
> constraint and get exactly what you need.
Right. IMO, this is exactly the right fix. It should be backward
compatible with old toolchains too.
FYI, the 'R' constraint has been kept around specifically for inline asms.
gcc itself no longer uses it.
"Maciej W. Rozycki" <macro@linux-mips.org> writes:
> On Wed, 1 Dec 2004, Ralf Baechle wrote:
>> On 64-bit the savings would be even more significant. But what we actually
>> want would be using the "o" constraint. Which just at least on the
>> compilers where I've tried it, didn't produce code any different from "m".
>
> No surprise as the "o" constraint doesn't mean anything particular for
> MIPS. All addresses are offsettable -- there is no addressing mode that
> would preclude it, so "o" is exactly the same as "m".
Right!
Thiemo Seufer <ica2_ts@csv.ica.uni-stuttgart.de> writes:
> Current 64bit MIPS kernels run in (C)KSEG0, and exploit sign-extension
> to optimize symbol loads (2 instead of 6/7 instructions, the same as in
> 32bit kernels). This optimization relies on an assembler macro
> expansion mode which was hacked in gas for exactly this purpose. Gcc
> currently doesn't have something similiar, and would try to do a regular
> 64bit load with explicit relocs.
>
> I discussed this with Richard Sandiford a while ago, and the conclusion
> was to implement an explicit --msym32 option for both gcc and gas to
> improve register scheduling and get rid of the gas hack. So far, nobody
> came around to actually do the work for it.
True. FWIW, it's trivial to add this option to gcc. As far as I remember,
the stumbling block was whether we should mark the objects in some way,
and whether the linker ought to check for overflow.
Richard
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-07 11:56 ` Richard Sandiford
@ 2004-12-07 12:56 ` Thiemo Seufer
2004-12-07 19:28 ` Richard Sandiford
0 siblings, 1 reply; 21+ messages in thread
From: Thiemo Seufer @ 2004-12-07 12:56 UTC (permalink / raw)
To: Richard Sandiford
Cc: Maciej W. Rozycki, Ralf Baechle, Dominic Sweetman, linux-mips,
Nigel Stephens, David Ung
Richard Sandiford wrote:
[snip]
> > Only for old compilers. For current (>= 3.4) ones you can use the "R"
> > constraint and get exactly what you need.
>
> Right. IMO, this is exactly the right fix. It should be backward
> compatible with old toolchains too.
>
> FYI, the 'R' constraint has been kept around specifically for inline asms.
> gcc itself no longer uses it.
I tried to use "R" in atomic.h but this failed in some (but not all)
cases with
include/asm/atomic.h:64: error: inconsistent operand constraints in an asm'
where the argument happens to be a member of a global struct.
Simple testcases work, however, as well as PIC code.
[snip]
> > I discussed this with Richard Sandiford a while ago, and the conclusion
> > was to implement an explicit --msym32 option for both gcc and gas to
> > improve register scheduling and get rid of the gas hack. So far, nobody
> > came around to actually do the work for it.
>
> True. FWIW, it's trivial to add this option to gcc. As far as I remember,
> the stumbling block was whether we should mark the objects in some way,
> and whether the linker ought to check for overflow.
Both might be nice but isn't exactly reqired. The use of --msym32 will
be limited to ELF64 non-PIC code, which is only used in kernels or
other stand-alone programs with limited exposure to other binaries with
incompatible code models.
Thiemo
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-07 12:56 ` Thiemo Seufer
@ 2004-12-07 19:28 ` Richard Sandiford
2004-12-08 0:09 ` Thiemo Seufer
0 siblings, 1 reply; 21+ messages in thread
From: Richard Sandiford @ 2004-12-07 19:28 UTC (permalink / raw)
To: Thiemo Seufer
Cc: Maciej W. Rozycki, Ralf Baechle, Dominic Sweetman, linux-mips,
Nigel Stephens, David Ung
Thiemo Seufer <ica2_ts@csv.ica.uni-stuttgart.de> writes:
> I tried to use "R" in atomic.h but this failed in some (but not all)
> cases with
>
> include/asm/atomic.h:64: error: inconsistent operand constraints in an asm'
>
> where the argument happens to be a member of a global struct.
Doh! Do you have any testcases handy? Was it failing with >= 3.4,
or with older toolchains? 3.4 and above should know that 'R' is a
memory-type constraint.
Richard
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-07 19:28 ` Richard Sandiford
@ 2004-12-08 0:09 ` Thiemo Seufer
0 siblings, 0 replies; 21+ messages in thread
From: Thiemo Seufer @ 2004-12-08 0:09 UTC (permalink / raw)
To: Richard Sandiford
Cc: Maciej W. Rozycki, Ralf Baechle, Dominic Sweetman, linux-mips,
Nigel Stephens, David Ung
Richard Sandiford wrote:
> Thiemo Seufer <ica2_ts@csv.ica.uni-stuttgart.de> writes:
> > I tried to use "R" in atomic.h but this failed in some (but not all)
> > cases with
> >
> > include/asm/atomic.h:64: error: inconsistent operand constraints in an asm'
> >
> > where the argument happens to be a member of a global struct.
>
> Doh! Do you have any testcases handy?
So far only the whole kernel, which isn't exactly handy.
> Was it failing with >= 3.4,
> or with older toolchains? 3.4 and above should know that 'R' is a
> memory-type constraint.
I saw the same failure for 3.3 and 3.4, I haven't tried 4.0.
Thiemo
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-01 10:20 ` Dominic Sweetman
(?)
(?)
@ 2004-12-01 20:45 ` Thiemo Seufer
2004-12-01 21:53 ` Maciej W. Rozycki
-1 siblings, 1 reply; 21+ messages in thread
From: Thiemo Seufer @ 2004-12-01 20:45 UTC (permalink / raw)
To: Dominic Sweetman; +Cc: linux-mips, ralf, Nigel Stephens, David Ung
Dominic Sweetman wrote:
>
> Thiemo writes:
>
> > the atomic functions use so far memory references for the inline
> > assembler to access the semaphore. This can lead to additional
> > instructions in the ll/sc loop, because newer compilers don't
> > expand the memory reference any more but leave it to the assembler.
> >
> > The appended patch...
>
> I thought it was an explicit aim of the substantial rewrite of the
> MIPS backend for 3.x to get the compiler to generate only "real"
> instructions, not macros which expand to multiple instructions inside
> the assembler. So it's disappointing if newer compilers got worse.
The compiler was improved with PIC code in mind. The kernel is
non-PIC, and can't allow explicit relocs by the compiler because
of the weird code model used for 64bit kernels. This led to some
degradation and even subtle failures for inline assembly code which
relies on assumptions about earlier compiler's behaviour.
Thiemo
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-01 20:45 ` Thiemo Seufer
@ 2004-12-01 21:53 ` Maciej W. Rozycki
2004-12-01 23:03 ` Thiemo Seufer
0 siblings, 1 reply; 21+ messages in thread
From: Maciej W. Rozycki @ 2004-12-01 21:53 UTC (permalink / raw)
To: Thiemo Seufer
Cc: Dominic Sweetman, linux-mips, ralf, Nigel Stephens, David Ung
On Wed, 1 Dec 2004, Thiemo Seufer wrote:
> The compiler was improved with PIC code in mind. The kernel is
> non-PIC, and can't allow explicit relocs by the compiler because
> of the weird code model used for 64bit kernels. This led to some
> degradation and even subtle failures for inline assembly code which
> relies on assumptions about earlier compiler's behaviour.
What do you mean by "the weird code model" and what failures have you
observed? I think the bits are worth being done correctly, so I'd like
to know what problems to address.
Maciej
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-01 21:53 ` Maciej W. Rozycki
@ 2004-12-01 23:03 ` Thiemo Seufer
2004-12-02 0:17 ` Maciej W. Rozycki
2004-12-02 8:01 ` Dominic Sweetman
0 siblings, 2 replies; 21+ messages in thread
From: Thiemo Seufer @ 2004-12-01 23:03 UTC (permalink / raw)
To: Maciej W. Rozycki
Cc: Dominic Sweetman, linux-mips, ralf, Nigel Stephens, David Ung
Maciej W. Rozycki wrote:
> On Wed, 1 Dec 2004, Thiemo Seufer wrote:
>
> > The compiler was improved with PIC code in mind. The kernel is
> > non-PIC, and can't allow explicit relocs by the compiler because
> > of the weird code model used for 64bit kernels. This led to some
> > degradation and even subtle failures for inline assembly code which
> > relies on assumptions about earlier compiler's behaviour.
>
> What do you mean by "the weird code model" and what failures have you
> observed? I think the bits are worth being done correctly, so I'd like
> to know what problems to address.
I had guessed you already know what i mean. :-)
Current 64bit MIPS kernels run in (C)KSEG0, and exploit sign-extension
to optimize symbol loads (2 instead of 6/7 instructions, the same as in
32bit kernels). This optimization relies on an assembler macro
expansion mode which was hacked in gas for exactly this purpose. Gcc
currently doesn't have something similiar, and would try to do a regular
64bit load with explicit relocs.
I discussed this with Richard Sandiford a while ago, and the conclusion
was to implement an explicit --msym32 option for both gcc and gas to
improve register scheduling and get rid of the gas hack. So far, nobody
came around to actually do the work for it.
For the "subtle failures" part, we had some gas failures to handle dla
because of the changed arguments. For userland (PIC) code, I've also
seen additional load/store insn creeping in ll/sc loops. I believe
there's a large amount of inline assembly code (not necessarily in the
kernel) which relies on similiar assumptions.
Thiemo
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-01 23:03 ` Thiemo Seufer
@ 2004-12-02 0:17 ` Maciej W. Rozycki
2004-12-02 1:02 ` Thiemo Seufer
2004-12-02 8:01 ` Dominic Sweetman
1 sibling, 1 reply; 21+ messages in thread
From: Maciej W. Rozycki @ 2004-12-02 0:17 UTC (permalink / raw)
To: Thiemo Seufer
Cc: Dominic Sweetman, linux-mips, ralf, Nigel Stephens, David Ung
On Thu, 2 Dec 2004, Thiemo Seufer wrote:
> > What do you mean by "the weird code model" and what failures have you
> > observed? I think the bits are worth being done correctly, so I'd like
> > to know what problems to address.
>
> I had guessed you already know what i mean. :-)
>
> Current 64bit MIPS kernels run in (C)KSEG0, and exploit sign-extension
> to optimize symbol loads (2 instead of 6/7 instructions, the same as in
> 32bit kernels). This optimization relies on an assembler macro
> expansion mode which was hacked in gas for exactly this purpose. Gcc
> currently doesn't have something similiar, and would try to do a regular
> 64bit load with explicit relocs.
Ah *that*. Well, sorry -- I tend to forget about the hack as I've never
used it. I think a valid solution is either to use CONFIG_BUILD_ELF64
(now that it is there) or to modify tools to implement it correctly...
> I discussed this with Richard Sandiford a while ago, and the conclusion
> was to implement an explicit --msym32 option for both gcc and gas to
> improve register scheduling and get rid of the gas hack. So far, nobody
> came around to actually do the work for it.
... like this, for example. But if nobody has implemented it yet, then
perhaps nobody is really interested in it? ;-)
> For the "subtle failures" part, we had some gas failures to handle dla
> because of the changed arguments. For userland (PIC) code, I've also
I recall this -- I've thought the more or less agreed consensus was to
forbid or at least strongly discourage "dla" and "la" macros expanded
within inline asms and referring to an address operand to be provided by
GCC based on a constraint.
Otherwise there is still my patch to gas available as an alternative. ;-)
> seen additional load/store insn creeping in ll/sc loops. I believe
> there's a large amount of inline assembly code (not necessarily in the
> kernel) which relies on similiar assumptions.
With explicit relocs you have no problem with any instructions appearing
inside inline asms unexpectedly. That is if you use the "R" constraint --
the "m" one never guaranteed that.
Maciej
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-02 0:17 ` Maciej W. Rozycki
@ 2004-12-02 1:02 ` Thiemo Seufer
0 siblings, 0 replies; 21+ messages in thread
From: Thiemo Seufer @ 2004-12-02 1:02 UTC (permalink / raw)
To: Maciej W. Rozycki
Cc: Dominic Sweetman, linux-mips, ralf, Nigel Stephens, David Ung
Maciej W. Rozycki wrote:
[snip]
> > I discussed this with Richard Sandiford a while ago, and the conclusion
> > was to implement an explicit --msym32 option for both gcc and gas to
> > improve register scheduling and get rid of the gas hack. So far, nobody
> > came around to actually do the work for it.
>
> ... like this, for example. But if nobody has implemented it yet, then
> perhaps nobody is really interested in it? ;-)
The old solution works, and kernel developers tend to use old toolchains.
> > seen additional load/store insn creeping in ll/sc loops. I believe
> > there's a large amount of inline assembly code (not necessarily in the
> > kernel) which relies on similiar assumptions.
>
> With explicit relocs you have no problem with any instructions appearing
> inside inline asms unexpectedly. That is if you use the "R" constraint --
> the "m" one never guaranteed that.
But it happened to work, and is in widespread use.
Thiemo
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-01 23:03 ` Thiemo Seufer
2004-12-02 0:17 ` Maciej W. Rozycki
@ 2004-12-02 8:01 ` Dominic Sweetman
2004-12-02 8:38 ` Thiemo Seufer
1 sibling, 1 reply; 21+ messages in thread
From: Dominic Sweetman @ 2004-12-02 8:01 UTC (permalink / raw)
To: Thiemo Seufer
Cc: Maciej W. Rozycki, Dominic Sweetman, linux-mips, ralf,
Nigel Stephens, David Ung
Thiemo,
> I had guessed you already know what i mean. :-)
Generally a bad guess, of course...
> Current 64bit MIPS kernels run in (C)KSEG0, and exploit sign-extension
> to optimize symbol loads (2 instead of 6/7 instructions, the same as in
> 32bit kernels).
Gross... yet ingenious. I see the problem. You want to use a
32-bit-pointer "la" for the addresses of static variables in the
kernel build (which works, because the kernel is in the
32-bit-pointer-accessible 'kseg0').
The assembler macro was a very Linux solution, if you don't mind my
saying so... A more native compiler fix would probably be a good
idea.
What ABI are you using for the 64-bit kernel build (n64? how do you
get it to build non-PIC?) And what constraints are there on
your choice of gcc version? - it would be easier if 3.4 was OK.
--
Dominic Sweetman
MIPS Technologies
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-02 8:01 ` Dominic Sweetman
@ 2004-12-02 8:38 ` Thiemo Seufer
2004-12-02 11:31 ` Stephen P. Becker
0 siblings, 1 reply; 21+ messages in thread
From: Thiemo Seufer @ 2004-12-02 8:38 UTC (permalink / raw)
To: Dominic Sweetman
Cc: Maciej W. Rozycki, linux-mips, ralf, Nigel Stephens, David Ung
Dominic Sweetman wrote:
>
> Thiemo,
>
> > I had guessed you already know what i mean. :-)
>
> Generally a bad guess, of course...
>
> > Current 64bit MIPS kernels run in (C)KSEG0, and exploit sign-extension
> > to optimize symbol loads (2 instead of 6/7 instructions, the same as in
> > 32bit kernels).
>
> Gross... yet ingenious. I see the problem. You want to use a
> 32-bit-pointer "la" for the addresses of static variables in the
> kernel build (which works, because the kernel is in the
> 32-bit-pointer-accessible 'kseg0').
Except that the compiler emits a "dla", but the assembler expands
it like "la". :-)
> The assembler macro was a very Linux solution, if you don't mind my
> saying so... A more native compiler fix would probably be a good
> idea.
>
> What ABI are you using for the 64-bit kernel build (n64? how do you
> get it to build non-PIC?)
The ABIs are generally only defined for PIC code, the 64bit kernel uses
the n64 non-PIC alike. Building as non-PIC is simple, via
-mabi=64 -fno-PIC -mno-abicalls
> And what constraints are there on
> your choice of gcc version? - it would be easier if 3.4 was OK.
3.2/3.3 are known to work. 3.4 fails for yet unknown reason, I guess
either due to inline assembler changes or more agressive dead code
elimination.
Thiemo
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-02 8:38 ` Thiemo Seufer
@ 2004-12-02 11:31 ` Stephen P. Becker
2004-12-02 11:54 ` Maciej W. Rozycki
0 siblings, 1 reply; 21+ messages in thread
From: Stephen P. Becker @ 2004-12-02 11:31 UTC (permalink / raw)
To: Thiemo Seufer
Cc: Dominic Sweetman, Maciej W. Rozycki, linux-mips, ralf,
Nigel Stephens, David Ung
>>And what constraints are there on
>>your choice of gcc version? - it would be easier if 3.4 was OK.
>
>
> 3.2/3.3 are known to work. 3.4 fails for yet unknown reason, I guess
> either due to inline assembler changes or more agressive dead code
> elimination.
>
>
> Thiemo
For what it's worth, I'm running 64-bit kernels on my O2 and Indy that
were compiled with gcc 3.4.3, and I have had no problems.
Steve
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-02 11:31 ` Stephen P. Becker
@ 2004-12-02 11:54 ` Maciej W. Rozycki
2004-12-02 11:57 ` Stephen P. Becker
0 siblings, 1 reply; 21+ messages in thread
From: Maciej W. Rozycki @ 2004-12-02 11:54 UTC (permalink / raw)
To: Stephen P. Becker
Cc: Thiemo Seufer, Dominic Sweetman, linux-mips, ralf, Nigel Stephens,
David Ung
On Thu, 2 Dec 2004, Stephen P. Becker wrote:
> For what it's worth, I'm running 64-bit kernels on my O2 and Indy that
> were compiled with gcc 3.4.3, and I have had no problems.
Have you used CONFIG_BUILD_ELF64 or not? The former obviously works for
me; it's the latter that may cause troubles.
Maciej
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Improve atomic.h implementation robustness
2004-12-02 11:54 ` Maciej W. Rozycki
@ 2004-12-02 11:57 ` Stephen P. Becker
0 siblings, 0 replies; 21+ messages in thread
From: Stephen P. Becker @ 2004-12-02 11:57 UTC (permalink / raw)
To: Maciej W. Rozycki
Cc: Thiemo Seufer, Dominic Sweetman, linux-mips, ralf, Nigel Stephens,
David Ung
Maciej W. Rozycki wrote:
> On Thu, 2 Dec 2004, Stephen P. Becker wrote:
>
>
>>For what it's worth, I'm running 64-bit kernels on my O2 and Indy that
>>were compiled with gcc 3.4.3, and I have had no problems.
>
>
> Have you used CONFIG_BUILD_ELF64 or not? The former obviously works for
> me; it's the latter that may cause troubles.
>
> Maciej
I have used CONFIG_BUILD_ELF64 for an O2 kernel that booted, but
typically I just use the -mabi=o64 hack.
Steve
^ permalink raw reply [flat|nested] 21+ messages in thread
end of thread, other threads:[~2004-12-08 0:10 UTC | newest]
Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-12-01 7:00 [PATCH] Improve atomic.h implementation robustness Thiemo Seufer
2004-12-01 10:20 ` Dominic Sweetman
2004-12-01 10:20 ` Dominic Sweetman
2004-12-01 12:33 ` Ralf Baechle
2004-12-01 21:50 ` Maciej W. Rozycki
2004-12-01 23:39 ` Ralf Baechle
2004-12-02 0:01 ` Maciej W. Rozycki
2004-12-07 11:56 ` Richard Sandiford
2004-12-07 12:56 ` Thiemo Seufer
2004-12-07 19:28 ` Richard Sandiford
2004-12-08 0:09 ` Thiemo Seufer
2004-12-01 20:45 ` Thiemo Seufer
2004-12-01 21:53 ` Maciej W. Rozycki
2004-12-01 23:03 ` Thiemo Seufer
2004-12-02 0:17 ` Maciej W. Rozycki
2004-12-02 1:02 ` Thiemo Seufer
2004-12-02 8:01 ` Dominic Sweetman
2004-12-02 8:38 ` Thiemo Seufer
2004-12-02 11:31 ` Stephen P. Becker
2004-12-02 11:54 ` Maciej W. Rozycki
2004-12-02 11:57 ` Stephen P. Becker
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.