* [PATCH 2/2] ix86: atomic64 assembly improvements
@ 2012-01-18 14:24 Jan Beulich
2012-01-18 16:36 ` H. Peter Anvin
0 siblings, 1 reply; 10+ messages in thread
From: Jan Beulich @ 2012-01-18 14:24 UTC (permalink / raw)
To: mingo, tglx, hpa; +Cc: eric.dumazet@gmail.com, luca, linux-kernel
The cmpxchg8b variants of "set" and "xchg" are really identical, and
hence don't need to be repeated: %ebx and %ecx don't need to be copied
into %eax and %edx respectively (this is only necessary when desiring
to only read the stored value), and the LOCK prefix should also be used
in "set" (other than the comment that is now being removed was saying,
there is - to my knowledge - no *architectural* guarantee that aligned
64-bit writes would always be carried out atomically).
In the "add_unless" implementation, swapping the use of %ecx and %esi
for passing arguments allows %esi to become an input only (i.e.
permitting the register to be re-used to address the same object
without reload).
In "{add,sub}_return", doing the initial read64 through the passed in
%ecx decreases a register dependency.
In "inc_not_zero", a branch can be eliminated by or-ing together the
two halves of the current (64-bit) value, and code size can be further
reduced by adjusting the arithmetic slightly.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Luca Barbieri <luca@luca-barbieri.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
---
arch/x86/include/asm/atomic64_32.h | 13 +++++------
arch/x86/lib/atomic64_386_32.S | 6 ++---
arch/x86/lib/atomic64_cx8_32.S | 42 +++++++++----------------------------
3 files changed, 20 insertions(+), 41 deletions(-)
--- tip-i386-atomic64.orig/arch/x86/include/asm/atomic64_32.h
+++ tip-i386-atomic64/arch/x86/include/asm/atomic64_32.h
@@ -36,6 +36,7 @@ typedef struct {
#define ATOMIC64_EXPORT(sym) __ATOMIC64_EXPORT(sym##_cx8); \
__ATOMIC64_EXPORT(sym##_386)
+__ATOMIC64_EXPORT(set_386);
__ATOMIC64_EXPORT(add_386);
__ATOMIC64_EXPORT(sub_386);
__ATOMIC64_EXPORT(inc_386);
@@ -46,7 +47,6 @@ __ATOMIC64_EXPORT(dec_386);
__alternative_atomic64(f, f, ASM_OUTPUT2(out), ## in)
ATOMIC64_EXPORT(read);
-ATOMIC64_EXPORT(set);
ATOMIC64_EXPORT(xchg);
ATOMIC64_EXPORT(add_return);
ATOMIC64_EXPORT(sub_return);
@@ -104,9 +104,9 @@ static inline void atomic64_set(atomic64
{
unsigned high = (unsigned)(i >> 32);
unsigned low = (unsigned)i;
- alternative_atomic64(set, /* no output */,
- "S" (v), "b" (low), "c" (high)
- : "eax", "edx", "memory");
+ __alternative_atomic64(set, xchg, /* no output */,
+ "S" (v), "b" (low), "c" (high)
+ : "eax", "edx", "memory");
}
/**
@@ -286,9 +286,8 @@ static inline int atomic64_add_unless(at
unsigned low = (unsigned)u;
unsigned high = (unsigned)(u >> 32);
alternative_atomic64(add_unless,
- ASM_OUTPUT2("+A" (a), "+c" (v),
- "+S" (low), "+D" (high)),
- ASM_NO_INPUT_CLOBBER("memory"));
+ ASM_OUTPUT2("+A" (a), "+c" (low), "+D" (high)),
+ "S" (v) : "memory");
return (int)a;
}
--- tip-i386-atomic64.orig/arch/x86/lib/atomic64_386_32.S
+++ tip-i386-atomic64/arch/x86/lib/atomic64_386_32.S
@@ -137,13 +137,13 @@ BEGIN(dec_return)
RET_ENDP
#undef v
-#define v %ecx
+#define v %esi
BEGIN(add_unless)
- addl %eax, %esi
+ addl %eax, %ecx
adcl %edx, %edi
addl (v), %eax
adcl 4(v), %edx
- cmpl %eax, %esi
+ cmpl %eax, %ecx
je 3f
1:
movl %eax, (v)
--- tip-i386-atomic64.orig/arch/x86/lib/atomic64_cx8_32.S
+++ tip-i386-atomic64/arch/x86/lib/atomic64_cx8_32.S
@@ -39,24 +39,9 @@ ENTRY(atomic64_read_cx8)
CFI_ENDPROC
ENDPROC(atomic64_read_cx8)
-ENTRY(atomic64_set_cx8)
- CFI_STARTPROC
-
-1:
-/* we don't need LOCK_PREFIX since aligned 64-bit writes
- * are atomic on 586 and newer */
- cmpxchg8b (%esi)
- jne 1b
-
- ret
- CFI_ENDPROC
-ENDPROC(atomic64_set_cx8)
-
ENTRY(atomic64_xchg_cx8)
CFI_STARTPROC
- movl %ebx, %eax
- movl %ecx, %edx
1:
LOCK_PREFIX
cmpxchg8b (%esi)
@@ -78,7 +63,7 @@ ENTRY(atomic64_\func\()_return_cx8)
movl %edx, %edi
movl %ecx, %ebp
- read64 %ebp
+ read64 %ecx
1:
movl %eax, %ebx
movl %edx, %ecx
@@ -159,23 +144,22 @@ ENTRY(atomic64_add_unless_cx8)
SAVE ebx
/* these just push these two parameters on the stack */
SAVE edi
- SAVE esi
+ SAVE ecx
- movl %ecx, %ebp
- movl %eax, %esi
+ movl %eax, %ebp
movl %edx, %edi
- read64 %ebp
+ read64 %esi
1:
cmpl %eax, 0(%esp)
je 4f
2:
movl %eax, %ebx
movl %edx, %ecx
- addl %esi, %ebx
+ addl %ebp, %ebx
adcl %edi, %ecx
LOCK_PREFIX
- cmpxchg8b (%ebp)
+ cmpxchg8b (%esi)
jne 1b
movl $1, %eax
@@ -199,13 +183,13 @@ ENTRY(atomic64_inc_not_zero_cx8)
read64 %esi
1:
- testl %eax, %eax
- je 4f
-2:
+ movl %eax, %ecx
+ orl %edx, %ecx
+ jz 3f
movl %eax, %ebx
- movl %edx, %ecx
+ xorl %ecx, %ecx
addl $1, %ebx
- adcl $0, %ecx
+ adcl %edx, %ecx
LOCK_PREFIX
cmpxchg8b (%esi)
jne 1b
@@ -214,9 +198,5 @@ ENTRY(atomic64_inc_not_zero_cx8)
3:
RESTORE ebx
ret
-4:
- testl %edx, %edx
- jne 2b
- jmp 3b
CFI_ENDPROC
ENDPROC(atomic64_inc_not_zero_cx8)
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
2012-01-18 14:24 [PATCH 2/2] ix86: atomic64 assembly improvements Jan Beulich
@ 2012-01-18 16:36 ` H. Peter Anvin
2012-01-18 16:50 ` Jan Beulich
0 siblings, 1 reply; 10+ messages in thread
From: H. Peter Anvin @ 2012-01-18 16:36 UTC (permalink / raw)
To: Jan Beulich; +Cc: mingo, tglx, eric.dumazet@gmail.com, luca, linux-kernel
On 01/18/2012 06:24 AM, Jan Beulich wrote:
> The cmpxchg8b variants of "set" and "xchg" are really identical, and
> hence don't need to be repeated: %ebx and %ecx don't need to be copied
> into %eax and %edx respectively (this is only necessary when desiring
> to only read the stored value), and the LOCK prefix should also be used
> in "set" (other than the comment that is now being removed was saying,
> there is - to my knowledge - no *architectural* guarantee that aligned
> 64-bit writes would always be carried out atomically).
EWHAT?
It's atomic in the same way a MOV is atomic.
The CPU could, in fact, execute the locked version at all if the
unlocked version didn't behave like that.
Unless you have a specific instance where you think this might be
violated, please let me know.
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
2012-01-18 16:36 ` H. Peter Anvin
@ 2012-01-18 16:50 ` Jan Beulich
2012-01-18 17:47 ` H. Peter Anvin
0 siblings, 1 reply; 10+ messages in thread
From: Jan Beulich @ 2012-01-18 16:50 UTC (permalink / raw)
To: H. Peter Anvin; +Cc: mingo, eric.dumazet@gmail.com, tglx, luca, linux-kernel
>>> On 18.01.12 at 17:36, "H. Peter Anvin" <hpa@zytor.com> wrote:
> On 01/18/2012 06:24 AM, Jan Beulich wrote:
>> The cmpxchg8b variants of "set" and "xchg" are really identical, and
>> hence don't need to be repeated: %ebx and %ecx don't need to be copied
>> into %eax and %edx respectively (this is only necessary when desiring
>> to only read the stored value), and the LOCK prefix should also be used
>> in "set" (other than the comment that is now being removed was saying,
>> there is - to my knowledge - no *architectural* guarantee that aligned
>> 64-bit writes would always be carried out atomically).
>
> EWHAT?
>
> It's atomic in the same way a MOV is atomic.
Then please point me to where this is documented.
As I understand it, there is nothing keeping the CPU (or something
down the bus) from executing the unlocked version as two 32-bit
reads followed by two 32-bit writes.
> The CPU could, in fact, execute the locked version at all if the
> unlocked version didn't behave like that.
Assuming you meant "could not", that's not true: As long as the
external world has a way to know that both items are locked (think
of the old bus lock mechanism when there were no caches yet),
it can very well do so.
I do not question that in practice all CPUs behave as described,
but without an architectural guarantee (and with the code in
question not being used in hot paths afaik) I see no reason why
it should depend on undefined behavior.
Jan
> Unless you have a specific instance where you think this might be
> violated, please let me know.
>
> -hpa
>
> --
> H. Peter Anvin, Intel Open Source Technology Center
> I work for Intel. I don't speak on their behalf.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
2012-01-18 16:50 ` Jan Beulich
@ 2012-01-18 17:47 ` H. Peter Anvin
2012-01-19 9:18 ` Jan Beulich
0 siblings, 1 reply; 10+ messages in thread
From: H. Peter Anvin @ 2012-01-18 17:47 UTC (permalink / raw)
To: Jan Beulich; +Cc: mingo, eric.dumazet@gmail.com, tglx, luca, linux-kernel
On 01/18/2012 08:50 AM, Jan Beulich wrote:
>>
>> It's atomic in the same way a MOV is atomic.
>
> Then please point me to where this is documented.
>
> As I understand it, there is nothing keeping the CPU (or something
> down the bus) from executing the unlocked version as two 32-bit
> reads followed by two 32-bit writes.
>
>> The CPU could, in fact, execute the locked version at all if the
>> unlocked version didn't behave like that.
>
> Assuming you meant "could not", that's not true: As long as the
> external world has a way to know that both items are locked (think
> of the old bus lock mechanism when there were no caches yet),
> it can very well do so.
>
> I do not question that in practice all CPUs behave as described,
> but without an architectural guarantee (and with the code in
> question not being used in hot paths afaik) I see no reason why
> it should depend on undefined behavior.
>
Sorry, Jan, if you want to play the documentation lawyer game then there
is very little that will get done. The code is increasingly being used
in warm/hot paths and that's actually fair, so I'd like to avoid
crapping it up.
There are, to my knowledge, only three companies which have brought
SMP-capable x86 processors to market: Intel, AMD, and VIA and the above
holds for them. A new vendor realistically can't bring a new processor
to market which violates a constraint that the existing processor
vendors have preserved.
It is somewhat implied in the SDM section 8.1.1 of volume 3A, but as
with many other things it's not written out specifically... I suspect
largely because the question hasn't been raised before.
-hpa
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
2012-01-18 17:47 ` H. Peter Anvin
@ 2012-01-19 9:18 ` Jan Beulich
2012-01-19 14:44 ` H. Peter Anvin
0 siblings, 1 reply; 10+ messages in thread
From: Jan Beulich @ 2012-01-19 9:18 UTC (permalink / raw)
To: H. Peter Anvin; +Cc: mingo, eric.dumazet@gmail.com, tglx, luca, linux-kernel
>>> On 18.01.12 at 18:47, "H. Peter Anvin" <hpa@zytor.com> wrote:
> On 01/18/2012 08:50 AM, Jan Beulich wrote:
>>>
>>> It's atomic in the same way a MOV is atomic.
>>
>> Then please point me to where this is documented.
>>
>> As I understand it, there is nothing keeping the CPU (or something
>> down the bus) from executing the unlocked version as two 32-bit
>> reads followed by two 32-bit writes.
>>
>>> The CPU could, in fact, execute the locked version at all if the
>>> unlocked version didn't behave like that.
>>
>> Assuming you meant "could not", that's not true: As long as the
>> external world has a way to know that both items are locked (think
>> of the old bus lock mechanism when there were no caches yet),
>> it can very well do so.
>>
>> I do not question that in practice all CPUs behave as described,
>> but without an architectural guarantee (and with the code in
>> question not being used in hot paths afaik) I see no reason why
>> it should depend on undefined behavior.
>>
>
> Sorry, Jan, if you want to play the documentation lawyer game then there
> is very little that will get done. The code is increasingly being used
> in warm/hot paths and that's actually fair, so I'd like to avoid
> crapping it up.
>
> There are, to my knowledge, only three companies which have brought
> SMP-capable x86 processors to market: Intel, AMD, and VIA and the above
> holds for them. A new vendor realistically can't bring a new processor
> to market which violates a constraint that the existing processor
> vendors have preserved.
>
> It is somewhat implied in the SDM section 8.1.1 of volume 3A, but as
> with many other things it's not written out specifically... I suspect
> largely because the question hasn't been raised before.
But the code is supposed to be correct even when caches are disabled
(in which case LOCK# will continue to be used even on modern CPUs),
and this case clearly isn't covered by the current implementation. It
may be a good idea to adjust the patch description accordingly, but I
see no reason to change the patch itself.
Jan
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
2012-01-19 9:18 ` Jan Beulich
@ 2012-01-19 14:44 ` H. Peter Anvin
2012-01-19 14:50 ` Jan Beulich
0 siblings, 1 reply; 10+ messages in thread
From: H. Peter Anvin @ 2012-01-19 14:44 UTC (permalink / raw)
To: Jan Beulich; +Cc: mingo, eric.dumazet@gmail.com, tglx, luca, linux-kernel
On 01/19/2012 01:18 AM, Jan Beulich wrote:
>
> But the code is supposed to be correct even when caches are disabled
> (in which case LOCK# will continue to be used even on modern CPUs),
> and this case clearly isn't covered by the current implementation. It
> may be a good idea to adjust the patch description accordingly, but I
> see no reason to change the patch itself.
>
It doesn't have anything to do with caches on or off.
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
2012-01-19 14:44 ` H. Peter Anvin
@ 2012-01-19 14:50 ` Jan Beulich
2012-01-19 14:55 ` H. Peter Anvin
2012-01-19 14:59 ` H. Peter Anvin
0 siblings, 2 replies; 10+ messages in thread
From: Jan Beulich @ 2012-01-19 14:50 UTC (permalink / raw)
To: H. Peter Anvin; +Cc: mingo, eric.dumazet@gmail.com, tglx, luca, linux-kernel
>>> On 19.01.12 at 15:44, "H. Peter Anvin" <hpa@zytor.com> wrote:
> On 01/19/2012 01:18 AM, Jan Beulich wrote:
>>
>> But the code is supposed to be correct even when caches are disabled
>> (in which case LOCK# will continue to be used even on modern CPUs),
>> and this case clearly isn't covered by the current implementation. It
>> may be a good idea to adjust the patch description accordingly, but I
>> see no reason to change the patch itself.
>>
>
> It doesn't have anything to do with caches on or off.
How does it not? If any part of the bus topology is only 32 bits wide,
a 64-bit read or write simply can't be executed atomically without
asserting LOCK#.
Jan
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
2012-01-19 14:50 ` Jan Beulich
@ 2012-01-19 14:55 ` H. Peter Anvin
2012-01-19 14:59 ` H. Peter Anvin
1 sibling, 0 replies; 10+ messages in thread
From: H. Peter Anvin @ 2012-01-19 14:55 UTC (permalink / raw)
To: Jan Beulich; +Cc: mingo, eric.dumazet@gmail.com, tglx, luca, linux-kernel
On 01/19/2012 06:50 AM, Jan Beulich wrote:
>>
>> It doesn't have anything to do with caches on or off.
>
> How does it not? If any part of the bus topology is only 32 bits wide,
> a 64-bit read or write simply can't be executed atomically without
> asserting LOCK#.
>
If any part of the bus topology is only 32 bits wide, you're not talking
to memory on anything newer than a 486.
LOCK to I/O devices is unreliable on any machine (a lot of northbridges
simply drop LOCK on the floor).
-hpa
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
2012-01-19 14:50 ` Jan Beulich
2012-01-19 14:55 ` H. Peter Anvin
@ 2012-01-19 14:59 ` H. Peter Anvin
2012-01-19 15:11 ` Jan Beulich
1 sibling, 1 reply; 10+ messages in thread
From: H. Peter Anvin @ 2012-01-19 14:59 UTC (permalink / raw)
To: Jan Beulich; +Cc: mingo, eric.dumazet@gmail.com, tglx, luca, linux-kernel
On 01/19/2012 06:50 AM, Jan Beulich wrote:
>>
>> It doesn't have anything to do with caches on or off.
>
> How does it not? If any part of the bus topology is only 32 bits wide,
> a 64-bit read or write simply can't be executed atomically without
> asserting LOCK#.
>
Furthermore, if you're going down that rathole then we can't even trust
MOV (nor, for that matter, can you trust LOCK in a lot of
circumstances.) In short, you need to be extremely careful about what
you do to uncached memory *at all* (and you need to know exactly what is
behind this bus) but pessimizing these kinds of construct for that
reason is wrong in the extreme.
-hpa
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
2012-01-19 14:59 ` H. Peter Anvin
@ 2012-01-19 15:11 ` Jan Beulich
0 siblings, 0 replies; 10+ messages in thread
From: Jan Beulich @ 2012-01-19 15:11 UTC (permalink / raw)
To: H. Peter Anvin; +Cc: mingo, eric.dumazet@gmail.com, tglx, luca, linux-kernel
>>> On 19.01.12 at 15:59, "H. Peter Anvin" <hpa@zytor.com> wrote:
> On 01/19/2012 06:50 AM, Jan Beulich wrote:
>>>
>>> It doesn't have anything to do with caches on or off.
>>
>> How does it not? If any part of the bus topology is only 32 bits wide,
>> a 64-bit read or write simply can't be executed atomically without
>> asserting LOCK#.
>>
>
> Furthermore, if you're going down that rathole then we can't even trust
> MOV (nor, for that matter, can you trust LOCK in a lot of
No - (general register) MOV is at most 32 bits wide.
> circumstances.) In short, you need to be extremely careful about what
> you do to uncached memory *at all* (and you need to know exactly what is
> behind this bus) but pessimizing these kinds of construct for that
> reason is wrong in the extreme.
Okay, despite not being fully convinced I'll undo the folding of set and
xchg then, and re-submit.
Jan
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2012-01-19 15:10 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-01-18 14:24 [PATCH 2/2] ix86: atomic64 assembly improvements Jan Beulich
2012-01-18 16:36 ` H. Peter Anvin
2012-01-18 16:50 ` Jan Beulich
2012-01-18 17:47 ` H. Peter Anvin
2012-01-19 9:18 ` Jan Beulich
2012-01-19 14:44 ` H. Peter Anvin
2012-01-19 14:50 ` Jan Beulich
2012-01-19 14:55 ` H. Peter Anvin
2012-01-19 14:59 ` H. Peter Anvin
2012-01-19 15:11 ` Jan Beulich
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox