[PATCH 2/2] ix86: atomic64 assembly improvements

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 2/2] ix86: atomic64 assembly improvements
@ 2012-01-18 14:24 Jan Beulich
  2012-01-18 16:36 ` H. Peter Anvin
  0 siblings, 1 reply; 10+ messages in thread
From: Jan Beulich @ 2012-01-18 14:24 UTC (permalink / raw)
  To: mingo, tglx, hpa; +Cc: eric.dumazet@gmail.com, luca, linux-kernel

The cmpxchg8b variants of "set" and "xchg" are really identical, and
hence don't need to be repeated: %ebx and %ecx don't need to be copied
into %eax and %edx respectively (this is only necessary when desiring
to only read the stored value), and the LOCK prefix should also be used
in "set" (other than the comment that is now being removed was saying,
there is - to my knowledge - no *architectural* guarantee that aligned
64-bit writes would always be carried out atomically).

In the "add_unless" implementation, swapping the use of %ecx and %esi
for passing arguments allows %esi to become an input only (i.e.
permitting the register to be re-used to address the same object
without reload).

In "{add,sub}_return", doing the initial read64 through the passed in
%ecx decreases a register dependency.

In "inc_not_zero", a branch can be eliminated by or-ing together the
two halves of the current (64-bit) value, and code size can be further
reduced by adjusting the arithmetic slightly.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Luca Barbieri <luca@luca-barbieri.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>

---
 arch/x86/include/asm/atomic64_32.h |   13 +++++------
 arch/x86/lib/atomic64_386_32.S     |    6 ++---
 arch/x86/lib/atomic64_cx8_32.S     |   42 +++++++++----------------------------
 3 files changed, 20 insertions(+), 41 deletions(-)

--- tip-i386-atomic64.orig/arch/x86/include/asm/atomic64_32.h
+++ tip-i386-atomic64/arch/x86/include/asm/atomic64_32.h
@@ -36,6 +36,7 @@ typedef struct {
 #define ATOMIC64_EXPORT(sym) __ATOMIC64_EXPORT(sym##_cx8); \
 		__ATOMIC64_EXPORT(sym##_386)
 
+__ATOMIC64_EXPORT(set_386);
 __ATOMIC64_EXPORT(add_386);
 __ATOMIC64_EXPORT(sub_386);
 __ATOMIC64_EXPORT(inc_386);
@@ -46,7 +47,6 @@ __ATOMIC64_EXPORT(dec_386);
 	__alternative_atomic64(f, f, ASM_OUTPUT2(out), ## in)
 
 ATOMIC64_EXPORT(read);
-ATOMIC64_EXPORT(set);
 ATOMIC64_EXPORT(xchg);
 ATOMIC64_EXPORT(add_return);
 ATOMIC64_EXPORT(sub_return);
@@ -104,9 +104,9 @@ static inline void atomic64_set(atomic64
 {
 	unsigned high = (unsigned)(i >> 32);
 	unsigned low = (unsigned)i;
-	alternative_atomic64(set, /* no output */,
-			     "S" (v), "b" (low), "c" (high)
-			     : "eax", "edx", "memory");
+	__alternative_atomic64(set, xchg, /* no output */,
+			       "S" (v), "b" (low), "c" (high)
+			       : "eax", "edx", "memory");
 }
 
 /**
@@ -286,9 +286,8 @@ static inline int atomic64_add_unless(at
 	unsigned low = (unsigned)u;
 	unsigned high = (unsigned)(u >> 32);
 	alternative_atomic64(add_unless,
-			     ASM_OUTPUT2("+A" (a), "+c" (v),
-					 "+S" (low), "+D" (high)),
-			     ASM_NO_INPUT_CLOBBER("memory"));
+			     ASM_OUTPUT2("+A" (a), "+c" (low), "+D" (high)),
+			     "S" (v) : "memory");
 	return (int)a;
 }
 
--- tip-i386-atomic64.orig/arch/x86/lib/atomic64_386_32.S
+++ tip-i386-atomic64/arch/x86/lib/atomic64_386_32.S
@@ -137,13 +137,13 @@ BEGIN(dec_return)
 RET_ENDP
 #undef v
 
-#define v %ecx
+#define v %esi
 BEGIN(add_unless)
-	addl %eax, %esi
+	addl %eax, %ecx
 	adcl %edx, %edi
 	addl  (v), %eax
 	adcl 4(v), %edx
-	cmpl %eax, %esi
+	cmpl %eax, %ecx
 	je 3f
 1:
 	movl %eax,  (v)
--- tip-i386-atomic64.orig/arch/x86/lib/atomic64_cx8_32.S
+++ tip-i386-atomic64/arch/x86/lib/atomic64_cx8_32.S
@@ -39,24 +39,9 @@ ENTRY(atomic64_read_cx8)
 	CFI_ENDPROC
 ENDPROC(atomic64_read_cx8)
 
-ENTRY(atomic64_set_cx8)
-	CFI_STARTPROC
-
-1:
-/* we don't need LOCK_PREFIX since aligned 64-bit writes
- * are atomic on 586 and newer */
-	cmpxchg8b (%esi)
-	jne 1b
-
-	ret
-	CFI_ENDPROC
-ENDPROC(atomic64_set_cx8)
-
 ENTRY(atomic64_xchg_cx8)
 	CFI_STARTPROC
 
-	movl %ebx, %eax
-	movl %ecx, %edx
 1:
 	LOCK_PREFIX
 	cmpxchg8b (%esi)
@@ -78,7 +63,7 @@ ENTRY(atomic64_\func\()_return_cx8)
 	movl %edx, %edi
 	movl %ecx, %ebp
 
-	read64 %ebp
+	read64 %ecx
 1:
 	movl %eax, %ebx
 	movl %edx, %ecx
@@ -159,23 +144,22 @@ ENTRY(atomic64_add_unless_cx8)
 	SAVE ebx
 /* these just push these two parameters on the stack */
 	SAVE edi
-	SAVE esi
+	SAVE ecx
 
-	movl %ecx, %ebp
-	movl %eax, %esi
+	movl %eax, %ebp
 	movl %edx, %edi
 
-	read64 %ebp
+	read64 %esi
 1:
 	cmpl %eax, 0(%esp)
 	je 4f
 2:
 	movl %eax, %ebx
 	movl %edx, %ecx
-	addl %esi, %ebx
+	addl %ebp, %ebx
 	adcl %edi, %ecx
 	LOCK_PREFIX
-	cmpxchg8b (%ebp)
+	cmpxchg8b (%esi)
 	jne 1b
 
 	movl $1, %eax
@@ -199,13 +183,13 @@ ENTRY(atomic64_inc_not_zero_cx8)
 
 	read64 %esi
 1:
-	testl %eax, %eax
-	je 4f
-2:
+	movl %eax, %ecx
+	orl %edx, %ecx
+	jz 3f
 	movl %eax, %ebx
-	movl %edx, %ecx
+	xorl %ecx, %ecx
 	addl $1, %ebx
-	adcl $0, %ecx
+	adcl %edx, %ecx
 	LOCK_PREFIX
 	cmpxchg8b (%esi)
 	jne 1b
@@ -214,9 +198,5 @@ ENTRY(atomic64_inc_not_zero_cx8)
 3:
 	RESTORE ebx
 	ret
-4:
-	testl %edx, %edx
-	jne 2b
-	jmp 3b
 	CFI_ENDPROC
 ENDPROC(atomic64_inc_not_zero_cx8)



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
  2012-01-18 14:24 [PATCH 2/2] ix86: atomic64 assembly improvements Jan Beulich
@ 2012-01-18 16:36 ` H. Peter Anvin
  2012-01-18 16:50   ` Jan Beulich
  0 siblings, 1 reply; 10+ messages in thread
From: H. Peter Anvin @ 2012-01-18 16:36 UTC (permalink / raw)
  To: Jan Beulich; +Cc: mingo, tglx, eric.dumazet@gmail.com, luca, linux-kernel

On 01/18/2012 06:24 AM, Jan Beulich wrote:
> The cmpxchg8b variants of "set" and "xchg" are really identical, and
> hence don't need to be repeated: %ebx and %ecx don't need to be copied
> into %eax and %edx respectively (this is only necessary when desiring
> to only read the stored value), and the LOCK prefix should also be used
> in "set" (other than the comment that is now being removed was saying,
> there is - to my knowledge - no *architectural* guarantee that aligned
> 64-bit writes would always be carried out atomically).

EWHAT?

It's atomic in the same way a MOV is atomic.

The CPU could, in fact, execute the locked version at all if the
unlocked version didn't behave like that.

Unless you have a specific instance where you think this might be
violated, please let me know.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
  2012-01-18 16:36 ` H. Peter Anvin
@ 2012-01-18 16:50   ` Jan Beulich
  2012-01-18 17:47     ` H. Peter Anvin
  0 siblings, 1 reply; 10+ messages in thread
From: Jan Beulich @ 2012-01-18 16:50 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: mingo, eric.dumazet@gmail.com, tglx, luca, linux-kernel

>>> On 18.01.12 at 17:36, "H. Peter Anvin" <hpa@zytor.com> wrote:
> On 01/18/2012 06:24 AM, Jan Beulich wrote:
>> The cmpxchg8b variants of "set" and "xchg" are really identical, and
>> hence don't need to be repeated: %ebx and %ecx don't need to be copied
>> into %eax and %edx respectively (this is only necessary when desiring
>> to only read the stored value), and the LOCK prefix should also be used
>> in "set" (other than the comment that is now being removed was saying,
>> there is - to my knowledge - no *architectural* guarantee that aligned
>> 64-bit writes would always be carried out atomically).
> 
> EWHAT?
> 
> It's atomic in the same way a MOV is atomic.

Then please point me to where this is documented.

As I understand it, there is nothing keeping the CPU (or something
down the bus) from executing the unlocked version as two 32-bit
reads followed by two 32-bit writes.

> The CPU could, in fact, execute the locked version at all if the
> unlocked version didn't behave like that.

Assuming you meant "could not", that's not true: As long as the
external world has a way to know that both items are locked (think
of the old bus lock mechanism when there were no caches yet),
it can very well do so.

I do not question that in practice all CPUs behave as described,
but without an architectural guarantee (and with the code in
question not being used in hot paths afaik) I see no reason why
it should depend on undefined behavior.

Jan

> Unless you have a specific instance where you think this might be
> violated, please let me know.
> 
> 	-hpa
> 
> -- 
> H. Peter Anvin, Intel Open Source Technology Center
> I work for Intel.  I don't speak on their behalf.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
  2012-01-18 16:50   ` Jan Beulich
@ 2012-01-18 17:47     ` H. Peter Anvin
  2012-01-19  9:18       ` Jan Beulich
  0 siblings, 1 reply; 10+ messages in thread
From: H. Peter Anvin @ 2012-01-18 17:47 UTC (permalink / raw)
  To: Jan Beulich; +Cc: mingo, eric.dumazet@gmail.com, tglx, luca, linux-kernel

On 01/18/2012 08:50 AM, Jan Beulich wrote:
>>
>> It's atomic in the same way a MOV is atomic.
>
> Then please point me to where this is documented.
>
> As I understand it, there is nothing keeping the CPU (or something
> down the bus) from executing the unlocked version as two 32-bit
> reads followed by two 32-bit writes.
>
>> The CPU could, in fact, execute the locked version at all if the
>> unlocked version didn't behave like that.
>
> Assuming you meant "could not", that's not true: As long as the
> external world has a way to know that both items are locked (think
> of the old bus lock mechanism when there were no caches yet),
> it can very well do so.
>
> I do not question that in practice all CPUs behave as described,
> but without an architectural guarantee (and with the code in
> question not being used in hot paths afaik) I see no reason why
> it should depend on undefined behavior.
>

Sorry, Jan, if you want to play the documentation lawyer game then there 
is very little that will get done.  The code is increasingly being used 
in warm/hot paths and that's actually fair, so I'd like to avoid 
crapping it up.

There are, to my knowledge, only three companies which have brought 
SMP-capable x86 processors to market: Intel, AMD, and VIA and the above 
holds for them.  A new vendor realistically can't bring a new processor 
to market which violates a constraint that the existing processor 
vendors have preserved.

It is somewhat implied in the SDM section 8.1.1 of volume 3A, but as 
with many other things it's not written out specifically... I suspect 
largely because the question hasn't been raised before.

	-hpa

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
  2012-01-18 17:47     ` H. Peter Anvin
@ 2012-01-19  9:18       ` Jan Beulich
  2012-01-19 14:44         ` H. Peter Anvin
  0 siblings, 1 reply; 10+ messages in thread
From: Jan Beulich @ 2012-01-19  9:18 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: mingo, eric.dumazet@gmail.com, tglx, luca, linux-kernel

>>> On 18.01.12 at 18:47, "H. Peter Anvin" <hpa@zytor.com> wrote:
> On 01/18/2012 08:50 AM, Jan Beulich wrote:
>>>
>>> It's atomic in the same way a MOV is atomic.
>>
>> Then please point me to where this is documented.
>>
>> As I understand it, there is nothing keeping the CPU (or something
>> down the bus) from executing the unlocked version as two 32-bit
>> reads followed by two 32-bit writes.
>>
>>> The CPU could, in fact, execute the locked version at all if the
>>> unlocked version didn't behave like that.
>>
>> Assuming you meant "could not", that's not true: As long as the
>> external world has a way to know that both items are locked (think
>> of the old bus lock mechanism when there were no caches yet),
>> it can very well do so.
>>
>> I do not question that in practice all CPUs behave as described,
>> but without an architectural guarantee (and with the code in
>> question not being used in hot paths afaik) I see no reason why
>> it should depend on undefined behavior.
>>
> 
> Sorry, Jan, if you want to play the documentation lawyer game then there 
> is very little that will get done.  The code is increasingly being used 
> in warm/hot paths and that's actually fair, so I'd like to avoid 
> crapping it up.
> 
> There are, to my knowledge, only three companies which have brought 
> SMP-capable x86 processors to market: Intel, AMD, and VIA and the above 
> holds for them.  A new vendor realistically can't bring a new processor 
> to market which violates a constraint that the existing processor 
> vendors have preserved.
> 
> It is somewhat implied in the SDM section 8.1.1 of volume 3A, but as 
> with many other things it's not written out specifically... I suspect 
> largely because the question hasn't been raised before.

But the code is supposed to be correct even when caches are disabled
(in which case LOCK# will continue to be used even on modern CPUs),
and this case clearly isn't covered by the current implementation. It
may be a good idea to adjust the patch description accordingly, but I
see no reason to change the patch itself.

Jan


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
  2012-01-19  9:18       ` Jan Beulich
@ 2012-01-19 14:44         ` H. Peter Anvin
  2012-01-19 14:50           ` Jan Beulich
  0 siblings, 1 reply; 10+ messages in thread
From: H. Peter Anvin @ 2012-01-19 14:44 UTC (permalink / raw)
  To: Jan Beulich; +Cc: mingo, eric.dumazet@gmail.com, tglx, luca, linux-kernel

On 01/19/2012 01:18 AM, Jan Beulich wrote:
> 
> But the code is supposed to be correct even when caches are disabled
> (in which case LOCK# will continue to be used even on modern CPUs),
> and this case clearly isn't covered by the current implementation. It
> may be a good idea to adjust the patch description accordingly, but I
> see no reason to change the patch itself.
> 

It doesn't have anything to do with caches on or off.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
  2012-01-19 14:44         ` H. Peter Anvin
@ 2012-01-19 14:50           ` Jan Beulich
  2012-01-19 14:55             ` H. Peter Anvin
  2012-01-19 14:59             ` H. Peter Anvin
  0 siblings, 2 replies; 10+ messages in thread
From: Jan Beulich @ 2012-01-19 14:50 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: mingo, eric.dumazet@gmail.com, tglx, luca, linux-kernel

>>> On 19.01.12 at 15:44, "H. Peter Anvin" <hpa@zytor.com> wrote:
> On 01/19/2012 01:18 AM, Jan Beulich wrote:
>> 
>> But the code is supposed to be correct even when caches are disabled
>> (in which case LOCK# will continue to be used even on modern CPUs),
>> and this case clearly isn't covered by the current implementation. It
>> may be a good idea to adjust the patch description accordingly, but I
>> see no reason to change the patch itself.
>> 
> 
> It doesn't have anything to do with caches on or off.

How does it not? If any part of the bus topology is only 32 bits wide,
a 64-bit read or write simply can't be executed atomically without
asserting LOCK#.

Jan


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
  2012-01-19 14:50           ` Jan Beulich
@ 2012-01-19 14:55             ` H. Peter Anvin
  2012-01-19 14:59             ` H. Peter Anvin
  1 sibling, 0 replies; 10+ messages in thread
From: H. Peter Anvin @ 2012-01-19 14:55 UTC (permalink / raw)
  To: Jan Beulich; +Cc: mingo, eric.dumazet@gmail.com, tglx, luca, linux-kernel

On 01/19/2012 06:50 AM, Jan Beulich wrote:
>>
>> It doesn't have anything to do with caches on or off.
>
> How does it not? If any part of the bus topology is only 32 bits wide,
> a 64-bit read or write simply can't be executed atomically without
> asserting LOCK#.
>

If any part of the bus topology is only 32 bits wide, you're not talking 
to memory on anything newer than a 486.

LOCK to I/O devices is unreliable on any machine (a lot of northbridges 
simply drop LOCK on the floor).

	-hpa


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
  2012-01-19 14:50           ` Jan Beulich
  2012-01-19 14:55             ` H. Peter Anvin
@ 2012-01-19 14:59             ` H. Peter Anvin
  2012-01-19 15:11               ` Jan Beulich
  1 sibling, 1 reply; 10+ messages in thread
From: H. Peter Anvin @ 2012-01-19 14:59 UTC (permalink / raw)
  To: Jan Beulich; +Cc: mingo, eric.dumazet@gmail.com, tglx, luca, linux-kernel

On 01/19/2012 06:50 AM, Jan Beulich wrote:
>>
>> It doesn't have anything to do with caches on or off.
>
> How does it not? If any part of the bus topology is only 32 bits wide,
> a 64-bit read or write simply can't be executed atomically without
> asserting LOCK#.
>

Furthermore, if you're going down that rathole then we can't even trust 
MOV (nor, for that matter, can you trust LOCK in a lot of 
circumstances.)  In short, you need to be extremely careful about what 
you do to uncached memory *at all* (and you need to know exactly what is 
behind this bus) but pessimizing these kinds of construct for that 
reason is wrong in the extreme.

	-hpa

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ix86: atomic64 assembly improvements
  2012-01-19 14:59             ` H. Peter Anvin
@ 2012-01-19 15:11               ` Jan Beulich
  0 siblings, 0 replies; 10+ messages in thread
From: Jan Beulich @ 2012-01-19 15:11 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: mingo, eric.dumazet@gmail.com, tglx, luca, linux-kernel

>>> On 19.01.12 at 15:59, "H. Peter Anvin" <hpa@zytor.com> wrote:
> On 01/19/2012 06:50 AM, Jan Beulich wrote:
>>>
>>> It doesn't have anything to do with caches on or off.
>>
>> How does it not? If any part of the bus topology is only 32 bits wide,
>> a 64-bit read or write simply can't be executed atomically without
>> asserting LOCK#.
>>
> 
> Furthermore, if you're going down that rathole then we can't even trust 
> MOV (nor, for that matter, can you trust LOCK in a lot of 

No - (general register) MOV is at most 32 bits wide.

> circumstances.)  In short, you need to be extremely careful about what 
> you do to uncached memory *at all* (and you need to know exactly what is 
> behind this bus) but pessimizing these kinds of construct for that 
> reason is wrong in the extreme.

Okay, despite not being fully convinced I'll undo the folding of set and
xchg then, and re-submit.

Jan


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2012-01-19 15:10 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-01-18 14:24 [PATCH 2/2] ix86: atomic64 assembly improvements Jan Beulich
2012-01-18 16:36 ` H. Peter Anvin
2012-01-18 16:50   ` Jan Beulich
2012-01-18 17:47     ` H. Peter Anvin
2012-01-19  9:18       ` Jan Beulich
2012-01-19 14:44         ` H. Peter Anvin
2012-01-19 14:50           ` Jan Beulich
2012-01-19 14:55             ` H. Peter Anvin
2012-01-19 14:59             ` H. Peter Anvin
2012-01-19 15:11               ` Jan Beulich

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox