updates for be8 patch series

All of lore.kernel.org
 help / color / mirror / Atom feed

From: ben.dooks@codethink.co.uk (Ben Dooks)
To: linux-arm-kernel@lists.infradead.org
Subject: updates for be8 patch series
Date: Wed, 24 Jul 2013 11:46:44 +0100	[thread overview]
Message-ID: <51EFB094.3050002@codethink.co.uk> (raw)
In-Reply-To: <CAA3XUr31LAJ3+iWq5gXb+JZO-1G96waOFy-RBVZQEW-djj3dYA@mail.gmail.com>

On 24/07/13 02:06, Victor Kamensky wrote:
> Hi Ben,
>
> Let me split off atomic64 from other issues, so we have focused topic here.
>
> Please see atomic64 issue test case below. Please try it in your setup. I've
> run on Pandaboard ES with my set of patches, but I believe it should be the
> same in your case.
>
> <snip>
>
>>> Index: linux-linaro-tracking/arch/arm/include/asm/atomic.h
>>> ===================================================================
>>> --- linux-linaro-tracking.orig/arch/arm/include/asm/atomic.h
>>> +++ linux-linaro-tracking/arch/arm/include/asm/atomic.h
>>> @@ -301,8 +301,13 @@ static inline void atomic64_add(u64 i, a
>>>
>>>        __asm__ __volatile__("@ atomic64_add\n"
>>>    "1:    ldrexd    %0, %H0, [%3]\n"
>>> +#ifndef CONFIG_CPU_BIG_ENDIAN /* little endian */
>>>    "    adds    %0, %0, %4\n"
>>>    "    adc    %H0, %H0, %H4\n"
>>> +#else
>>> +"    adds    %H0, %H0, %H4\n"
>>> +"    adc    %0, %0, %4\n"
>>> +#endif
>>>    "    strexd    %1, %0, %H0, [%3]\n"
>>>    "    teq    %1, #0\n"
>>>    "    bne    1b"
>>> @@ -320,8 +325,13 @@ static inline u64 atomic64_add_return(u6
>>>
>>>        __asm__ __volatile__("@ atomic64_add_return\n"
>>>    "1:    ldrexd    %0, %H0, [%3]\n"
>>> +#ifndef CONFIG_CPU_BIG_ENDIAN /* little endian */
>>>    "    adds    %0, %0, %4\n"
>>>    "    adc    %H0, %H0, %H4\n"
>>> +#else
>>> +"    adds    %H0, %H0, %H4\n"
>>> +"    adc    %0, %0, %4\n"
>>> +#endif
>>>    "    strexd    %1, %0, %H0, [%3]\n"
>>>    "    teq    %1, #0\n"
>>>    "    bne    1b"
>>> @@ -341,8 +351,13 @@ static inline void atomic64_sub(u64 i, a
>>>
>>>        __asm__ __volatile__("@ atomic64_sub\n"
>>>    "1:    ldrexd    %0, %H0, [%3]\n"
>>> +#ifndef CONFIG_CPU_BIG_ENDIAN /* little endian */
>>>    "    subs    %0, %0, %4\n"
>>>    "    sbc    %H0, %H0, %H4\n"
>>> +#else
>>> +"    subs    %H0, %H0, %H4\n"
>>> +"    sbc    %0, %0, %4\n"
>>> +#endif
>>>    "    strexd    %1, %0, %H0, [%3]\n"
>>>    "    teq    %1, #0\n"
>>>    "    bne    1b"
>>> @@ -360,8 +375,13 @@ static inline u64 atomic64_sub_return(u6
>>>
>>>        __asm__ __volatile__("@ atomic64_sub_return\n"
>>>    "1:    ldrexd    %0, %H0, [%3]\n"
>>> +#ifndef CONFIG_CPU_BIG_ENDIAN /* little endian */
>>>    "    subs    %0, %0, %4\n"
>>>    "    sbc    %H0, %H0, %H4\n"
>>> +#else
>>> +"    subs    %H0, %H0, %H4\n"
>>> +"    sbc    %0, %0, %4\n"
>>> +#endif
>>>    "    strexd    %1, %0, %H0, [%3]\n"
>>>    "    teq    %1, #0\n"
>>>    "    bne    1b"
>>> @@ -428,9 +448,15 @@ static inline u64 atomic64_dec_if_positi
>>>
>>>        __asm__ __volatile__("@ atomic64_dec_if_positive\n"
>>>    "1:    ldrexd    %0, %H0, [%3]\n"
>>> +#ifndef CONFIG_CPU_BIG_ENDIAN /* little endian */
>>>    "    subs    %0, %0, #1\n"
>>>    "    sbc    %H0, %H0, #0\n"
>>>    "    teq    %H0, #0\n"
>>> +#else
>>> +"    subs    %H0, %H0, #1\n"
>>> +"    sbc    %0, %0, #0\n"
>>> +"    teq    %0, #0\n"
>>> +#endif
>>>    "    bmi    2f\n"
>>>    "    strexd    %1, %0, %H0, [%3]\n"
>>>    "    teq    %1, #0\n"
>>> @@ -459,8 +485,13 @@ static inline int atomic64_add_unless(at
>>>    "    teqeq    %H0, %H5\n"
>>>    "    moveq    %1, #0\n"
>>>    "    beq    2f\n"
>>> +#ifndef CONFIG_CPU_BIG_ENDIAN /* little endian */
>>>    "    adds    %0, %0, %6\n"
>>>    "    adc    %H0, %H0, %H6\n"
>>> +#else
>>> +"    adds    %H0, %H0, %H6\n"
>>> +"    adc    %0, %0, %6\n"
>>> +#endif
>>>    "    strexd    %2, %0, %H0, [%4]\n"
>>>    "    teq    %2, #0\n"
>>>    "    bne    1b\n"
>>
>>
>> I still believe that you're doing the wrong thing here
>> and that these can be left well enough alone. Please give
>> a concrete piece of code that can show they're actually
>> doing the wrong thing.
>
> Disable CONFIG_GENERIC_ATOMIC64
> -------------------------------
>
> Make sure that CONFIG_GENERIC_ATOMIC64 is not set. Otherwise atomic64_add from
> lib/atomic64.c will be used and that one works correctly but this case does
> not use atomic64_XXX inline functions from arch/arm/include/asm/atomic.h
>
> Personally with my current kernel close to 3.10 I failed to do that and manage
> to get live kernel, so to illustrate the issue I will use my own copy of
> inline atomic64_add function. But effectively it will show the same problem
> that I tried to report.

the kconfig has:
select GENERIC_ATOMIC64 if (CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI)

which means that it is possible that the generic one is being used.


> Compiler used
> -------------
>
> gcc-linaro-arm-linux-gnueabihf-4.7-2013.01-20130125_linux
>
> Test module source
> ------------------
>
> #include<linux/module.h>
> #include<linux/moduleparam.h>
> #include<linux/atomic.h>
>
> static inline void my_atomic64_add_original(u64 i, atomic64_t *v)
> {
>      u64 result;
>      unsigned long tmp;
>
>      __asm__ __volatile__("@ atomic64_add\n"
> "1:    ldrexd    %0, %H0, [%3]\n"
> "    adds    %0, %0, %4\n"
> "    adc    %H0, %H0, %H4\n"
> "    strexd    %1, %0, %H0, [%3]\n"
> "    teq    %1, #0\n"
> "    bne    1b"
>      : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
>      : "r" (&v->counter), "r" (i)
>      : "cc");
> }
>
> static inline void my_atomic64_add_fixed(u64 i, atomic64_t *v)
> {
>      u64 result;
>      unsigned long tmp;
>
>      __asm__ __volatile__("@ atomic64_add\n"
> "1:    ldrexd    %0, %H0, [%3]\n"
> #ifndef CONFIG_CPU_BIG_ENDIAN /* little endian */
> "    adds    %0, %0, %4\n"
> "    adc    %H0, %H0, %H4\n"
> #else
> "    adds    %H0, %H0, %H4\n"
> "    adc    %0, %0, %4\n"
> #endif
> "    strexd    %1, %0, %H0, [%3]\n"
> "    teq    %1, #0\n"
> "    bne    1b"
>      : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
>      : "r" (&v->counter), "r" (i)
>      : "cc");
> }
>
>
> atomic64_t a = {0};
>
> #define INCREMENT 0x40000000
>
> void atomic_update_original (void)
> {
>      my_atomic64_add_original(INCREMENT,&a);
> }
>
> void atomic_update_fixed (void)
> {
>      my_atomic64_add_fixed(INCREMENT,&a);
> }
>
> void test_atomic64_original(void)
> {
>      int i;
>
>      printk("original atomic64_add\n");
>
>      atomic64_set(&a, 0);
>
>      printk("a = %llx\n", a.counter);
>
>      for (i = 0; i<  10; i++) {
>          atomic_update_original();
>          printk("i = %2d: a =  %llx\n", i, a.counter);
>      }
> }
>
> void test_atomic64_fixed(void)
> {
>      int i;
>
>      printk("fixed atomic64_add\n");
>
>      atomic64_set(&a, 0);
>
>      printk("a = %llx\n", a.counter);
>
>      for (i = 0; i<  10; i++) {
>          atomic_update_fixed();
>          printk("i = %2d: a =  %llx\n", i, a.counter);
>      }
> }
>
> int
> init_module (void)
> {
>      test_atomic64_original();
>      test_atomic64_fixed();
>      return 0;
> }
>
> void
> cleanup_module(void)
> {
> }
>
> MODULE_LICENSE("GPL");
>
>
> Test run
> --------
>
> Compile and run module, observe in original code counter is not incremented
> correctly.
>
> root at genericarmv7ab:~# insmod atomic64.ko
> [   73.041107] original atomic64_add
> [   73.044647] a = 0
> [   73.046691] i =  0: a =  40000000
> [   73.050659] i =  1: a =  80000000
> [   73.054199] i =  2: a =  c0000000
> [   73.057983] i =  3: a =  0
> [   73.060852] i =  4: a =  40000000
> [   73.064361] i =  5: a =  80000000
> [   73.067932] i =  6: a =  c0000000
> [   73.071441] i =  7: a =  0
> [   73.074310] i =  8: a =  40000000
> [   73.077850] i =  9: a =  80000000
> [   73.081390] fixed atomic64_add
> [   73.084625] a = 0
> [   73.086669] i =  0: a =  40000000
> [   73.090209] i =  1: a =  80000000
> [   73.093749] i =  2: a =  c0000000
> [   73.097290] i =  3: a =  100000000
> [   73.100891] i =  4: a =  140000000
> [   73.104522] i =  5: a =  180000000
> [   73.108154] i =  6: a =  1c0000000
> [   73.111755] i =  7: a =  200000000
> [   73.115386] i =  8: a =  240000000
> [   73.119018] i =  9: a =  280000000
>
>
> Disassemble of original code
> ----------------------------
>
> (gdb) disassemble atomic_update_original
> Dump of assembler code for function atomic_update_original:
>     0x00000000<+0>:    push    {r4}        ; (str r4, [sp, #-4]!)
>     0x00000004<+4>:    mov    r3, #1073741824    ; 0x40000000
>     0x00000008<+8>:    ldr    r12, [pc, #32]    ; 0x30
> <atomic_update_original+48>
>     0x0000000c<+12>:    mov    r2, #0
>     0x00000010<+16>:    ldrexd    r0, [r12]
>     0x00000014<+20>:    adds    r0, r0, r2
>     0x00000018<+24>:    adc    r1, r1, r3
>     0x0000001c<+28>:    strexd    r4, r0, [r12]
>     0x00000020<+32>:    teq    r4, #0
>     0x00000024<+36>:    bne    0x10<atomic_update_original+16>
>     0x00000028<+40>:    ldmfd    sp!, {r4}
>     0x0000002c<+44>:    bx    lr
>     0x00000030<+48>:    andeq    r0, r0, r0
> End of assembler dump.
>
> Disassemble of fixed code
> -------------------------
>
> (gdb) disassemble atomic_update_fixed
> Dump of assembler code for function atomic_update_fixed:
>     0x00000034<+0>:    push    {r4}        ; (str r4, [sp, #-4]!)
>     0x00000038<+4>:    mov    r3, #1073741824    ; 0x40000000
>     0x0000003c<+8>:    ldr    r12, [pc, #32]    ; 0x64<atomic_update_fixed+48>
>     0x00000040<+12>:    mov    r2, #0
>     0x00000044<+16>:    ldrexd    r0, [r12]
>     0x00000048<+20>:    adds    r1, r1, r3
>     0x0000004c<+24>:    adc    r0, r0, r2
>     0x00000050<+28>:    strexd    r4, r0, [r12]
>     0x00000054<+32>:    teq    r4, #0
>     0x00000058<+36>:    bne    0x44<atomic_update_fixed+16>
>     0x0000005c<+40>:    ldmfd    sp!, {r4}
>     0x00000060<+44>:    bx    lr
>     0x00000064<+48>:    andeq    r0, r0, r0
> End of assembler dump.
>
> Fixed code explanation
> ----------------------
>
> - $r2 has 4 most significant bytes of increement (0)
> - $r3 has 4 least significant bytes of increement (0x40000000)
>
> - Load from address at $r12 'long long' into r0 and r1
>    $r0 will get 4 most significant bytes (i.e 4 bytes at $r12)
>    $r1 will get 4 least significan bytes (i.e 4 bytes at $r12+4)
>    loads are big endian, so $r0 and $r0 will be read with proper be swap
>
>     0x00000044<+16>:    ldrexd    r0, [r12]
>
> - add $r3 to $r1 store result into $r1, carry flag could be set
> - adding 4 least sginificant bytes of 'long long'
>
>     0x00000048<+20>:    adds    r1, r1, r3
>
> - adding with carry most siginificant parts of increment and counter
>
>     0x0000004c<+24>:    adc    r0, r0, r2
>
> - Store result back
>    $r0 will to $r12 address
>    $r1 will to $r12+4 address
>
>     0x00000050<+28>:    strexd    r4, r0, [r12]
>
> Ldrd/strd
> ---------
>
> Issue boils down to the fact that ldrexd/ldrd and strexd/strd instrucitons
> in big endian mode will do byteswap only within 4 bytes boundary, but it will
> not swap registers numbers itself. I.e ldrexd rN,<addr>  puts swapped 4 bytes
> at<addr>  address into rN, and it puts swapped 4 bytes at<addr+4>  into rN+1

Looking at the ARM ARM, the following is LDRD.

if ConditionPassed() then
     EncodingSpecificOperations(); NullCheckIfThumbEE(15);
     address = if add then (Align(PC,4) + imm32) else (Align(PC,4) - imm32);
     if HaveLPAE() && address<2:0> == '000' then
         data = MemA[Address,8];
         if BigEndian() then
              R[t] = data<63:32>;
              R[t2] = data<31:0>;
         else
              R[t] = data<31:0>;
              R[t2] = data<63:32>;
     else
         R[t] = MemA[address,4];
         R[t2] = MemA[address+4,4];

I thought it always had R[t] as lowest and R[t2] as the highest.

> Compiler?
> ---------
>
> One may thing whether we have compiler bug here. And after all, what is the
> meaning of %0 vs %H0? I asked my local gcc expert for the help. Here what
> I got. Please look at
>
> https://github.com/mirrors/gcc/blob/master/gcc/config/arm/arm.c
>
> and search for>'Q', 'R' and 'H'<
>
> according to this page '%HN' is always reg+1 operand regardless whether it is
> big endian system or not. It is opposite to 'Q'.
>
> Better fix
> ----------
>
> In fact now I believe correct fix should use 'Q' for least siginificant 4
> bytes, 'R' for most siginificant 4 bytes and 'H' is used in load/store
> instructions.
>
> It should look something like this:
>
> static inline void my_atomic64_add_fixed1(u64 i, atomic64_t *v)
> {
>      u64 result;
>      unsigned long tmp;
>
>      __asm__ __volatile__("@ atomic64_add\n"
> "1:    ldrexd    %0, %H0, [%3]\n"
> "    adds    %Q0, %Q0, %Q4\n"
> "    adc    %R0, %R0, %R4\n"
> "    strexd    %1, %0, %H0, [%3]\n"
> "    teq    %1, #0\n"
> "    bne    1b"
>      : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
>      : "r" (&v->counter), "r" (i)
>      : "cc");
> }
>
> It is better that my original fix because it does not have conditional
> compilation. I've tested it in the module, it works on both LE and BE kernel
> variants.

If you want to make a patch with this in, I can add it to my series
ready to get merged.

-- 
Ben Dooks				http://www.codethink.co.uk/
Senior Engineer				Codethink - Providing Genius

next prev parent reply	other threads:[~2013-07-24 10:46 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CAA3XUr3h5C0oJVrN5iXsM=NFkLMz_Tssp28CysZfwMNS5KvO-Q@mail.gmail.com>
2013-07-23 17:40 ` updates for be8 patch series Ben Dooks
2013-07-23 17:53   ` Victor Kamensky
2013-07-23 18:02     ` Ben Dooks
2013-07-23 18:03     ` Victor Kamensky
2013-07-23 18:30       ` Ben Dooks
2013-07-24  1:06         ` Victor Kamensky
2013-07-24  9:36           ` Will Deacon
2013-07-24 10:46           ` Ben Dooks [this message]
2013-07-24  7:31         ` Victor Kamensky
2013-07-24 21:24           ` Victor Kamensky
     [not found] <CAA3XUr1o0cOCybSR=pCUT-WYP4xus5VrES6Hyzo-Wy4tcutTvQ@mail.gmail.com>
2013-07-23 18:02 ` Ben Dooks
2013-07-22 16:33 Ben Dooks
2013-07-22 16:49 ` Ben Dooks

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=51EFB094.3050002@codethink.co.uk \
    --to=ben.dooks@codethink.co.uk \
    --cc=linux-arm-kernel@lists.infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.