From mboxrd@z Thu Jan 1 00:00:00 1970 From: linux@arm.linux.org.uk (Russell King - ARM Linux) Date: Thu, 12 Dec 2013 16:47:48 +0000 Subject: gcc miscompiles csum_tcpudp_magic() on ARMv5 In-Reply-To: <20131212160426.GD31816@1wt.eu> References: <1386850444.22947.46.camel@sakura.staff.proxad.net> <20131212124015.GL4360@n2100.arm.linux.org.uk> <1386855390.22947.68.camel@sakura.staff.proxad.net> <20131212144853.GO4360@n2100.arm.linux.org.uk> <1386860657.25449.3.camel@sakura.staff.proxad.net> <20131212154110.GQ4360@n2100.arm.linux.org.uk> <20131212160426.GD31816@1wt.eu> Message-ID: <20131212164748.GS4360@n2100.arm.linux.org.uk> To: linux-arm-kernel@lists.infradead.org List-Id: linux-arm-kernel.lists.infradead.org On Thu, Dec 12, 2013 at 05:04:26PM +0100, Willy Tarreau wrote: > On Thu, Dec 12, 2013 at 03:41:10PM +0000, Russell King - ARM Linux wrote: > > One of the issues here is that internally, GCC tracks the "machine mode" > > of a register, where "mode" basically means the type of register it is. > > In this case, the register will be in "half integer" mode when it is > > passed into the assembler, and gcc does _not_ promote it. > > > > We can see this in the RTL: > > > > (insn 11 10 12 3 ../t.c:9 (set (reg:SI 143 [ sum ]) > > (asm_operands:SI ("mov %0, %1") ("=&r") 0 [ > > (subreg:HI (reg:SI 148) 0) > > ] > > [ > > (asm_input:HI ("r") (null):0) > > ] > > [] ../t.c:12)) -1 (nil)) > > > > Note that "HI" for the input register 148. What this means is that the > > compiler "knows" that it's supposed to be a 16-bit quantity, and has > > recorded that fact, and _will_ truncate it down to 16-bit when it needs > > to be promoted. > > > > Since the asm() only asks for a "register", that's what we get - a > > register which _happens_ to be in HI mode containing the value. However, > > there's no way to tell that from the assembly code itself (GCC doesn't > > have the facility to do conditional assembly generation depending on > > the argument type passed into it.) > > OK so that means that it's just like having true 16-bit registers except > that it's just a temporary representation and not a feature of the CPU. > The compiler has the right to expect the asm instructions to only use > the lower 16 bits and has no way to verify that. Quite. > Then changing the type of the function argument would probably be safer! Actually, I think we can do a bit better with this code. We really don't need much of this messing around here, we can combine some of these steps. We have: 16-bit protocol in host endian 16-bit length in host endian and we need to combine them into a 32-bit checksum which is then subsequently folded down to 16-bits by adding the top and bottom halves. Now, what we can do is this: 1. Construct a combined 32-bit protocol and length: unsigned lenproto = len | proto << 16; 2. Pass this into the assembly thusly: __asm__( "adds %0, %1, %2 @ csum_tcpudp_nofold \n\t" "adcs %0, %0, %3 \n\t" #ifdef __ARMEB__ "adcs %0, %0, %4 \n\t" #else "adcs %0, %0, %4, ror #8 \n\t" #endif "adc %0, %0, #0" : "=&r"(sum) : "r" (sum), "r" (daddr), "r" (saddr), "r" (lenprot) : "cc"); with no swabbing@this stage. Well, where do we get the endian conversion? See that ror #8 - that a 32 bit rotate by 8 bits. As these are two 16-bit quantities, we end up with this: original: 31..24 23..16 15..8 7..0 len_h len_l pro_h pro_l accumulated: 31..24 23..16 15..8 7..0 pro_l len_h len_l pro_h And now when we fold it down to 16-bit: 15..8 7..0 len_l pro_h pro_l len_h There we go - the low and high bytes end up correctly placed. Now, the advantage to having "len" and "proto" combined by the compiler is that the compiler itself can make the decision about how to combine these two 16-bit HI-mode quantities in the most efficient way. Here's some examples from udp.c - I have another optimisation in here too which makes this code sequence slightly shorter. Some intermediary instructions have been omitted. mov r6, r6, asl #16 @ tmp204, len, mov r6, r6, lsr #16 @ tmp203, tmp204, orr r6, r6, #1114112 @ tmp205, tmp203, @ 92 "/home/rmk/git/linux-rmk/arch/arm/include/asm/checksum.h" 1 adds r3, r8, r7 @ csum_tcpudp_nofold0 @ sum, dst, src adcs r3, r3, r6, ror #8 @ sum, tmp205 adc r3, r3, #0 @ sum mov r6, r6, asl #16 @ tmp220, len, mov r6, r6, lsr #16 @ tmp219, tmp220, orr r6, r6, #1114112 @ tmp221, tmp219, @ 104 "/home/rmk/git/linux-rmk/arch/arm/include/asm/checksum.h" 1 adds r3, r0, r8 @ csum_tcpudp_nofold @ sum,, dst adcs r3, r3, r7 @ sum, src adcs r3, r3, r6, ror #8 @ sum, tmp221 adc r3, r3, #0 @ sum Note this one sourcing an 8-bit protocol value. ldrb r3, [r7, #269] @ zero_extendqisi2 @ tmp321, sk_5->sk_protocol orr sl, sl, r3, asl #16 @, tmp324, D.48540, tmp321, @ 104 "/home/rmk/git/linux-rmk/arch/arm/include/asm/checksum.h" 1 adds r3, r0, r2 @ csum_tcpudp_nofold @ sum, csum, fl4_19(D)->daddr adcs r3, r3, r1 @ sum, fl4_19(D)->saddr adcs r3, r3, sl, ror #8 @ sum, tmp324 adc r3, r3, #0 @ sum ldrh lr, [r4, #76] @ tmp503, skb_10(D)->len orr lr, lr, r7, asl #16 @, tmp505, tmp503, proto, @ 104 "/home/rmk/git/linux-rmk/arch/arm/include/asm/checksum.h" 1 adds r1, sl, r0 @ csum_tcpudp_nofold @ sum, skb_10(D)->D.22802.csum, iph_354->daddr adcs r1, r1, ip @ sum, iph_354->saddr adcs r1, r1, lr, ror #8 @ sum, tmp505 adc r1, r1, #0 @ sum ldrh r0, [r4, #76] @ tmp529, skb_10(D)->len orr r0, r0, r7, asl #16 @, tmp531, tmp529, proto, @ 92 "/home/rmk/git/linux-rmk/arch/arm/include/asm/checksum.h" 1 adds r3, r2, r1 @ csum_tcpudp_nofold0 @ sum, iph_354->daddr, iph_354->saddr adcs r3, r3, r0, ror #8 @ sum, tmp531 adc r3, r3, #0 @ sum This one is a little more complicated because it uses skb->len - we can see the narrowing cast being performed: ldr r0, [r4, #76] @ skb_1->len, skb_1->len rsb r0, r9, r0 @ tmp335, D.53494, skb_1->len mov r0, r0, asl #16 @ tmp337, tmp335, mov r0, r0, lsr #16 @ tmp336, tmp337, orr r0, r0, #1114112 @ tmp338, tmp336, @ 92 "/home/rmk/git/linux-rmk/arch/arm/include/asm/checksum.h" 1 adds r3, r2, r1 @ csum_tcpudp_nofold0 @ sum, iph_252->daddr, iph_252->saddr adcs r3, r3, r0, ror #8 @ sum, tmp338 adc r3, r3, #0 @ sum Looking at whether it's better to shift len or proto to the upper 16 bits reveals no real advantage to either: we end up with the same overall number of instructions between the UDP and TCP code combined, individually there's only one line between them. So here's a patch for Maxime to try - I've not run it yet but it seems to do the right thing by code inspection. diff --git a/arch/arm/include/asm/checksum.h b/arch/arm/include/asm/checksum.h index 6dcc16430868..523315115478 100644 --- a/arch/arm/include/asm/checksum.h +++ b/arch/arm/include/asm/checksum.h @@ -87,19 +87,34 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len, unsigned short proto, __wsum sum) { - __asm__( - "adds %0, %1, %2 @ csum_tcpudp_nofold \n\ - adcs %0, %0, %3 \n" + u32 lenprot = len | proto << 16; + + if (__builtin_constant_p(sum) && sum == 0) { + __asm__( + "adds %0, %1, %2 @ csum_tcpudp_nofold0 \n\t" #ifdef __ARMEB__ - "adcs %0, %0, %4 \n" + "adcs %0, %0, %3 \n\t" #else - "adcs %0, %0, %4, lsl #8 \n" + "adcs %0, %0, %3, ror #8 \n\t" #endif - "adcs %0, %0, %5 \n\ - adc %0, %0, #0" - : "=&r"(sum) - : "r" (sum), "r" (daddr), "r" (saddr), "r" (len), "Ir" (htons(proto)) - : "cc"); + "adc %0, %0, #0" + : "=&r" (sum) + : "r" (daddr), "r" (saddr), "r" (lenprot) + : "cc"); + } else { + __asm__( + "adds %0, %1, %2 @ csum_tcpudp_nofold \n\t" + "adcs %0, %0, %3 \n\t" +#ifdef __ARMEB__ + "adcs %0, %0, %4 \n\t" +#else + "adcs %0, %0, %4, ror #8 \n\t" +#endif + "adc %0, %0, #0" + : "=&r"(sum) + : "r" (sum), "r" (daddr), "r" (saddr), "r" (lenprot) + : "cc"); + } return sum; } /*