public inbox for linux-bluetooth@vger.kernel.org
 help / color / mirror / Atom feed
* [Bluez-devel] ARM optimization
@ 2008-01-29 17:46 Cidorvan Leite
  2008-01-29 19:07 ` Brad Midgley
  0 siblings, 1 reply; 9+ messages in thread
From: Cidorvan Leite @ 2008-01-29 17:46 UTC (permalink / raw)
  To: bluez-devel

[-- Attachment #1: Type: text/plain, Size: 1545 bytes --]

Hi!

We've done some profiling in the sbc encoder code for the past 2
weeks. From the result we've got that most of cpu cycles is spent on
_sbc_analyze_four and _sbc_analyze_eight. Those functions are composed
mainly by multipication and add operations. The ARM assembly code
generated by gcc for the current C version of those functions generate
a lot of trash code, including unnecessary store ops (stmdb). Besides,
gcc does not take advantage of 'smlal' instruction that multiply and
accumulate in a single op.

For example - The following C code from _sbc_analyze_eight function:

MUL(res,  _sbc_proto_8[11], in[1]);
MULA(res, _sbc_proto_8[12], in[17]);
MULA(res, _sbc_proto_8[13], in[33]);
MULA(res, _sbc_proto_8[14], in[49]);
MULA(res, _sbc_proto_8[15], in[65]);
MULA(res, _sbc_proto_8[16], in[3]);
MULA(res, _sbc_proto_8[17], in[19]);
MULA(res, _sbc_proto_8[18], in[35]);
MULA(res, _sbc_proto_8[19], in[51]);
MULA(res, _sbc_proto_8[20], in[67]);
t[2] = SCALE8_STAGE1(res);

will generate result in gcc-no-arm-optimization.txt

With this patch arm-optimization.patch, gcc will generate
gcc-arm-optimization.txt, ~20% faster on encode.
The MULA macro forces gcc to use 'smlal' instruction and the 'res'
variable defined with register without static inline removes the store
operations.

The problem is the code maintenance.
So, to do this patch works, it is necessary to declare 'res' variable
with 'register', changing _sbc_analyze_four and _sbc_analyze_eight
functions with #ifdef __arm__

How can we to do this without #ifdef in C file?

[-- Attachment #2: gcc-no-arm-optimization.txt --]
[-- Type: text/plain, Size: 4938 bytes --]

        ldr     r5, [r0, #4]
        mov     lr, #41
        mov     r6, r5, asr #31
        umull   fp, ip, r5, lr
        ldr     r2, [r0, #196]
        ldr     r3, [r0, #68]
        mov     r8, r2
        mov     r9, r8, asr #31
        mov     r4, r3, asr #31
        add     sl, sp, #672
        add     lr, sp, #704
        stmia   sl, {r8-r9}
        adr     r9, .L197
        ldmia   r9, {r9-sl}
        stmdb   lr, {fp-ip}
        umull   fp, ip, r3, r9
        add     lr, sp, #688
        stmia   lr, {fp-ip}
        ldr     fp, [r0, #260]
        add     lr, sp, #656
        mov     ip, fp, asr #31
        stmia   lr, {fp-ip}
        ldr     r2, .L197+40
        ldr     lr, [sp, #672]
        ldr     r7, [r0, #132]
        umull   fp, ip, lr, r2
        add     lr, sp, #672
        stmdb   lr, {fp-ip}
        ldr     lr, .L197+44
        mov     r8, r7, asr #31
        umull   fp, ip, r7, lr
        add     lr, sp, #688
        stmdb   lr, {fp-ip}
        ldr     lr, [sp, #656]
        mvn     r2, #916
        umull   fp, ip, lr, r2
        add     lr, sp, #656
        stmdb   lr, {fp-ip}
        ldr     fp, [r0, #12]
        add     lr, sp, #640
        mov     ip, fp, asr #31
        stmia   lr, {fp-ip}
        ldr     ip, [sp, #692]
        ldr     r3, [r0, #76]
        mla     ip, r4, r9, ip
        mov     r4, r3, asr #31
        add     r9, sp, #624
        stmia   r9, {r3-r4}
        str     ip, [sp, #692]
        ldr     r4, [sp, #700]
        ldr     ip, [sp, #640]
        mov     r9, #41
        mov     r3, #145
        umull   sl, fp, ip, r3
        mla     r4, r9, r6, r4
        ldr     lr, [r0, #140]
        ldr     r2, [r0, #204]
        add     ip, sp, #640
        ldr     r5, [sp, #684]
        stmdb   ip, {sl-fp}
        str     r4, [sp, #700]
        mov     r9, lr
        mov     sl, r9, asr #31
        mov     r6, r2
        mov     r7, r6, asr #31
        ldr     lr, [sp, #676]
        ldr     r2, [sp, #668]
        ldr     r4, .L197+44
        ldr     ip, .L197+40
        mla     r4, r8, r4, r5
        mla     ip, lr, ip, r2
        ldr     r3, [r0, #268]
        add     r8, sp, #608
        add     fp, sp, #624
        stmdb   fp, {r9-sl}
        stmia   r8, {r6-r7}
        add     sl, sp, #608
        mov     r6, r3
        mov     r7, r6, asr #31
        str     r4, [sp, #684]
        ldr     r5, .L197+48
        str     ip, [sp, #36]
        stmdb   sl, {r6-r7}
        ldr     r4, [sp, #624]
        ldr     r6, [sp, #616]
        ldr     r7, .L197+52
        ldr     ip, [sp, #652]
        umull   r8, r9, r4, r5
        ldr     fp, [sp, #660]
        umull   r4, r5, r6, r7
        ldr     r2, [sp, #608]
        ldr     r3, .L197+56
        mvn     lr, #916
        mla     lr, fp, lr, ip
        ldr     ip, [sp, #600]
        umull   sl, fp, r2, r3
        add     r6, sp, #32
        mvn     r2, #46
        stmdb   r6, {r4-r5}
        add     r3, sp, #688
        umull   r6, r7, ip, r2
        ldmia   r3, {r4-r5}
        add     r2, sp, #704
        ldmdb   r2, {r2-r3}
        adds    r4, r4, r2
        adc     r5, r5, r3
        ldr     r2, [sp, #636]
        ldr     r3, [sp, #644]
        mov     ip, #145
        mla     r3, ip, r3, r2
        str     r3, [sp, #636]
        add     r3, sp, #688
        ldmdb   r3, {r2-r3}
        ldr     ip, [sp, #36]
        adds    r4, r4, r2
        ldr     r2, [sp, #672]
        adc     r5, r5, r3
        rsb     ip, r2, ip
        str     ip, [sp, #668]
        ldr     r3, .L197+48
        ldr     ip, [sp, #628]
        mla     r9, ip, r3, r9
        ldr     ip, [sp, #656]
        add     r3, sp, #672
        ldmdb   r3, {r2-r3}
        rsb     lr, ip, lr
        adds    r4, r4, r2
        str     lr, [sp, #652]
        ldr     r2, [sp, #620]
        adc     r5, r5, r3
        ldr     lr, .L197+52
        ldr     r3, [sp, #28]
        ldr     ip, .L197+56
        mla     lr, r2, lr, r3
        add     r3, sp, #656
        str     lr, [sp, #28]
        ldr     lr, [sp, #612]
        ldmdb   r3, {r2-r3}
        adds    r4, r4, r2
        ldr     r2, [sp, #604]
        mla     ip, lr, ip, fp
        adc     r5, r5, r3
        mvn     lr, #46
        add     r3, sp, #640
        mla     lr, r2, lr, r7
        ldmdb   r3, {r2-r3}
        adds    r4, r4, r2
        adc     r5, r5, r3
        adds    r4, r4, r8
        adc     r5, r5, r9
        ldr     r2, [sp, #608]
        add     r9, sp, #32
        ldmdb   r9, {r8-r9}
        ldr     r3, [sp, #600]
        adds    r4, r4, r8
        adc     r5, r5, r9
        rsb     fp, r2, ip
        adds    r4, r4, sl
        adc     r5, r5, fp
        rsb     r7, r3, lr
        adds    r4, r4, r6
        adc     r5, r5, r7
        mov     r4, r4, lsr #8
        orr     r4, r4, r5, asl #24
        mov     r5, r5, asr #8
        str     r4, [sp, #592]
        str     r5, [sp, #596]
        str     r4, [sp, #920]
        str     r4, [sp, #588]

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: arm-optimization.patch --]
[-- Type: text/x-patch; name=arm-optimization.patch, Size: 1737 bytes --]

Index: sbc.c
===================================================================
RCS file: /cvsroot/bluez/utils/sbc/sbc.c,v
retrieving revision 1.55
diff -u -p -r1.55 sbc.c
--- sbc.c	28 Jan 2008 18:00:51 -0000	1.55
+++ sbc.c	29 Jan 2008 14:42:29 -0000
@@ -708,10 +708,15 @@ static void sbc_encoder_init(struct sbc_
 	state->position[0] = state->position[1] = 9 * frame->subbands;
 }
 
-
+#ifdef __arm__
+void _sbc_analyze_four(const int16_t *in, int32_t *out)
+{
+	sbc_extended_t register res asm("r4");
+#else
 static inline void _sbc_analyze_four(const int16_t *in, int32_t *out)
 {
 	sbc_extended_t res;
+#endif
 	sbc_fixed_t t[8];
 	sbc_extended_t s[5];
 
@@ -794,9 +799,15 @@ static inline void sbc_analyze_four(stru
 		state->position[ch] = 36;
 }
 
+#ifdef __arm__
+void _sbc_analyze_eight(const int16_t *in, int32_t *out)
+{
+	sbc_extended_t register res asm("r4");
+#else
 static inline void _sbc_analyze_eight(const int16_t *in, int32_t *out)
 {
 	sbc_extended_t res;
+#endif
 	sbc_fixed_t t[8];
 	sbc_extended_t s[8];
 
Index: sbc_math.h
===================================================================
RCS file: /cvsroot/bluez/utils/sbc/sbc_math.h,v
retrieving revision 1.12
diff -u -p -r1.12 sbc_math.h
--- sbc_math.h	28 Jan 2008 18:00:51 -0000	1.12
+++ sbc_math.h	29 Jan 2008 14:42:29 -0000
@@ -66,4 +66,12 @@ typedef long long sbc_extended_t;
 #define SUB(dst, src)    { dst -= src; }
 #define MUL(dst, a, b)   { dst = (sbc_extended_t) (a) * (b); }
 #define DIV2(dst, src)   { dst = ASR(src, 1); }
+
+#ifdef __arm__
+#define MULA(res, a, b) __asm__(		\
+		"smlal %Q0, %R0, %2, %3"	\
+		: "=&r" (res)			\
+		: "0" (res), "r" (a), "r" (b));
+#else
 #define MULA(dst, a, b)  { dst += (sbc_extended_t) (a) * (b); }
+#endif

[-- Attachment #4: gcc-arm-optimization.txt --]
[-- Type: text/plain, Size: 3508 bytes --]

        ldr     r2, [r0, #4]                                                  
        mov     sl, #41                                                       
        mov     r3, r2, asr #31                                               
        umull   r4, r5, r2, sl                                                
        ldr     ip, .L198+32                                                  
        mla     r5, sl, r3, r5                                                
        ldr     lr, .L198+36                                                  
        ldr     r3, [r0, #68]                                                 
        ldr     sl, .L198+40                                                  
#APP                                                                          
        smlal r4, r5, ip, r3                                                  
        mov     r2, #145        @ movhi                                       
        ldr     r3, [r0, #132]                                                
        mvn     ip, #916                                                      
#APP                                                                          
        smlal r4, r5, sl, r3                                                  
        ldr     r8, .L198+44                                                  
        ldr     r3, [r0, #196]                                                
        ldr     r9, .L198+48                                                  
#APP                                                                          
        smlal r4, r5, lr, r3                                                  
        ldr     r3, [r0, #260]                                                
        ldr     lr, .L198+52                                                  
#APP                                                                          
        smlal r4, r5, ip, r3                                                  
        ldr     r3, [r0, #12]                                                 
#APP                                                                          
        smlal r4, r5, r2, r3                                                  
        ldr     r3, [r0, #76]                                                 
        mvn     r2, #46 @ movhi                                               
#APP                                                                          
        smlal r4, r5, r8, r3                                                  
        ldr     r3, [r0, #140]                                                
#APP                                                                          
        smlal r4, r5, r9, r3                                                  
        ldr     r3, [r0, #204]                                                
#APP                                                                          
        smlal r4, r5, lr, r3                                                  
        ldr     r3, [r0, #268]                                                
#APP                                                                          
        smlal r4, r5, r2, r3                                                  
        mov     r4, r4, lsr #8                                                
        orr     r6, r4, r5, asl #24                                           
        str     r6, [sp, #528]                                                
        str     r6, [sp, #424]


[-- Attachment #5: Type: text/plain, Size: 228 bytes --]

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

[-- Attachment #6: Type: text/plain, Size: 164 bytes --]

_______________________________________________
Bluez-devel mailing list
Bluez-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/bluez-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2008-01-30 20:21 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-01-29 17:46 [Bluez-devel] ARM optimization Cidorvan Leite
2008-01-29 19:07 ` Brad Midgley
2008-01-30 16:19   ` Brad Midgley
2008-01-30 17:47     ` Marcel Holtmann
2008-01-30 19:26       ` Brad Midgley
2008-01-30 19:56         ` Cidorvan Leite
2008-01-30 20:21           ` Marcel Holtmann
2008-01-30 17:07   ` Cidorvan Leite
2008-01-30 17:41     ` Brad Midgley

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox