From: "Cidorvan Leite" <cidorvan@gmail.com>
To: bluez-devel@lists.sourceforge.net
Subject: [Bluez-devel] ARM optimization
Date: Tue, 29 Jan 2008 14:46:37 -0300 [thread overview]
Message-ID: <50282bd30801290946l359dc7a6j29bb3b891ab35f9a@mail.gmail.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 1545 bytes --]
Hi!
We've done some profiling in the sbc encoder code for the past 2
weeks. From the result we've got that most of cpu cycles is spent on
_sbc_analyze_four and _sbc_analyze_eight. Those functions are composed
mainly by multipication and add operations. The ARM assembly code
generated by gcc for the current C version of those functions generate
a lot of trash code, including unnecessary store ops (stmdb). Besides,
gcc does not take advantage of 'smlal' instruction that multiply and
accumulate in a single op.
For example - The following C code from _sbc_analyze_eight function:
MUL(res, _sbc_proto_8[11], in[1]);
MULA(res, _sbc_proto_8[12], in[17]);
MULA(res, _sbc_proto_8[13], in[33]);
MULA(res, _sbc_proto_8[14], in[49]);
MULA(res, _sbc_proto_8[15], in[65]);
MULA(res, _sbc_proto_8[16], in[3]);
MULA(res, _sbc_proto_8[17], in[19]);
MULA(res, _sbc_proto_8[18], in[35]);
MULA(res, _sbc_proto_8[19], in[51]);
MULA(res, _sbc_proto_8[20], in[67]);
t[2] = SCALE8_STAGE1(res);
will generate result in gcc-no-arm-optimization.txt
With this patch arm-optimization.patch, gcc will generate
gcc-arm-optimization.txt, ~20% faster on encode.
The MULA macro forces gcc to use 'smlal' instruction and the 'res'
variable defined with register without static inline removes the store
operations.
The problem is the code maintenance.
So, to do this patch works, it is necessary to declare 'res' variable
with 'register', changing _sbc_analyze_four and _sbc_analyze_eight
functions with #ifdef __arm__
How can we to do this without #ifdef in C file?
[-- Attachment #2: gcc-no-arm-optimization.txt --]
[-- Type: text/plain, Size: 4938 bytes --]
ldr r5, [r0, #4]
mov lr, #41
mov r6, r5, asr #31
umull fp, ip, r5, lr
ldr r2, [r0, #196]
ldr r3, [r0, #68]
mov r8, r2
mov r9, r8, asr #31
mov r4, r3, asr #31
add sl, sp, #672
add lr, sp, #704
stmia sl, {r8-r9}
adr r9, .L197
ldmia r9, {r9-sl}
stmdb lr, {fp-ip}
umull fp, ip, r3, r9
add lr, sp, #688
stmia lr, {fp-ip}
ldr fp, [r0, #260]
add lr, sp, #656
mov ip, fp, asr #31
stmia lr, {fp-ip}
ldr r2, .L197+40
ldr lr, [sp, #672]
ldr r7, [r0, #132]
umull fp, ip, lr, r2
add lr, sp, #672
stmdb lr, {fp-ip}
ldr lr, .L197+44
mov r8, r7, asr #31
umull fp, ip, r7, lr
add lr, sp, #688
stmdb lr, {fp-ip}
ldr lr, [sp, #656]
mvn r2, #916
umull fp, ip, lr, r2
add lr, sp, #656
stmdb lr, {fp-ip}
ldr fp, [r0, #12]
add lr, sp, #640
mov ip, fp, asr #31
stmia lr, {fp-ip}
ldr ip, [sp, #692]
ldr r3, [r0, #76]
mla ip, r4, r9, ip
mov r4, r3, asr #31
add r9, sp, #624
stmia r9, {r3-r4}
str ip, [sp, #692]
ldr r4, [sp, #700]
ldr ip, [sp, #640]
mov r9, #41
mov r3, #145
umull sl, fp, ip, r3
mla r4, r9, r6, r4
ldr lr, [r0, #140]
ldr r2, [r0, #204]
add ip, sp, #640
ldr r5, [sp, #684]
stmdb ip, {sl-fp}
str r4, [sp, #700]
mov r9, lr
mov sl, r9, asr #31
mov r6, r2
mov r7, r6, asr #31
ldr lr, [sp, #676]
ldr r2, [sp, #668]
ldr r4, .L197+44
ldr ip, .L197+40
mla r4, r8, r4, r5
mla ip, lr, ip, r2
ldr r3, [r0, #268]
add r8, sp, #608
add fp, sp, #624
stmdb fp, {r9-sl}
stmia r8, {r6-r7}
add sl, sp, #608
mov r6, r3
mov r7, r6, asr #31
str r4, [sp, #684]
ldr r5, .L197+48
str ip, [sp, #36]
stmdb sl, {r6-r7}
ldr r4, [sp, #624]
ldr r6, [sp, #616]
ldr r7, .L197+52
ldr ip, [sp, #652]
umull r8, r9, r4, r5
ldr fp, [sp, #660]
umull r4, r5, r6, r7
ldr r2, [sp, #608]
ldr r3, .L197+56
mvn lr, #916
mla lr, fp, lr, ip
ldr ip, [sp, #600]
umull sl, fp, r2, r3
add r6, sp, #32
mvn r2, #46
stmdb r6, {r4-r5}
add r3, sp, #688
umull r6, r7, ip, r2
ldmia r3, {r4-r5}
add r2, sp, #704
ldmdb r2, {r2-r3}
adds r4, r4, r2
adc r5, r5, r3
ldr r2, [sp, #636]
ldr r3, [sp, #644]
mov ip, #145
mla r3, ip, r3, r2
str r3, [sp, #636]
add r3, sp, #688
ldmdb r3, {r2-r3}
ldr ip, [sp, #36]
adds r4, r4, r2
ldr r2, [sp, #672]
adc r5, r5, r3
rsb ip, r2, ip
str ip, [sp, #668]
ldr r3, .L197+48
ldr ip, [sp, #628]
mla r9, ip, r3, r9
ldr ip, [sp, #656]
add r3, sp, #672
ldmdb r3, {r2-r3}
rsb lr, ip, lr
adds r4, r4, r2
str lr, [sp, #652]
ldr r2, [sp, #620]
adc r5, r5, r3
ldr lr, .L197+52
ldr r3, [sp, #28]
ldr ip, .L197+56
mla lr, r2, lr, r3
add r3, sp, #656
str lr, [sp, #28]
ldr lr, [sp, #612]
ldmdb r3, {r2-r3}
adds r4, r4, r2
ldr r2, [sp, #604]
mla ip, lr, ip, fp
adc r5, r5, r3
mvn lr, #46
add r3, sp, #640
mla lr, r2, lr, r7
ldmdb r3, {r2-r3}
adds r4, r4, r2
adc r5, r5, r3
adds r4, r4, r8
adc r5, r5, r9
ldr r2, [sp, #608]
add r9, sp, #32
ldmdb r9, {r8-r9}
ldr r3, [sp, #600]
adds r4, r4, r8
adc r5, r5, r9
rsb fp, r2, ip
adds r4, r4, sl
adc r5, r5, fp
rsb r7, r3, lr
adds r4, r4, r6
adc r5, r5, r7
mov r4, r4, lsr #8
orr r4, r4, r5, asl #24
mov r5, r5, asr #8
str r4, [sp, #592]
str r5, [sp, #596]
str r4, [sp, #920]
str r4, [sp, #588]
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: arm-optimization.patch --]
[-- Type: text/x-patch; name=arm-optimization.patch, Size: 1737 bytes --]
Index: sbc.c
===================================================================
RCS file: /cvsroot/bluez/utils/sbc/sbc.c,v
retrieving revision 1.55
diff -u -p -r1.55 sbc.c
--- sbc.c 28 Jan 2008 18:00:51 -0000 1.55
+++ sbc.c 29 Jan 2008 14:42:29 -0000
@@ -708,10 +708,15 @@ static void sbc_encoder_init(struct sbc_
state->position[0] = state->position[1] = 9 * frame->subbands;
}
-
+#ifdef __arm__
+void _sbc_analyze_four(const int16_t *in, int32_t *out)
+{
+ sbc_extended_t register res asm("r4");
+#else
static inline void _sbc_analyze_four(const int16_t *in, int32_t *out)
{
sbc_extended_t res;
+#endif
sbc_fixed_t t[8];
sbc_extended_t s[5];
@@ -794,9 +799,15 @@ static inline void sbc_analyze_four(stru
state->position[ch] = 36;
}
+#ifdef __arm__
+void _sbc_analyze_eight(const int16_t *in, int32_t *out)
+{
+ sbc_extended_t register res asm("r4");
+#else
static inline void _sbc_analyze_eight(const int16_t *in, int32_t *out)
{
sbc_extended_t res;
+#endif
sbc_fixed_t t[8];
sbc_extended_t s[8];
Index: sbc_math.h
===================================================================
RCS file: /cvsroot/bluez/utils/sbc/sbc_math.h,v
retrieving revision 1.12
diff -u -p -r1.12 sbc_math.h
--- sbc_math.h 28 Jan 2008 18:00:51 -0000 1.12
+++ sbc_math.h 29 Jan 2008 14:42:29 -0000
@@ -66,4 +66,12 @@ typedef long long sbc_extended_t;
#define SUB(dst, src) { dst -= src; }
#define MUL(dst, a, b) { dst = (sbc_extended_t) (a) * (b); }
#define DIV2(dst, src) { dst = ASR(src, 1); }
+
+#ifdef __arm__
+#define MULA(res, a, b) __asm__( \
+ "smlal %Q0, %R0, %2, %3" \
+ : "=&r" (res) \
+ : "0" (res), "r" (a), "r" (b));
+#else
#define MULA(dst, a, b) { dst += (sbc_extended_t) (a) * (b); }
+#endif
[-- Attachment #4: gcc-arm-optimization.txt --]
[-- Type: text/plain, Size: 3508 bytes --]
ldr r2, [r0, #4]
mov sl, #41
mov r3, r2, asr #31
umull r4, r5, r2, sl
ldr ip, .L198+32
mla r5, sl, r3, r5
ldr lr, .L198+36
ldr r3, [r0, #68]
ldr sl, .L198+40
#APP
smlal r4, r5, ip, r3
mov r2, #145 @ movhi
ldr r3, [r0, #132]
mvn ip, #916
#APP
smlal r4, r5, sl, r3
ldr r8, .L198+44
ldr r3, [r0, #196]
ldr r9, .L198+48
#APP
smlal r4, r5, lr, r3
ldr r3, [r0, #260]
ldr lr, .L198+52
#APP
smlal r4, r5, ip, r3
ldr r3, [r0, #12]
#APP
smlal r4, r5, r2, r3
ldr r3, [r0, #76]
mvn r2, #46 @ movhi
#APP
smlal r4, r5, r8, r3
ldr r3, [r0, #140]
#APP
smlal r4, r5, r9, r3
ldr r3, [r0, #204]
#APP
smlal r4, r5, lr, r3
ldr r3, [r0, #268]
#APP
smlal r4, r5, r2, r3
mov r4, r4, lsr #8
orr r6, r4, r5, asl #24
str r6, [sp, #528]
str r6, [sp, #424]
[-- Attachment #5: Type: text/plain, Size: 228 bytes --]
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
[-- Attachment #6: Type: text/plain, Size: 164 bytes --]
_______________________________________________
Bluez-devel mailing list
Bluez-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/bluez-devel
next reply other threads:[~2008-01-29 17:46 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-01-29 17:46 Cidorvan Leite [this message]
2008-01-29 19:07 ` [Bluez-devel] ARM optimization Brad Midgley
2008-01-30 16:19 ` Brad Midgley
2008-01-30 17:47 ` Marcel Holtmann
2008-01-30 19:26 ` Brad Midgley
2008-01-30 19:56 ` Cidorvan Leite
2008-01-30 20:21 ` Marcel Holtmann
2008-01-30 17:07 ` Cidorvan Leite
2008-01-30 17:41 ` Brad Midgley
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=50282bd30801290946l359dc7a6j29bb3b891ab35f9a@mail.gmail.com \
--to=cidorvan@gmail.com \
--cc=bluez-devel@lists.sourceforge.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox