* [Bluez-devel] ARM optimization
@ 2008-01-29 17:46 Cidorvan Leite
2008-01-29 19:07 ` Brad Midgley
0 siblings, 1 reply; 9+ messages in thread
From: Cidorvan Leite @ 2008-01-29 17:46 UTC (permalink / raw)
To: bluez-devel
[-- Attachment #1: Type: text/plain, Size: 1545 bytes --]
Hi!
We've done some profiling in the sbc encoder code for the past 2
weeks. From the result we've got that most of cpu cycles is spent on
_sbc_analyze_four and _sbc_analyze_eight. Those functions are composed
mainly by multipication and add operations. The ARM assembly code
generated by gcc for the current C version of those functions generate
a lot of trash code, including unnecessary store ops (stmdb). Besides,
gcc does not take advantage of 'smlal' instruction that multiply and
accumulate in a single op.
For example - The following C code from _sbc_analyze_eight function:
MUL(res, _sbc_proto_8[11], in[1]);
MULA(res, _sbc_proto_8[12], in[17]);
MULA(res, _sbc_proto_8[13], in[33]);
MULA(res, _sbc_proto_8[14], in[49]);
MULA(res, _sbc_proto_8[15], in[65]);
MULA(res, _sbc_proto_8[16], in[3]);
MULA(res, _sbc_proto_8[17], in[19]);
MULA(res, _sbc_proto_8[18], in[35]);
MULA(res, _sbc_proto_8[19], in[51]);
MULA(res, _sbc_proto_8[20], in[67]);
t[2] = SCALE8_STAGE1(res);
will generate result in gcc-no-arm-optimization.txt
With this patch arm-optimization.patch, gcc will generate
gcc-arm-optimization.txt, ~20% faster on encode.
The MULA macro forces gcc to use 'smlal' instruction and the 'res'
variable defined with register without static inline removes the store
operations.
The problem is the code maintenance.
So, to do this patch works, it is necessary to declare 'res' variable
with 'register', changing _sbc_analyze_four and _sbc_analyze_eight
functions with #ifdef __arm__
How can we to do this without #ifdef in C file?
[-- Attachment #2: gcc-no-arm-optimization.txt --]
[-- Type: text/plain, Size: 4938 bytes --]
ldr r5, [r0, #4]
mov lr, #41
mov r6, r5, asr #31
umull fp, ip, r5, lr
ldr r2, [r0, #196]
ldr r3, [r0, #68]
mov r8, r2
mov r9, r8, asr #31
mov r4, r3, asr #31
add sl, sp, #672
add lr, sp, #704
stmia sl, {r8-r9}
adr r9, .L197
ldmia r9, {r9-sl}
stmdb lr, {fp-ip}
umull fp, ip, r3, r9
add lr, sp, #688
stmia lr, {fp-ip}
ldr fp, [r0, #260]
add lr, sp, #656
mov ip, fp, asr #31
stmia lr, {fp-ip}
ldr r2, .L197+40
ldr lr, [sp, #672]
ldr r7, [r0, #132]
umull fp, ip, lr, r2
add lr, sp, #672
stmdb lr, {fp-ip}
ldr lr, .L197+44
mov r8, r7, asr #31
umull fp, ip, r7, lr
add lr, sp, #688
stmdb lr, {fp-ip}
ldr lr, [sp, #656]
mvn r2, #916
umull fp, ip, lr, r2
add lr, sp, #656
stmdb lr, {fp-ip}
ldr fp, [r0, #12]
add lr, sp, #640
mov ip, fp, asr #31
stmia lr, {fp-ip}
ldr ip, [sp, #692]
ldr r3, [r0, #76]
mla ip, r4, r9, ip
mov r4, r3, asr #31
add r9, sp, #624
stmia r9, {r3-r4}
str ip, [sp, #692]
ldr r4, [sp, #700]
ldr ip, [sp, #640]
mov r9, #41
mov r3, #145
umull sl, fp, ip, r3
mla r4, r9, r6, r4
ldr lr, [r0, #140]
ldr r2, [r0, #204]
add ip, sp, #640
ldr r5, [sp, #684]
stmdb ip, {sl-fp}
str r4, [sp, #700]
mov r9, lr
mov sl, r9, asr #31
mov r6, r2
mov r7, r6, asr #31
ldr lr, [sp, #676]
ldr r2, [sp, #668]
ldr r4, .L197+44
ldr ip, .L197+40
mla r4, r8, r4, r5
mla ip, lr, ip, r2
ldr r3, [r0, #268]
add r8, sp, #608
add fp, sp, #624
stmdb fp, {r9-sl}
stmia r8, {r6-r7}
add sl, sp, #608
mov r6, r3
mov r7, r6, asr #31
str r4, [sp, #684]
ldr r5, .L197+48
str ip, [sp, #36]
stmdb sl, {r6-r7}
ldr r4, [sp, #624]
ldr r6, [sp, #616]
ldr r7, .L197+52
ldr ip, [sp, #652]
umull r8, r9, r4, r5
ldr fp, [sp, #660]
umull r4, r5, r6, r7
ldr r2, [sp, #608]
ldr r3, .L197+56
mvn lr, #916
mla lr, fp, lr, ip
ldr ip, [sp, #600]
umull sl, fp, r2, r3
add r6, sp, #32
mvn r2, #46
stmdb r6, {r4-r5}
add r3, sp, #688
umull r6, r7, ip, r2
ldmia r3, {r4-r5}
add r2, sp, #704
ldmdb r2, {r2-r3}
adds r4, r4, r2
adc r5, r5, r3
ldr r2, [sp, #636]
ldr r3, [sp, #644]
mov ip, #145
mla r3, ip, r3, r2
str r3, [sp, #636]
add r3, sp, #688
ldmdb r3, {r2-r3}
ldr ip, [sp, #36]
adds r4, r4, r2
ldr r2, [sp, #672]
adc r5, r5, r3
rsb ip, r2, ip
str ip, [sp, #668]
ldr r3, .L197+48
ldr ip, [sp, #628]
mla r9, ip, r3, r9
ldr ip, [sp, #656]
add r3, sp, #672
ldmdb r3, {r2-r3}
rsb lr, ip, lr
adds r4, r4, r2
str lr, [sp, #652]
ldr r2, [sp, #620]
adc r5, r5, r3
ldr lr, .L197+52
ldr r3, [sp, #28]
ldr ip, .L197+56
mla lr, r2, lr, r3
add r3, sp, #656
str lr, [sp, #28]
ldr lr, [sp, #612]
ldmdb r3, {r2-r3}
adds r4, r4, r2
ldr r2, [sp, #604]
mla ip, lr, ip, fp
adc r5, r5, r3
mvn lr, #46
add r3, sp, #640
mla lr, r2, lr, r7
ldmdb r3, {r2-r3}
adds r4, r4, r2
adc r5, r5, r3
adds r4, r4, r8
adc r5, r5, r9
ldr r2, [sp, #608]
add r9, sp, #32
ldmdb r9, {r8-r9}
ldr r3, [sp, #600]
adds r4, r4, r8
adc r5, r5, r9
rsb fp, r2, ip
adds r4, r4, sl
adc r5, r5, fp
rsb r7, r3, lr
adds r4, r4, r6
adc r5, r5, r7
mov r4, r4, lsr #8
orr r4, r4, r5, asl #24
mov r5, r5, asr #8
str r4, [sp, #592]
str r5, [sp, #596]
str r4, [sp, #920]
str r4, [sp, #588]
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: arm-optimization.patch --]
[-- Type: text/x-patch; name=arm-optimization.patch, Size: 1737 bytes --]
Index: sbc.c
===================================================================
RCS file: /cvsroot/bluez/utils/sbc/sbc.c,v
retrieving revision 1.55
diff -u -p -r1.55 sbc.c
--- sbc.c 28 Jan 2008 18:00:51 -0000 1.55
+++ sbc.c 29 Jan 2008 14:42:29 -0000
@@ -708,10 +708,15 @@ static void sbc_encoder_init(struct sbc_
state->position[0] = state->position[1] = 9 * frame->subbands;
}
-
+#ifdef __arm__
+void _sbc_analyze_four(const int16_t *in, int32_t *out)
+{
+ sbc_extended_t register res asm("r4");
+#else
static inline void _sbc_analyze_four(const int16_t *in, int32_t *out)
{
sbc_extended_t res;
+#endif
sbc_fixed_t t[8];
sbc_extended_t s[5];
@@ -794,9 +799,15 @@ static inline void sbc_analyze_four(stru
state->position[ch] = 36;
}
+#ifdef __arm__
+void _sbc_analyze_eight(const int16_t *in, int32_t *out)
+{
+ sbc_extended_t register res asm("r4");
+#else
static inline void _sbc_analyze_eight(const int16_t *in, int32_t *out)
{
sbc_extended_t res;
+#endif
sbc_fixed_t t[8];
sbc_extended_t s[8];
Index: sbc_math.h
===================================================================
RCS file: /cvsroot/bluez/utils/sbc/sbc_math.h,v
retrieving revision 1.12
diff -u -p -r1.12 sbc_math.h
--- sbc_math.h 28 Jan 2008 18:00:51 -0000 1.12
+++ sbc_math.h 29 Jan 2008 14:42:29 -0000
@@ -66,4 +66,12 @@ typedef long long sbc_extended_t;
#define SUB(dst, src) { dst -= src; }
#define MUL(dst, a, b) { dst = (sbc_extended_t) (a) * (b); }
#define DIV2(dst, src) { dst = ASR(src, 1); }
+
+#ifdef __arm__
+#define MULA(res, a, b) __asm__( \
+ "smlal %Q0, %R0, %2, %3" \
+ : "=&r" (res) \
+ : "0" (res), "r" (a), "r" (b));
+#else
#define MULA(dst, a, b) { dst += (sbc_extended_t) (a) * (b); }
+#endif
[-- Attachment #4: gcc-arm-optimization.txt --]
[-- Type: text/plain, Size: 3508 bytes --]
ldr r2, [r0, #4]
mov sl, #41
mov r3, r2, asr #31
umull r4, r5, r2, sl
ldr ip, .L198+32
mla r5, sl, r3, r5
ldr lr, .L198+36
ldr r3, [r0, #68]
ldr sl, .L198+40
#APP
smlal r4, r5, ip, r3
mov r2, #145 @ movhi
ldr r3, [r0, #132]
mvn ip, #916
#APP
smlal r4, r5, sl, r3
ldr r8, .L198+44
ldr r3, [r0, #196]
ldr r9, .L198+48
#APP
smlal r4, r5, lr, r3
ldr r3, [r0, #260]
ldr lr, .L198+52
#APP
smlal r4, r5, ip, r3
ldr r3, [r0, #12]
#APP
smlal r4, r5, r2, r3
ldr r3, [r0, #76]
mvn r2, #46 @ movhi
#APP
smlal r4, r5, r8, r3
ldr r3, [r0, #140]
#APP
smlal r4, r5, r9, r3
ldr r3, [r0, #204]
#APP
smlal r4, r5, lr, r3
ldr r3, [r0, #268]
#APP
smlal r4, r5, r2, r3
mov r4, r4, lsr #8
orr r6, r4, r5, asl #24
str r6, [sp, #528]
str r6, [sp, #424]
[-- Attachment #5: Type: text/plain, Size: 228 bytes --]
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
[-- Attachment #6: Type: text/plain, Size: 164 bytes --]
_______________________________________________
Bluez-devel mailing list
Bluez-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/bluez-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [Bluez-devel] ARM optimization
2008-01-29 17:46 [Bluez-devel] ARM optimization Cidorvan Leite
@ 2008-01-29 19:07 ` Brad Midgley
2008-01-30 16:19 ` Brad Midgley
2008-01-30 17:07 ` Cidorvan Leite
0 siblings, 2 replies; 9+ messages in thread
From: Brad Midgley @ 2008-01-29 19:07 UTC (permalink / raw)
To: BlueZ development
Hey
The only thing I'd add here is the ugliness in putting the function
declaration inside the ifdef is mitigated a bit by taking advantage
instead of the ugliness of making the function non-static for all
platforms.
The thing to note here is the optimization will not take if gcc sees
the function is static *or* inline.
It isn't perfect, but replacing the whole function with assembly will
be even harder to maintain. We're really hurting on omap so we need to
take some steps to get performance up.
diff -u -r1.56 sbc.c
--- sbc/sbc.c 29 Jan 2008 18:56:13 -0000 1.56
+++ sbc/sbc.c 29 Jan 2008 19:03:06 -0000
@@ -709,9 +709,13 @@
}
-static inline void _sbc_analyze_four(const int32_t *in, int32_t *out)
+void _sbc_analyze_four(const int32_t *in, int32_t *out)
{
+#ifdef __arm__
+ sbc_extended_t register res asm("r4");
+#else
sbc_extended_t res;
+#endif
sbc_fixed_t t[8];
sbc_extended_t s[5];
@@ -794,9 +798,13 @@
state->position[ch] = 36;
}
-static inline void _sbc_analyze_eight(const int32_t *in, int32_t *out)
+void _sbc_analyze_eight(const int32_t *in, int32_t *out)
{
+#ifdef __arm__
+ sbc_extended_t register res asm("r4");
+#else
sbc_extended_t res;
+#endif
sbc_fixed_t t[8];
sbc_extended_t s[8];
--
Brad
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
Bluez-devel mailing list
Bluez-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/bluez-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [Bluez-devel] ARM optimization
2008-01-29 19:07 ` Brad Midgley
@ 2008-01-30 16:19 ` Brad Midgley
2008-01-30 17:47 ` Marcel Holtmann
2008-01-30 17:07 ` Cidorvan Leite
1 sibling, 1 reply; 9+ messages in thread
From: Brad Midgley @ 2008-01-30 16:19 UTC (permalink / raw)
To: BlueZ development
Marcel
Cidorvan has a new proposal. It shifts MUL/MULA around so the
assignment happens outside the macro. The good news is the assembly is
now limited to a single ifdef around the macro definition. If we had
to define eg MIPS code or something specialized it would be relatively
quick & clean.
The tricky part is he nests the use of the macro. I'll show you how he
formatted it. Formatting it in a traditional way will get messy
quickly unfortunately.
Index: sbc.c
===================================================================
RCS file: /cvsroot/bluez/utils/sbc/sbc.c,v
retrieving revision 1.57
diff -u -p -r1.57 sbc.c
--- sbc.c 29 Jan 2008 19:47:49 -0000 1.57
+++ sbc.c 30 Jan 2008 13:22:53 -0000
-614,7 +614,8 @@ static inline void sbc_synthesize_four(s
/* Distribute the new matrix value to the shifted position */
SBC_FIXED_0(res);
for (j = 0; j < 4; j++)
- MULA(res, synmatrix4[i][j], frame->sb_sample[blk][ch][j]);
+ res = MULA(synmatrix4[i][j],
+ frame->sb_sample[blk][ch][j], res);
state->V[ch][state->offset[ch][i]] = SCALE4_STAGED1(res);
}
-623,10 +624,10 @@ static inline void sbc_synthesize_four(s
k = (i + 4) & 0xf;
SBC_FIXED_0(res);
for (j = 0; j < 10; idx++) {
- MULA(res, state->V[ch][state->offset[ch][i]+j++],
- sbc_proto_4_40m0[idx]);
- MULA(res, state->V[ch][state->offset[ch][k]+j++],
- sbc_proto_4_40m1[idx]);
+ res = MULA(state->V[ch][state->offset[ch][i]+j++],
+ sbc_proto_4_40m0[idx], res);
+ res = MULA(state->V[ch][state->offset[ch][k]+j++],
+ sbc_proto_4_40m1[idx], res);
}
/* Store in output, Q0 */
-655,7 +656,8 @@ static inline void sbc_synthesize_eight(
SBC_FIXED_0(res);
for (j = 0; j < 8; j++) {
/* Q28 = Q15 * Q13 */
- MULA(res, synmatrix8[i][j], frame->sb_sample[blk][ch][j]);
+ res = MULA(synmatrix8[i][j],
+ frame->sb_sample[blk][ch][j], res);
}
/* Q10 */
state->V[ch][state->offset[ch][i]] = SCALE8_STAGED1(res);
-666,8 +668,10 @@ static inline void sbc_synthesize_eight(
k = (i + 8) & 0xf;
SBC_FIXED_0(res);
for (j = 0; j < 10; idx++) {
- MULA(res, state->V[ch][state->offset[ch][i]+j++], sbc_proto_8_80m0[idx]);
- MULA(res, state->V[ch][state->offset[ch][k]+j++], sbc_proto_8_80m1[idx]);
+ res = MULA(state->V[ch][state->offset[ch][i]+j++],
+ sbc_proto_8_80m0[idx], res);
+ res = MULA(state->V[ch][state->offset[ch][k]+j++],
+ sbc_proto_8_80m1[idx], res);
}
/* Store in output */
frame->pcm_sample[ch][blk * 8 + i] = SCALE8_STAGED2(res); // Q0
-708,67 +712,66 @@ static void sbc_encoder_init(struct sbc_
state->position[0] = state->position[1] = 9 * frame->subbands;
}
-
static inline void _sbc_analyze_four(const int32_t *in, int32_t *out)
{
- sbc_extended_t res;
sbc_fixed_t t[8];
sbc_extended_t s[5];
- MUL(res, _sbc_proto_4[0], in[8] - in[32]); /* Q18 */
- MULA(res, _sbc_proto_4[1], in[16] - in[24]);
- t[0] = SCALE4_STAGE1(res); /* Q8 */
-
- MUL(res, _sbc_proto_4[2], in[1]);
- MULA(res, _sbc_proto_4[3], in[9]);
- MULA(res, _sbc_proto_4[4], in[17]);
- MULA(res, _sbc_proto_4[5], in[25]);
- MULA(res, _sbc_proto_4[6], in[33]);
- t[1] = SCALE4_STAGE1(res);
-
- MUL(res, _sbc_proto_4[7], in[2]);
- MULA(res, _sbc_proto_4[8], in[10]);
- MULA(res, _sbc_proto_4[9], in[18]);
- MULA(res, _sbc_proto_4[10], in[26]);
- MULA(res, _sbc_proto_4[11], in[34]);
- t[2] = SCALE4_STAGE1(res);
-
- MUL(res, _sbc_proto_4[12], in[3]);
- MULA(res, _sbc_proto_4[13], in[11]);
- MULA(res, _sbc_proto_4[14], in[19]);
- MULA(res, _sbc_proto_4[15], in[27]);
- MULA(res, _sbc_proto_4[16], in[35]);
- t[3] = SCALE4_STAGE1(res);
-
- MUL(res, _sbc_proto_4[17], in[4] + in[36]);
- MULA(res, _sbc_proto_4[18], in[12] + in[28]);
- MULA(res, _sbc_proto_4[19], in[20]);
- t[4] = SCALE4_STAGE1(res);
-
- MUL(res, _sbc_proto_4[16], in[5]);
- MULA(res, _sbc_proto_4[15], in[13]);
- MULA(res, _sbc_proto_4[14], in[21]);
- MULA(res, _sbc_proto_4[13], in[29]);
- MULA(res, _sbc_proto_4[12], in[37]);
- t[5] = SCALE4_STAGE1(res);
+ t[0] = SCALE4_STAGE1( /* Q8 */
+ MULA(_sbc_proto_4[0], in[8] - in[32], /* Q18 */
+ MUL( _sbc_proto_4[1], in[16] - in[24])));
+
+ t[1] = SCALE4_STAGE1(
+ MULA(_sbc_proto_4[2], in[1],
+ MULA(_sbc_proto_4[3], in[9],
+ MULA(_sbc_proto_4[4], in[17],
+ MULA(_sbc_proto_4[5], in[25],
+ MUL( _sbc_proto_4[6], in[33]))))));
+
+ t[2] = SCALE4_STAGE1(
+ MULA(_sbc_proto_4[7], in[2],
+ MULA(_sbc_proto_4[8], in[10],
+ MULA(_sbc_proto_4[9], in[18],
+ MULA(_sbc_proto_4[10], in[26],
+ MUL( _sbc_proto_4[11], in[34]))))));
+
+ t[3] = SCALE4_STAGE1(
+ MULA(_sbc_proto_4[12], in[3],
+ MULA(_sbc_proto_4[13], in[11],
+ MULA(_sbc_proto_4[14], in[19],
+ MULA(_sbc_proto_4[15], in[27],
+ MUL( _sbc_proto_4[16], in[35]))))));
+
+ t[4] = SCALE4_STAGE1(
+ MULA(_sbc_proto_4[17], in[4] + in[36],
+ MULA(_sbc_proto_4[18], in[12] + in[28],
+ MUL( _sbc_proto_4[19], in[20]))));
+
+ t[5] = SCALE4_STAGE1(
+ MULA(_sbc_proto_4[16], in[5],
+ MULA(_sbc_proto_4[15], in[13],
+ MULA(_sbc_proto_4[14], in[21],
+ MULA(_sbc_proto_4[13], in[29],
+ MUL( _sbc_proto_4[12], in[37]))))));
/* don't compute t[6]... this term always multiplies
* with cos(pi/2) = 0 */
- MUL(res, _sbc_proto_4[6], in[7]);
- MULA(res, _sbc_proto_4[5], in[15]);
- MULA(res, _sbc_proto_4[4], in[23]);
- MULA(res, _sbc_proto_4[3], in[31]);
- MULA(res, _sbc_proto_4[2], in[39]);
- t[7] = SCALE4_STAGE1(res);
-
- MUL(s[0], _anamatrix4[0], t[0] + t[4]);
- MUL(s[1], _anamatrix4[2], t[2]);
- MUL(s[2], _anamatrix4[1], t[1] + t[3]);
- MULA(s[2], _anamatrix4[3], t[5]);
- MUL(s[3], _anamatrix4[3], t[1] + t[3]);
- MULA(s[3], _anamatrix4[1], - t[5] + t[7]);
- MUL(s[4], _anamatrix4[3], t[7]);
+ t[7] = SCALE4_STAGE1(
+ MULA(_sbc_proto_4[6], in[7],
+ MULA(_sbc_proto_4[5], in[15],
+ MULA(_sbc_proto_4[4], in[23],
+ MULA(_sbc_proto_4[3], in[31],
+ MUL( _sbc_proto_4[2], in[39]))))));
+
+ s[0] = MUL( _anamatrix4[0], t[0] + t[4]);
+ s[1] = MUL( _anamatrix4[2], t[2]);
+ s[2] = MULA(_anamatrix4[1], t[1] + t[3],
+ MUL(_anamatrix4[3], t[5]));
+ s[3] = MULA(_anamatrix4[3], t[1] + t[3],
+ MUL(_anamatrix4[1], -t[5] + t[7]));
+ s[4] = MUL( _anamatrix4[3], t[7]);
+
out[0] = SCALE4_STAGE2( s[0] + s[1] + s[2] + s[4]); /* Q0 */
out[1] = SCALE4_STAGE2(-s[0] + s[1] + s[3]);
out[2] = SCALE4_STAGE2(-s[0] + s[1] - s[3]);
-796,117 +799,117 @@ static inline void sbc_analyze_four(stru
static inline void _sbc_analyze_eight(const int32_t *in, int32_t *out)
{
- sbc_extended_t res;
sbc_fixed_t t[8];
sbc_extended_t s[8];
- MUL(res, _sbc_proto_8[0], (in[16] - in[64])); /* Q18 = Q18 * Q0 */
- MULA(res, _sbc_proto_8[1], (in[32] - in[48]));
- MULA(res, _sbc_proto_8[2], in[4]);
- MULA(res, _sbc_proto_8[3], in[20]);
- MULA(res, _sbc_proto_8[4], in[36]);
- MULA(res, _sbc_proto_8[5], in[52]);
- t[0] = SCALE8_STAGE1(res); /* Q10 */
-
- MUL(res, _sbc_proto_8[6], in[2]);
- MULA(res, _sbc_proto_8[7], in[18]);
- MULA(res, _sbc_proto_8[8], in[34]);
- MULA(res, _sbc_proto_8[9], in[50]);
- MULA(res, _sbc_proto_8[10], in[66]);
- t[1] = SCALE8_STAGE1(res);
-
- MUL(res, _sbc_proto_8[11], in[1]);
- MULA(res, _sbc_proto_8[12], in[17]);
- MULA(res, _sbc_proto_8[13], in[33]);
- MULA(res, _sbc_proto_8[14], in[49]);
- MULA(res, _sbc_proto_8[15], in[65]);
- MULA(res, _sbc_proto_8[16], in[3]);
- MULA(res, _sbc_proto_8[17], in[19]);
- MULA(res, _sbc_proto_8[18], in[35]);
- MULA(res, _sbc_proto_8[19], in[51]);
- MULA(res, _sbc_proto_8[20], in[67]);
- t[2] = SCALE8_STAGE1(res);
-
- MUL(res, _sbc_proto_8[21], in[5]);
- MULA(res, _sbc_proto_8[22], in[21]);
- MULA(res, _sbc_proto_8[23], in[37]);
- MULA(res, _sbc_proto_8[24], in[53]);
- MULA(res, _sbc_proto_8[25], in[69]);
- MULA(res, -_sbc_proto_8[15], in[15]);
- MULA(res, -_sbc_proto_8[14], in[31]);
- MULA(res, -_sbc_proto_8[13], in[47]);
- MULA(res, -_sbc_proto_8[12], in[63]);
- MULA(res, -_sbc_proto_8[11], in[79]);
- t[3] = SCALE8_STAGE1(res);
-
- MUL(res, _sbc_proto_8[26], in[6]);
- MULA(res, _sbc_proto_8[27], in[22]);
- MULA(res, _sbc_proto_8[28], in[38]);
- MULA(res, _sbc_proto_8[29], in[54]);
- MULA(res, _sbc_proto_8[30], in[70]);
- MULA(res, -_sbc_proto_8[10], in[14]);
- MULA(res, -_sbc_proto_8[9], in[30]);
- MULA(res, -_sbc_proto_8[8], in[46]);
- MULA(res, -_sbc_proto_8[7], in[62]);
- MULA(res, -_sbc_proto_8[6], in[78]);
- t[4] = SCALE8_STAGE1(res);
-
- MUL(res, _sbc_proto_8[31], in[7]);
- MULA(res, _sbc_proto_8[32], in[23]);
- MULA(res, _sbc_proto_8[33], in[39]);
- MULA(res, _sbc_proto_8[34], in[55]);
- MULA(res, _sbc_proto_8[35], in[71]);
- MULA(res, -_sbc_proto_8[20], in[13]);
- MULA(res, -_sbc_proto_8[19], in[29]);
- MULA(res, -_sbc_proto_8[18], in[45]);
- MULA(res, -_sbc_proto_8[17], in[61]);
- MULA(res, -_sbc_proto_8[16], in[77]);
- t[5] = SCALE8_STAGE1(res);
-
- MUL(res, _sbc_proto_8[36], in[8] + in[72]);
- MULA(res, _sbc_proto_8[37], in[24] + in[56]);
- MULA(res, _sbc_proto_8[38], in[40]);
- MULA(res, -_sbc_proto_8[39], in[12]);
- MULA(res, -_sbc_proto_8[5], in[28]);
- MULA(res, -_sbc_proto_8[4], in[44]);
- MULA(res, -_sbc_proto_8[3], in[60]);
- MULA(res, -_sbc_proto_8[2], in[76]);
- t[6] = SCALE8_STAGE1(res);
-
- MUL(res, _sbc_proto_8[35], in[9]);
- MULA(res, _sbc_proto_8[34], in[25]);
- MULA(res, _sbc_proto_8[33], in[41]);
- MULA(res, _sbc_proto_8[32], in[57]);
- MULA(res, _sbc_proto_8[31], in[73]);
- MULA(res, -_sbc_proto_8[25], in[11]);
- MULA(res, -_sbc_proto_8[24], in[27]);
- MULA(res, -_sbc_proto_8[23], in[43]);
- MULA(res, -_sbc_proto_8[22], in[59]);
- MULA(res, -_sbc_proto_8[21], in[75]);
- t[7] = SCALE8_STAGE1(res);
-
- MUL(s[0], _anamatrix8[0], t[0]); /* = Q14 * Q10 */
- MULA(s[0], _anamatrix8[1], t[6]);
- MUL(s[1], _anamatrix8[7], t[1]);
- MUL(s[2], _anamatrix8[2], t[2]);
- MULA(s[2], _anamatrix8[3], t[3]);
- MULA(s[2], _anamatrix8[4], t[5]);
- MULA(s[2], _anamatrix8[5], t[7]);
- MUL(s[3], _anamatrix8[6], t[4]);
- MUL(s[4], _anamatrix8[3], t[2]);
- MULA(s[4], -_anamatrix8[5], t[3]);
- MULA(s[4], -_anamatrix8[2], t[5]);
- MULA(s[4], -_anamatrix8[4], t[7]);
- MUL(s[5], _anamatrix8[4], t[2]);
- MULA(s[5], -_anamatrix8[2], t[3]);
- MULA(s[5], _anamatrix8[5], t[5]);
- MULA(s[5], _anamatrix8[3], t[7]);
- MUL(s[6], _anamatrix8[1], t[0]);
- MULA(s[6], -_anamatrix8[0], t[6]);
- MUL(s[7], _anamatrix8[5], t[2]);
- MULA(s[7], -_anamatrix8[4], t[3]);
- MULA(s[7], _anamatrix8[3], t[5]);
- MULA(s[7], -_anamatrix8[2], t[7]);
+ t[0] = SCALE8_STAGE1( /* Q10 */
+ MULA(_sbc_proto_8[0], (in[16] - in[64]), /* Q18 = Q18 * Q0 */
+ MULA(_sbc_proto_8[1], (in[32] - in[48]),
+ MULA(_sbc_proto_8[2], in[4],
+ MULA(_sbc_proto_8[3], in[20],
+ MULA(_sbc_proto_8[4], in[36],
+ MUL( _sbc_proto_8[5], in[52])))))));
+
+ t[1] = SCALE8_STAGE1(
+ MULA(_sbc_proto_8[6], in[2],
+ MULA(_sbc_proto_8[7], in[18],
+ MULA(_sbc_proto_8[8], in[34],
+ MULA(_sbc_proto_8[9], in[50],
+ MUL(_sbc_proto_8[10], in[66]))))));
+
+ t[2] = SCALE8_STAGE1(
+ MULA(_sbc_proto_8[11], in[1],
+ MULA(_sbc_proto_8[12], in[17],
+ MULA(_sbc_proto_8[13], in[33],
+ MULA(_sbc_proto_8[14], in[49],
+ MULA(_sbc_proto_8[15], in[65],
+ MULA(_sbc_proto_8[16], in[3],
+ MULA(_sbc_proto_8[17], in[19],
+ MULA(_sbc_proto_8[18], in[35],
+ MULA(_sbc_proto_8[19], in[51],
+ MUL( _sbc_proto_8[20], in[67])))))))))));
+
+ t[3] = SCALE8_STAGE1(
+ MULA( _sbc_proto_8[21], in[5],
+ MULA( _sbc_proto_8[22], in[21],
+ MULA( _sbc_proto_8[23], in[37],
+ MULA( _sbc_proto_8[24], in[53],
+ MULA( _sbc_proto_8[25], in[69],
+ MULA(-_sbc_proto_8[15], in[15],
+ MULA(-_sbc_proto_8[14], in[31],
+ MULA(-_sbc_proto_8[13], in[47],
+ MULA(-_sbc_proto_8[12], in[63],
+ MUL( -_sbc_proto_8[11], in[79])))))))))));
+
+ t[4] = SCALE8_STAGE1(
+ MULA( _sbc_proto_8[26], in[6],
+ MULA( _sbc_proto_8[27], in[22],
+ MULA( _sbc_proto_8[28], in[38],
+ MULA( _sbc_proto_8[29], in[54],
+ MULA( _sbc_proto_8[30], in[70],
+ MULA(-_sbc_proto_8[10], in[14],
+ MULA(-_sbc_proto_8[9], in[30],
+ MULA(-_sbc_proto_8[8], in[46],
+ MULA(-_sbc_proto_8[7], in[62],
+ MUL( -_sbc_proto_8[6], in[78])))))))))));
+
+ t[5] = SCALE8_STAGE1(
+ MULA( _sbc_proto_8[31], in[7],
+ MULA( _sbc_proto_8[32], in[23],
+ MULA( _sbc_proto_8[33], in[39],
+ MULA( _sbc_proto_8[34], in[55],
+ MULA( _sbc_proto_8[35], in[71],
+ MULA(-_sbc_proto_8[20], in[13],
+ MULA(-_sbc_proto_8[19], in[29],
+ MULA(-_sbc_proto_8[18], in[45],
+ MULA(-_sbc_proto_8[17], in[61],
+ MUL( -_sbc_proto_8[16], in[77])))))))))));
+
+ t[6] = SCALE8_STAGE1(
+ MULA( _sbc_proto_8[36], (in[8] + in[72]),
+ MULA( _sbc_proto_8[37], (in[24] + in[56]),
+ MULA( _sbc_proto_8[38], in[40],
+ MULA(-_sbc_proto_8[39], in[12],
+ MULA(-_sbc_proto_8[5], in[28],
+ MULA(-_sbc_proto_8[4], in[44],
+ MULA(-_sbc_proto_8[3], in[60],
+ MUL( -_sbc_proto_8[2], in[76])))))))));
+
+ t[7] = SCALE8_STAGE1(
+ MULA( _sbc_proto_8[35], in[9],
+ MULA( _sbc_proto_8[34], in[25],
+ MULA( _sbc_proto_8[33], in[41],
+ MULA( _sbc_proto_8[32], in[57],
+ MULA( _sbc_proto_8[31], in[73],
+ MULA(-_sbc_proto_8[25], in[11],
+ MULA(-_sbc_proto_8[24], in[27],
+ MULA(-_sbc_proto_8[23], in[43],
+ MULA(-_sbc_proto_8[22], in[59],
+ MUL( -_sbc_proto_8[21], in[75])))))))))));
+
+ s[0] = MULA( _anamatrix8[0], t[0],
+ MUL( _anamatrix8[1], t[6]));
+ s[1] = MUL( _anamatrix8[7], t[1]);
+ s[2] = MULA( _anamatrix8[2], t[2],
+ MULA( _anamatrix8[3], t[3],
+ MULA( _anamatrix8[4], t[5],
+ MUL( _anamatrix8[5], t[7]))));
+ s[3] = MUL( _anamatrix8[6], t[4]);
+ s[4] = MULA( _anamatrix8[3], t[2],
+ MULA(-_anamatrix8[5], t[3],
+ MULA(-_anamatrix8[2], t[5],
+ MUL( -_anamatrix8[4], t[7]))));
+ s[5] = MULA( _anamatrix8[4], t[2],
+ MULA(-_anamatrix8[2], t[3],
+ MULA( _anamatrix8[5], t[5],
+ MUL( _anamatrix8[3], t[7]))));
+ s[6] = MULA( _anamatrix8[1], t[0],
+ MUL( -_anamatrix8[0], t[6]));
+ s[7] = MULA( _anamatrix8[5], t[2],
+ MULA(-_anamatrix8[4], t[3],
+ MULA( _anamatrix8[3], t[5],
+ MUL( -_anamatrix8[2], t[7]))));
+
out[0] = SCALE8_STAGE2( s[0] + s[1] + s[2] + s[3]);
out[1] = SCALE8_STAGE2( s[1] - s[3] + s[4] + s[6]);
out[2] = SCALE8_STAGE2( s[1] - s[3] + s[5] - s[6]);
Index: sbc_math.h
===================================================================
RCS file: /cvsroot/bluez/utils/sbc/sbc_math.h,v
retrieving revision 1.12
diff -u -p -r1.12 sbc_math.h
--- sbc_math.h 28 Jan 2008 18:00:51 -0000 1.12
+++ sbc_math.h 30 Jan 2008 13:22:53 -0000
-64,6 +64,17 @@ typedef long long sbc_extended_t;
#define SBC_FIXED_0(val) { val = 0; }
#define ADD(dst, src) { dst += src; }
#define SUB(dst, src) { dst -= src; }
-#define MUL(dst, a, b) { dst = (sbc_extended_t) (a) * (b); }
+#define MUL(a, b) ((sbc_extended_t)(a) * (b))
#define DIV2(dst, src) { dst = ASR(src, 1); }
-#define MULA(dst, a, b) { dst += (sbc_extended_t) (a) * (b); }
+
+#ifdef __arm__
+#define MULA(a, b, res) ({ \
+ long long tmp = res; \
+ __asm__( \
+ "smlal %Q0, %R0, %2, %3" \
+ : "=&r" (tmp) \
+ : "0" (tmp), "r" (a), "r" (b)); \
+ tmp; })
+#else
+#define MULA(a, b, res) ((sbc_extended_t)(a) * (b) + (res))
+#endif
Brad
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
Bluez-devel mailing list
Bluez-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/bluez-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [Bluez-devel] ARM optimization
2008-01-29 19:07 ` Brad Midgley
2008-01-30 16:19 ` Brad Midgley
@ 2008-01-30 17:07 ` Cidorvan Leite
2008-01-30 17:41 ` Brad Midgley
1 sibling, 1 reply; 9+ messages in thread
From: Cidorvan Leite @ 2008-01-30 17:07 UTC (permalink / raw)
To: BlueZ development
[-- Attachment #1: Type: text/plain, Size: 2146 bytes --]
Hi!
I found another way to remove #ifdef from C file.
I changed the format of MUL and MULA macros, but ARM assembly code is
still necessary.
This way is no longer necessary to declare a temporary variable with
"register" and we can keep static inline functions.
On Jan 29, 2008 4:07 PM, Brad Midgley <bmidgley@gmail.com> wrote:
> Hey
>
> The only thing I'd add here is the ugliness in putting the function
> declaration inside the ifdef is mitigated a bit by taking advantage
> instead of the ugliness of making the function non-static for all
> platforms.
>
> The thing to note here is the optimization will not take if gcc sees
> the function is static *or* inline.
>
> It isn't perfect, but replacing the whole function with assembly will
> be even harder to maintain. We're really hurting on omap so we need to
> take some steps to get performance up.
>
> diff -u -r1.56 sbc.c
> --- sbc/sbc.c 29 Jan 2008 18:56:13 -0000 1.56
> +++ sbc/sbc.c 29 Jan 2008 19:03:06 -0000
> @@ -709,9 +709,13 @@
> }
>
>
> -static inline void _sbc_analyze_four(const int32_t *in, int32_t *out)
> +void _sbc_analyze_four(const int32_t *in, int32_t *out)
> {
> +#ifdef __arm__
> + sbc_extended_t register res asm("r4");
> +#else
> sbc_extended_t res;
> +#endif
> sbc_fixed_t t[8];
> sbc_extended_t s[5];
>
> @@ -794,9 +798,13 @@
> state->position[ch] = 36;
> }
>
> -static inline void _sbc_analyze_eight(const int32_t *in, int32_t *out)
> +void _sbc_analyze_eight(const int32_t *in, int32_t *out)
> {
> +#ifdef __arm__
> + sbc_extended_t register res asm("r4");
> +#else
> sbc_extended_t res;
> +#endif
> sbc_fixed_t t[8];
> sbc_extended_t s[8];
>
>
> --
> Brad
>
> -------------------------------------------------------------------------
> This SF.net email is sponsored by: Microsoft
> Defy all challenges. Microsoft(R) Visual Studio 2008.
> http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
> _______________________________________________
> Bluez-devel mailing list
> Bluez-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/bluez-devel
>
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: p.diff --]
[-- Type: text/x-patch; name=p.diff, Size: 14499 bytes --]
Index: sbc.c
===================================================================
RCS file: /cvsroot/bluez/utils/sbc/sbc.c,v
retrieving revision 1.57
diff -u -p -r1.57 sbc.c
--- sbc.c 29 Jan 2008 19:47:49 -0000 1.57
+++ sbc.c 30 Jan 2008 13:22:53 -0000
@@ -614,7 +614,8 @@ static inline void sbc_synthesize_four(s
/* Distribute the new matrix value to the shifted position */
SBC_FIXED_0(res);
for (j = 0; j < 4; j++)
- MULA(res, synmatrix4[i][j], frame->sb_sample[blk][ch][j]);
+ res = MULA(synmatrix4[i][j],
+ frame->sb_sample[blk][ch][j], res);
state->V[ch][state->offset[ch][i]] = SCALE4_STAGED1(res);
}
@@ -623,10 +624,10 @@ static inline void sbc_synthesize_four(s
k = (i + 4) & 0xf;
SBC_FIXED_0(res);
for (j = 0; j < 10; idx++) {
- MULA(res, state->V[ch][state->offset[ch][i]+j++],
- sbc_proto_4_40m0[idx]);
- MULA(res, state->V[ch][state->offset[ch][k]+j++],
- sbc_proto_4_40m1[idx]);
+ res = MULA(state->V[ch][state->offset[ch][i]+j++],
+ sbc_proto_4_40m0[idx], res);
+ res = MULA(state->V[ch][state->offset[ch][k]+j++],
+ sbc_proto_4_40m1[idx], res);
}
/* Store in output, Q0 */
@@ -655,7 +656,8 @@ static inline void sbc_synthesize_eight(
SBC_FIXED_0(res);
for (j = 0; j < 8; j++) {
/* Q28 = Q15 * Q13 */
- MULA(res, synmatrix8[i][j], frame->sb_sample[blk][ch][j]);
+ res = MULA(synmatrix8[i][j],
+ frame->sb_sample[blk][ch][j], res);
}
/* Q10 */
state->V[ch][state->offset[ch][i]] = SCALE8_STAGED1(res);
@@ -666,8 +668,10 @@ static inline void sbc_synthesize_eight(
k = (i + 8) & 0xf;
SBC_FIXED_0(res);
for (j = 0; j < 10; idx++) {
- MULA(res, state->V[ch][state->offset[ch][i]+j++], sbc_proto_8_80m0[idx]);
- MULA(res, state->V[ch][state->offset[ch][k]+j++], sbc_proto_8_80m1[idx]);
+ res = MULA(state->V[ch][state->offset[ch][i]+j++],
+ sbc_proto_8_80m0[idx], res);
+ res = MULA(state->V[ch][state->offset[ch][k]+j++],
+ sbc_proto_8_80m1[idx], res);
}
/* Store in output */
frame->pcm_sample[ch][blk * 8 + i] = SCALE8_STAGED2(res); // Q0
@@ -708,67 +712,66 @@ static void sbc_encoder_init(struct sbc_
state->position[0] = state->position[1] = 9 * frame->subbands;
}
-
static inline void _sbc_analyze_four(const int32_t *in, int32_t *out)
{
- sbc_extended_t res;
sbc_fixed_t t[8];
sbc_extended_t s[5];
- MUL(res, _sbc_proto_4[0], in[8] - in[32]); /* Q18 */
- MULA(res, _sbc_proto_4[1], in[16] - in[24]);
- t[0] = SCALE4_STAGE1(res); /* Q8 */
-
- MUL(res, _sbc_proto_4[2], in[1]);
- MULA(res, _sbc_proto_4[3], in[9]);
- MULA(res, _sbc_proto_4[4], in[17]);
- MULA(res, _sbc_proto_4[5], in[25]);
- MULA(res, _sbc_proto_4[6], in[33]);
- t[1] = SCALE4_STAGE1(res);
-
- MUL(res, _sbc_proto_4[7], in[2]);
- MULA(res, _sbc_proto_4[8], in[10]);
- MULA(res, _sbc_proto_4[9], in[18]);
- MULA(res, _sbc_proto_4[10], in[26]);
- MULA(res, _sbc_proto_4[11], in[34]);
- t[2] = SCALE4_STAGE1(res);
-
- MUL(res, _sbc_proto_4[12], in[3]);
- MULA(res, _sbc_proto_4[13], in[11]);
- MULA(res, _sbc_proto_4[14], in[19]);
- MULA(res, _sbc_proto_4[15], in[27]);
- MULA(res, _sbc_proto_4[16], in[35]);
- t[3] = SCALE4_STAGE1(res);
-
- MUL(res, _sbc_proto_4[17], in[4] + in[36]);
- MULA(res, _sbc_proto_4[18], in[12] + in[28]);
- MULA(res, _sbc_proto_4[19], in[20]);
- t[4] = SCALE4_STAGE1(res);
-
- MUL(res, _sbc_proto_4[16], in[5]);
- MULA(res, _sbc_proto_4[15], in[13]);
- MULA(res, _sbc_proto_4[14], in[21]);
- MULA(res, _sbc_proto_4[13], in[29]);
- MULA(res, _sbc_proto_4[12], in[37]);
- t[5] = SCALE4_STAGE1(res);
+ t[0] = SCALE4_STAGE1( /* Q8 */
+ MULA(_sbc_proto_4[0], in[8] - in[32], /* Q18 */
+ MUL( _sbc_proto_4[1], in[16] - in[24])));
+
+ t[1] = SCALE4_STAGE1(
+ MULA(_sbc_proto_4[2], in[1],
+ MULA(_sbc_proto_4[3], in[9],
+ MULA(_sbc_proto_4[4], in[17],
+ MULA(_sbc_proto_4[5], in[25],
+ MUL( _sbc_proto_4[6], in[33]))))));
+
+ t[2] = SCALE4_STAGE1(
+ MULA(_sbc_proto_4[7], in[2],
+ MULA(_sbc_proto_4[8], in[10],
+ MULA(_sbc_proto_4[9], in[18],
+ MULA(_sbc_proto_4[10], in[26],
+ MUL( _sbc_proto_4[11], in[34]))))));
+
+ t[3] = SCALE4_STAGE1(
+ MULA(_sbc_proto_4[12], in[3],
+ MULA(_sbc_proto_4[13], in[11],
+ MULA(_sbc_proto_4[14], in[19],
+ MULA(_sbc_proto_4[15], in[27],
+ MUL( _sbc_proto_4[16], in[35]))))));
+
+ t[4] = SCALE4_STAGE1(
+ MULA(_sbc_proto_4[17], in[4] + in[36],
+ MULA(_sbc_proto_4[18], in[12] + in[28],
+ MUL( _sbc_proto_4[19], in[20]))));
+
+ t[5] = SCALE4_STAGE1(
+ MULA(_sbc_proto_4[16], in[5],
+ MULA(_sbc_proto_4[15], in[13],
+ MULA(_sbc_proto_4[14], in[21],
+ MULA(_sbc_proto_4[13], in[29],
+ MUL( _sbc_proto_4[12], in[37]))))));
/* don't compute t[6]... this term always multiplies
* with cos(pi/2) = 0 */
- MUL(res, _sbc_proto_4[6], in[7]);
- MULA(res, _sbc_proto_4[5], in[15]);
- MULA(res, _sbc_proto_4[4], in[23]);
- MULA(res, _sbc_proto_4[3], in[31]);
- MULA(res, _sbc_proto_4[2], in[39]);
- t[7] = SCALE4_STAGE1(res);
-
- MUL(s[0], _anamatrix4[0], t[0] + t[4]);
- MUL(s[1], _anamatrix4[2], t[2]);
- MUL(s[2], _anamatrix4[1], t[1] + t[3]);
- MULA(s[2], _anamatrix4[3], t[5]);
- MUL(s[3], _anamatrix4[3], t[1] + t[3]);
- MULA(s[3], _anamatrix4[1], - t[5] + t[7]);
- MUL(s[4], _anamatrix4[3], t[7]);
+ t[7] = SCALE4_STAGE1(
+ MULA(_sbc_proto_4[6], in[7],
+ MULA(_sbc_proto_4[5], in[15],
+ MULA(_sbc_proto_4[4], in[23],
+ MULA(_sbc_proto_4[3], in[31],
+ MUL( _sbc_proto_4[2], in[39]))))));
+
+ s[0] = MUL( _anamatrix4[0], t[0] + t[4]);
+ s[1] = MUL( _anamatrix4[2], t[2]);
+ s[2] = MULA(_anamatrix4[1], t[1] + t[3],
+ MUL(_anamatrix4[3], t[5]));
+ s[3] = MULA(_anamatrix4[3], t[1] + t[3],
+ MUL(_anamatrix4[1], -t[5] + t[7]));
+ s[4] = MUL( _anamatrix4[3], t[7]);
+
out[0] = SCALE4_STAGE2( s[0] + s[1] + s[2] + s[4]); /* Q0 */
out[1] = SCALE4_STAGE2(-s[0] + s[1] + s[3]);
out[2] = SCALE4_STAGE2(-s[0] + s[1] - s[3]);
@@ -796,117 +799,117 @@ static inline void sbc_analyze_four(stru
static inline void _sbc_analyze_eight(const int32_t *in, int32_t *out)
{
- sbc_extended_t res;
sbc_fixed_t t[8];
sbc_extended_t s[8];
- MUL(res, _sbc_proto_8[0], (in[16] - in[64])); /* Q18 = Q18 * Q0 */
- MULA(res, _sbc_proto_8[1], (in[32] - in[48]));
- MULA(res, _sbc_proto_8[2], in[4]);
- MULA(res, _sbc_proto_8[3], in[20]);
- MULA(res, _sbc_proto_8[4], in[36]);
- MULA(res, _sbc_proto_8[5], in[52]);
- t[0] = SCALE8_STAGE1(res); /* Q10 */
-
- MUL(res, _sbc_proto_8[6], in[2]);
- MULA(res, _sbc_proto_8[7], in[18]);
- MULA(res, _sbc_proto_8[8], in[34]);
- MULA(res, _sbc_proto_8[9], in[50]);
- MULA(res, _sbc_proto_8[10], in[66]);
- t[1] = SCALE8_STAGE1(res);
-
- MUL(res, _sbc_proto_8[11], in[1]);
- MULA(res, _sbc_proto_8[12], in[17]);
- MULA(res, _sbc_proto_8[13], in[33]);
- MULA(res, _sbc_proto_8[14], in[49]);
- MULA(res, _sbc_proto_8[15], in[65]);
- MULA(res, _sbc_proto_8[16], in[3]);
- MULA(res, _sbc_proto_8[17], in[19]);
- MULA(res, _sbc_proto_8[18], in[35]);
- MULA(res, _sbc_proto_8[19], in[51]);
- MULA(res, _sbc_proto_8[20], in[67]);
- t[2] = SCALE8_STAGE1(res);
-
- MUL(res, _sbc_proto_8[21], in[5]);
- MULA(res, _sbc_proto_8[22], in[21]);
- MULA(res, _sbc_proto_8[23], in[37]);
- MULA(res, _sbc_proto_8[24], in[53]);
- MULA(res, _sbc_proto_8[25], in[69]);
- MULA(res, -_sbc_proto_8[15], in[15]);
- MULA(res, -_sbc_proto_8[14], in[31]);
- MULA(res, -_sbc_proto_8[13], in[47]);
- MULA(res, -_sbc_proto_8[12], in[63]);
- MULA(res, -_sbc_proto_8[11], in[79]);
- t[3] = SCALE8_STAGE1(res);
-
- MUL(res, _sbc_proto_8[26], in[6]);
- MULA(res, _sbc_proto_8[27], in[22]);
- MULA(res, _sbc_proto_8[28], in[38]);
- MULA(res, _sbc_proto_8[29], in[54]);
- MULA(res, _sbc_proto_8[30], in[70]);
- MULA(res, -_sbc_proto_8[10], in[14]);
- MULA(res, -_sbc_proto_8[9], in[30]);
- MULA(res, -_sbc_proto_8[8], in[46]);
- MULA(res, -_sbc_proto_8[7], in[62]);
- MULA(res, -_sbc_proto_8[6], in[78]);
- t[4] = SCALE8_STAGE1(res);
-
- MUL(res, _sbc_proto_8[31], in[7]);
- MULA(res, _sbc_proto_8[32], in[23]);
- MULA(res, _sbc_proto_8[33], in[39]);
- MULA(res, _sbc_proto_8[34], in[55]);
- MULA(res, _sbc_proto_8[35], in[71]);
- MULA(res, -_sbc_proto_8[20], in[13]);
- MULA(res, -_sbc_proto_8[19], in[29]);
- MULA(res, -_sbc_proto_8[18], in[45]);
- MULA(res, -_sbc_proto_8[17], in[61]);
- MULA(res, -_sbc_proto_8[16], in[77]);
- t[5] = SCALE8_STAGE1(res);
-
- MUL(res, _sbc_proto_8[36], in[8] + in[72]);
- MULA(res, _sbc_proto_8[37], in[24] + in[56]);
- MULA(res, _sbc_proto_8[38], in[40]);
- MULA(res, -_sbc_proto_8[39], in[12]);
- MULA(res, -_sbc_proto_8[5], in[28]);
- MULA(res, -_sbc_proto_8[4], in[44]);
- MULA(res, -_sbc_proto_8[3], in[60]);
- MULA(res, -_sbc_proto_8[2], in[76]);
- t[6] = SCALE8_STAGE1(res);
-
- MUL(res, _sbc_proto_8[35], in[9]);
- MULA(res, _sbc_proto_8[34], in[25]);
- MULA(res, _sbc_proto_8[33], in[41]);
- MULA(res, _sbc_proto_8[32], in[57]);
- MULA(res, _sbc_proto_8[31], in[73]);
- MULA(res, -_sbc_proto_8[25], in[11]);
- MULA(res, -_sbc_proto_8[24], in[27]);
- MULA(res, -_sbc_proto_8[23], in[43]);
- MULA(res, -_sbc_proto_8[22], in[59]);
- MULA(res, -_sbc_proto_8[21], in[75]);
- t[7] = SCALE8_STAGE1(res);
-
- MUL(s[0], _anamatrix8[0], t[0]); /* = Q14 * Q10 */
- MULA(s[0], _anamatrix8[1], t[6]);
- MUL(s[1], _anamatrix8[7], t[1]);
- MUL(s[2], _anamatrix8[2], t[2]);
- MULA(s[2], _anamatrix8[3], t[3]);
- MULA(s[2], _anamatrix8[4], t[5]);
- MULA(s[2], _anamatrix8[5], t[7]);
- MUL(s[3], _anamatrix8[6], t[4]);
- MUL(s[4], _anamatrix8[3], t[2]);
- MULA(s[4], -_anamatrix8[5], t[3]);
- MULA(s[4], -_anamatrix8[2], t[5]);
- MULA(s[4], -_anamatrix8[4], t[7]);
- MUL(s[5], _anamatrix8[4], t[2]);
- MULA(s[5], -_anamatrix8[2], t[3]);
- MULA(s[5], _anamatrix8[5], t[5]);
- MULA(s[5], _anamatrix8[3], t[7]);
- MUL(s[6], _anamatrix8[1], t[0]);
- MULA(s[6], -_anamatrix8[0], t[6]);
- MUL(s[7], _anamatrix8[5], t[2]);
- MULA(s[7], -_anamatrix8[4], t[3]);
- MULA(s[7], _anamatrix8[3], t[5]);
- MULA(s[7], -_anamatrix8[2], t[7]);
+ t[0] = SCALE8_STAGE1( /* Q10 */
+ MULA(_sbc_proto_8[0], (in[16] - in[64]), /* Q18 = Q18 * Q0 */
+ MULA(_sbc_proto_8[1], (in[32] - in[48]),
+ MULA(_sbc_proto_8[2], in[4],
+ MULA(_sbc_proto_8[3], in[20],
+ MULA(_sbc_proto_8[4], in[36],
+ MUL( _sbc_proto_8[5], in[52])))))));
+
+ t[1] = SCALE8_STAGE1(
+ MULA(_sbc_proto_8[6], in[2],
+ MULA(_sbc_proto_8[7], in[18],
+ MULA(_sbc_proto_8[8], in[34],
+ MULA(_sbc_proto_8[9], in[50],
+ MUL(_sbc_proto_8[10], in[66]))))));
+
+ t[2] = SCALE8_STAGE1(
+ MULA(_sbc_proto_8[11], in[1],
+ MULA(_sbc_proto_8[12], in[17],
+ MULA(_sbc_proto_8[13], in[33],
+ MULA(_sbc_proto_8[14], in[49],
+ MULA(_sbc_proto_8[15], in[65],
+ MULA(_sbc_proto_8[16], in[3],
+ MULA(_sbc_proto_8[17], in[19],
+ MULA(_sbc_proto_8[18], in[35],
+ MULA(_sbc_proto_8[19], in[51],
+ MUL( _sbc_proto_8[20], in[67])))))))))));
+
+ t[3] = SCALE8_STAGE1(
+ MULA( _sbc_proto_8[21], in[5],
+ MULA( _sbc_proto_8[22], in[21],
+ MULA( _sbc_proto_8[23], in[37],
+ MULA( _sbc_proto_8[24], in[53],
+ MULA( _sbc_proto_8[25], in[69],
+ MULA(-_sbc_proto_8[15], in[15],
+ MULA(-_sbc_proto_8[14], in[31],
+ MULA(-_sbc_proto_8[13], in[47],
+ MULA(-_sbc_proto_8[12], in[63],
+ MUL( -_sbc_proto_8[11], in[79])))))))))));
+
+ t[4] = SCALE8_STAGE1(
+ MULA( _sbc_proto_8[26], in[6],
+ MULA( _sbc_proto_8[27], in[22],
+ MULA( _sbc_proto_8[28], in[38],
+ MULA( _sbc_proto_8[29], in[54],
+ MULA( _sbc_proto_8[30], in[70],
+ MULA(-_sbc_proto_8[10], in[14],
+ MULA(-_sbc_proto_8[9], in[30],
+ MULA(-_sbc_proto_8[8], in[46],
+ MULA(-_sbc_proto_8[7], in[62],
+ MUL( -_sbc_proto_8[6], in[78])))))))))));
+
+ t[5] = SCALE8_STAGE1(
+ MULA( _sbc_proto_8[31], in[7],
+ MULA( _sbc_proto_8[32], in[23],
+ MULA( _sbc_proto_8[33], in[39],
+ MULA( _sbc_proto_8[34], in[55],
+ MULA( _sbc_proto_8[35], in[71],
+ MULA(-_sbc_proto_8[20], in[13],
+ MULA(-_sbc_proto_8[19], in[29],
+ MULA(-_sbc_proto_8[18], in[45],
+ MULA(-_sbc_proto_8[17], in[61],
+ MUL( -_sbc_proto_8[16], in[77])))))))))));
+
+ t[6] = SCALE8_STAGE1(
+ MULA( _sbc_proto_8[36], (in[8] + in[72]),
+ MULA( _sbc_proto_8[37], (in[24] + in[56]),
+ MULA( _sbc_proto_8[38], in[40],
+ MULA(-_sbc_proto_8[39], in[12],
+ MULA(-_sbc_proto_8[5], in[28],
+ MULA(-_sbc_proto_8[4], in[44],
+ MULA(-_sbc_proto_8[3], in[60],
+ MUL( -_sbc_proto_8[2], in[76])))))))));
+
+ t[7] = SCALE8_STAGE1(
+ MULA( _sbc_proto_8[35], in[9],
+ MULA( _sbc_proto_8[34], in[25],
+ MULA( _sbc_proto_8[33], in[41],
+ MULA( _sbc_proto_8[32], in[57],
+ MULA( _sbc_proto_8[31], in[73],
+ MULA(-_sbc_proto_8[25], in[11],
+ MULA(-_sbc_proto_8[24], in[27],
+ MULA(-_sbc_proto_8[23], in[43],
+ MULA(-_sbc_proto_8[22], in[59],
+ MUL( -_sbc_proto_8[21], in[75])))))))))));
+
+ s[0] = MULA( _anamatrix8[0], t[0],
+ MUL( _anamatrix8[1], t[6]));
+ s[1] = MUL( _anamatrix8[7], t[1]);
+ s[2] = MULA( _anamatrix8[2], t[2],
+ MULA( _anamatrix8[3], t[3],
+ MULA( _anamatrix8[4], t[5],
+ MUL( _anamatrix8[5], t[7]))));
+ s[3] = MUL( _anamatrix8[6], t[4]);
+ s[4] = MULA( _anamatrix8[3], t[2],
+ MULA(-_anamatrix8[5], t[3],
+ MULA(-_anamatrix8[2], t[5],
+ MUL( -_anamatrix8[4], t[7]))));
+ s[5] = MULA( _anamatrix8[4], t[2],
+ MULA(-_anamatrix8[2], t[3],
+ MULA( _anamatrix8[5], t[5],
+ MUL( _anamatrix8[3], t[7]))));
+ s[6] = MULA( _anamatrix8[1], t[0],
+ MUL( -_anamatrix8[0], t[6]));
+ s[7] = MULA( _anamatrix8[5], t[2],
+ MULA(-_anamatrix8[4], t[3],
+ MULA( _anamatrix8[3], t[5],
+ MUL( -_anamatrix8[2], t[7]))));
+
out[0] = SCALE8_STAGE2( s[0] + s[1] + s[2] + s[3]);
out[1] = SCALE8_STAGE2( s[1] - s[3] + s[4] + s[6]);
out[2] = SCALE8_STAGE2( s[1] - s[3] + s[5] - s[6]);
Index: sbc_math.h
===================================================================
RCS file: /cvsroot/bluez/utils/sbc/sbc_math.h,v
retrieving revision 1.12
diff -u -p -r1.12 sbc_math.h
--- sbc_math.h 28 Jan 2008 18:00:51 -0000 1.12
+++ sbc_math.h 30 Jan 2008 13:22:53 -0000
@@ -64,6 +64,17 @@ typedef long long sbc_extended_t;
#define SBC_FIXED_0(val) { val = 0; }
#define ADD(dst, src) { dst += src; }
#define SUB(dst, src) { dst -= src; }
-#define MUL(dst, a, b) { dst = (sbc_extended_t) (a) * (b); }
+#define MUL(a, b) ((sbc_extended_t)(a) * (b))
#define DIV2(dst, src) { dst = ASR(src, 1); }
-#define MULA(dst, a, b) { dst += (sbc_extended_t) (a) * (b); }
+
+#ifdef __arm__
+#define MULA(a, b, res) ({ \
+ long long tmp = res; \
+ __asm__( \
+ "smlal %Q0, %R0, %2, %3" \
+ : "=&r" (tmp) \
+ : "0" (tmp), "r" (a), "r" (b)); \
+ tmp; })
+#else
+#define MULA(a, b, res) ((sbc_extended_t)(a) * (b) + (res))
+#endif
[-- Attachment #3: Type: text/plain, Size: 228 bytes --]
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
[-- Attachment #4: Type: text/plain, Size: 164 bytes --]
_______________________________________________
Bluez-devel mailing list
Bluez-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/bluez-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [Bluez-devel] ARM optimization
2008-01-30 17:07 ` Cidorvan Leite
@ 2008-01-30 17:41 ` Brad Midgley
0 siblings, 0 replies; 9+ messages in thread
From: Brad Midgley @ 2008-01-30 17:41 UTC (permalink / raw)
To: BlueZ development
Guys
It may be useful to think of this as two different optimizations.
First, making assignment happen outside the macro and nesting macros;
second, adding an arm version of MULA.
Brad
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
Bluez-devel mailing list
Bluez-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/bluez-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [Bluez-devel] ARM optimization
2008-01-30 16:19 ` Brad Midgley
@ 2008-01-30 17:47 ` Marcel Holtmann
2008-01-30 19:26 ` Brad Midgley
0 siblings, 1 reply; 9+ messages in thread
From: Marcel Holtmann @ 2008-01-30 17:47 UTC (permalink / raw)
To: BlueZ development
Hi Brad,
> Cidorvan has a new proposal. It shifts MUL/MULA around so the
> assignment happens outside the macro. The good news is the assembly is
> now limited to a single ifdef around the macro definition. If we had
> to define eg MIPS code or something specialized it would be relatively
> quick & clean.
>
> The tricky part is he nests the use of the macro. I'll show you how he
> formatted it. Formatting it in a traditional way will get messy
> quickly unfortunately.
this is tricky since the semantic of the other macros is different.
Having to separate semantics is a bad things. So changing the semantic
of MULA is a bad idea.
My idea would be to keep the semantic of MULA and only make it a two
stage thing. In case of no assembler we still use MULA as it is right
now, but in case of assembler it calls MULA_ARM which will do the
calculations and then the assignment. Will this work? Is the compiler
smart enough to optimize the code properly.
Regards
Marcel
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
Bluez-devel mailing list
Bluez-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/bluez-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [Bluez-devel] ARM optimization
2008-01-30 17:47 ` Marcel Holtmann
@ 2008-01-30 19:26 ` Brad Midgley
2008-01-30 19:56 ` Cidorvan Leite
0 siblings, 1 reply; 9+ messages in thread
From: Brad Midgley @ 2008-01-30 19:26 UTC (permalink / raw)
To: BlueZ development
Marcel
The other macros are not being used and I had intended to remove them.
We use only MUL/MULA.
> My idea would be to keep the semantic of MULA and only make it a two
> stage thing. In case of no assembler we still use MULA as it is right
> now, but in case of assembler it calls MULA_ARM which will do the
> calculations and then the assignment. Will this work? Is the compiler
> smart enough to optimize the code properly.
I don't know how to do this but maybe Cidorvan can see it.
--
Brad
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
Bluez-devel mailing list
Bluez-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/bluez-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [Bluez-devel] ARM optimization
2008-01-30 19:26 ` Brad Midgley
@ 2008-01-30 19:56 ` Cidorvan Leite
2008-01-30 20:21 ` Marcel Holtmann
0 siblings, 1 reply; 9+ messages in thread
From: Cidorvan Leite @ 2008-01-30 19:56 UTC (permalink / raw)
To: BlueZ development
Hi!
On Jan 30, 2008 4:26 PM, Brad Midgley <bmidgley@gmail.com> wrote:
> Marcel
>
> The other macros are not being used and I had intended to remove them.
> We use only MUL/MULA.
>
> > My idea would be to keep the semantic of MULA and only make it a two
> > stage thing. In case of no assembler we still use MULA as it is right
> > now, but in case of assembler it calls MULA_ARM which will do the
> > calculations and then the assignment. Will this work? Is the compiler
> > smart enough to optimize the code properly.
>
> I don't know how to do this but maybe Cidorvan can see it.
>
Like you can see in my first e-mail (gcc-no-arm-optimization.txt), gcc
wasn't smart enough to optimize assignments.
The reason to do a macro with assembly code is to force the gcc to use
the right instruction (smlal) and keep the output code clean.
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
Bluez-devel mailing list
Bluez-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/bluez-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [Bluez-devel] ARM optimization
2008-01-30 19:56 ` Cidorvan Leite
@ 2008-01-30 20:21 ` Marcel Holtmann
0 siblings, 0 replies; 9+ messages in thread
From: Marcel Holtmann @ 2008-01-30 20:21 UTC (permalink / raw)
To: BlueZ development
Hi Cidorvan,
> > The other macros are not being used and I had intended to remove them.
> > We use only MUL/MULA.
> >
> > > My idea would be to keep the semantic of MULA and only make it a two
> > > stage thing. In case of no assembler we still use MULA as it is right
> > > now, but in case of assembler it calls MULA_ARM which will do the
> > > calculations and then the assignment. Will this work? Is the compiler
> > > smart enough to optimize the code properly.
> >
> > I don't know how to do this but maybe Cidorvan can see it.
> >
>
> Like you can see in my first e-mail (gcc-no-arm-optimization.txt), gcc
> wasn't smart enough to optimize assignments.
> The reason to do a macro with assembly code is to force the gcc to use
> the right instruction (smlal) and keep the output code clean.
I think you didn't get my idea, but I also have no time to actually put
it into code right now. So I wanna have the semantics of all macros the
same. According to Brad, the others can be removed. If so, then I am
fine with this change.
Regards
Marcel
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
Bluez-devel mailing list
Bluez-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/bluez-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2008-01-30 20:21 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-01-29 17:46 [Bluez-devel] ARM optimization Cidorvan Leite
2008-01-29 19:07 ` Brad Midgley
2008-01-30 16:19 ` Brad Midgley
2008-01-30 17:47 ` Marcel Holtmann
2008-01-30 19:26 ` Brad Midgley
2008-01-30 19:56 ` Cidorvan Leite
2008-01-30 20:21 ` Marcel Holtmann
2008-01-30 17:07 ` Cidorvan Leite
2008-01-30 17:41 ` Brad Midgley
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox