* [PATCH] Add iwmmxt optimization for sbc for pxa series cpu
@ 2010-11-11 8:05 Keith Mok
2010-11-11 11:46 ` Siarhei Siamashka
0 siblings, 1 reply; 10+ messages in thread
From: Keith Mok @ 2010-11-11 8:05 UTC (permalink / raw)
To: linux-bluetooth
Hi all,
This patch add iwmmxt (Intel wireless mmx, pxa platform) optimzation
for sbc, based on the mmx code.
Have verified the encoded result against the mmx generated one.
Keith
Signed-off-by: Keith Mok <ek9852@gmail.com>
---
Makefile.am | 1 +
sbc/sbc_primitives.c | 4 +
sbc/sbc_primitives_iwmmxt.c | 361 +++++++++++++++++++++++++++++++++++++++++++
sbc/sbc_primitives_iwmmxt.h | 38 +++++
4 files changed, 404 insertions(+), 0 deletions(-)
create mode 100644 sbc/sbc_primitives_iwmmxt.c
create mode 100644 sbc/sbc_primitives_iwmmxt.h
diff --git a/Makefile.am b/Makefile.am
index da308a7..03a9bf2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -65,6 +65,7 @@ noinst_LTLIBRARIES += sbc/libsbc.la
sbc_libsbc_la_SOURCES = sbc/sbc.h sbc/sbc.c sbc/sbc_math.h sbc/sbc_tables.h \
sbc/sbc_primitives.h sbc/sbc_primitives.c \
sbc/sbc_primitives_mmx.h sbc/sbc_primitives_mmx.c \
+ sbc/sbc_primitives_iwmmxt.h sbc/sbc_primitives_iwmmxt.c \
sbc/sbc_primitives_neon.h sbc/sbc_primitives_neon.c \
sbc/sbc_primitives_armv6.h sbc/sbc_primitives_armv6.c
diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
index f87fb5a..ad780d0 100644
--- a/sbc/sbc_primitives.c
+++ b/sbc/sbc_primitives.c
@@ -33,6 +33,7 @@
#include "sbc_primitives.h"
#include "sbc_primitives_mmx.h"
+#include "sbc_primitives_iwmmxt.h"
#include "sbc_primitives_neon.h"
#include "sbc_primitives_armv6.h"
@@ -544,6 +545,9 @@ void sbc_init_primitives(struct sbc_encoder_state *state)
#ifdef SBC_BUILD_WITH_ARMV6_SUPPORT
sbc_init_primitives_armv6(state);
#endif
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+ sbc_init_primitives_iwmmxt(state);
+#endif
#ifdef SBC_BUILD_WITH_NEON_SUPPORT
sbc_init_primitives_neon(state);
#endif
diff --git a/sbc/sbc_primitives_iwmmxt.c b/sbc/sbc_primitives_iwmmxt.c
new file mode 100644
index 0000000..4825998
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.c
@@ -0,0 +1,361 @@
+/*
+ *
+ * Bluetooth low-complexity, subband codec (SBC) library
+ *
+ * Copyright (C) 2010 Keith Mok <ek9852@gmail.com>
+ * Based on sbc_primitives_mmx.c
+ *
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <stdint.h>
+#include <limits.h>
+#include "sbc.h"
+#include "sbc_math.h"
+#include "sbc_tables.h"
+
+#include "sbc_primitives_iwmmxt.h"
+
+/*
+ * IWMMXT optimizations
+ */
+
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+static inline void sbc_analyze_four_iwmmxt(const int16_t *in, int32_t *out,
+ const FIXED_T *consts)
+{
+ asm volatile (
+ "tbcstw wr4, %2\n"
+ "wldrd wr0, [%0]\n"
+ "wldrd wr1, [%0, #8]\n"
+ "wldrd wr2, [%1]\n"
+ "wldrd wr3, [%1, #8]\n"
+ "wmadds wr0, wr2, wr0\n"
+ "wmadds wr1, wr3, wr1\n"
+ "waddwss wr0, wr0, wr4\n"
+ "waddwss wr1, wr1, wr4\n"
+ "\n"
+ "wldrd wr2, [%0, #16]\n"
+ "wldrd wr3, [%0, #24]\n"
+ "wldrd wr4, [%1, #16]\n"
+ "wldrd wr5, [%1, #24]\n"
+ "wmadds wr2, wr4, wr2\n"
+ "wmadds wr3, wr5, wr3\n"
+ "waddwss wr0, wr2, wr0\n"
+ "waddwss wr1, wr3, wr1\n"
+ "\n"
+ "wldrd wr2, [%0, #32]\n"
+ "wldrd wr3, [%0, #40]\n"
+ "wldrd wr4, [%1, #32]\n"
+ "wldrd wr5, [%1, #40]\n"
+ "wmadds wr2, wr4, wr2\n"
+ "wmadds wr3, wr5, wr3\n"
+ "waddwss wr0, wr2, wr0\n"
+ "waddwss wr1, wr3, wr1\n"
+ "\n"
+ "wldrd wr2, [%0, #48]\n"
+ "wldrd wr3, [%0, #56]\n"
+ "wldrd wr4, [%1, #48]\n"
+ "wldrd wr5, [%1, #56]\n"
+ "wmadds wr2, wr4, wr2\n"
+ "wmadds wr3, wr5, wr3\n"
+ "waddwss wr0, wr2, wr0\n"
+ "waddwss wr1, wr3, wr1\n"
+ "\n"
+ "wldrd wr2, [%0, #64]\n"
+ "wldrd wr3, [%0, #72]\n"
+ "wldrd wr4, [%1, #64]\n"
+ "wldrd wr5, [%1, #72]\n"
+ "wmadds wr2, wr4, wr2\n"
+ "wmadds wr3, wr5, wr3\n"
+ "waddwss wr0, wr2, wr0\n"
+ "waddwss wr1, wr3, wr1\n"
+ "\n"
+ "tmcr wcgr0, %4\n"
+ "wsrawg wr0, wr0, wcgr0\n"
+ "wsrawg wr1, wr1, wcgr0\n"
+ "wpackwss wr0, wr0, wr0\n"
+ "wpackwss wr1, wr1, wr1\n"
+ "\n"
+ "wldrd wr4, [%1, #80]\n"
+ "wldrd wr5, [%1, #88]\n"
+ "wldrd wr6, [%1, #96]\n"
+ "wldrd wr7, [%1, #104]\n"
+ "wmadds wr2, wr5, wr0\n"
+ "wmadds wr0, wr4, wr0\n"
+ "\n"
+ "wmadds wr3, wr7, wr1\n"
+ "wmadds wr1, wr6, wr1\n"
+ "waddwss wr0, wr1, wr0\n"
+ "waddwss wr2, wr3, wr2\n"
+ "\n"
+ "wstrd wr0, [%3]\n"
+ "wstrd wr2, [%3, #8]\n"
+ :
+ : "r" (in), "r" (consts),
+ "r" (1 << (SBC_PROTO_FIXED4_SCALE - 1)), "r" (out),
+ "r" (SBC_PROTO_FIXED4_SCALE)
+ : "memory");
+}
+
+static inline void sbc_analyze_eight_iwmmxt(const int16_t *in, int32_t *out,
+ const FIXED_T *consts)
+{
+ asm volatile (
+ "tbcstw wr8, %2\n"
+ "wldrd wr0, [%0]\n"
+ "wldrd wr1, [%0, #8]\n"
+ "wldrd wr2, [%0, #16]\n"
+ "wldrd wr3, [%0, #24]\n"
+ "wldrd wr4, [%1]\n"
+ "wldrd wr5, [%1, #8]\n"
+ "wldrd wr6, [%1, #16]\n"
+ "wldrd wr7, [%1, #24]\n"
+ "wmadds wr0, wr0, wr4\n"
+ "wmadds wr1, wr1, wr5\n"
+ "wmadds wr2, wr2, wr6\n"
+ "wmadds wr3, wr3, wr7\n"
+ "waddwss wr0, wr0, wr8\n"
+ "waddwss wr1, wr1, wr8\n"
+ "waddwss wr2, wr2, wr8\n"
+ "waddwss wr3, wr3, wr8\n"
+ "\n"
+ "wldrd wr4, [%0, #32]\n"
+ "wldrd wr5, [%0, #40]\n"
+ "wldrd wr6, [%0, #48]\n"
+ "wldrd wr7, [%0, #56]\n"
+ "wldrd wr8, [%1, #32]\n"
+ "wldrd wr9, [%1, #40]\n"
+ "wldrd wr10, [%1, #48]\n"
+ "wldrd wr11, [%1, #56]\n"
+ "wmadds wr4, wr4, wr8\n"
+ "wmadds wr5, wr5, wr9\n"
+ "wmadds wr6, wr6, wr10\n"
+ "wmadds wr7, wr7, wr11\n"
+ "waddwss wr0, wr4, wr0\n"
+ "waddwss wr1, wr5, wr1\n"
+ "waddwss wr2, wr6, wr2\n"
+ "waddwss wr3, wr7, wr3\n"
+ "\n"
+ "wldrd wr4, [%0, #64]\n"
+ "wldrd wr5, [%0, #72]\n"
+ "wldrd wr6, [%0, #80]\n"
+ "wldrd wr7, [%0, #88]\n"
+ "wldrd wr8, [%1, #64]\n"
+ "wldrd wr9, [%1, #72]\n"
+ "wldrd wr10, [%1, #80]\n"
+ "wldrd wr11, [%1, #88]\n"
+ "wmadds wr4, wr4, wr8\n"
+ "wmadds wr5, wr5, wr9\n"
+ "wmadds wr6, wr6, wr10\n"
+ "wmadds wr7, wr7, wr11\n"
+ "waddwss wr0, wr4, wr0\n"
+ "waddwss wr1, wr5, wr1\n"
+ "waddwss wr2, wr6, wr2\n"
+ "waddwss wr3, wr7, wr3\n"
+ "\n"
+ "wldrd wr4, [%0, #96]\n"
+ "wldrd wr5, [%0, #104]\n"
+ "wldrd wr6, [%0, #112]\n"
+ "wldrd wr7, [%0, #120]\n"
+ "wldrd wr8, [%1, #96]\n"
+ "wldrd wr9, [%1, #104]\n"
+ "wldrd wr10, [%1, #112]\n"
+ "wldrd wr11, [%1, #120]\n"
+ "wmadds wr4, wr4, wr8\n"
+ "wmadds wr5, wr5, wr9\n"
+ "wmadds wr6, wr6, wr10\n"
+ "wmadds wr7, wr7, wr11\n"
+ "waddwss wr0, wr4, wr0\n"
+ "waddwss wr1, wr5, wr1\n"
+ "waddwss wr2, wr6, wr2\n"
+ "waddwss wr3, wr7, wr3\n"
+ "\n"
+ "wldrd wr4, [%0, #128]\n"
+ "wldrd wr5, [%0, #136]\n"
+ "wldrd wr6, [%0, #144]\n"
+ "wldrd wr7, [%0, #152]\n"
+ "wldrd wr8, [%1, #128]\n"
+ "wldrd wr9, [%1, #136]\n"
+ "wldrd wr10, [%1, #144]\n"
+ "wldrd wr11, [%1, #152]\n"
+ "wmadds wr4, wr4, wr8\n"
+ "wmadds wr5, wr5, wr9\n"
+ "wmadds wr6, wr6, wr10\n"
+ "wmadds wr7, wr7, wr11\n"
+ "waddwss wr0, wr4, wr0\n"
+ "waddwss wr1, wr5, wr1\n"
+ "waddwss wr2, wr6, wr2\n"
+ "waddwss wr3, wr7, wr3\n"
+ "\n"
+ "tmcr wcgr0, %4\n"
+ "wsrawg wr0, wr0, wcgr0\n"
+ "wsrawg wr1, wr1, wcgr0\n"
+ "wsrawg wr2, wr2, wcgr0\n"
+ "wsrawg wr3, wr3, wcgr0\n"
+ "\n"
+ "wpackwss wr0, wr0, wr0\n"
+ "wpackwss wr1, wr1, wr1\n"
+ "wpackwss wr2, wr2, wr2\n"
+ "wpackwss wr3, wr3, wr3\n"
+ "\n"
+ "wldrd wr4, [%1, #160]\n"
+ "wldrd wr5, [%1, #168]\n"
+ "wmadds wr4, wr4, wr0\n"
+ "wmadds wr5, wr5, wr0\n"
+ "\n"
+ "wldrd wr6, [%1, #192]\n"
+ "wldrd wr7, [%1, #200]\n"
+ "wmadds wr6, wr6, wr1\n"
+ "wmadds wr7, wr7, wr1\n"
+ "waddwss wr4, wr6, wr4\n"
+ "waddwss wr5, wr7, wr5\n"
+ "\n"
+ "wldrd wr6, [%1, #224]\n"
+ "wldrd wr7, [%1, #232]\n"
+ "wmadds wr6, wr6, wr2\n"
+ "wmadds wr7, wr7, wr2\n"
+ "waddwss wr4, wr6, wr4\n"
+ "waddwss wr5, wr7, wr5\n"
+ "\n"
+ "wldrd wr6, [%1, #256]\n"
+ "wldrd wr7, [%1, #264]\n"
+ "wmadds wr6, wr6, wr3\n"
+ "wmadds wr7, wr7, wr3\n"
+ "waddwss wr4, wr6, wr4\n"
+ "waddwss wr5, wr7, wr5\n"
+ "\n"
+ "wstrd wr4, [%3]\n"
+ "wstrd wr5, [%3, #8]\n"
+ "\n"
+ "wldrd wr4, [%1, #176]\n"
+ "wldrd wr5, [%1, #184]\n"
+ "wmadds wr5, wr5, wr0\n"
+ "wmadds wr0, wr4, wr0\n"
+ "\n"
+ "wldrd wr4, [%1, #208]\n"
+ "wldrd wr7, [%1, #216]\n"
+ "wmadds wr7, wr7, wr1\n"
+ "wmadds wr1, wr4, wr1\n"
+ "waddwss wr0, wr1, wr0\n"
+ "waddwss wr5, wr7, wr5\n"
+ "\n"
+ "wldrd wr4, [%1, #240]\n"
+ "wldrd wr7, [%1, #248]\n"
+ "wmadds wr7, wr7, wr2\n"
+ "wmadds wr2, wr4, wr2\n"
+ "waddwss wr0, wr2, wr0\n"
+ "waddwss wr5, wr7, wr5\n"
+ "\n"
+ "wldrd wr4, [%1, #272]\n"
+ "wldrd wr7, [%1, #280]\n"
+ "wmadds wr7, wr7, wr3\n"
+ "wmadds wr3, wr4, wr3\n"
+ "waddwss wr0, wr3, wr0\n"
+ "waddwss wr5, wr7, wr5\n"
+ "\n"
+ "wstrd wr0, [%3, #16]\n"
+ "wstrd wr5, [%3, #24]\n"
+ :
+ : "r" (in), "r" (consts),
+ "r" (1 << (SBC_PROTO_FIXED8_SCALE - 1)), "r" (out),
+ "r" (SBC_PROTO_FIXED8_SCALE)
+ : "memory");
+}
+
+static inline void sbc_analyze_4b_4s_iwmmxt(int16_t *x, int32_t *out,
+ int out_stride)
+{
+ /* Analyze blocks */
+ sbc_analyze_four_iwmmxt(x + 12, out, analysis_consts_fixed4_simd_odd);
+ out += out_stride;
+ sbc_analyze_four_iwmmxt(x + 8, out, analysis_consts_fixed4_simd_even);
+ out += out_stride;
+ sbc_analyze_four_iwmmxt(x + 4, out, analysis_consts_fixed4_simd_odd);
+ out += out_stride;
+ sbc_analyze_four_iwmmxt(x + 0, out, analysis_consts_fixed4_simd_even);
+}
+
+static inline void sbc_analyze_4b_8s_iwmmxt(int16_t *x, int32_t *out,
+ int out_stride)
+{
+ /* Analyze blocks */
+ sbc_analyze_eight_iwmmxt(x + 24, out, analysis_consts_fixed8_simd_odd);
+ out += out_stride;
+ sbc_analyze_eight_iwmmxt(x + 16, out, analysis_consts_fixed8_simd_even);
+ out += out_stride;
+ sbc_analyze_eight_iwmmxt(x + 8, out, analysis_consts_fixed8_simd_odd);
+ out += out_stride;
+ sbc_analyze_eight_iwmmxt(x + 0, out, analysis_consts_fixed8_simd_even);
+}
+
+static void sbc_calc_scalefactors_iwmmxt(
+ int32_t sb_sample_f[16][2][8],
+ uint32_t scale_factor[2][8],
+ int blocks, int channels, int subbands)
+{
+ int ch, sb;
+ intptr_t blk;
+ for (ch = 0; ch < channels; ch++) {
+ for (sb = 0; sb < subbands; sb += 2) {
+ int b;
+ blk = &sb_sample_f[0][ch][sb];
+ b = blocks;
+ asm volatile (
+ "tbcstw wr0, %4\n"
+ "1:\n"
+ "wldrd wr1, [%0], %2\n"
+ "wxor wr2, wr2, wr2\n"
+ "wcmpgtsw wr3, wr1, wr2\n"
+ "waddwss wr1, wr1, wr3\n"
+ "wcmpgtsw wr2, wr2, wr1\n"
+ "wxor wr1, wr1, wr2\n"
+
+ "wor wr0, wr0, wr1\n"
+
+ "subs %1, %1, #1\n"
+ "bne 1b\n"
+
+ "tmrrc %0, %1, wr0\n"
+ "clz %0, %0\n"
+ "rsb %0, %0, %5\n"
+ "str %0, [%3]\n"
+
+ "clz %1, %1\n"
+ "rsb %1, %1, %5\n"
+ "str %1, [%3, #4]\n"
+ : "+&r" (blk), "+&r" (b)
+ : "i" ((char *) &sb_sample_f[1][0][0] -
+ (char *) &sb_sample_f[0][0][0]),
+ "r" (&scale_factor[ch][sb]),
+ "r" (1 << SCALE_OUT_BITS),
+ "i" (SCALE_OUT_BITS+1)
+ : "memory");
+ }
+ }
+}
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *state)
+{
+ state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_iwmmxt;
+ state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_iwmmxt;
+ state->sbc_calc_scalefactors = sbc_calc_scalefactors_iwmmxt;
+ state->implementation_info = "IWMMXT";
+}
+
+#endif
diff --git a/sbc/sbc_primitives_iwmmxt.h b/sbc/sbc_primitives_iwmmxt.h
new file mode 100644
index 0000000..827d811
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.h
@@ -0,0 +1,38 @@
+/*
+ *
+ * Bluetooth low-complexity, subband codec (SBC) library
+ *
+ * Based on sbc_primitives_mmx.c
+ *
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef __SBC_PRIMITIVES_IWMMXT_H
+#define __SBC_PRIMITIVES_IWMMXT_H
+
+#include "sbc_primitives.h"
+
+#if defined(__GNUC__) && defined(__IWMMXT__) && \
+ !defined(SBC_HIGH_PRECISION) && (SCALE_OUT_BITS == 15)
+
+#define SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *encoder_state);
+
+#endif
+
+#endif
--
1.6.3.3
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH] Add iwmmxt optimization for sbc for pxa series cpu
2010-11-11 8:05 [PATCH] Add iwmmxt optimization for sbc for pxa series cpu Keith Mok
@ 2010-11-11 11:46 ` Siarhei Siamashka
2010-11-12 7:35 ` [PATCH v2] " Keith Mok
0 siblings, 1 reply; 10+ messages in thread
From: Siarhei Siamashka @ 2010-11-11 11:46 UTC (permalink / raw)
To: Keith Mok; +Cc: linux-bluetooth
On Thursday 11 November 2010 10:05:46 Keith Mok wrote:
> This patch add iwmmxt (Intel wireless mmx, pxa platform) optimzation
> for sbc, based on the mmx code.
> Have verified the encoded result against the mmx generated one.
Nice, I guess it should provide a noticeable performance improvement on this
hardware.
Did you run some benchmarks with these optimizations to measure how much they
are helping? The most interesting numbers are for the "44100kHz audio
with bitpool set to 53, 8 subbands, joint stereo" case, which is typically
used for A2DP. This can be done by running:
$ time ./sbcenc -b53 -s8 -j test.au > /dev/null
In my opinion, commit messages for the performance patches are more descriptive
in the following format:
http://git.kernel.org/?p=bluetooth/bluez.git;a=commit;h=e80454d08b4ec098024ddfbdffbd71e9d2f81bd0
And splitting the patch into parts, adding one optimization at a time may be a
good idea (for bisecting purposes).
A few other comments below.
I don't have any IWMMXT capable hardware to test/benchmark, but I checked the
following manuals:
http://download.intel.com/design/intelxscale/31451001.pdf
http://download.intel.com/design/intelxscale/27347302.pdf
> +static inline void sbc_analyze_four_iwmmxt(const int16_t *in, int32_t
> *out, + const FIXED_T *consts)
> +{
> + asm volatile (
> + "tbcstw wr4, %2\n"
> + "wldrd wr0, [%0]\n"
> + "wldrd wr1, [%0, #8]\n"
> + "wldrd wr2, [%1]\n"
> + "wldrd wr3, [%1, #8]\n"
Using back-to-back WLDRD instructions has some performance penalty
"D.3.2.3 Memory Control Pipeline
There is also an additional stall introduced by the core when 2 double word (64
bits) are issued back to back such as:
WLDRD or WSTRD
WLDR[B,H,W,D] or WSTR[B,H,W,D] <- 1 cycle stall.
Critical inner loop sequences can use non memory related instructions following
a WLDRD or WSTRD."
It's better to try rearranging the code so that load instructions are
interleaved with the others whenever it is possible.
> + "wmadds wr0, wr2, wr0\n"
> + "wmadds wr1, wr3, wr1\n"
> + "waddwss wr0, wr0, wr4\n"
> + "waddwss wr1, wr1, wr4\n"
> + "\n"
> + "wldrd wr2, [%0, #16]\n"
> + "wldrd wr3, [%0, #24]\n"
> + "wldrd wr4, [%1, #16]\n"
^^^^^^ (1)
> + "wldrd wr5, [%1, #24]\n"
> + "wmadds wr2, wr4, wr2\n"
^^^^^^^ (2)
It also makes sense to pay attention to instruction latencies. Here you use wr4
register (2) after loading (1) with only one unrelated instruction in between.
And according to "Table D-1. Issue Cycle and Result Latency of the Intel®
Wireless MMX™ 2 Coprocessor Instructions", WLDRD has result latency 3, so that
it works best if you insert 2 unrelated instruction in between.
> + "wmadds wr3, wr5, wr3\n"
> + "waddwss wr0, wr2, wr0\n"
> + "waddwss wr1, wr3, wr1\n"
> + "\n"
> + "wldrd wr2, [%0, #32]\n"
> + "wldrd wr3, [%0, #40]\n"
> + "wldrd wr4, [%1, #32]\n"
> + "wldrd wr5, [%1, #40]\n"
> + "wmadds wr2, wr4, wr2\n"
> + "wmadds wr3, wr5, wr3\n"
According to "Table D-3. Resource Availability Delay for the Multiplier
Pipeline", back-to-back WMADD instructions may have a performance penalty.
> + "waddwss wr0, wr2, wr0\n"
> + "waddwss wr1, wr3, wr1\n"
> + "\n"
> + "wldrd wr2, [%0, #48]\n"
> + "wldrd wr3, [%0, #56]\n"
> + "wldrd wr4, [%1, #48]\n"
> + "wldrd wr5, [%1, #56]\n"
> + "wmadds wr2, wr4, wr2\n"
> + "wmadds wr3, wr5, wr3\n"
> + "waddwss wr0, wr2, wr0\n"
> + "waddwss wr1, wr3, wr1\n"
> + "\n"
> + "wldrd wr2, [%0, #64]\n"
> + "wldrd wr3, [%0, #72]\n"
> + "wldrd wr4, [%1, #64]\n"
> + "wldrd wr5, [%1, #72]\n"
> + "wmadds wr2, wr4, wr2\n"
> + "wmadds wr3, wr5, wr3\n"
> + "waddwss wr0, wr2, wr0\n"
> + "waddwss wr1, wr3, wr1\n"
> + "\n"
> + "tmcr wcgr0, %4\n"
> + "wsrawg wr0, wr0, wcgr0\n"
> + "wsrawg wr1, wr1, wcgr0\n"
> + "wpackwss wr0, wr0, wr0\n"
> + "wpackwss wr1, wr1, wr1\n"
> + "\n"
> + "wldrd wr4, [%1, #80]\n"
> + "wldrd wr5, [%1, #88]\n"
> + "wldrd wr6, [%1, #96]\n"
> + "wldrd wr7, [%1, #104]\n"
> + "wmadds wr2, wr5, wr0\n"
> + "wmadds wr0, wr4, wr0\n"
> + "\n"
> + "wmadds wr3, wr7, wr1\n"
> + "wmadds wr1, wr6, wr1\n"
> + "waddwss wr0, wr1, wr0\n"
> + "waddwss wr2, wr3, wr2\n"
> + "\n"
> + "wstrd wr0, [%3]\n"
> + "wstrd wr2, [%3, #8]\n"
> + :
> + : "r" (in), "r" (consts),
> + "r" (1 << (SBC_PROTO_FIXED4_SCALE - 1)), "r" (out),
> + "r" (SBC_PROTO_FIXED4_SCALE)
> + : "memory");
> +}
> +static void sbc_calc_scalefactors_iwmmxt(
> + int32_t sb_sample_f[16][2][8],
> + uint32_t scale_factor[2][8],
> + int blocks, int channels, int subbands)
> +{
> + int ch, sb;
> + intptr_t blk;
> + for (ch = 0; ch < channels; ch++) {
> + for (sb = 0; sb < subbands; sb += 2) {
> + int b;
> + blk = &sb_sample_f[0][ch][sb];
> + b = blocks;
> + asm volatile (
> + "tbcstw wr0, %4\n"
> + "1:\n"
> + "wldrd wr1, [%0], %2\n"
> + "wxor wr2, wr2, wr2\n"
> + "wcmpgtsw wr3, wr1, wr2\n"
The MMX code was using PCMPGTD and the other instructions just because MMX
instruction set is very limited and did not have the needed instructions. But
you can use WABS and WMAX instructions to do this job better. You can refer to
the original C code and also to ARM NEON optimizations to get some ideas about
how to do this operation faster.
> + "waddwss wr1, wr1, wr3\n"
> + "wcmpgtsw wr2, wr2, wr1\n"
> + "wxor wr1, wr1, wr2\n"
> +
> + "wor wr0, wr0, wr1\n"
> +
> + "subs %1, %1, #1\n"
> + "bne 1b\n"
> +
> + "tmrrc %0, %1, wr0\n"
> + "clz %0, %0\n"
> + "rsb %0, %0, %5\n"
> + "str %0, [%3]\n"
> +
> + "clz %1, %1\n"
> + "rsb %1, %1, %5\n"
> + "str %1, [%3, #4]\n"
> + : "+&r" (blk), "+&r" (b)
> + : "i" ((char *) &sb_sample_f[1][0][0] -
> + (char *) &sb_sample_f[0][0][0]),
> + "r" (&scale_factor[ch][sb]),
> + "r" (1 << SCALE_OUT_BITS),
> + "i" (SCALE_OUT_BITS+1)
> + : "memory");
And this is actually a bug, which exists in the original MMX code too (my
fault). In order to fix it, "cc" needs to be added to the clobber list. I have
just sent a patch for MMX code here:
http://marc.info/?l=linux-bluetooth&m=128946780706187&w=2
Such bug is more dangerous on ARM, because it is up to the developer whether to
update flags in each particular instruction or not. So while almost every
arithmetic x86 instruction updates flags unconditionally, on ARM the flags can
easily survive long enough. That makes it possible for the compiler to
implement more clever optimizations related to setting and checking flags, and
fail if the clobber list does not contain correct information.
> + }
> + }
> +}
--
Best regards,
Siarhei Siamashka
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v2] Add iwmmxt optimization for sbc for pxa series cpu
2010-11-11 11:46 ` Siarhei Siamashka
@ 2010-11-12 7:35 ` Keith Mok
2010-11-12 13:22 ` Siarhei Siamashka
0 siblings, 1 reply; 10+ messages in thread
From: Keith Mok @ 2010-11-12 7:35 UTC (permalink / raw)
To: Siarhei Siamashka; +Cc: linux-bluetooth
> Did you run some benchmarks with these optimizations to measure how much they
> are helping?
Tested on Marvell PXA platform.
== Before ==
$ time ./sbcenc -b53 -s8 -j c.au > /dev/null
real 0m 0.41s
user 0m 0.40s
sys 0m 0.00s
== After ==
$ time ./sbcenc -b53 -s8 -j c.au > /dev/null
real 0m 0.19s
user 0m 0.17s
sys 0m 0.02s
> Using back-to-back WLDRD instructions has some performance penalty
I rearrange the instructions and keep the original one as for reference in
the block that comment out. Since the code is really difficult to read
after interleaved.
> The MMX code was using PCMPGTD and the other instructions just because MMX
> instruction set is very limited and did not have the needed instructions. But
> you can use WABS and WMAX instructions to do this job better. You can refer to
> the original C code and also to ARM NEON optimizations to get some ideas about
> how to do this operation faster.
Changed as suggested.
But got a question that the __IWMMXT__ builtin gcc definition is not a
reliable way to
determine whether mcpu=iwmmxt2 is turned on or not. It will break when
compile under pxa270
which does not support wabs with just mcpu=iwmmx on.
Keith
Signed-off-by: Keith Mok <ek9852@gmail.com>
---
diff --git a/Makefile.am b/Makefile.am
index da308a7..03a9bf2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -65,6 +65,7 @@ noinst_LTLIBRARIES += sbc/libsbc.la
sbc_libsbc_la_SOURCES = sbc/sbc.h sbc/sbc.c sbc/sbc_math.h sbc/sbc_tables.h \
sbc/sbc_primitives.h sbc/sbc_primitives.c \
sbc/sbc_primitives_mmx.h sbc/sbc_primitives_mmx.c \
+ sbc/sbc_primitives_iwmmxt.h sbc/sbc_primitives_iwmmxt.c \
sbc/sbc_primitives_neon.h sbc/sbc_primitives_neon.c \
sbc/sbc_primitives_armv6.h sbc/sbc_primitives_armv6.c
diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
index f87fb5a..ad780d0 100644
--- a/sbc/sbc_primitives.c
+++ b/sbc/sbc_primitives.c
@@ -33,6 +33,7 @@
#include "sbc_primitives.h"
#include "sbc_primitives_mmx.h"
+#include "sbc_primitives_iwmmxt.h"
#include "sbc_primitives_neon.h"
#include "sbc_primitives_armv6.h"
@@ -544,6 +545,9 @@ void sbc_init_primitives(struct sbc_encoder_state *state)
#ifdef SBC_BUILD_WITH_ARMV6_SUPPORT
sbc_init_primitives_armv6(state);
#endif
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+ sbc_init_primitives_iwmmxt(state);
+#endif
#ifdef SBC_BUILD_WITH_NEON_SUPPORT
sbc_init_primitives_neon(state);
#endif
diff --git a/sbc/sbc_primitives_iwmmxt.c b/sbc/sbc_primitives_iwmmxt.c
new file mode 100644
index 0000000..b988bb1
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.c
@@ -0,0 +1,599 @@
+/*
+ *
+ * Bluetooth low-complexity, subband codec (SBC) library
+ *
+ * Copyright (C) 2010 Keith Mok <ek9852@gmail.com>
+ * Based on sbc_primitives_mmx.c
+ *
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <stdint.h>
+#include <limits.h>
+#include "sbc.h"
+#include "sbc_math.h"
+#include "sbc_tables.h"
+
+#include "sbc_primitives_iwmmxt.h"
+
+/*
+ * IWMMXT optimizations
+ */
+
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+static inline void sbc_analyze_four_iwmmxt(const int16_t *in, int32_t *out,
+ const FIXED_T *consts)
+{
+ asm volatile (
+ "wldrd wr0, [%0]\n"
+ "tbcstw wr4, %2\n"
+ "wldrd wr2, [%1]\n"
+ "wldrd wr1, [%0, #8]\n"
+ "wldrd wr3, [%1, #8]\n"
+ "wmadds wr0, wr2, wr0\n"
+ "wldrd wr6, [%0, #16]\n"
+ "wmadds wr1, wr3, wr1\n"
+ "wldrd wr7, [%0, #24]\n"
+ "waddwss wr0, wr0, wr4\n"
+ "wldrd wr8, [%1, #16]\n"
+ "waddwss wr1, wr1, wr4\n"
+ "wldrd wr9, [%1, #24]\n"
+ "wmadds wr6, wr8, wr6\n"
+ "wldrd wr2, [%0, #32]\n"
+ "wmadds wr7, wr9, wr7\n"
+ "wldrd wr3, [%0, #40]\n"
+ "waddwss wr0, wr6, wr0\n"
+ "wldrd wr4, [%1, #32]\n"
+ "waddwss wr1, wr7, wr1\n"
+ "wldrd wr5, [%1, #40]\n"
+ "wmadds wr2, wr4, wr2\n"
+ "wldrd wr6, [%0, #48]\n"
+ "wmadds wr3, wr5, wr3\n"
+ "wldrd wr7, [%0, #56]\n"
+ "waddwss wr0, wr2, wr0\n"
+ "wldrd wr8, [%1, #48]\n"
+ "waddwss wr1, wr3, wr1\n"
+ "wldrd wr9, [%1, #56]\n"
+ "wmadds wr6, wr8, wr6\n"
+ "wldrd wr2, [%0, #64]\n"
+ "wmadds wr7, wr9, wr7\n"
+ "wldrd wr3, [%0, #72]\n"
+ "waddwss wr0, wr6, wr0\n"
+ "wldrd wr4, [%1, #64]\n"
+ "waddwss wr1, wr7, wr1\n"
+ "wldrd wr5, [%1, #72]\n"
+ "wmadds wr2, wr4, wr2\n"
+ "wmadds wr3, wr5, wr3\n"
+ "waddwss wr0, wr2, wr0\n"
+ "waddwss wr1, wr3, wr1\n"
+ "\n"
+ "tmcr wcgr0, %4\n"
+ "wsrawg wr0, wr0, wcgr0\n"
+ "wldrd wr4, [%1, #80]\n"
+ "wsrawg wr1, wr1, wcgr0\n"
+ "wldrd wr5, [%1, #88]\n"
+ "wpackwss wr0, wr0, wr0\n"
+ "wldrd wr6, [%1, #96]\n"
+ "wpackwss wr1, wr1, wr1\n"
+ "wldrd wr7, [%1, #104]\n"
+ "wmadds wr2, wr5, wr0\n"
+ "wmadds wr0, wr4, wr0\n"
+ "\n"
+ "wmadds wr3, wr7, wr1\n"
+ "wmadds wr1, wr6, wr1\n"
+ "waddwss wr0, wr1, wr0\n"
+ "waddwss wr2, wr3, wr2\n"
+ "\n"
+ "wstrd wr0, [%3]\n"
+ "wstrd wr2, [%3, #8]\n"
+ :
+ : "r" (in), "r" (consts),
+ "r" (1 << (SBC_PROTO_FIXED4_SCALE - 1)), "r" (out),
+ "r" (SBC_PROTO_FIXED4_SCALE)
+ : "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6", "wr7",
+ "wr8", "wr9", "wcgr0", "memory");
+#if 0
+ /* without pipeline and resultant latency consideration
+ * keep it here for reference
+ * since the latency optimizated code above is difficult to read */
+ asm volatile (
+ "tbcstw wr4, %2\n"
+ "wldrd wr0, [%0]\n"
+ "wldrd wr1, [%0, #8]\n"
+ "wldrd wr2, [%1]\n"
+ "wldrd wr3, [%1, #8]\n"
+ "wmadds wr0, wr2, wr0\n"
+ "wmadds wr1, wr3, wr1\n"
+ "waddwss wr0, wr0, wr4\n"
+ "waddwss wr1, wr1, wr4\n"
+ "\n"
+ "wldrd wr2, [%0, #16]\n"
+ "wldrd wr3, [%0, #24]\n"
+ "wldrd wr4, [%1, #16]\n"
+ "wldrd wr5, [%1, #24]\n"
+ "wmadds wr2, wr4, wr2\n"
+ "wmadds wr3, wr5, wr3\n"
+ "waddwss wr0, wr2, wr0\n"
+ "waddwss wr1, wr3, wr1\n"
+ "\n"
+ "wldrd wr2, [%0, #32]\n"
+ "wldrd wr3, [%0, #40]\n"
+ "wldrd wr4, [%1, #32]\n"
+ "wldrd wr5, [%1, #40]\n"
+ "wmadds wr2, wr4, wr2\n"
+ "wmadds wr3, wr5, wr3\n"
+ "waddwss wr0, wr2, wr0\n"
+ "waddwss wr1, wr3, wr1\n"
+ "\n"
+ "wldrd wr2, [%0, #48]\n"
+ "wldrd wr3, [%0, #56]\n"
+ "wldrd wr4, [%1, #48]\n"
+ "wldrd wr5, [%1, #56]\n"
+ "wmadds wr2, wr4, wr2\n"
+ "wmadds wr3, wr5, wr3\n"
+ "waddwss wr0, wr2, wr0\n"
+ "waddwss wr1, wr3, wr1\n"
+ "\n"
+ "wldrd wr2, [%0, #64]\n"
+ "wldrd wr3, [%0, #72]\n"
+ "wldrd wr4, [%1, #64]\n"
+ "wldrd wr5, [%1, #72]\n"
+ "wmadds wr2, wr4, wr2\n"
+ "wmadds wr3, wr5, wr3\n"
+ "waddwss wr0, wr2, wr0\n"
+ "waddwss wr1, wr3, wr1\n"
+ "\n"
+ "tmcr wcgr0, %4\n"
+ "wsrawg wr0, wr0, wcgr0\n"
+ "wsrawg wr1, wr1, wcgr0\n"
+ "wpackwss wr0, wr0, wr0\n"
+ "wpackwss wr1, wr1, wr1\n"
+ "\n"
+ "wldrd wr4, [%1, #80]\n"
+ "wldrd wr5, [%1, #88]\n"
+ "wldrd wr6, [%1, #96]\n"
+ "wldrd wr7, [%1, #104]\n"
+ "wmadds wr2, wr5, wr0\n"
+ "wmadds wr0, wr4, wr0\n"
+ "\n"
+ "wmadds wr3, wr7, wr1\n"
+ "wmadds wr1, wr6, wr1\n"
+ "waddwss wr0, wr1, wr0\n"
+ "waddwss wr2, wr3, wr2\n"
+ "\n"
+ "wstrd wr0, [%3]\n"
+ "wstrd wr2, [%3, #8]\n"
+ :
+ : "r" (in), "r" (consts),
+ "r" (1 << (SBC_PROTO_FIXED4_SCALE - 1)), "r" (out),
+ "r" (SBC_PROTO_FIXED4_SCALE)
+ : "memory");
+#endif
+}
+
+static inline void sbc_analyze_eight_iwmmxt(const int16_t *in, int32_t *out,
+ const FIXED_T *consts)
+{
+ asm volatile (
+ "wldrd wr0, [%0]\n"
+ "tbcstw wr15, %2\n"
+ "wldrd wr1, [%0, #8]\n"
+ "wldrd wr2, [%0, #16]\n"
+ "wldrd wr3, [%0, #24]\n"
+ "wldrd wr4, [%1]\n"
+ "wldrd wr5, [%1, #8]\n"
+ "wldrd wr6, [%1, #16]\n"
+ "wldrd wr7, [%1, #24]\n"
+ "wmadds wr0, wr0, wr4\n"
+ "wldrd wr8, [%1, #32]\n"
+ "wmadds wr1, wr1, wr5\n"
+ "wldrd wr9, [%1, #40]\n"
+ "wmadds wr2, wr2, wr6\n"
+ "wldrd wr10, [%1, #48]\n"
+ "wmadds wr3, wr3, wr7\n"
+ "wldrd wr11, [%1, #56]\n"
+ "waddwss wr0, wr0, wr15\n"
+ "wldrd wr4, [%0, #32]\n"
+ "waddwss wr1, wr1, wr15\n"
+ "wldrd wr5, [%0, #40]\n"
+ "waddwss wr2, wr2, wr15\n"
+ "wldrd wr6, [%0, #48]\n"
+ "waddwss wr3, wr3, wr15\n"
+ "wldrd wr7, [%0, #56]\n"
+ "wmadds wr4, wr4, wr8\n"
+ "wldrd wr12, [%0, #64]\n"
+ "wmadds wr5, wr5, wr9\n"
+ "wldrd wr13, [%0, #72]\n"
+ "wmadds wr6, wr6, wr10\n"
+ "wldrd wr14, [%0, #80]\n"
+ "wmadds wr7, wr7, wr11\n"
+ "wldrd wr15, [%0, #88]\n"
+ "waddwss wr0, wr4, wr0\n"
+ "wldrd wr8, [%1, #64]\n"
+ "waddwss wr1, wr5, wr1\n"
+ "wldrd wr9, [%1, #72]\n"
+ "waddwss wr2, wr6, wr2\n"
+ "wldrd wr10, [%1, #80]\n"
+ "waddwss wr3, wr7, wr3\n"
+ "wldrd wr11, [%1, #88]\n"
+ "wmadds wr12, wr12, wr8\n"
+ "wldrd wr4, [%0, #96]\n"
+ "wmadds wr13, wr13, wr9\n"
+ "wldrd wr5, [%0, #104]\n"
+ "wmadds wr14, wr14, wr10\n"
+ "wldrd wr6, [%0, #112]\n"
+ "wmadds wr15, wr15, wr11\n"
+ "wldrd wr7, [%0, #120]\n"
+ "waddwss wr0, wr12, wr0\n"
+ "wldrd wr8, [%1, #96]\n"
+ "waddwss wr1, wr13, wr1\n"
+ "wldrd wr9, [%1, #104]\n"
+ "waddwss wr2, wr14, wr2\n"
+ "wldrd wr10, [%1, #112]\n"
+ "waddwss wr3, wr15, wr3\n"
+ "wldrd wr11, [%1, #120]\n"
+ "wmadds wr4, wr4, wr8\n"
+ "wldrd wr12, [%0, #128]\n"
+ "wmadds wr5, wr5, wr9\n"
+ "wldrd wr13, [%0, #136]\n"
+ "wmadds wr6, wr6, wr10\n"
+ "wldrd wr14, [%0, #144]\n"
+ "wmadds wr7, wr7, wr11\n"
+ "wldrd wr15, [%0, #152]\n"
+ "waddwss wr0, wr4, wr0\n"
+ "wldrd wr8, [%1, #128]\n"
+ "waddwss wr1, wr5, wr1\n"
+ "wldrd wr9, [%1, #136]\n"
+ "waddwss wr2, wr6, wr2\n"
+ "wldrd wr10, [%1, #144]\n"
+ "waddwss wr3, wr7, wr3\n"
+ "wldrd wr11, [%1, #152]\n"
+ "wmadds wr12, wr12, wr8\n"
+ "wmadds wr13, wr13, wr9\n"
+ "wmadds wr14, wr14, wr10\n"
+ "wmadds wr15, wr15, wr11\n"
+ "waddwss wr0, wr12, wr0\n"
+ "waddwss wr1, wr13, wr1\n"
+ "waddwss wr2, wr14, wr2\n"
+ "waddwss wr3, wr15, wr3\n"
+ "\n"
+ "tmcr wcgr0, %4\n"
+ "wsrawg wr0, wr0, wcgr0\n"
+ "wsrawg wr1, wr1, wcgr0\n"
+ "wsrawg wr2, wr2, wcgr0\n"
+ "wsrawg wr3, wr3, wcgr0\n"
+ "\n"
+ "wpackwss wr0, wr0, wr0\n"
+ "wpackwss wr1, wr1, wr1\n"
+ "wldrd wr4, [%1, #160]\n"
+ "wpackwss wr2, wr2, wr2\n"
+ "wldrd wr5, [%1, #168]\n"
+ "wpackwss wr3, wr3, wr3\n"
+ "wldrd wr6, [%1, #192]\n"
+ "wmadds wr4, wr4, wr0\n"
+ "wldrd wr7, [%1, #200]\n"
+ "wmadds wr5, wr5, wr0\n"
+ "wldrd wr8, [%1, #224]\n"
+ "wmadds wr6, wr6, wr1\n"
+ "wldrd wr9, [%1, #232]\n"
+ "wmadds wr7, wr7, wr1\n"
+ "waddwss wr4, wr6, wr4\n"
+ "waddwss wr5, wr7, wr5\n"
+ "wmadds wr8, wr8, wr2\n"
+ "wldrd wr6, [%1, #256]\n"
+ "wmadds wr9, wr9, wr2\n"
+ "wldrd wr7, [%1, #264]\n"
+ "waddwss wr4, wr8, wr4\n"
+ "waddwss wr5, wr9, wr5\n"
+ "wmadds wr6, wr6, wr3\n"
+ "wmadds wr7, wr7, wr3\n"
+ "waddwss wr4, wr6, wr4\n"
+ "waddwss wr5, wr7, wr5\n"
+ "\n"
+ "wstrd wr4, [%3]\n"
+ "wstrd wr5, [%3, #8]\n"
+ "\n"
+ "wldrd wr6, [%1, #176]\n"
+ "wldrd wr5, [%1, #184]\n"
+ "wmadds wr5, wr5, wr0\n"
+ "wldrd wr8, [%1, #208]\n"
+ "wmadds wr0, wr6, wr0\n"
+ "wldrd wr9, [%1, #216]\n"
+ "wmadds wr9, wr9, wr1\n"
+ "wldrd wr6, [%1, #240]\n"
+ "wmadds wr1, wr8, wr1\n"
+ "wldrd wr7, [%1, #248]\n"
+ "waddwss wr0, wr1, wr0\n"
+ "waddwss wr5, wr9, wr5\n"
+ "wmadds wr7, wr7, wr2\n"
+ "wldrd wr8, [%1, #272]\n"
+ "wmadds wr2, wr6, wr2\n"
+ "wldrd wr9, [%1, #280]\n"
+ "waddwss wr0, wr2, wr0\n"
+ "waddwss wr5, wr7, wr5\n"
+ "wmadds wr9, wr9, wr3\n"
+ "wmadds wr3, wr8, wr3\n"
+ "waddwss wr0, wr3, wr0\n"
+ "waddwss wr5, wr9, wr5\n"
+ "\n"
+ "wstrd wr0, [%3, #16]\n"
+ "wstrd wr5, [%3, #24]\n"
+ :
+ : "r" (in), "r" (consts),
+ "r" (1 << (SBC_PROTO_FIXED8_SCALE - 1)), "r" (out),
+ "r" (SBC_PROTO_FIXED8_SCALE)
+ : "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6", "wr7",
+ "wr8", "wr9", "wr10", "wr11", "wr12", "wr13", "wr14", "wr15",
+ "wcgr0", "memory");
+#if 0
+ /* without pipeline and resultant latency consideration
+ * keep it here for reference
+ * since the latency optimizated code above is difficult to read */
+ asm volatile (
+ "tbcstw wr8, %2\n"
+ "wldrd wr0, [%0]\n"
+ "wldrd wr1, [%0, #8]\n"
+ "wldrd wr2, [%0, #16]\n"
+ "wldrd wr3, [%0, #24]\n"
+ "wldrd wr4, [%1]\n"
+ "wldrd wr5, [%1, #8]\n"
+ "wldrd wr6, [%1, #16]\n"
+ "wldrd wr7, [%1, #24]\n"
+ "wmadds wr0, wr0, wr4\n"
+ "wmadds wr1, wr1, wr5\n"
+ "wmadds wr2, wr2, wr6\n"
+ "wmadds wr3, wr3, wr7\n"
+ "waddwss wr0, wr0, wr8\n"
+ "waddwss wr1, wr1, wr8\n"
+ "waddwss wr2, wr2, wr8\n"
+ "waddwss wr3, wr3, wr8\n"
+ "\n"
+ "wldrd wr4, [%0, #32]\n"
+ "wldrd wr5, [%0, #40]\n"
+ "wldrd wr6, [%0, #48]\n"
+ "wldrd wr7, [%0, #56]\n"
+ "wldrd wr8, [%1, #32]\n"
+ "wldrd wr9, [%1, #40]\n"
+ "wldrd wr10, [%1, #48]\n"
+ "wldrd wr11, [%1, #56]\n"
+ "wmadds wr4, wr4, wr8\n"
+ "wmadds wr5, wr5, wr9\n"
+ "wmadds wr6, wr6, wr10\n"
+ "wmadds wr7, wr7, wr11\n"
+ "waddwss wr0, wr4, wr0\n"
+ "waddwss wr1, wr5, wr1\n"
+ "waddwss wr2, wr6, wr2\n"
+ "waddwss wr3, wr7, wr3\n"
+ "\n"
+ "wldrd wr4, [%0, #64]\n"
+ "wldrd wr5, [%0, #72]\n"
+ "wldrd wr6, [%0, #80]\n"
+ "wldrd wr7, [%0, #88]\n"
+ "wldrd wr8, [%1, #64]\n"
+ "wldrd wr9, [%1, #72]\n"
+ "wldrd wr10, [%1, #80]\n"
+ "wldrd wr11, [%1, #88]\n"
+ "wmadds wr4, wr4, wr8\n"
+ "wmadds wr5, wr5, wr9\n"
+ "wmadds wr6, wr6, wr10\n"
+ "wmadds wr7, wr7, wr11\n"
+ "waddwss wr0, wr4, wr0\n"
+ "waddwss wr1, wr5, wr1\n"
+ "waddwss wr2, wr6, wr2\n"
+ "waddwss wr3, wr7, wr3\n"
+ "\n"
+ "wldrd wr4, [%0, #96]\n"
+ "wldrd wr5, [%0, #104]\n"
+ "wldrd wr6, [%0, #112]\n"
+ "wldrd wr7, [%0, #120]\n"
+ "wldrd wr8, [%1, #96]\n"
+ "wldrd wr9, [%1, #104]\n"
+ "wldrd wr10, [%1, #112]\n"
+ "wldrd wr11, [%1, #120]\n"
+ "wmadds wr4, wr4, wr8\n"
+ "wmadds wr5, wr5, wr9\n"
+ "wmadds wr6, wr6, wr10\n"
+ "wmadds wr7, wr7, wr11\n"
+ "waddwss wr0, wr4, wr0\n"
+ "waddwss wr1, wr5, wr1\n"
+ "waddwss wr2, wr6, wr2\n"
+ "waddwss wr3, wr7, wr3\n"
+ "\n"
+ "wldrd wr4, [%0, #128]\n"
+ "wldrd wr5, [%0, #136]\n"
+ "wldrd wr6, [%0, #144]\n"
+ "wldrd wr7, [%0, #152]\n"
+ "wldrd wr8, [%1, #128]\n"
+ "wldrd wr9, [%1, #136]\n"
+ "wldrd wr10, [%1, #144]\n"
+ "wldrd wr11, [%1, #152]\n"
+ "wmadds wr4, wr4, wr8\n"
+ "wmadds wr5, wr5, wr9\n"
+ "wmadds wr6, wr6, wr10\n"
+ "wmadds wr7, wr7, wr11\n"
+ "waddwss wr0, wr4, wr0\n"
+ "waddwss wr1, wr5, wr1\n"
+ "waddwss wr2, wr6, wr2\n"
+ "waddwss wr3, wr7, wr3\n"
+ "\n"
+ "tmcr wcgr0, %4\n"
+ "wsrawg wr0, wr0, wcgr0\n"
+ "wsrawg wr1, wr1, wcgr0\n"
+ "wsrawg wr2, wr2, wcgr0\n"
+ "wsrawg wr3, wr3, wcgr0\n"
+ "\n"
+ "wpackwss wr0, wr0, wr0\n"
+ "wpackwss wr1, wr1, wr1\n"
+ "wpackwss wr2, wr2, wr2\n"
+ "wpackwss wr3, wr3, wr3\n"
+ "\n"
+ "wldrd wr4, [%1, #160]\n"
+ "wldrd wr5, [%1, #168]\n"
+ "wmadds wr4, wr4, wr0\n"
+ "wmadds wr5, wr5, wr0\n"
+ "\n"
+ "wldrd wr6, [%1, #192]\n"
+ "wldrd wr7, [%1, #200]\n"
+ "wmadds wr6, wr6, wr1\n"
+ "wmadds wr7, wr7, wr1\n"
+ "waddwss wr4, wr6, wr4\n"
+ "waddwss wr5, wr7, wr5\n"
+ "\n"
+ "wldrd wr6, [%1, #224]\n"
+ "wldrd wr7, [%1, #232]\n"
+ "wmadds wr6, wr6, wr2\n"
+ "wmadds wr7, wr7, wr2\n"
+ "waddwss wr4, wr6, wr4\n"
+ "waddwss wr5, wr7, wr5\n"
+ "\n"
+ "wldrd wr6, [%1, #256]\n"
+ "wldrd wr7, [%1, #264]\n"
+ "wmadds wr6, wr6, wr3\n"
+ "wmadds wr7, wr7, wr3\n"
+ "waddwss wr4, wr6, wr4\n"
+ "waddwss wr5, wr7, wr5\n"
+ "\n"
+ "wstrd wr4, [%3]\n"
+ "wstrd wr5, [%3, #8]\n"
+ "\n"
+ "wldrd wr4, [%1, #176]\n"
+ "wldrd wr5, [%1, #184]\n"
+ "wmadds wr5, wr5, wr0\n"
+ "wmadds wr0, wr4, wr0\n"
+ "\n"
+ "wldrd wr4, [%1, #208]\n"
+ "wldrd wr7, [%1, #216]\n"
+ "wmadds wr7, wr7, wr1\n"
+ "wmadds wr1, wr4, wr1\n"
+ "waddwss wr0, wr1, wr0\n"
+ "waddwss wr5, wr7, wr5\n"
+ "\n"
+ "wldrd wr4, [%1, #240]\n"
+ "wldrd wr7, [%1, #248]\n"
+ "wmadds wr7, wr7, wr2\n"
+ "wmadds wr2, wr4, wr2\n"
+ "waddwss wr0, wr2, wr0\n"
+ "waddwss wr5, wr7, wr5\n"
+ "\n"
+ "wldrd wr4, [%1, #272]\n"
+ "wldrd wr7, [%1, #280]\n"
+ "wmadds wr7, wr7, wr3\n"
+ "wmadds wr3, wr4, wr3\n"
+ "waddwss wr0, wr3, wr0\n"
+ "waddwss wr5, wr7, wr5\n"
+ "\n"
+ "wstrd wr0, [%3, #16]\n"
+ "wstrd wr5, [%3, #24]\n"
+ :
+ : "r" (in), "r" (consts),
+ "r" (1 << (SBC_PROTO_FIXED8_SCALE - 1)), "r" (out),
+ "r" (SBC_PROTO_FIXED8_SCALE)
+ : "memory");
+#endif
+}
+
+static inline void sbc_analyze_4b_4s_iwmmxt(int16_t *x, int32_t *out,
+ int out_stride)
+{
+ /* Analyze blocks */
+ sbc_analyze_four_iwmmxt(x + 12, out, analysis_consts_fixed4_simd_odd);
+ out += out_stride;
+ sbc_analyze_four_iwmmxt(x + 8, out, analysis_consts_fixed4_simd_even);
+ out += out_stride;
+ sbc_analyze_four_iwmmxt(x + 4, out, analysis_consts_fixed4_simd_odd);
+ out += out_stride;
+ sbc_analyze_four_iwmmxt(x + 0, out, analysis_consts_fixed4_simd_even);
+}
+
+static inline void sbc_analyze_4b_8s_iwmmxt(int16_t *x, int32_t *out,
+ int out_stride)
+{
+ /* Analyze blocks */
+ sbc_analyze_eight_iwmmxt(x + 24, out, analysis_consts_fixed8_simd_odd);
+ out += out_stride;
+ sbc_analyze_eight_iwmmxt(x + 16, out, analysis_consts_fixed8_simd_even);
+ out += out_stride;
+ sbc_analyze_eight_iwmmxt(x + 8, out, analysis_consts_fixed8_simd_odd);
+ out += out_stride;
+ sbc_analyze_eight_iwmmxt(x + 0, out, analysis_consts_fixed8_simd_even);
+}
+
+static void sbc_calc_scalefactors_iwmmxt2(
+ int32_t sb_sample_f[16][2][8],
+ uint32_t scale_factor[2][8],
+ int blocks, int channels, int subbands)
+{
+ int ch, sb;
+ for (ch = 0; ch < channels; ch++) {
+ for (sb = 0; sb < subbands; sb += 2) {
+ int blk = blocks;
+ int32_t *in = &sb_sample_f[0][ch][sb];
+ /* For iwmmxt2, since we use wabs */
+ asm volatile (
+ "wldrd wr1, [%[in]], %[inc]\n"
+ "tbcstw wr0, %[c1]\n"
+ "wldrd wr2, [%[in]], %[inc]\n"
+ "wldrd wr3, [%[in]], %[inc]\n"
+ "wldrd wr4, [%[in]], %[inc]\n"
+ "1:\n"
+ "wabsw wr1, wr1\n"
+ "wabsw wr2, wr2\n"
+ "wabsw wr3, wr3\n"
+ "wabsw wr4, wr4\n"
+ "wmaxuw wr5, wr1, wr2\n"
+ "wldrd wr1, [%[in]], %[inc]\n"
+ "wmaxuw wr6, wr3, wr4\n"
+ "wldrd wr2, [%[in]], %[inc]\n"
+ "wmaxuw wr5, wr5, wr6\n"
+ "wldrd wr3, [%[in]], %[inc]\n"
+ "wmaxuw wr0, wr0, wr5\n"
+ "wldrd wr4, [%[in]], %[inc]\n"
+ "subs %[blk], %[blk], #4\n"
+ "bgt 1b\n"
+
+ "tmrrc %0, %1, wr0\n"
+ "sub %0, %0, #1\n"
+ "clz %0, %0\n"
+ "rsb %0, %0, %[c2]\n"
+ "str %0, [%[out]]\n"
+
+ "sub %1, %1, #1\n"
+ "clz %1, %1\n"
+ "rsb %1, %1, %[c2]\n"
+ "str %1, [%[out], #4]\n"
+ : [in] "+r" (in), [blk] "+r" (blk)
+ : [inc] "i" ((char *) &sb_sample_f[1][0][0] -
+ (char *) &sb_sample_f[0][0][0]),
+ [out] "r" (&scale_factor[ch][sb]),
+ [c1] "r" ((1 << SCALE_OUT_BITS) + 1),
+ [c2] "i" (SCALE_OUT_BITS+1)
+ : "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6",
+ "cc", "memory");
+ }
+ }
+}
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *state)
+{
+ state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_iwmmxt;
+ state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_iwmmxt;
+ state->sbc_calc_scalefactors = sbc_calc_scalefactors_iwmmxt2;
+ state->implementation_info = "IWMMXT";
+}
+
+#endif
diff --git a/sbc/sbc_primitives_iwmmxt.h b/sbc/sbc_primitives_iwmmxt.h
new file mode 100644
index 0000000..827d811
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.h
@@ -0,0 +1,38 @@
+/*
+ *
+ * Bluetooth low-complexity, subband codec (SBC) library
+ *
+ * Based on sbc_primitives_mmx.c
+ *
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef __SBC_PRIMITIVES_IWMMXT_H
+#define __SBC_PRIMITIVES_IWMMXT_H
+
+#include "sbc_primitives.h"
+
+#if defined(__GNUC__) && defined(__IWMMXT__) && \
+ !defined(SBC_HIGH_PRECISION) && (SCALE_OUT_BITS == 15)
+
+#define SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *encoder_state);
+
+#endif
+
+#endif
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH v2] Add iwmmxt optimization for sbc for pxa series cpu
2010-11-12 7:35 ` [PATCH v2] " Keith Mok
@ 2010-11-12 13:22 ` Siarhei Siamashka
2010-11-15 2:46 ` [PATCH v3] " Keith Mok
0 siblings, 1 reply; 10+ messages in thread
From: Siarhei Siamashka @ 2010-11-12 13:22 UTC (permalink / raw)
To: Keith Mok; +Cc: linux-bluetooth
[-- Attachment #1: Type: Text/Plain, Size: 5034 bytes --]
On Friday 12 November 2010 09:35:04 Keith Mok wrote:
> > Did you run some benchmarks with these optimizations to measure how much
> > they are helping?
>
> Tested on Marvell PXA platform.
> == Before ==
> $ time ./sbcenc -b53 -s8 -j c.au > /dev/null
> real 0m 0.41s
> user 0m 0.40s
> sys 0m 0.00s
>
> == After ==
> $ time ./sbcenc -b53 -s8 -j c.au > /dev/null
> real 0m 0.19s
> user 0m 0.17s
> sys 0m 0.02s
Thanks, this looks consistent with the results of optimizations on the other
platforms where the performance increases roughly twice after adding SIMD
optimizations to the sbc analysis filter.
But maybe it's better to use a bit bigger test file, so that the total time
increases to at least several seconds. With very small times, it's hard to say
whether it is an actual improvement or random noise. It may be ok for such a
huge performance improvement, but with less significant optimizations the
precision of measurements may become a problem.
Also do you have oprofile available on PXA platform? It may provide a nice
statistics about what functions are used and are the performance hot spots.
> > Using back-to-back WLDRD instructions has some performance penalty
>
> I rearrange the instructions and keep the original one as for reference in
> the block that comment out. Since the code is really difficult to read
> after interleaved.
Thanks, this looks like it really should run quite a bit faster than the
previous variant (based on my understanding of intel pdf files).
I sometimes use different indentation levels in such cases in order to improve
readability after instructions reordering, so that each logically independent
block of code has its own indentation level and it is still easily visible
after instructions reordering. For example, with the original code:
A1
A2
A3
A4
B1
B2
B3
B4
If the instructions need to be reordered in order to improve scheduling for the
cpu pipeline, then for example
A1
A2
B1
A3
B2
A4
B3
B4
looks much more readable to me than
A1
A2
B1
A3
B2
A4
B3
B4
With different indentation levels, one can still see the flow of instructions
as independent streams. If different levels of indentation in inline assembly
pass coding style test by checkpatch.pl script, then it should be fine.
Also I'm quite curious whether better instructions scheduling provide any clear
improvement, so some numbers comparing older and newer implementation would
be appreciated. I did not suggest that just for entertainment purposes ;) It
really should provide some practical benefit.
If you have time and want to make such a test, iwmmxt intrinsics could be also
tried, so that instructions scheduling and registers allocation becomes a
responsibility of the compiler. But my previous experiments with arm neon
intrinsics showed that the compiler does a very poor job and can't be trusted
to generate fast code. But maybe iwmmxt could be different or gcc could have
improved since than.
> > The MMX code was using PCMPGTD and the other instructions just because
> > MMX instruction set is very limited and did not have the needed
> > instructions. But you can use WABS and WMAX instructions to do this job
> > better. You can refer to the original C code and also to ARM NEON
> > optimizations to get some ideas about how to do this operation faster.
>
> Changed as suggested.
> But got a question that the __IWMMXT__ builtin gcc definition is not a
> reliable way to
> determine whether mcpu=iwmmxt2 is turned on or not. It will break when
> compile under pxa270
> which does not support wabs with just mcpu=iwmmx on.
Well, as I said before, I'm not familiar with iwmmxt and pxa platform. And I
did not notice that there are actually several revisions of iwmmxt isa, my bad.
So looks like iwmmxt1 is just as restrictive as the original mmx and the direct
conversion from mmx like you did before may be the right thing. For arm neon
optimizations, the effect of using vector ABS/MAX instructions was just
about 1% of overall performance improvement. Not so much, but every little bit
helps. And if for iwmmxt it causes such backwards compatibility issues, then it
might be not worth it. It's up to you to decide.
I would still suggest to initially have just optimizations for
sbc_analyze_four_iwmmxt/sbc_analyze_eight_iwmmxt in the first patch (or maybe
in two patches). And then add optimization for sbc_calc_scalefactors in a
separate patch later.
Regarding the benchmarks and functions usage:
1. sbc_analyze_four_iwmmxt is important for 4 subbands case ('-s4' option for
sbcenc)
2. sbc_analyze_eight_iwmmxt is important for 8 subbands case ('-s8' option for
sbcenc)
3. sbc_calc_scalefactors is important for either mono audio, or when joint
stereo is *not* used (sbcenc is run without '-j' option).
All of this is better to be benchmarked/tested separately.
--
Best regards,
Siarhei Siamashka
[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 198 bytes --]
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v3] Add iwmmxt optimization for sbc for pxa series cpu
2010-11-12 13:22 ` Siarhei Siamashka
@ 2010-11-15 2:46 ` Keith Mok
2010-11-15 11:08 ` Siarhei Siamashka
0 siblings, 1 reply; 10+ messages in thread
From: Keith Mok @ 2010-11-15 2:46 UTC (permalink / raw)
To: Siarhei Siamashka; +Cc: linux-bluetooth
> I sometimes use different indentation levels in such cases in order to improve
> readability after instructions reordering, so that each logically independent
> block of code has its own indentation level and it is still easily visible
> after instructions reordering. For example, with the original code:
Thanks for the hints. I rearranged the code.
> Not so much, but every little bit
> helps. And if for iwmmxt it causes such backwards compatibility issues, then it
> might be not worth it. It's up to you to decide.
I removed the scale_factor optimization since from the result I
tested, it shows little help in performance.
> Regarding the benchmarks and functions usage:
> 1. sbc_analyze_four_iwmmxt is important for 4 subbands case ('-s4' option for
> sbcenc)
> 2. sbc_analyze_eight_iwmmxt is important for 8 subbands case ('-s8' option for
> sbcenc)
=== Before (4 bands) ====
$ time ./sbcenc_orig -s 4 long.au > /dev/null
real 0m 2.44s
user 0m 2.39s
sys 0m 0.05s
=== After (4 bands) ====
$ time ./sbcenc -s 4 long.au > /dev/null
real 0m 1.59s
user 0m 1.49s
sys 0m 0.10s
=== Before (8 bands) ====
$ time ./sbcenc_orig -s 8 long.au > /dev/null
real 0m 4.05s
user 0m 3.98s
sys 0m 0.07s
=== After (8 bands) ====
$ time ./sbcenc -s 8 long.au > /dev/null
real 0m 1.48s
user 0m 1.41s
sys 0m 0.06s
=== Before (a2dp usage) ====
$ time ./sbcenc_orig -b53 -s8 -j long.au > /dev/null
real 0m 4.51s
user 0m 4.41s
sys 0m 0.10s
=== After (a2dp usage) ====
$ time ./sbcenc -b53 -s8 -j long.au > /dev/null
real 0m 2.05s
user 0m 1.99s
sys 0m 0.06s
Keith
Signed-off-by: Keith Mok <ek9852@gmail.com>
---
Makefile.am | 1 +
sbc/sbc_primitives.c | 4 +
sbc/sbc_primitives_iwmmxt.c | 301 +++++++++++++++++++++++++++++++++++++++++++
sbc/sbc_primitives_iwmmxt.h | 38 ++++++
4 files changed, 344 insertions(+), 0 deletions(-)
create mode 100644 sbc/sbc_primitives_iwmmxt.c
create mode 100644 sbc/sbc_primitives_iwmmxt.h
diff --git a/Makefile.am b/Makefile.am
index da308a7..03a9bf2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -65,6 +65,7 @@ noinst_LTLIBRARIES += sbc/libsbc.la
sbc_libsbc_la_SOURCES = sbc/sbc.h sbc/sbc.c sbc/sbc_math.h sbc/sbc_tables.h \
sbc/sbc_primitives.h sbc/sbc_primitives.c \
sbc/sbc_primitives_mmx.h sbc/sbc_primitives_mmx.c \
+ sbc/sbc_primitives_iwmmxt.h sbc/sbc_primitives_iwmmxt.c \
sbc/sbc_primitives_neon.h sbc/sbc_primitives_neon.c \
sbc/sbc_primitives_armv6.h sbc/sbc_primitives_armv6.c
diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
index f87fb5a..ad780d0 100644
--- a/sbc/sbc_primitives.c
+++ b/sbc/sbc_primitives.c
@@ -33,6 +33,7 @@
#include "sbc_primitives.h"
#include "sbc_primitives_mmx.h"
+#include "sbc_primitives_iwmmxt.h"
#include "sbc_primitives_neon.h"
#include "sbc_primitives_armv6.h"
@@ -544,6 +545,9 @@ void sbc_init_primitives(struct sbc_encoder_state *state)
#ifdef SBC_BUILD_WITH_ARMV6_SUPPORT
sbc_init_primitives_armv6(state);
#endif
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+ sbc_init_primitives_iwmmxt(state);
+#endif
#ifdef SBC_BUILD_WITH_NEON_SUPPORT
sbc_init_primitives_neon(state);
#endif
diff --git a/sbc/sbc_primitives_iwmmxt.c b/sbc/sbc_primitives_iwmmxt.c
new file mode 100644
index 0000000..fc462d2
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.c
@@ -0,0 +1,301 @@
+/*
+ *
+ * Bluetooth low-complexity, subband codec (SBC) library
+ *
+ * Copyright (C) 2010 Keith Mok <ek9852@gmail.com>
+ * Based on sbc_primitives_mmx.c
+ *
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <stdint.h>
+#include <limits.h>
+#include "sbc.h"
+#include "sbc_math.h"
+#include "sbc_tables.h"
+
+#include "sbc_primitives_iwmmxt.h"
+
+/*
+ * IWMMXT optimizations
+ */
+
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+static inline void sbc_analyze_four_iwmmxt(const int16_t *in, int32_t *out,
+ const FIXED_T *consts)
+{
+ asm volatile (
+ "wldrd wr0, [%0]\n"
+ "tbcstw wr4, %2\n"
+ "wldrd wr2, [%1]\n"
+ "wldrd wr1, [%0, #8]\n"
+ "wldrd wr3, [%1, #8]\n"
+ "wmadds wr0, wr2, wr0\n"
+ " wldrd wr6, [%0, #16]\n"
+ "wmadds wr1, wr3, wr1\n"
+ " wldrd wr7, [%0, #24]\n"
+ "waddwss wr0, wr0, wr4\n"
+ " wldrd wr8, [%1, #16]\n"
+ "waddwss wr1, wr1, wr4\n"
+ " wldrd wr9, [%1, #24]\n"
+ " wmadds wr6, wr8, wr6\n"
+ " wldrd wr2, [%0, #32]\n"
+ " wmadds wr7, wr9, wr7\n"
+ " wldrd wr3, [%0, #40]\n"
+ " waddwss wr0, wr6, wr0\n"
+ " wldrd wr4, [%1, #32]\n"
+ " waddwss wr1, wr7, wr1\n"
+ " wldrd wr5, [%1, #40]\n"
+ " wmadds wr2, wr4, wr2\n"
+ "wldrd wr6, [%0, #48]\n"
+ " wmadds wr3, wr5, wr3\n"
+ "wldrd wr7, [%0, #56]\n"
+ " waddwss wr0, wr2, wr0\n"
+ "wldrd wr8, [%1, #48]\n"
+ " waddwss wr1, wr3, wr1\n"
+ "wldrd wr9, [%1, #56]\n"
+ "wmadds wr6, wr8, wr6\n"
+ " wldrd wr2, [%0, #64]\n"
+ "wmadds wr7, wr9, wr7\n"
+ " wldrd wr3, [%0, #72]\n"
+ "waddwss wr0, wr6, wr0\n"
+ " wldrd wr4, [%1, #64]\n"
+ "waddwss wr1, wr7, wr1\n"
+ " wldrd wr5, [%1, #72]\n"
+ " wmadds wr2, wr4, wr2\n"
+ "tmcr wcgr0, %4\n"
+ " wmadds wr3, wr5, wr3\n"
+ " waddwss wr0, wr2, wr0\n"
+ " waddwss wr1, wr3, wr1\n"
+ "\n"
+ "wsrawg wr0, wr0, wcgr0\n"
+ " wldrd wr4, [%1, #80]\n"
+ "wsrawg wr1, wr1, wcgr0\n"
+ " wldrd wr5, [%1, #88]\n"
+ "wpackwss wr0, wr0, wr0\n"
+ " wldrd wr6, [%1, #96]\n"
+ "wpackwss wr1, wr1, wr1\n"
+ "wmadds wr2, wr5, wr0\n"
+ " wldrd wr7, [%1, #104]\n"
+ "wmadds wr0, wr4, wr0\n"
+ "\n"
+ " wmadds wr3, wr7, wr1\n"
+ " wmadds wr1, wr6, wr1\n"
+ " waddwss wr2, wr3, wr2\n"
+ " waddwss wr0, wr1, wr0\n"
+ "\n"
+ "wstrd wr0, [%3]\n"
+ "wstrd wr2, [%3, #8]\n"
+ :
+ : "r" (in), "r" (consts),
+ "r" (1 << (SBC_PROTO_FIXED4_SCALE - 1)), "r" (out),
+ "r" (SBC_PROTO_FIXED4_SCALE)
+ : "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6", "wr7",
+ "wr8", "wr9", "wcgr0", "memory");
+}
+
+static inline void sbc_analyze_eight_iwmmxt(const int16_t *in, int32_t *out,
+ const FIXED_T *consts)
+{
+ asm volatile (
+ "wldrd wr0, [%0]\n"
+ "tbcstw wr15, %2\n"
+ "wldrd wr1, [%0, #8]\n"
+ "wldrd wr2, [%0, #16]\n"
+ "wldrd wr3, [%0, #24]\n"
+ "wldrd wr4, [%1]\n"
+ "wldrd wr5, [%1, #8]\n"
+ "wldrd wr6, [%1, #16]\n"
+ "wldrd wr7, [%1, #24]\n"
+ "wmadds wr0, wr0, wr4\n"
+ " wldrd wr8, [%1, #32]\n"
+ "wmadds wr1, wr1, wr5\n"
+ " wldrd wr9, [%1, #40]\n"
+ "wmadds wr2, wr2, wr6\n"
+ " wldrd wr10, [%1, #48]\n"
+ "wmadds wr3, wr3, wr7\n"
+ " wldrd wr11, [%1, #56]\n"
+ "waddwss wr0, wr0, wr15\n"
+ " wldrd wr4, [%0, #32]\n"
+ "waddwss wr1, wr1, wr15\n"
+ " wldrd wr5, [%0, #40]\n"
+ "waddwss wr2, wr2, wr15\n"
+ " wldrd wr6, [%0, #48]\n"
+ "waddwss wr3, wr3, wr15\n"
+ " wldrd wr7, [%0, #56]\n"
+ " wmadds wr4, wr4, wr8\n"
+ " wldrd wr12, [%0, #64]\n"
+ " wmadds wr5, wr5, wr9\n"
+ " wldrd wr13, [%0, #72]\n"
+ " wmadds wr6, wr6, wr10\n"
+ " wldrd wr14, [%0, #80]\n"
+ " wmadds wr7, wr7, wr11\n"
+ " wldrd wr15, [%0, #88]\n"
+ " waddwss wr0, wr4, wr0\n"
+ " wldrd wr8, [%1, #64]\n"
+ " waddwss wr1, wr5, wr1\n"
+ " wldrd wr9, [%1, #72]\n"
+ " waddwss wr2, wr6, wr2\n"
+ " wldrd wr10, [%1, #80]\n"
+ " waddwss wr3, wr7, wr3\n"
+ " wldrd wr11, [%1, #88]\n"
+ " wmadds wr12, wr12, wr8\n"
+ "wldrd wr4, [%0, #96]\n"
+ " wmadds wr13, wr13, wr9\n"
+ "wldrd wr5, [%0, #104]\n"
+ " wmadds wr14, wr14, wr10\n"
+ "wldrd wr6, [%0, #112]\n"
+ " wmadds wr15, wr15, wr11\n"
+ "wldrd wr7, [%0, #120]\n"
+ " waddwss wr0, wr12, wr0\n"
+ "wldrd wr8, [%1, #96]\n"
+ " waddwss wr1, wr13, wr1\n"
+ "wldrd wr9, [%1, #104]\n"
+ " waddwss wr2, wr14, wr2\n"
+ "wldrd wr10, [%1, #112]\n"
+ " waddwss wr3, wr15, wr3\n"
+ "wldrd wr11, [%1, #120]\n"
+ "wmadds wr4, wr4, wr8\n"
+ " wldrd wr12, [%0, #128]\n"
+ "wmadds wr5, wr5, wr9\n"
+ " wldrd wr13, [%0, #136]\n"
+ "wmadds wr6, wr6, wr10\n"
+ " wldrd wr14, [%0, #144]\n"
+ "wmadds wr7, wr7, wr11\n"
+ " wldrd wr15, [%0, #152]\n"
+ "waddwss wr0, wr4, wr0\n"
+ " wldrd wr8, [%1, #128]\n"
+ "waddwss wr1, wr5, wr1\n"
+ " wldrd wr9, [%1, #136]\n"
+ "waddwss wr2, wr6, wr2\n"
+ " wldrd wr10, [%1, #144]\n"
+ " waddwss wr3, wr7, wr3\n"
+ " wldrd wr11, [%1, #152]\n"
+ " wmadds wr12, wr12, wr8\n"
+ "tmcr wcgr0, %4\n"
+ " wmadds wr13, wr13, wr9\n"
+ " wmadds wr14, wr14, wr10\n"
+ " wmadds wr15, wr15, wr11\n"
+ " waddwss wr0, wr12, wr0\n"
+ " waddwss wr1, wr13, wr1\n"
+ " waddwss wr2, wr14, wr2\n"
+ " waddwss wr3, wr15, wr3\n"
+ "\n"
+ "wsrawg wr0, wr0, wcgr0\n"
+ "wsrawg wr1, wr1, wcgr0\n"
+ "wsrawg wr2, wr2, wcgr0\n"
+ "wsrawg wr3, wr3, wcgr0\n"
+ "\n"
+ "wpackwss wr0, wr0, wr0\n"
+ "wpackwss wr1, wr1, wr1\n"
+ " wldrd wr4, [%1, #160]\n"
+ "wpackwss wr2, wr2, wr2\n"
+ " wldrd wr5, [%1, #168]\n"
+ "wpackwss wr3, wr3, wr3\n"
+ " wldrd wr6, [%1, #192]\n"
+ " wmadds wr4, wr4, wr0\n"
+ " wldrd wr7, [%1, #200]\n"
+ " wmadds wr5, wr5, wr0\n"
+ " wldrd wr8, [%1, #224]\n"
+ " wmadds wr6, wr6, wr1\n"
+ " wldrd wr9, [%1, #232]\n"
+ " wmadds wr7, wr7, wr1\n"
+ " waddwss wr4, wr6, wr4\n"
+ " waddwss wr5, wr7, wr5\n"
+ " wmadds wr8, wr8, wr2\n"
+ "wldrd wr6, [%1, #256]\n"
+ " wmadds wr9, wr9, wr2\n"
+ "wldrd wr7, [%1, #264]\n"
+ "waddwss wr4, wr8, wr4\n"
+ " waddwss wr5, wr9, wr5\n"
+ "wmadds wr6, wr6, wr3\n"
+ "wmadds wr7, wr7, wr3\n"
+ "waddwss wr4, wr6, wr4\n"
+ "waddwss wr5, wr7, wr5\n"
+ "\n"
+ "wstrd wr4, [%3]\n"
+ "wstrd wr5, [%3, #8]\n"
+ "\n"
+ "wldrd wr6, [%1, #176]\n"
+ "wldrd wr5, [%1, #184]\n"
+ "wmadds wr5, wr5, wr0\n"
+ "wldrd wr8, [%1, #208]\n"
+ "wmadds wr0, wr6, wr0\n"
+ "wldrd wr9, [%1, #216]\n"
+ "wmadds wr9, wr9, wr1\n"
+ "wldrd wr6, [%1, #240]\n"
+ "wmadds wr1, wr8, wr1\n"
+ "wldrd wr7, [%1, #248]\n"
+ "waddwss wr0, wr1, wr0\n"
+ "waddwss wr5, wr9, wr5\n"
+ "wmadds wr7, wr7, wr2\n"
+ "wldrd wr8, [%1, #272]\n"
+ "wmadds wr2, wr6, wr2\n"
+ "wldrd wr9, [%1, #280]\n"
+ "waddwss wr0, wr2, wr0\n"
+ "waddwss wr5, wr7, wr5\n"
+ "wmadds wr9, wr9, wr3\n"
+ "wmadds wr3, wr8, wr3\n"
+ "waddwss wr0, wr3, wr0\n"
+ "waddwss wr5, wr9, wr5\n"
+ "\n"
+ "wstrd wr0, [%3, #16]\n"
+ "wstrd wr5, [%3, #24]\n"
+ :
+ : "r" (in), "r" (consts),
+ "r" (1 << (SBC_PROTO_FIXED8_SCALE - 1)), "r" (out),
+ "r" (SBC_PROTO_FIXED8_SCALE)
+ : "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6", "wr7",
+ "wr8", "wr9", "wr10", "wr11", "wr12", "wr13", "wr14", "wr15",
+ "wcgr0", "memory");
+}
+
+static inline void sbc_analyze_4b_4s_iwmmxt(int16_t *x, int32_t *out,
+ int out_stride)
+{
+ /* Analyze blocks */
+ sbc_analyze_four_iwmmxt(x + 12, out, analysis_consts_fixed4_simd_odd);
+ out += out_stride;
+ sbc_analyze_four_iwmmxt(x + 8, out, analysis_consts_fixed4_simd_even);
+ out += out_stride;
+ sbc_analyze_four_iwmmxt(x + 4, out, analysis_consts_fixed4_simd_odd);
+ out += out_stride;
+ sbc_analyze_four_iwmmxt(x + 0, out, analysis_consts_fixed4_simd_even);
+}
+
+static inline void sbc_analyze_4b_8s_iwmmxt(int16_t *x, int32_t *out,
+ int out_stride)
+{
+ /* Analyze blocks */
+ sbc_analyze_eight_iwmmxt(x + 24, out, analysis_consts_fixed8_simd_odd);
+ out += out_stride;
+ sbc_analyze_eight_iwmmxt(x + 16, out, analysis_consts_fixed8_simd_even);
+ out += out_stride;
+ sbc_analyze_eight_iwmmxt(x + 8, out, analysis_consts_fixed8_simd_odd);
+ out += out_stride;
+ sbc_analyze_eight_iwmmxt(x + 0, out, analysis_consts_fixed8_simd_even);
+}
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *state)
+{
+ state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_iwmmxt;
+ state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_iwmmxt;
+ state->implementation_info = "IWMMXT";
+}
+
+#endif
diff --git a/sbc/sbc_primitives_iwmmxt.h b/sbc/sbc_primitives_iwmmxt.h
new file mode 100644
index 0000000..827d811
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.h
@@ -0,0 +1,38 @@
+/*
+ *
+ * Bluetooth low-complexity, subband codec (SBC) library
+ *
+ * Based on sbc_primitives_mmx.c
+ *
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef __SBC_PRIMITIVES_IWMMXT_H
+#define __SBC_PRIMITIVES_IWMMXT_H
+
+#include "sbc_primitives.h"
+
+#if defined(__GNUC__) && defined(__IWMMXT__) && \
+ !defined(SBC_HIGH_PRECISION) && (SCALE_OUT_BITS == 15)
+
+#define SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *encoder_state);
+
+#endif
+
+#endif
--
1.6.3.3
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH v3] Add iwmmxt optimization for sbc for pxa series cpu
2010-11-15 2:46 ` [PATCH v3] " Keith Mok
@ 2010-11-15 11:08 ` Siarhei Siamashka
2010-11-18 13:05 ` Siarhei Siamashka
0 siblings, 1 reply; 10+ messages in thread
From: Siarhei Siamashka @ 2010-11-15 11:08 UTC (permalink / raw)
To: Keith Mok; +Cc: linux-bluetooth
[-- Attachment #1: Type: Text/Plain, Size: 2142 bytes --]
On Monday 15 November 2010 04:46:25 Keith Mok wrote:
> > I sometimes use different indentation levels in such cases in order to
> > improve readability after instructions reordering, so that each
> > logically independent block of code has its own indentation level and it
> > is still easily visible
>
> > after instructions reordering. For example, with the original code:
> Thanks for the hints. I rearranged the code.
Thanks, now the assembly code looks ok to me. I also discovered that qemu
supports iwmmxt1 emulation just fine and also tried to test your optimizations
for correctness myself (with a script which tries different encoding paramaters
for different audio samples and checks md5 checksums), no problems detected.
So if somebody else could check whether the other things are right (copyright
notices for example), then we are done with it.
> I removed the scale_factor optimization since from the result I
> tested, it shows little help in performance.
I guess after easily doubling performance by adding simd optimizations to the
sbc analysis filter, just roughly ~10% improvement (as measured for x86 and
arm neon) does not look particularly impressive anymore:
http://git.kernel.org/?p=bluetooth/bluez.git;a=commit;h=95465b816f0ce7f0ec10a183ce7ff0c6f83d86eb
http://git.kernel.org/?p=bluetooth/bluez.git;a=commit;h=d049a9a2aec2b518e04f11ef0ecc355db8237291
But I still think that every little bit helps. Did you also get something like
10% speedup, or was it even worse than that?
A bit more important in practice is the optimization for joint stereo scale
factors calculation (because it is typically used for A2DP). And it provided
almost 20% of performance improvement for arm neon:
http://git.kernel.org/?p=bluetooth/bluez.git;a=commit;h=e1ea3e76c72d56041c30b317818e8d7b5a0c7350
So 'sbc_calc_scalefactors_j_iwmmxt' may be a nice addition too, optimized
either as a whole for best performance (like in arm neon code), or just with
some small chunks of assembly like in 'sbc_calc_scalefactors_mmx' because it
is easier this way.
--
Best regards,
Siarhei Siamashka
[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 198 bytes --]
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v3] Add iwmmxt optimization for sbc for pxa series cpu
2010-11-15 11:08 ` Siarhei Siamashka
@ 2010-11-18 13:05 ` Siarhei Siamashka
2010-11-18 13:31 ` Johan Hedberg
2010-11-18 13:33 ` [PATCH] " Keith Mok
0 siblings, 2 replies; 10+ messages in thread
From: Siarhei Siamashka @ 2010-11-18 13:05 UTC (permalink / raw)
To: Keith Mok; +Cc: linux-bluetooth
On Monday 15 November 2010 13:08:19 Siarhei Siamashka wrote:
> On Monday 15 November 2010 04:46:25 Keith Mok wrote:
> > > I sometimes use different indentation levels in such cases in order to
> > > improve readability after instructions reordering, so that each
> > > logically independent block of code has its own indentation level and
> > > it is still easily visible
> >
> > > after instructions reordering. For example, with the original code:
> > Thanks for the hints. I rearranged the code.
>
> Thanks, now the assembly code looks ok to me. I also discovered that qemu
> supports iwmmxt1 emulation just fine and also tried to test your
> optimizations for correctness myself (with a script which tries different
> encoding paramaters for different audio samples and checks md5 checksums),
> no problems detected.
>
> So if somebody else could check whether the other things are right
> (copyright notices for example), then we are done with it.
As nobody else has stepped in, I guess it's still my responsibility to provide
some further guidance even though I'm a very infrequent contributor myself.
Hopefully somebody will correct me if I'm wrong.
So please
1. Make a final patch in such a form that can be pushed to git repository
without any modifications, it means that you need a clean commit message and
not just some text intermixed with the parts and quotations of discussion from
this mailing list.
2. "Signed-off-by" header is not needed for the userspace parts of bluez.
3. All files must have copyright notices, even a small one like
'sbc_primitives_iwmmxt.h'. And probably you should just replicate all the
copyright notices from the source files with sbc mmx optimizations and add your
own copyright on top.
Hopefully that should be enough to get your optimizations applied. Thanks.
--
Best regards,
Siarhei Siamashka
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v3] Add iwmmxt optimization for sbc for pxa series cpu
2010-11-18 13:05 ` Siarhei Siamashka
@ 2010-11-18 13:31 ` Johan Hedberg
2010-11-18 13:33 ` [PATCH] " Keith Mok
1 sibling, 0 replies; 10+ messages in thread
From: Johan Hedberg @ 2010-11-18 13:31 UTC (permalink / raw)
To: Siarhei Siamashka; +Cc: Keith Mok, linux-bluetooth
Hi Siarhei,
On Thu, Nov 18, 2010, Siarhei Siamashka wrote:
> 1. Make a final patch in such a form that can be pushed to git repository
> without any modifications, it means that you need a clean commit message and
> not just some text intermixed with the parts and quotations of discussion from
> this mailing list.
> 2. "Signed-off-by" header is not needed for the userspace parts of bluez.
> 3. All files must have copyright notices, even a small one like
> 'sbc_primitives_iwmmxt.h'. And probably you should just replicate all the
> copyright notices from the source files with sbc mmx optimizations and add your
> own copyright on top.
>
> Hopefully that should be enough to get your optimizations applied. Thanks.
Yep, those things would be needed before pushing upstream. Thanks for
reminding me about this patch. I had actually forgotten about it.
Johan
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH] Add iwmmxt optimization for sbc for pxa series cpu
2010-11-18 13:05 ` Siarhei Siamashka
2010-11-18 13:31 ` Johan Hedberg
@ 2010-11-18 13:33 ` Keith Mok
2010-11-18 16:53 ` Johan Hedberg
1 sibling, 1 reply; 10+ messages in thread
From: Keith Mok @ 2010-11-18 13:33 UTC (permalink / raw)
To: Siarhei Siamashka; +Cc: linux-bluetooth
Add iwmmxt optimization for sbc for pxa series cpu.
Benchmarked on ARM PXA platform:
=== Before (4 bands) ====
$ time ./sbcenc_orig -s 4 long.au > /dev/null
real 0m 2.44s
user 0m 2.39s
sys 0m 0.05s
=== After (4 bands) ====
$ time ./sbcenc -s 4 long.au > /dev/null
real 0m 1.59s
user 0m 1.49s
sys 0m 0.10s
=== Before (8 bands) ====
$ time ./sbcenc_orig -s 8 long.au > /dev/null
real 0m 4.05s
user 0m 3.98s
sys 0m 0.07s
=== After (8 bands) ====
$ time ./sbcenc -s 8 long.au > /dev/null
real 0m 1.48s
user 0m 1.41s
sys 0m 0.06s
=== Before (a2dp usage) ====
$ time ./sbcenc_orig -b53 -s8 -j long.au > /dev/null
real 0m 4.51s
user 0m 4.41s
sys 0m 0.10s
=== After (a2dp usage) ====
$ time ./sbcenc -b53 -s8 -j long.au > /dev/null
real 0m 2.05s
user 0m 1.99s
sys 0m 0.06s
---
Makefile.am | 1 +
sbc/sbc_primitives.c | 4 +
sbc/sbc_primitives_iwmmxt.c | 304 +++++++++++++++++++++++++++++++++++++++++++
sbc/sbc_primitives_iwmmxt.h | 42 ++++++
4 files changed, 351 insertions(+), 0 deletions(-)
create mode 100644 sbc/sbc_primitives_iwmmxt.c
create mode 100644 sbc/sbc_primitives_iwmmxt.h
diff --git a/Makefile.am b/Makefile.am
index da308a7..03a9bf2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -65,6 +65,7 @@ noinst_LTLIBRARIES += sbc/libsbc.la
sbc_libsbc_la_SOURCES = sbc/sbc.h sbc/sbc.c sbc/sbc_math.h sbc/sbc_tables.h \
sbc/sbc_primitives.h sbc/sbc_primitives.c \
sbc/sbc_primitives_mmx.h sbc/sbc_primitives_mmx.c \
+ sbc/sbc_primitives_iwmmxt.h sbc/sbc_primitives_iwmmxt.c \
sbc/sbc_primitives_neon.h sbc/sbc_primitives_neon.c \
sbc/sbc_primitives_armv6.h sbc/sbc_primitives_armv6.c
diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
index f87fb5a..ad780d0 100644
--- a/sbc/sbc_primitives.c
+++ b/sbc/sbc_primitives.c
@@ -33,6 +33,7 @@
#include "sbc_primitives.h"
#include "sbc_primitives_mmx.h"
+#include "sbc_primitives_iwmmxt.h"
#include "sbc_primitives_neon.h"
#include "sbc_primitives_armv6.h"
@@ -544,6 +545,9 @@ void sbc_init_primitives(struct sbc_encoder_state *state)
#ifdef SBC_BUILD_WITH_ARMV6_SUPPORT
sbc_init_primitives_armv6(state);
#endif
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+ sbc_init_primitives_iwmmxt(state);
+#endif
#ifdef SBC_BUILD_WITH_NEON_SUPPORT
sbc_init_primitives_neon(state);
#endif
diff --git a/sbc/sbc_primitives_iwmmxt.c b/sbc/sbc_primitives_iwmmxt.c
new file mode 100644
index 0000000..213967e
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.c
@@ -0,0 +1,304 @@
+/*
+ *
+ * Bluetooth low-complexity, subband codec (SBC) library
+ *
+ * Copyright (C) 2010 Keith Mok <ek9852@gmail.com>
+ * Copyright (C) 2008-2010 Nokia Corporation
+ * Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com>
+ *
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <stdint.h>
+#include <limits.h>
+#include "sbc.h"
+#include "sbc_math.h"
+#include "sbc_tables.h"
+
+#include "sbc_primitives_iwmmxt.h"
+
+/*
+ * IWMMXT optimizations
+ */
+
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+static inline void sbc_analyze_four_iwmmxt(const int16_t *in, int32_t *out,
+ const FIXED_T *consts)
+{
+ asm volatile (
+ "wldrd wr0, [%0]\n"
+ "tbcstw wr4, %2\n"
+ "wldrd wr2, [%1]\n"
+ "wldrd wr1, [%0, #8]\n"
+ "wldrd wr3, [%1, #8]\n"
+ "wmadds wr0, wr2, wr0\n"
+ " wldrd wr6, [%0, #16]\n"
+ "wmadds wr1, wr3, wr1\n"
+ " wldrd wr7, [%0, #24]\n"
+ "waddwss wr0, wr0, wr4\n"
+ " wldrd wr8, [%1, #16]\n"
+ "waddwss wr1, wr1, wr4\n"
+ " wldrd wr9, [%1, #24]\n"
+ " wmadds wr6, wr8, wr6\n"
+ " wldrd wr2, [%0, #32]\n"
+ " wmadds wr7, wr9, wr7\n"
+ " wldrd wr3, [%0, #40]\n"
+ " waddwss wr0, wr6, wr0\n"
+ " wldrd wr4, [%1, #32]\n"
+ " waddwss wr1, wr7, wr1\n"
+ " wldrd wr5, [%1, #40]\n"
+ " wmadds wr2, wr4, wr2\n"
+ "wldrd wr6, [%0, #48]\n"
+ " wmadds wr3, wr5, wr3\n"
+ "wldrd wr7, [%0, #56]\n"
+ " waddwss wr0, wr2, wr0\n"
+ "wldrd wr8, [%1, #48]\n"
+ " waddwss wr1, wr3, wr1\n"
+ "wldrd wr9, [%1, #56]\n"
+ "wmadds wr6, wr8, wr6\n"
+ " wldrd wr2, [%0, #64]\n"
+ "wmadds wr7, wr9, wr7\n"
+ " wldrd wr3, [%0, #72]\n"
+ "waddwss wr0, wr6, wr0\n"
+ " wldrd wr4, [%1, #64]\n"
+ "waddwss wr1, wr7, wr1\n"
+ " wldrd wr5, [%1, #72]\n"
+ " wmadds wr2, wr4, wr2\n"
+ "tmcr wcgr0, %4\n"
+ " wmadds wr3, wr5, wr3\n"
+ " waddwss wr0, wr2, wr0\n"
+ " waddwss wr1, wr3, wr1\n"
+ "\n"
+ "wsrawg wr0, wr0, wcgr0\n"
+ " wldrd wr4, [%1, #80]\n"
+ "wsrawg wr1, wr1, wcgr0\n"
+ " wldrd wr5, [%1, #88]\n"
+ "wpackwss wr0, wr0, wr0\n"
+ " wldrd wr6, [%1, #96]\n"
+ "wpackwss wr1, wr1, wr1\n"
+ "wmadds wr2, wr5, wr0\n"
+ " wldrd wr7, [%1, #104]\n"
+ "wmadds wr0, wr4, wr0\n"
+ "\n"
+ " wmadds wr3, wr7, wr1\n"
+ " wmadds wr1, wr6, wr1\n"
+ " waddwss wr2, wr3, wr2\n"
+ " waddwss wr0, wr1, wr0\n"
+ "\n"
+ "wstrd wr0, [%3]\n"
+ "wstrd wr2, [%3, #8]\n"
+ :
+ : "r" (in), "r" (consts),
+ "r" (1 << (SBC_PROTO_FIXED4_SCALE - 1)), "r" (out),
+ "r" (SBC_PROTO_FIXED4_SCALE)
+ : "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6", "wr7",
+ "wr8", "wr9", "wcgr0", "memory");
+}
+
+static inline void sbc_analyze_eight_iwmmxt(const int16_t *in, int32_t *out,
+ const FIXED_T *consts)
+{
+ asm volatile (
+ "wldrd wr0, [%0]\n"
+ "tbcstw wr15, %2\n"
+ "wldrd wr1, [%0, #8]\n"
+ "wldrd wr2, [%0, #16]\n"
+ "wldrd wr3, [%0, #24]\n"
+ "wldrd wr4, [%1]\n"
+ "wldrd wr5, [%1, #8]\n"
+ "wldrd wr6, [%1, #16]\n"
+ "wldrd wr7, [%1, #24]\n"
+ "wmadds wr0, wr0, wr4\n"
+ " wldrd wr8, [%1, #32]\n"
+ "wmadds wr1, wr1, wr5\n"
+ " wldrd wr9, [%1, #40]\n"
+ "wmadds wr2, wr2, wr6\n"
+ " wldrd wr10, [%1, #48]\n"
+ "wmadds wr3, wr3, wr7\n"
+ " wldrd wr11, [%1, #56]\n"
+ "waddwss wr0, wr0, wr15\n"
+ " wldrd wr4, [%0, #32]\n"
+ "waddwss wr1, wr1, wr15\n"
+ " wldrd wr5, [%0, #40]\n"
+ "waddwss wr2, wr2, wr15\n"
+ " wldrd wr6, [%0, #48]\n"
+ "waddwss wr3, wr3, wr15\n"
+ " wldrd wr7, [%0, #56]\n"
+ " wmadds wr4, wr4, wr8\n"
+ " wldrd wr12, [%0, #64]\n"
+ " wmadds wr5, wr5, wr9\n"
+ " wldrd wr13, [%0, #72]\n"
+ " wmadds wr6, wr6, wr10\n"
+ " wldrd wr14, [%0, #80]\n"
+ " wmadds wr7, wr7, wr11\n"
+ " wldrd wr15, [%0, #88]\n"
+ " waddwss wr0, wr4, wr0\n"
+ " wldrd wr8, [%1, #64]\n"
+ " waddwss wr1, wr5, wr1\n"
+ " wldrd wr9, [%1, #72]\n"
+ " waddwss wr2, wr6, wr2\n"
+ " wldrd wr10, [%1, #80]\n"
+ " waddwss wr3, wr7, wr3\n"
+ " wldrd wr11, [%1, #88]\n"
+ " wmadds wr12, wr12, wr8\n"
+ "wldrd wr4, [%0, #96]\n"
+ " wmadds wr13, wr13, wr9\n"
+ "wldrd wr5, [%0, #104]\n"
+ " wmadds wr14, wr14, wr10\n"
+ "wldrd wr6, [%0, #112]\n"
+ " wmadds wr15, wr15, wr11\n"
+ "wldrd wr7, [%0, #120]\n"
+ " waddwss wr0, wr12, wr0\n"
+ "wldrd wr8, [%1, #96]\n"
+ " waddwss wr1, wr13, wr1\n"
+ "wldrd wr9, [%1, #104]\n"
+ " waddwss wr2, wr14, wr2\n"
+ "wldrd wr10, [%1, #112]\n"
+ " waddwss wr3, wr15, wr3\n"
+ "wldrd wr11, [%1, #120]\n"
+ "wmadds wr4, wr4, wr8\n"
+ " wldrd wr12, [%0, #128]\n"
+ "wmadds wr5, wr5, wr9\n"
+ " wldrd wr13, [%0, #136]\n"
+ "wmadds wr6, wr6, wr10\n"
+ " wldrd wr14, [%0, #144]\n"
+ "wmadds wr7, wr7, wr11\n"
+ " wldrd wr15, [%0, #152]\n"
+ "waddwss wr0, wr4, wr0\n"
+ " wldrd wr8, [%1, #128]\n"
+ "waddwss wr1, wr5, wr1\n"
+ " wldrd wr9, [%1, #136]\n"
+ "waddwss wr2, wr6, wr2\n"
+ " wldrd wr10, [%1, #144]\n"
+ " waddwss wr3, wr7, wr3\n"
+ " wldrd wr11, [%1, #152]\n"
+ " wmadds wr12, wr12, wr8\n"
+ "tmcr wcgr0, %4\n"
+ " wmadds wr13, wr13, wr9\n"
+ " wmadds wr14, wr14, wr10\n"
+ " wmadds wr15, wr15, wr11\n"
+ " waddwss wr0, wr12, wr0\n"
+ " waddwss wr1, wr13, wr1\n"
+ " waddwss wr2, wr14, wr2\n"
+ " waddwss wr3, wr15, wr3\n"
+ "\n"
+ "wsrawg wr0, wr0, wcgr0\n"
+ "wsrawg wr1, wr1, wcgr0\n"
+ "wsrawg wr2, wr2, wcgr0\n"
+ "wsrawg wr3, wr3, wcgr0\n"
+ "\n"
+ "wpackwss wr0, wr0, wr0\n"
+ "wpackwss wr1, wr1, wr1\n"
+ " wldrd wr4, [%1, #160]\n"
+ "wpackwss wr2, wr2, wr2\n"
+ " wldrd wr5, [%1, #168]\n"
+ "wpackwss wr3, wr3, wr3\n"
+ " wldrd wr6, [%1, #192]\n"
+ " wmadds wr4, wr4, wr0\n"
+ " wldrd wr7, [%1, #200]\n"
+ " wmadds wr5, wr5, wr0\n"
+ " wldrd wr8, [%1, #224]\n"
+ " wmadds wr6, wr6, wr1\n"
+ " wldrd wr9, [%1, #232]\n"
+ " wmadds wr7, wr7, wr1\n"
+ " waddwss wr4, wr6, wr4\n"
+ " waddwss wr5, wr7, wr5\n"
+ " wmadds wr8, wr8, wr2\n"
+ "wldrd wr6, [%1, #256]\n"
+ " wmadds wr9, wr9, wr2\n"
+ "wldrd wr7, [%1, #264]\n"
+ "waddwss wr4, wr8, wr4\n"
+ " waddwss wr5, wr9, wr5\n"
+ "wmadds wr6, wr6, wr3\n"
+ "wmadds wr7, wr7, wr3\n"
+ "waddwss wr4, wr6, wr4\n"
+ "waddwss wr5, wr7, wr5\n"
+ "\n"
+ "wstrd wr4, [%3]\n"
+ "wstrd wr5, [%3, #8]\n"
+ "\n"
+ "wldrd wr6, [%1, #176]\n"
+ "wldrd wr5, [%1, #184]\n"
+ "wmadds wr5, wr5, wr0\n"
+ "wldrd wr8, [%1, #208]\n"
+ "wmadds wr0, wr6, wr0\n"
+ "wldrd wr9, [%1, #216]\n"
+ "wmadds wr9, wr9, wr1\n"
+ "wldrd wr6, [%1, #240]\n"
+ "wmadds wr1, wr8, wr1\n"
+ "wldrd wr7, [%1, #248]\n"
+ "waddwss wr0, wr1, wr0\n"
+ "waddwss wr5, wr9, wr5\n"
+ "wmadds wr7, wr7, wr2\n"
+ "wldrd wr8, [%1, #272]\n"
+ "wmadds wr2, wr6, wr2\n"
+ "wldrd wr9, [%1, #280]\n"
+ "waddwss wr0, wr2, wr0\n"
+ "waddwss wr5, wr7, wr5\n"
+ "wmadds wr9, wr9, wr3\n"
+ "wmadds wr3, wr8, wr3\n"
+ "waddwss wr0, wr3, wr0\n"
+ "waddwss wr5, wr9, wr5\n"
+ "\n"
+ "wstrd wr0, [%3, #16]\n"
+ "wstrd wr5, [%3, #24]\n"
+ :
+ : "r" (in), "r" (consts),
+ "r" (1 << (SBC_PROTO_FIXED8_SCALE - 1)), "r" (out),
+ "r" (SBC_PROTO_FIXED8_SCALE)
+ : "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6", "wr7",
+ "wr8", "wr9", "wr10", "wr11", "wr12", "wr13", "wr14", "wr15",
+ "wcgr0", "memory");
+}
+
+static inline void sbc_analyze_4b_4s_iwmmxt(int16_t *x, int32_t *out,
+ int out_stride)
+{
+ /* Analyze blocks */
+ sbc_analyze_four_iwmmxt(x + 12, out, analysis_consts_fixed4_simd_odd);
+ out += out_stride;
+ sbc_analyze_four_iwmmxt(x + 8, out, analysis_consts_fixed4_simd_even);
+ out += out_stride;
+ sbc_analyze_four_iwmmxt(x + 4, out, analysis_consts_fixed4_simd_odd);
+ out += out_stride;
+ sbc_analyze_four_iwmmxt(x + 0, out, analysis_consts_fixed4_simd_even);
+}
+
+static inline void sbc_analyze_4b_8s_iwmmxt(int16_t *x, int32_t *out,
+ int out_stride)
+{
+ /* Analyze blocks */
+ sbc_analyze_eight_iwmmxt(x + 24, out, analysis_consts_fixed8_simd_odd);
+ out += out_stride;
+ sbc_analyze_eight_iwmmxt(x + 16, out, analysis_consts_fixed8_simd_even);
+ out += out_stride;
+ sbc_analyze_eight_iwmmxt(x + 8, out, analysis_consts_fixed8_simd_odd);
+ out += out_stride;
+ sbc_analyze_eight_iwmmxt(x + 0, out, analysis_consts_fixed8_simd_even);
+}
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *state)
+{
+ state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_iwmmxt;
+ state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_iwmmxt;
+ state->implementation_info = "IWMMXT";
+}
+
+#endif
diff --git a/sbc/sbc_primitives_iwmmxt.h b/sbc/sbc_primitives_iwmmxt.h
new file mode 100644
index 0000000..b535e68
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.h
@@ -0,0 +1,42 @@
+/*
+ *
+ * Bluetooth low-complexity, subband codec (SBC) library
+ *
+ * Copyright (C) 2010 Keith Mok <ek9852@gmail.com>
+ * Copyright (C) 2008-2010 Nokia Corporation
+ * Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com>
+ *
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef __SBC_PRIMITIVES_IWMMXT_H
+#define __SBC_PRIMITIVES_IWMMXT_H
+
+#include "sbc_primitives.h"
+
+#if defined(__GNUC__) && defined(__IWMMXT__) && \
+ !defined(SBC_HIGH_PRECISION) && (SCALE_OUT_BITS == 15)
+
+#define SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *encoder_state);
+
+#endif
+
+#endif
--
1.6.3.3
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH] Add iwmmxt optimization for sbc for pxa series cpu
2010-11-18 13:33 ` [PATCH] " Keith Mok
@ 2010-11-18 16:53 ` Johan Hedberg
0 siblings, 0 replies; 10+ messages in thread
From: Johan Hedberg @ 2010-11-18 16:53 UTC (permalink / raw)
To: Keith Mok; +Cc: Siarhei Siamashka, linux-bluetooth
Hi Keith,
On Thu, Nov 18, 2010, Keith Mok wrote:
> Add iwmmxt optimization for sbc for pxa series cpu.
>
> Benchmarked on ARM PXA platform:
> === Before (4 bands) ====
> $ time ./sbcenc_orig -s 4 long.au > /dev/null
> real 0m 2.44s
> user 0m 2.39s
> sys 0m 0.05s
> === After (4 bands) ====
> $ time ./sbcenc -s 4 long.au > /dev/null
> real 0m 1.59s
> user 0m 1.49s
> sys 0m 0.10s
>
> === Before (8 bands) ====
> $ time ./sbcenc_orig -s 8 long.au > /dev/null
> real 0m 4.05s
> user 0m 3.98s
> sys 0m 0.07s
> === After (8 bands) ====
> $ time ./sbcenc -s 8 long.au > /dev/null
> real 0m 1.48s
> user 0m 1.41s
> sys 0m 0.06s
>
> === Before (a2dp usage) ====
> $ time ./sbcenc_orig -b53 -s8 -j long.au > /dev/null
> real 0m 4.51s
> user 0m 4.41s
> sys 0m 0.10s
> === After (a2dp usage) ====
> $ time ./sbcenc -b53 -s8 -j long.au > /dev/null
> real 0m 2.05s
> user 0m 1.99s
> sys 0m 0.06s
>
> ---
> Makefile.am | 1 +
> sbc/sbc_primitives.c | 4 +
> sbc/sbc_primitives_iwmmxt.c | 304 +++++++++++++++++++++++++++++++++++++++++++
> sbc/sbc_primitives_iwmmxt.h | 42 ++++++
> 4 files changed, 351 insertions(+), 0 deletions(-)
> create mode 100644 sbc/sbc_primitives_iwmmxt.c
> create mode 100644 sbc/sbc_primitives_iwmmxt.h
Pushed upstream. Thanks.
Johan
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2010-11-18 16:53 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-11-11 8:05 [PATCH] Add iwmmxt optimization for sbc for pxa series cpu Keith Mok
2010-11-11 11:46 ` Siarhei Siamashka
2010-11-12 7:35 ` [PATCH v2] " Keith Mok
2010-11-12 13:22 ` Siarhei Siamashka
2010-11-15 2:46 ` [PATCH v3] " Keith Mok
2010-11-15 11:08 ` Siarhei Siamashka
2010-11-18 13:05 ` Siarhei Siamashka
2010-11-18 13:31 ` Johan Hedberg
2010-11-18 13:33 ` [PATCH] " Keith Mok
2010-11-18 16:53 ` Johan Hedberg
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).