[PATCH] Add iwmmxt optimization for sbc for pxa series cpu

linux-bluetooth.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] Add iwmmxt optimization for sbc for pxa series cpu
@ 2010-11-11  8:05 Keith Mok
  2010-11-11 11:46 ` Siarhei Siamashka
  0 siblings, 1 reply; 10+ messages in thread
From: Keith Mok @ 2010-11-11  8:05 UTC (permalink / raw)
  To: linux-bluetooth

Hi all,

This patch add iwmmxt (Intel wireless mmx, pxa platform) optimzation
for sbc, based on the mmx code.
Have verified the encoded result against the mmx generated one.

Keith

Signed-off-by: Keith Mok <ek9852@gmail.com>
---
 Makefile.am                 |    1 +
 sbc/sbc_primitives.c        |    4 +
 sbc/sbc_primitives_iwmmxt.c |  361 +++++++++++++++++++++++++++++++++++++++++++
 sbc/sbc_primitives_iwmmxt.h |   38 +++++
 4 files changed, 404 insertions(+), 0 deletions(-)
 create mode 100644 sbc/sbc_primitives_iwmmxt.c
 create mode 100644 sbc/sbc_primitives_iwmmxt.h

diff --git a/Makefile.am b/Makefile.am
index da308a7..03a9bf2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -65,6 +65,7 @@ noinst_LTLIBRARIES += sbc/libsbc.la
 sbc_libsbc_la_SOURCES = sbc/sbc.h sbc/sbc.c sbc/sbc_math.h sbc/sbc_tables.h \
 			sbc/sbc_primitives.h sbc/sbc_primitives.c \
 			sbc/sbc_primitives_mmx.h sbc/sbc_primitives_mmx.c \
+			sbc/sbc_primitives_iwmmxt.h sbc/sbc_primitives_iwmmxt.c \
 			sbc/sbc_primitives_neon.h sbc/sbc_primitives_neon.c \
 			sbc/sbc_primitives_armv6.h sbc/sbc_primitives_armv6.c

diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
index f87fb5a..ad780d0 100644
--- a/sbc/sbc_primitives.c
+++ b/sbc/sbc_primitives.c
@@ -33,6 +33,7 @@

 #include "sbc_primitives.h"
 #include "sbc_primitives_mmx.h"
+#include "sbc_primitives_iwmmxt.h"
 #include "sbc_primitives_neon.h"
 #include "sbc_primitives_armv6.h"

@@ -544,6 +545,9 @@ void sbc_init_primitives(struct sbc_encoder_state *state)
 #ifdef SBC_BUILD_WITH_ARMV6_SUPPORT
 	sbc_init_primitives_armv6(state);
 #endif
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+	sbc_init_primitives_iwmmxt(state);
+#endif
 #ifdef SBC_BUILD_WITH_NEON_SUPPORT
 	sbc_init_primitives_neon(state);
 #endif
diff --git a/sbc/sbc_primitives_iwmmxt.c b/sbc/sbc_primitives_iwmmxt.c
new file mode 100644
index 0000000..4825998
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.c
@@ -0,0 +1,361 @@
+/*
+ *
+ *  Bluetooth low-complexity, subband codec (SBC) library
+ *
+ *  Copyright (C) 2010 Keith Mok <ek9852@gmail.com>
+ *  Based on sbc_primitives_mmx.c
+ *
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include <stdint.h>
+#include <limits.h>
+#include "sbc.h"
+#include "sbc_math.h"
+#include "sbc_tables.h"
+
+#include "sbc_primitives_iwmmxt.h"
+
+/*
+ * IWMMXT optimizations
+ */
+
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+static inline void sbc_analyze_four_iwmmxt(const int16_t *in, int32_t *out,
+					const FIXED_T *consts)
+{
+	asm volatile (
+		"tbcstw       wr4, %2\n"
+		"wldrd        wr0, [%0]\n"
+		"wldrd        wr1, [%0, #8]\n"
+		"wldrd        wr2, [%1]\n"
+		"wldrd        wr3, [%1, #8]\n"
+		"wmadds       wr0, wr2, wr0\n"
+		"wmadds       wr1, wr3, wr1\n"
+		"waddwss      wr0, wr0, wr4\n"
+		"waddwss      wr1, wr1, wr4\n"
+		"\n"
+		"wldrd        wr2, [%0, #16]\n"
+		"wldrd        wr3, [%0, #24]\n"
+		"wldrd        wr4, [%1, #16]\n"
+		"wldrd        wr5, [%1, #24]\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"wmadds       wr3, wr5, wr3\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr1, wr3, wr1\n"
+		"\n"
+		"wldrd        wr2, [%0, #32]\n"
+		"wldrd        wr3, [%0, #40]\n"
+		"wldrd        wr4, [%1, #32]\n"
+		"wldrd        wr5, [%1, #40]\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"wmadds       wr3, wr5, wr3\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr1, wr3, wr1\n"
+		"\n"
+		"wldrd        wr2, [%0, #48]\n"
+		"wldrd        wr3, [%0, #56]\n"
+		"wldrd        wr4, [%1, #48]\n"
+		"wldrd        wr5, [%1, #56]\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"wmadds       wr3, wr5, wr3\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr1, wr3, wr1\n"
+		"\n"
+		"wldrd        wr2, [%0, #64]\n"
+		"wldrd        wr3, [%0, #72]\n"
+		"wldrd        wr4, [%1, #64]\n"
+		"wldrd        wr5, [%1, #72]\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"wmadds       wr3, wr5, wr3\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr1, wr3, wr1\n"
+		"\n"
+		"tmcr       wcgr0, %4\n"
+		"wsrawg       wr0, wr0, wcgr0\n"
+		"wsrawg       wr1, wr1, wcgr0\n"
+		"wpackwss     wr0, wr0, wr0\n"
+		"wpackwss     wr1, wr1, wr1\n"
+		"\n"
+		"wldrd        wr4, [%1, #80]\n"
+		"wldrd        wr5, [%1, #88]\n"
+		"wldrd        wr6, [%1, #96]\n"
+		"wldrd        wr7, [%1, #104]\n"
+		"wmadds       wr2, wr5, wr0\n"
+		"wmadds       wr0, wr4, wr0\n"
+		"\n"
+		"wmadds       wr3, wr7, wr1\n"
+		"wmadds       wr1, wr6, wr1\n"
+		"waddwss      wr0, wr1, wr0\n"
+		"waddwss      wr2, wr3, wr2\n"
+		"\n"
+		"wstrd        wr0, [%3]\n"
+		"wstrd        wr2, [%3, #8]\n"
+		:
+		: "r" (in), "r" (consts),
+			"r" (1 << (SBC_PROTO_FIXED4_SCALE - 1)), "r" (out),
+			"r" (SBC_PROTO_FIXED4_SCALE)
+		: "memory");
+}
+
+static inline void sbc_analyze_eight_iwmmxt(const int16_t *in, int32_t *out,
+							const FIXED_T *consts)
+{
+	asm volatile (
+		"tbcstw       wr8, %2\n"
+		"wldrd        wr0, [%0]\n"
+		"wldrd        wr1, [%0, #8]\n"
+		"wldrd        wr2, [%0, #16]\n"
+		"wldrd        wr3, [%0, #24]\n"
+		"wldrd        wr4, [%1]\n"
+		"wldrd        wr5, [%1, #8]\n"
+		"wldrd        wr6, [%1, #16]\n"
+		"wldrd        wr7, [%1, #24]\n"
+		"wmadds       wr0, wr0, wr4\n"
+		"wmadds       wr1, wr1, wr5\n"
+		"wmadds       wr2, wr2, wr6\n"
+		"wmadds       wr3, wr3, wr7\n"
+		"waddwss      wr0, wr0, wr8\n"
+		"waddwss      wr1, wr1, wr8\n"
+		"waddwss      wr2, wr2, wr8\n"
+		"waddwss      wr3, wr3, wr8\n"
+		"\n"
+		"wldrd        wr4, [%0, #32]\n"
+		"wldrd        wr5, [%0, #40]\n"
+		"wldrd        wr6, [%0, #48]\n"
+		"wldrd        wr7, [%0, #56]\n"
+		"wldrd        wr8, [%1, #32]\n"
+		"wldrd        wr9, [%1, #40]\n"
+		"wldrd       wr10, [%1, #48]\n"
+		"wldrd       wr11, [%1, #56]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		"wmadds       wr5, wr5, wr9\n"
+		"wmadds       wr6, wr6, wr10\n"
+		"wmadds       wr7, wr7, wr11\n"
+		"waddwss      wr0, wr4, wr0\n"
+		"waddwss      wr1, wr5, wr1\n"
+		"waddwss      wr2, wr6, wr2\n"
+		"waddwss      wr3, wr7, wr3\n"
+		"\n"
+		"wldrd        wr4, [%0, #64]\n"
+		"wldrd        wr5, [%0, #72]\n"
+		"wldrd        wr6, [%0, #80]\n"
+		"wldrd        wr7, [%0, #88]\n"
+		"wldrd        wr8, [%1, #64]\n"
+		"wldrd        wr9, [%1, #72]\n"
+		"wldrd       wr10, [%1, #80]\n"
+		"wldrd       wr11, [%1, #88]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		"wmadds       wr5, wr5, wr9\n"
+		"wmadds       wr6, wr6, wr10\n"
+		"wmadds       wr7, wr7, wr11\n"
+		"waddwss      wr0, wr4, wr0\n"
+		"waddwss      wr1, wr5, wr1\n"
+		"waddwss      wr2, wr6, wr2\n"
+		"waddwss      wr3, wr7, wr3\n"
+		"\n"
+		"wldrd        wr4, [%0, #96]\n"
+		"wldrd        wr5, [%0, #104]\n"
+		"wldrd        wr6, [%0, #112]\n"
+		"wldrd        wr7, [%0, #120]\n"
+		"wldrd        wr8, [%1, #96]\n"
+		"wldrd        wr9, [%1, #104]\n"
+		"wldrd       wr10, [%1, #112]\n"
+		"wldrd       wr11, [%1, #120]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		"wmadds       wr5, wr5, wr9\n"
+		"wmadds       wr6, wr6, wr10\n"
+		"wmadds       wr7, wr7, wr11\n"
+		"waddwss      wr0, wr4, wr0\n"
+		"waddwss      wr1, wr5, wr1\n"
+		"waddwss      wr2, wr6, wr2\n"
+		"waddwss      wr3, wr7, wr3\n"
+		"\n"
+		"wldrd        wr4, [%0, #128]\n"
+		"wldrd        wr5, [%0, #136]\n"
+		"wldrd        wr6, [%0, #144]\n"
+		"wldrd        wr7, [%0, #152]\n"
+		"wldrd        wr8, [%1, #128]\n"
+		"wldrd        wr9, [%1, #136]\n"
+		"wldrd       wr10, [%1, #144]\n"
+		"wldrd       wr11, [%1, #152]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		"wmadds       wr5, wr5, wr9\n"
+		"wmadds       wr6, wr6, wr10\n"
+		"wmadds       wr7, wr7, wr11\n"
+		"waddwss      wr0, wr4, wr0\n"
+		"waddwss      wr1, wr5, wr1\n"
+		"waddwss      wr2, wr6, wr2\n"
+		"waddwss      wr3, wr7, wr3\n"
+		"\n"
+		"tmcr       wcgr0, %4\n"
+		"wsrawg       wr0, wr0, wcgr0\n"
+		"wsrawg       wr1, wr1, wcgr0\n"
+		"wsrawg       wr2, wr2, wcgr0\n"
+		"wsrawg       wr3, wr3, wcgr0\n"
+		"\n"
+		"wpackwss     wr0, wr0, wr0\n"
+		"wpackwss     wr1, wr1, wr1\n"
+		"wpackwss     wr2, wr2, wr2\n"
+		"wpackwss     wr3, wr3, wr3\n"
+		"\n"
+		"wldrd        wr4, [%1, #160]\n"
+		"wldrd        wr5, [%1, #168]\n"
+		"wmadds       wr4, wr4, wr0\n"
+		"wmadds       wr5, wr5, wr0\n"
+		"\n"
+		"wldrd        wr6, [%1, #192]\n"
+		"wldrd        wr7, [%1, #200]\n"
+		"wmadds       wr6, wr6, wr1\n"
+		"wmadds       wr7, wr7, wr1\n"
+		"waddwss      wr4, wr6, wr4\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wldrd        wr6, [%1, #224]\n"
+		"wldrd        wr7, [%1, #232]\n"
+		"wmadds       wr6, wr6, wr2\n"
+		"wmadds       wr7, wr7, wr2\n"
+		"waddwss      wr4, wr6, wr4\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wldrd        wr6, [%1, #256]\n"
+		"wldrd        wr7, [%1, #264]\n"
+		"wmadds       wr6, wr6, wr3\n"
+		"wmadds       wr7, wr7, wr3\n"
+		"waddwss      wr4, wr6, wr4\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wstrd        wr4, [%3]\n"
+		"wstrd        wr5, [%3, #8]\n"
+		"\n"
+		"wldrd        wr4, [%1, #176]\n"
+		"wldrd        wr5, [%1, #184]\n"
+		"wmadds       wr5, wr5, wr0\n"
+		"wmadds       wr0, wr4, wr0\n"
+		"\n"
+		"wldrd        wr4, [%1, #208]\n"
+		"wldrd        wr7, [%1, #216]\n"
+		"wmadds       wr7, wr7, wr1\n"
+		"wmadds       wr1, wr4, wr1\n"
+		"waddwss      wr0, wr1, wr0\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wldrd        wr4, [%1, #240]\n"
+		"wldrd        wr7, [%1, #248]\n"
+		"wmadds       wr7, wr7, wr2\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wldrd        wr4, [%1, #272]\n"
+		"wldrd        wr7, [%1, #280]\n"
+		"wmadds       wr7, wr7, wr3\n"
+		"wmadds       wr3, wr4, wr3\n"
+		"waddwss      wr0, wr3, wr0\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wstrd        wr0, [%3, #16]\n"
+		"wstrd        wr5, [%3, #24]\n"
+		:
+		: "r" (in), "r" (consts),
+			"r" (1 << (SBC_PROTO_FIXED8_SCALE - 1)), "r" (out),
+			"r" (SBC_PROTO_FIXED8_SCALE)
+		: "memory");
+}
+
+static inline void sbc_analyze_4b_4s_iwmmxt(int16_t *x, int32_t *out,
+						int out_stride)
+{
+	/* Analyze blocks */
+	sbc_analyze_four_iwmmxt(x + 12, out, analysis_consts_fixed4_simd_odd);
+	out += out_stride;
+	sbc_analyze_four_iwmmxt(x + 8, out, analysis_consts_fixed4_simd_even);
+	out += out_stride;
+	sbc_analyze_four_iwmmxt(x + 4, out, analysis_consts_fixed4_simd_odd);
+	out += out_stride;
+	sbc_analyze_four_iwmmxt(x + 0, out, analysis_consts_fixed4_simd_even);
+}
+
+static inline void sbc_analyze_4b_8s_iwmmxt(int16_t *x, int32_t *out,
+						int out_stride)
+{
+	/* Analyze blocks */
+	sbc_analyze_eight_iwmmxt(x + 24, out, analysis_consts_fixed8_simd_odd);
+	out += out_stride;
+	sbc_analyze_eight_iwmmxt(x + 16, out, analysis_consts_fixed8_simd_even);
+	out += out_stride;
+	sbc_analyze_eight_iwmmxt(x + 8, out, analysis_consts_fixed8_simd_odd);
+	out += out_stride;
+	sbc_analyze_eight_iwmmxt(x + 0, out, analysis_consts_fixed8_simd_even);
+}
+
+static void sbc_calc_scalefactors_iwmmxt(
+	int32_t sb_sample_f[16][2][8],
+	uint32_t scale_factor[2][8],
+	int blocks, int channels, int subbands)
+{
+	int ch, sb;
+	intptr_t blk;
+	for (ch = 0; ch < channels; ch++) {
+		for (sb = 0; sb < subbands; sb += 2) {
+			int b;
+			blk = &sb_sample_f[0][ch][sb];
+			b = blocks;
+			asm volatile (
+				"tbcstw       wr0, %4\n"
+			"1:\n"
+				"wldrd        wr1, [%0], %2\n"
+				"wxor         wr2, wr2, wr2\n"
+				"wcmpgtsw     wr3, wr1, wr2\n"
+				"waddwss      wr1, wr1, wr3\n"
+				"wcmpgtsw     wr2, wr2, wr1\n"
+				"wxor         wr1, wr1, wr2\n"
+
+				"wor          wr0, wr0, wr1\n"
+
+				"subs         %1, %1, #1\n"
+				"bne          1b\n"
+
+				"tmrrc        %0, %1, wr0\n"
+				"clz          %0, %0\n"
+				"rsb          %0, %0, %5\n"
+				"str          %0, [%3]\n"
+
+				"clz          %1, %1\n"
+				"rsb          %1, %1, %5\n"
+				"str          %1, [%3, #4]\n"
+			: "+&r" (blk), "+&r" (b)
+			: "i" ((char *) &sb_sample_f[1][0][0] -
+				(char *) &sb_sample_f[0][0][0]),
+				"r" (&scale_factor[ch][sb]),
+				"r" (1 << SCALE_OUT_BITS),
+				"i" (SCALE_OUT_BITS+1)
+			: "memory");
+		}
+	}
+}
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *state)
+{
+	state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_iwmmxt;
+	state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_iwmmxt;
+	state->sbc_calc_scalefactors = sbc_calc_scalefactors_iwmmxt;
+	state->implementation_info = "IWMMXT";
+}
+
+#endif
diff --git a/sbc/sbc_primitives_iwmmxt.h b/sbc/sbc_primitives_iwmmxt.h
new file mode 100644
index 0000000..827d811
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.h
@@ -0,0 +1,38 @@
+/*
+ *
+ *  Bluetooth low-complexity, subband codec (SBC) library
+ *
+ *  Based on sbc_primitives_mmx.c
+ *
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef __SBC_PRIMITIVES_IWMMXT_H
+#define __SBC_PRIMITIVES_IWMMXT_H
+
+#include "sbc_primitives.h"
+
+#if defined(__GNUC__) && defined(__IWMMXT__) && \
+		!defined(SBC_HIGH_PRECISION) && (SCALE_OUT_BITS == 15)
+
+#define SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *encoder_state);
+
+#endif
+
+#endif
-- 
1.6.3.3

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] Add iwmmxt optimization for sbc for pxa series cpu
  2010-11-11  8:05 [PATCH] Add iwmmxt optimization for sbc for pxa series cpu Keith Mok
@ 2010-11-11 11:46 ` Siarhei Siamashka
  2010-11-12  7:35   ` [PATCH v2] " Keith Mok
  0 siblings, 1 reply; 10+ messages in thread
From: Siarhei Siamashka @ 2010-11-11 11:46 UTC (permalink / raw)
  To: Keith Mok; +Cc: linux-bluetooth

On Thursday 11 November 2010 10:05:46 Keith Mok wrote:
> This patch add iwmmxt (Intel wireless mmx, pxa platform) optimzation
> for sbc, based on the mmx code.
> Have verified the encoded result against the mmx generated one.

Nice, I guess it should provide a noticeable performance improvement on this
hardware.

Did you run some benchmarks with these optimizations to measure how much they
are helping? The most interesting numbers are for the "44100kHz audio
with bitpool set to 53, 8 subbands, joint stereo" case, which is typically
used for A2DP. This can be done by running:
    $ time ./sbcenc -b53 -s8 -j test.au > /dev/null

In my opinion, commit messages for the performance patches are more descriptive
in the following format: 
http://git.kernel.org/?p=bluetooth/bluez.git;a=commit;h=e80454d08b4ec098024ddfbdffbd71e9d2f81bd0

And splitting the patch into parts, adding one optimization at a time may be a
good idea (for bisecting purposes).

A few other comments below.

I don't have any IWMMXT capable hardware to test/benchmark, but I checked the
following manuals:
http://download.intel.com/design/intelxscale/31451001.pdf
http://download.intel.com/design/intelxscale/27347302.pdf 

> +static inline void sbc_analyze_four_iwmmxt(const int16_t *in, int32_t
> *out, +					const FIXED_T *consts)
> +{
> +	asm volatile (
> +		"tbcstw       wr4, %2\n"
> +		"wldrd        wr0, [%0]\n"
> +		"wldrd        wr1, [%0, #8]\n"
> +		"wldrd        wr2, [%1]\n"
> +		"wldrd        wr3, [%1, #8]\n"

Using back-to-back WLDRD instructions has some performance penalty 

"D.3.2.3 Memory Control Pipeline

There is also an additional stall introduced by the core when 2 double word (64 
bits) are issued back to back such as:
WLDRD or WSTRD
WLDR[B,H,W,D] or WSTR[B,H,W,D] <- 1 cycle stall.
Critical inner loop sequences can use non memory related instructions following 
a WLDRD or WSTRD."

It's better to try rearranging the code so that load instructions are 
interleaved with the others whenever it is possible.

> +		"wmadds       wr0, wr2, wr0\n"
> +		"wmadds       wr1, wr3, wr1\n"
> +		"waddwss      wr0, wr0, wr4\n"
> +		"waddwss      wr1, wr1, wr4\n"
> +		"\n"
> +		"wldrd        wr2, [%0, #16]\n"
> +		"wldrd        wr3, [%0, #24]\n"
> +		"wldrd        wr4, [%1, #16]\n"
                ^^^^^^ (1)
> +		"wldrd        wr5, [%1, #24]\n"
> +		"wmadds       wr2, wr4, wr2\n"
                ^^^^^^^ (2)

It also makes sense to pay attention to instruction latencies. Here you use wr4
register (2) after loading (1) with only one unrelated instruction in between. 

And according to "Table D-1. Issue Cycle and Result Latency of the Intel® 
Wireless MMX™ 2 Coprocessor Instructions", WLDRD has result latency 3, so that 
it works best if you insert 2 unrelated instruction in between.

> +		"wmadds       wr3, wr5, wr3\n"
> +		"waddwss      wr0, wr2, wr0\n"
> +		"waddwss      wr1, wr3, wr1\n"
> +		"\n"
> +		"wldrd        wr2, [%0, #32]\n"
> +		"wldrd        wr3, [%0, #40]\n"
> +		"wldrd        wr4, [%1, #32]\n"
> +		"wldrd        wr5, [%1, #40]\n"
> +		"wmadds       wr2, wr4, wr2\n"
> +		"wmadds       wr3, wr5, wr3\n"

According to "Table D-3. Resource Availability Delay for the Multiplier 
Pipeline", back-to-back WMADD instructions may have a performance penalty. 

> +		"waddwss      wr0, wr2, wr0\n"
> +		"waddwss      wr1, wr3, wr1\n"
> +		"\n"
> +		"wldrd        wr2, [%0, #48]\n"
> +		"wldrd        wr3, [%0, #56]\n"
> +		"wldrd        wr4, [%1, #48]\n"
> +		"wldrd        wr5, [%1, #56]\n"
> +		"wmadds       wr2, wr4, wr2\n"
> +		"wmadds       wr3, wr5, wr3\n"
> +		"waddwss      wr0, wr2, wr0\n"
> +		"waddwss      wr1, wr3, wr1\n"
> +		"\n"
> +		"wldrd        wr2, [%0, #64]\n"
> +		"wldrd        wr3, [%0, #72]\n"
> +		"wldrd        wr4, [%1, #64]\n"
> +		"wldrd        wr5, [%1, #72]\n"
> +		"wmadds       wr2, wr4, wr2\n"
> +		"wmadds       wr3, wr5, wr3\n"
> +		"waddwss      wr0, wr2, wr0\n"
> +		"waddwss      wr1, wr3, wr1\n"
> +		"\n"
> +		"tmcr       wcgr0, %4\n"
> +		"wsrawg       wr0, wr0, wcgr0\n"
> +		"wsrawg       wr1, wr1, wcgr0\n"
> +		"wpackwss     wr0, wr0, wr0\n"
> +		"wpackwss     wr1, wr1, wr1\n"
> +		"\n"
> +		"wldrd        wr4, [%1, #80]\n"
> +		"wldrd        wr5, [%1, #88]\n"
> +		"wldrd        wr6, [%1, #96]\n"
> +		"wldrd        wr7, [%1, #104]\n"
> +		"wmadds       wr2, wr5, wr0\n"
> +		"wmadds       wr0, wr4, wr0\n"
> +		"\n"
> +		"wmadds       wr3, wr7, wr1\n"
> +		"wmadds       wr1, wr6, wr1\n"
> +		"waddwss      wr0, wr1, wr0\n"
> +		"waddwss      wr2, wr3, wr2\n"
> +		"\n"
> +		"wstrd        wr0, [%3]\n"
> +		"wstrd        wr2, [%3, #8]\n"
> +		:
> +		: "r" (in), "r" (consts),
> +			"r" (1 << (SBC_PROTO_FIXED4_SCALE - 1)), "r" (out),
> +			"r" (SBC_PROTO_FIXED4_SCALE)
> +		: "memory");
> +}


> +static void sbc_calc_scalefactors_iwmmxt(
> +	int32_t sb_sample_f[16][2][8],
> +	uint32_t scale_factor[2][8],
> +	int blocks, int channels, int subbands)
> +{
> +	int ch, sb;
> +	intptr_t blk;
> +	for (ch = 0; ch < channels; ch++) {
> +		for (sb = 0; sb < subbands; sb += 2) {
> +			int b;
> +			blk = &sb_sample_f[0][ch][sb];
> +			b = blocks;
> +			asm volatile (
> +				"tbcstw       wr0, %4\n"
> +			"1:\n"
> +				"wldrd        wr1, [%0], %2\n"
> +				"wxor         wr2, wr2, wr2\n"
> +				"wcmpgtsw     wr3, wr1, wr2\n"

The MMX code was using PCMPGTD and the other instructions just because MMX 
instruction set is very limited and did not have the needed instructions. But 
you can use WABS and WMAX instructions to do this job better. You can refer to
the original C code and also to ARM NEON optimizations to get some ideas about
how to do this operation faster.  

> +				"waddwss      wr1, wr1, wr3\n"
> +				"wcmpgtsw     wr2, wr2, wr1\n"
> +				"wxor         wr1, wr1, wr2\n"
> +
> +				"wor          wr0, wr0, wr1\n"
> +
> +				"subs         %1, %1, #1\n"
> +				"bne          1b\n"
> +
> +				"tmrrc        %0, %1, wr0\n"
> +				"clz          %0, %0\n"
> +				"rsb          %0, %0, %5\n"
> +				"str          %0, [%3]\n"
> +
> +				"clz          %1, %1\n"
> +				"rsb          %1, %1, %5\n"
> +				"str          %1, [%3, #4]\n"
> +			: "+&r" (blk), "+&r" (b)
> +			: "i" ((char *) &sb_sample_f[1][0][0] -
> +				(char *) &sb_sample_f[0][0][0]),
> +				"r" (&scale_factor[ch][sb]),
> +				"r" (1 << SCALE_OUT_BITS),
> +				"i" (SCALE_OUT_BITS+1)
> +			: "memory");

And this is actually a bug, which exists in the original MMX code too (my
fault). In order to fix it, "cc" needs to be added to the clobber list. I have
just sent a patch for MMX code here:
http://marc.info/?l=linux-bluetooth&m=128946780706187&w=2

Such bug is more dangerous on ARM, because it is up to the developer whether to
update flags in each particular instruction or not. So while almost every
arithmetic x86 instruction updates flags unconditionally, on ARM the flags can
easily survive long enough. That makes it possible for the compiler to
implement more clever optimizations related to setting and checking flags, and
fail if the clobber list does not contain correct information.

> +		}
> +	}
> +}

-- 
Best regards,
Siarhei Siamashka

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] Add iwmmxt optimization for sbc for pxa series cpu
  2010-11-11 11:46 ` Siarhei Siamashka
@ 2010-11-12  7:35   ` Keith Mok
  2010-11-12 13:22     ` Siarhei Siamashka
  0 siblings, 1 reply; 10+ messages in thread
From: Keith Mok @ 2010-11-12  7:35 UTC (permalink / raw)
  To: Siarhei Siamashka; +Cc: linux-bluetooth

> Did you run some benchmarks with these optimizations to measure how much they
> are helping?
Tested on Marvell PXA platform.
== Before ==
$ time ./sbcenc   -b53 -s8 -j  c.au  > /dev/null
real    0m 0.41s
user    0m 0.40s
sys     0m 0.00s

== After ==
$ time ./sbcenc   -b53 -s8 -j  c.au  > /dev/null
real    0m 0.19s
user    0m 0.17s
sys     0m 0.02s

> Using back-to-back WLDRD instructions has some performance penalty
I rearrange the instructions and keep the original one as for reference in
the block that comment out. Since the code is really difficult to read
after interleaved.

> The MMX code was using PCMPGTD and the other instructions just because MMX
> instruction set is very limited and did not have the needed instructions. But
> you can use WABS and WMAX instructions to do this job better. You can refer to
> the original C code and also to ARM NEON optimizations to get some ideas about
> how to do this operation faster.
Changed as suggested.
But got a question that the __IWMMXT__ builtin gcc definition is not a
reliable way to
determine whether mcpu=iwmmxt2 is turned on or not. It will break when
compile under pxa270
which does not support wabs with just mcpu=iwmmx on.

Keith

Signed-off-by: Keith Mok <ek9852@gmail.com>
---
diff --git a/Makefile.am b/Makefile.am
index da308a7..03a9bf2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -65,6 +65,7 @@ noinst_LTLIBRARIES += sbc/libsbc.la
 sbc_libsbc_la_SOURCES = sbc/sbc.h sbc/sbc.c sbc/sbc_math.h sbc/sbc_tables.h \
 			sbc/sbc_primitives.h sbc/sbc_primitives.c \
 			sbc/sbc_primitives_mmx.h sbc/sbc_primitives_mmx.c \
+			sbc/sbc_primitives_iwmmxt.h sbc/sbc_primitives_iwmmxt.c \
 			sbc/sbc_primitives_neon.h sbc/sbc_primitives_neon.c \
 			sbc/sbc_primitives_armv6.h sbc/sbc_primitives_armv6.c

diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
index f87fb5a..ad780d0 100644
--- a/sbc/sbc_primitives.c
+++ b/sbc/sbc_primitives.c
@@ -33,6 +33,7 @@

 #include "sbc_primitives.h"
 #include "sbc_primitives_mmx.h"
+#include "sbc_primitives_iwmmxt.h"
 #include "sbc_primitives_neon.h"
 #include "sbc_primitives_armv6.h"

@@ -544,6 +545,9 @@ void sbc_init_primitives(struct sbc_encoder_state *state)
 #ifdef SBC_BUILD_WITH_ARMV6_SUPPORT
 	sbc_init_primitives_armv6(state);
 #endif
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+	sbc_init_primitives_iwmmxt(state);
+#endif
 #ifdef SBC_BUILD_WITH_NEON_SUPPORT
 	sbc_init_primitives_neon(state);
 #endif
diff --git a/sbc/sbc_primitives_iwmmxt.c b/sbc/sbc_primitives_iwmmxt.c
new file mode 100644
index 0000000..b988bb1
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.c
@@ -0,0 +1,599 @@
+/*
+ *
+ *  Bluetooth low-complexity, subband codec (SBC) library
+ *
+ *  Copyright (C) 2010 Keith Mok <ek9852@gmail.com>
+ *  Based on sbc_primitives_mmx.c
+ *
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include <stdint.h>
+#include <limits.h>
+#include "sbc.h"
+#include "sbc_math.h"
+#include "sbc_tables.h"
+
+#include "sbc_primitives_iwmmxt.h"
+
+/*
+ * IWMMXT optimizations
+ */
+
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+static inline void sbc_analyze_four_iwmmxt(const int16_t *in, int32_t *out,
+					const FIXED_T *consts)
+{
+	asm volatile (
+		"wldrd        wr0, [%0]\n"
+		"tbcstw       wr4, %2\n"
+		"wldrd        wr2, [%1]\n"
+		"wldrd        wr1, [%0, #8]\n"
+		"wldrd        wr3, [%1, #8]\n"
+		"wmadds       wr0, wr2, wr0\n"
+		"wldrd        wr6, [%0, #16]\n"
+		"wmadds       wr1, wr3, wr1\n"
+		"wldrd        wr7, [%0, #24]\n"
+		"waddwss      wr0, wr0, wr4\n"
+		"wldrd        wr8, [%1, #16]\n"
+		"waddwss      wr1, wr1, wr4\n"
+		"wldrd        wr9, [%1, #24]\n"
+		"wmadds       wr6, wr8, wr6\n"
+		"wldrd        wr2, [%0, #32]\n"
+		"wmadds       wr7, wr9, wr7\n"
+		"wldrd        wr3, [%0, #40]\n"
+		"waddwss      wr0, wr6, wr0\n"
+		"wldrd        wr4, [%1, #32]\n"
+		"waddwss      wr1, wr7, wr1\n"
+		"wldrd        wr5, [%1, #40]\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"wldrd        wr6, [%0, #48]\n"
+		"wmadds       wr3, wr5, wr3\n"
+		"wldrd        wr7, [%0, #56]\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"wldrd        wr8, [%1, #48]\n"
+		"waddwss      wr1, wr3, wr1\n"
+		"wldrd        wr9, [%1, #56]\n"
+		"wmadds       wr6, wr8, wr6\n"
+		"wldrd        wr2, [%0, #64]\n"
+		"wmadds       wr7, wr9, wr7\n"
+		"wldrd        wr3, [%0, #72]\n"
+		"waddwss      wr0, wr6, wr0\n"
+		"wldrd        wr4, [%1, #64]\n"
+		"waddwss      wr1, wr7, wr1\n"
+		"wldrd        wr5, [%1, #72]\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"wmadds       wr3, wr5, wr3\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr1, wr3, wr1\n"
+		"\n"
+		"tmcr       wcgr0, %4\n"
+		"wsrawg       wr0, wr0, wcgr0\n"
+		"wldrd        wr4, [%1, #80]\n"
+		"wsrawg       wr1, wr1, wcgr0\n"
+		"wldrd        wr5, [%1, #88]\n"
+		"wpackwss     wr0, wr0, wr0\n"
+		"wldrd        wr6, [%1, #96]\n"
+		"wpackwss     wr1, wr1, wr1\n"
+		"wldrd        wr7, [%1, #104]\n"
+		"wmadds       wr2, wr5, wr0\n"
+		"wmadds       wr0, wr4, wr0\n"
+		"\n"
+		"wmadds       wr3, wr7, wr1\n"
+		"wmadds       wr1, wr6, wr1\n"
+		"waddwss      wr0, wr1, wr0\n"
+		"waddwss      wr2, wr3, wr2\n"
+		"\n"
+		"wstrd        wr0, [%3]\n"
+		"wstrd        wr2, [%3, #8]\n"
+		:
+		: "r" (in), "r" (consts),
+			"r" (1 << (SBC_PROTO_FIXED4_SCALE - 1)), "r" (out),
+			"r" (SBC_PROTO_FIXED4_SCALE)
+		: "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6", "wr7",
+		  "wr8", "wr9", "wcgr0", "memory");
+#if 0
+	/* without pipeline and resultant latency consideration
+	 * keep it here for reference
+	 * since the latency optimizated code above is difficult to read */
+	asm volatile (
+		"tbcstw       wr4, %2\n"
+		"wldrd        wr0, [%0]\n"
+		"wldrd        wr1, [%0, #8]\n"
+		"wldrd        wr2, [%1]\n"
+		"wldrd        wr3, [%1, #8]\n"
+		"wmadds       wr0, wr2, wr0\n"
+		"wmadds       wr1, wr3, wr1\n"
+		"waddwss      wr0, wr0, wr4\n"
+		"waddwss      wr1, wr1, wr4\n"
+		"\n"
+		"wldrd        wr2, [%0, #16]\n"
+		"wldrd        wr3, [%0, #24]\n"
+		"wldrd        wr4, [%1, #16]\n"
+		"wldrd        wr5, [%1, #24]\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"wmadds       wr3, wr5, wr3\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr1, wr3, wr1\n"
+		"\n"
+		"wldrd        wr2, [%0, #32]\n"
+		"wldrd        wr3, [%0, #40]\n"
+		"wldrd        wr4, [%1, #32]\n"
+		"wldrd        wr5, [%1, #40]\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"wmadds       wr3, wr5, wr3\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr1, wr3, wr1\n"
+		"\n"
+		"wldrd        wr2, [%0, #48]\n"
+		"wldrd        wr3, [%0, #56]\n"
+		"wldrd        wr4, [%1, #48]\n"
+		"wldrd        wr5, [%1, #56]\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"wmadds       wr3, wr5, wr3\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr1, wr3, wr1\n"
+		"\n"
+		"wldrd        wr2, [%0, #64]\n"
+		"wldrd        wr3, [%0, #72]\n"
+		"wldrd        wr4, [%1, #64]\n"
+		"wldrd        wr5, [%1, #72]\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"wmadds       wr3, wr5, wr3\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr1, wr3, wr1\n"
+		"\n"
+		"tmcr       wcgr0, %4\n"
+		"wsrawg       wr0, wr0, wcgr0\n"
+		"wsrawg       wr1, wr1, wcgr0\n"
+		"wpackwss     wr0, wr0, wr0\n"
+		"wpackwss     wr1, wr1, wr1\n"
+		"\n"
+		"wldrd        wr4, [%1, #80]\n"
+		"wldrd        wr5, [%1, #88]\n"
+		"wldrd        wr6, [%1, #96]\n"
+		"wldrd        wr7, [%1, #104]\n"
+		"wmadds       wr2, wr5, wr0\n"
+		"wmadds       wr0, wr4, wr0\n"
+		"\n"
+		"wmadds       wr3, wr7, wr1\n"
+		"wmadds       wr1, wr6, wr1\n"
+		"waddwss      wr0, wr1, wr0\n"
+		"waddwss      wr2, wr3, wr2\n"
+		"\n"
+		"wstrd        wr0, [%3]\n"
+		"wstrd        wr2, [%3, #8]\n"
+		:
+		: "r" (in), "r" (consts),
+			"r" (1 << (SBC_PROTO_FIXED4_SCALE - 1)), "r" (out),
+			"r" (SBC_PROTO_FIXED4_SCALE)
+		: "memory");
+#endif
+}
+
+static inline void sbc_analyze_eight_iwmmxt(const int16_t *in, int32_t *out,
+							const FIXED_T *consts)
+{
+	asm volatile (
+		"wldrd        wr0, [%0]\n"
+		"tbcstw       wr15, %2\n"
+		"wldrd        wr1, [%0, #8]\n"
+		"wldrd        wr2, [%0, #16]\n"
+		"wldrd        wr3, [%0, #24]\n"
+		"wldrd        wr4, [%1]\n"
+		"wldrd        wr5, [%1, #8]\n"
+		"wldrd        wr6, [%1, #16]\n"
+		"wldrd        wr7, [%1, #24]\n"
+		"wmadds       wr0, wr0, wr4\n"
+		"wldrd        wr8, [%1, #32]\n"
+		"wmadds       wr1, wr1, wr5\n"
+		"wldrd        wr9, [%1, #40]\n"
+		"wmadds       wr2, wr2, wr6\n"
+		"wldrd       wr10, [%1, #48]\n"
+		"wmadds       wr3, wr3, wr7\n"
+		"wldrd       wr11, [%1, #56]\n"
+		"waddwss      wr0, wr0, wr15\n"
+		"wldrd        wr4, [%0, #32]\n"
+		"waddwss      wr1, wr1, wr15\n"
+		"wldrd        wr5, [%0, #40]\n"
+		"waddwss      wr2, wr2, wr15\n"
+		"wldrd        wr6, [%0, #48]\n"
+		"waddwss      wr3, wr3, wr15\n"
+		"wldrd        wr7, [%0, #56]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		"wldrd       wr12, [%0, #64]\n"
+		"wmadds       wr5, wr5, wr9\n"
+		"wldrd       wr13, [%0, #72]\n"
+		"wmadds       wr6, wr6, wr10\n"
+		"wldrd       wr14, [%0, #80]\n"
+		"wmadds       wr7, wr7, wr11\n"
+		"wldrd       wr15, [%0, #88]\n"
+		"waddwss      wr0, wr4, wr0\n"
+		"wldrd        wr8, [%1, #64]\n"
+		"waddwss      wr1, wr5, wr1\n"
+		"wldrd        wr9, [%1, #72]\n"
+		"waddwss      wr2, wr6, wr2\n"
+		"wldrd       wr10, [%1, #80]\n"
+		"waddwss      wr3, wr7, wr3\n"
+		"wldrd       wr11, [%1, #88]\n"
+		"wmadds      wr12, wr12, wr8\n"
+		"wldrd        wr4, [%0, #96]\n"
+		"wmadds      wr13, wr13, wr9\n"
+		"wldrd        wr5, [%0, #104]\n"
+		"wmadds      wr14, wr14, wr10\n"
+		"wldrd        wr6, [%0, #112]\n"
+		"wmadds      wr15, wr15, wr11\n"
+		"wldrd        wr7, [%0, #120]\n"
+		"waddwss      wr0, wr12, wr0\n"
+		"wldrd        wr8, [%1, #96]\n"
+		"waddwss      wr1, wr13, wr1\n"
+		"wldrd        wr9, [%1, #104]\n"
+		"waddwss      wr2, wr14, wr2\n"
+		"wldrd       wr10, [%1, #112]\n"
+		"waddwss      wr3, wr15, wr3\n"
+		"wldrd       wr11, [%1, #120]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		"wldrd       wr12, [%0, #128]\n"
+		"wmadds       wr5, wr5, wr9\n"
+		"wldrd       wr13, [%0, #136]\n"
+		"wmadds       wr6, wr6, wr10\n"
+		"wldrd       wr14, [%0, #144]\n"
+		"wmadds       wr7, wr7, wr11\n"
+		"wldrd       wr15, [%0, #152]\n"
+		"waddwss      wr0, wr4, wr0\n"
+		"wldrd        wr8, [%1, #128]\n"
+		"waddwss      wr1, wr5, wr1\n"
+		"wldrd        wr9, [%1, #136]\n"
+		"waddwss      wr2, wr6, wr2\n"
+		"wldrd       wr10, [%1, #144]\n"
+		"waddwss      wr3, wr7, wr3\n"
+		"wldrd       wr11, [%1, #152]\n"
+		"wmadds      wr12, wr12, wr8\n"
+		"wmadds      wr13, wr13, wr9\n"
+		"wmadds      wr14, wr14, wr10\n"
+		"wmadds      wr15, wr15, wr11\n"
+		"waddwss      wr0, wr12, wr0\n"
+		"waddwss      wr1, wr13, wr1\n"
+		"waddwss      wr2, wr14, wr2\n"
+		"waddwss      wr3, wr15, wr3\n"
+		"\n"
+		"tmcr       wcgr0, %4\n"
+		"wsrawg       wr0, wr0, wcgr0\n"
+		"wsrawg       wr1, wr1, wcgr0\n"
+		"wsrawg       wr2, wr2, wcgr0\n"
+		"wsrawg       wr3, wr3, wcgr0\n"
+		"\n"
+		"wpackwss     wr0, wr0, wr0\n"
+		"wpackwss     wr1, wr1, wr1\n"
+		"wldrd        wr4, [%1, #160]\n"
+		"wpackwss     wr2, wr2, wr2\n"
+		"wldrd        wr5, [%1, #168]\n"
+		"wpackwss     wr3, wr3, wr3\n"
+		"wldrd        wr6, [%1, #192]\n"
+		"wmadds       wr4, wr4, wr0\n"
+		"wldrd        wr7, [%1, #200]\n"
+		"wmadds       wr5, wr5, wr0\n"
+		"wldrd        wr8, [%1, #224]\n"
+		"wmadds       wr6, wr6, wr1\n"
+		"wldrd        wr9, [%1, #232]\n"
+		"wmadds       wr7, wr7, wr1\n"
+		"waddwss      wr4, wr6, wr4\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"wmadds       wr8, wr8, wr2\n"
+		"wldrd        wr6, [%1, #256]\n"
+		"wmadds       wr9, wr9, wr2\n"
+		"wldrd        wr7, [%1, #264]\n"
+		"waddwss      wr4, wr8, wr4\n"
+		"waddwss      wr5, wr9, wr5\n"
+		"wmadds       wr6, wr6, wr3\n"
+		"wmadds       wr7, wr7, wr3\n"
+		"waddwss      wr4, wr6, wr4\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wstrd        wr4, [%3]\n"
+		"wstrd        wr5, [%3, #8]\n"
+		"\n"
+		"wldrd        wr6, [%1, #176]\n"
+		"wldrd        wr5, [%1, #184]\n"
+		"wmadds       wr5, wr5, wr0\n"
+		"wldrd        wr8, [%1, #208]\n"
+		"wmadds       wr0, wr6, wr0\n"
+		"wldrd        wr9, [%1, #216]\n"
+		"wmadds       wr9, wr9, wr1\n"
+		"wldrd        wr6, [%1, #240]\n"
+		"wmadds       wr1, wr8, wr1\n"
+		"wldrd        wr7, [%1, #248]\n"
+		"waddwss      wr0, wr1, wr0\n"
+		"waddwss      wr5, wr9, wr5\n"
+		"wmadds       wr7, wr7, wr2\n"
+		"wldrd        wr8, [%1, #272]\n"
+		"wmadds       wr2, wr6, wr2\n"
+		"wldrd        wr9, [%1, #280]\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"wmadds       wr9, wr9, wr3\n"
+		"wmadds       wr3, wr8, wr3\n"
+		"waddwss      wr0, wr3, wr0\n"
+		"waddwss      wr5, wr9, wr5\n"
+		"\n"
+		"wstrd        wr0, [%3, #16]\n"
+		"wstrd        wr5, [%3, #24]\n"
+		:
+		: "r" (in), "r" (consts),
+			"r" (1 << (SBC_PROTO_FIXED8_SCALE - 1)), "r" (out),
+			"r" (SBC_PROTO_FIXED8_SCALE)
+		: "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6", "wr7",
+		  "wr8", "wr9", "wr10", "wr11", "wr12", "wr13", "wr14", "wr15",
+		  "wcgr0", "memory");
+#if 0
+	/* without pipeline and resultant latency consideration
+	 * keep it here for reference
+	 * since the latency optimizated code above is difficult to read */
+	asm volatile (
+		"tbcstw       wr8, %2\n"
+		"wldrd        wr0, [%0]\n"
+		"wldrd        wr1, [%0, #8]\n"
+		"wldrd        wr2, [%0, #16]\n"
+		"wldrd        wr3, [%0, #24]\n"
+		"wldrd        wr4, [%1]\n"
+		"wldrd        wr5, [%1, #8]\n"
+		"wldrd        wr6, [%1, #16]\n"
+		"wldrd        wr7, [%1, #24]\n"
+		"wmadds       wr0, wr0, wr4\n"
+		"wmadds       wr1, wr1, wr5\n"
+		"wmadds       wr2, wr2, wr6\n"
+		"wmadds       wr3, wr3, wr7\n"
+		"waddwss      wr0, wr0, wr8\n"
+		"waddwss      wr1, wr1, wr8\n"
+		"waddwss      wr2, wr2, wr8\n"
+		"waddwss      wr3, wr3, wr8\n"
+		"\n"
+		"wldrd        wr4, [%0, #32]\n"
+		"wldrd        wr5, [%0, #40]\n"
+		"wldrd        wr6, [%0, #48]\n"
+		"wldrd        wr7, [%0, #56]\n"
+		"wldrd        wr8, [%1, #32]\n"
+		"wldrd        wr9, [%1, #40]\n"
+		"wldrd       wr10, [%1, #48]\n"
+		"wldrd       wr11, [%1, #56]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		"wmadds       wr5, wr5, wr9\n"
+		"wmadds       wr6, wr6, wr10\n"
+		"wmadds       wr7, wr7, wr11\n"
+		"waddwss      wr0, wr4, wr0\n"
+		"waddwss      wr1, wr5, wr1\n"
+		"waddwss      wr2, wr6, wr2\n"
+		"waddwss      wr3, wr7, wr3\n"
+		"\n"
+		"wldrd        wr4, [%0, #64]\n"
+		"wldrd        wr5, [%0, #72]\n"
+		"wldrd        wr6, [%0, #80]\n"
+		"wldrd        wr7, [%0, #88]\n"
+		"wldrd        wr8, [%1, #64]\n"
+		"wldrd        wr9, [%1, #72]\n"
+		"wldrd       wr10, [%1, #80]\n"
+		"wldrd       wr11, [%1, #88]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		"wmadds       wr5, wr5, wr9\n"
+		"wmadds       wr6, wr6, wr10\n"
+		"wmadds       wr7, wr7, wr11\n"
+		"waddwss      wr0, wr4, wr0\n"
+		"waddwss      wr1, wr5, wr1\n"
+		"waddwss      wr2, wr6, wr2\n"
+		"waddwss      wr3, wr7, wr3\n"
+		"\n"
+		"wldrd        wr4, [%0, #96]\n"
+		"wldrd        wr5, [%0, #104]\n"
+		"wldrd        wr6, [%0, #112]\n"
+		"wldrd        wr7, [%0, #120]\n"
+		"wldrd        wr8, [%1, #96]\n"
+		"wldrd        wr9, [%1, #104]\n"
+		"wldrd       wr10, [%1, #112]\n"
+		"wldrd       wr11, [%1, #120]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		"wmadds       wr5, wr5, wr9\n"
+		"wmadds       wr6, wr6, wr10\n"
+		"wmadds       wr7, wr7, wr11\n"
+		"waddwss      wr0, wr4, wr0\n"
+		"waddwss      wr1, wr5, wr1\n"
+		"waddwss      wr2, wr6, wr2\n"
+		"waddwss      wr3, wr7, wr3\n"
+		"\n"
+		"wldrd        wr4, [%0, #128]\n"
+		"wldrd        wr5, [%0, #136]\n"
+		"wldrd        wr6, [%0, #144]\n"
+		"wldrd        wr7, [%0, #152]\n"
+		"wldrd        wr8, [%1, #128]\n"
+		"wldrd        wr9, [%1, #136]\n"
+		"wldrd       wr10, [%1, #144]\n"
+		"wldrd       wr11, [%1, #152]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		"wmadds       wr5, wr5, wr9\n"
+		"wmadds       wr6, wr6, wr10\n"
+		"wmadds       wr7, wr7, wr11\n"
+		"waddwss      wr0, wr4, wr0\n"
+		"waddwss      wr1, wr5, wr1\n"
+		"waddwss      wr2, wr6, wr2\n"
+		"waddwss      wr3, wr7, wr3\n"
+		"\n"
+		"tmcr       wcgr0, %4\n"
+		"wsrawg       wr0, wr0, wcgr0\n"
+		"wsrawg       wr1, wr1, wcgr0\n"
+		"wsrawg       wr2, wr2, wcgr0\n"
+		"wsrawg       wr3, wr3, wcgr0\n"
+		"\n"
+		"wpackwss     wr0, wr0, wr0\n"
+		"wpackwss     wr1, wr1, wr1\n"
+		"wpackwss     wr2, wr2, wr2\n"
+		"wpackwss     wr3, wr3, wr3\n"
+		"\n"
+		"wldrd        wr4, [%1, #160]\n"
+		"wldrd        wr5, [%1, #168]\n"
+		"wmadds       wr4, wr4, wr0\n"
+		"wmadds       wr5, wr5, wr0\n"
+		"\n"
+		"wldrd        wr6, [%1, #192]\n"
+		"wldrd        wr7, [%1, #200]\n"
+		"wmadds       wr6, wr6, wr1\n"
+		"wmadds       wr7, wr7, wr1\n"
+		"waddwss      wr4, wr6, wr4\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wldrd        wr6, [%1, #224]\n"
+		"wldrd        wr7, [%1, #232]\n"
+		"wmadds       wr6, wr6, wr2\n"
+		"wmadds       wr7, wr7, wr2\n"
+		"waddwss      wr4, wr6, wr4\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wldrd        wr6, [%1, #256]\n"
+		"wldrd        wr7, [%1, #264]\n"
+		"wmadds       wr6, wr6, wr3\n"
+		"wmadds       wr7, wr7, wr3\n"
+		"waddwss      wr4, wr6, wr4\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wstrd        wr4, [%3]\n"
+		"wstrd        wr5, [%3, #8]\n"
+		"\n"
+		"wldrd        wr4, [%1, #176]\n"
+		"wldrd        wr5, [%1, #184]\n"
+		"wmadds       wr5, wr5, wr0\n"
+		"wmadds       wr0, wr4, wr0\n"
+		"\n"
+		"wldrd        wr4, [%1, #208]\n"
+		"wldrd        wr7, [%1, #216]\n"
+		"wmadds       wr7, wr7, wr1\n"
+		"wmadds       wr1, wr4, wr1\n"
+		"waddwss      wr0, wr1, wr0\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wldrd        wr4, [%1, #240]\n"
+		"wldrd        wr7, [%1, #248]\n"
+		"wmadds       wr7, wr7, wr2\n"
+		"wmadds       wr2, wr4, wr2\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wldrd        wr4, [%1, #272]\n"
+		"wldrd        wr7, [%1, #280]\n"
+		"wmadds       wr7, wr7, wr3\n"
+		"wmadds       wr3, wr4, wr3\n"
+		"waddwss      wr0, wr3, wr0\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wstrd        wr0, [%3, #16]\n"
+		"wstrd        wr5, [%3, #24]\n"
+		:
+		: "r" (in), "r" (consts),
+			"r" (1 << (SBC_PROTO_FIXED8_SCALE - 1)), "r" (out),
+			"r" (SBC_PROTO_FIXED8_SCALE)
+		: "memory");
+#endif
+}
+
+static inline void sbc_analyze_4b_4s_iwmmxt(int16_t *x, int32_t *out,
+						int out_stride)
+{
+	/* Analyze blocks */
+	sbc_analyze_four_iwmmxt(x + 12, out, analysis_consts_fixed4_simd_odd);
+	out += out_stride;
+	sbc_analyze_four_iwmmxt(x + 8, out, analysis_consts_fixed4_simd_even);
+	out += out_stride;
+	sbc_analyze_four_iwmmxt(x + 4, out, analysis_consts_fixed4_simd_odd);
+	out += out_stride;
+	sbc_analyze_four_iwmmxt(x + 0, out, analysis_consts_fixed4_simd_even);
+}
+
+static inline void sbc_analyze_4b_8s_iwmmxt(int16_t *x, int32_t *out,
+						int out_stride)
+{
+	/* Analyze blocks */
+	sbc_analyze_eight_iwmmxt(x + 24, out, analysis_consts_fixed8_simd_odd);
+	out += out_stride;
+	sbc_analyze_eight_iwmmxt(x + 16, out, analysis_consts_fixed8_simd_even);
+	out += out_stride;
+	sbc_analyze_eight_iwmmxt(x + 8, out, analysis_consts_fixed8_simd_odd);
+	out += out_stride;
+	sbc_analyze_eight_iwmmxt(x + 0, out, analysis_consts_fixed8_simd_even);
+}
+
+static void sbc_calc_scalefactors_iwmmxt2(
+	int32_t sb_sample_f[16][2][8],
+	uint32_t scale_factor[2][8],
+	int blocks, int channels, int subbands)
+{
+	int ch, sb;
+	for (ch = 0; ch < channels; ch++) {
+		for (sb = 0; sb < subbands; sb += 2) {
+			int blk = blocks;
+			int32_t *in = &sb_sample_f[0][ch][sb];
+			/* For iwmmxt2, since we use wabs */
+			asm volatile (
+				"wldrd        wr1, [%[in]], %[inc]\n"
+				"tbcstw       wr0, %[c1]\n"
+				"wldrd        wr2, [%[in]], %[inc]\n"
+				"wldrd        wr3, [%[in]], %[inc]\n"
+				"wldrd        wr4, [%[in]], %[inc]\n"
+			"1:\n"
+				"wabsw        wr1, wr1\n"
+				"wabsw        wr2, wr2\n"
+				"wabsw        wr3, wr3\n"
+				"wabsw        wr4, wr4\n"
+				"wmaxuw       wr5, wr1, wr2\n"
+				"wldrd        wr1, [%[in]], %[inc]\n"
+				"wmaxuw       wr6, wr3, wr4\n"
+				"wldrd        wr2, [%[in]], %[inc]\n"
+				"wmaxuw       wr5, wr5, wr6\n"
+				"wldrd        wr3, [%[in]], %[inc]\n"
+				"wmaxuw       wr0, wr0, wr5\n"
+				"wldrd        wr4, [%[in]], %[inc]\n"
+				"subs         %[blk], %[blk], #4\n"
+				"bgt          1b\n"
+
+				"tmrrc        %0, %1, wr0\n"
+				"sub          %0, %0, #1\n"
+				"clz          %0, %0\n"
+				"rsb          %0, %0, %[c2]\n"
+				"str          %0, [%[out]]\n"
+
+				"sub          %1, %1, #1\n"
+				"clz          %1, %1\n"
+				"rsb          %1, %1, %[c2]\n"
+				"str          %1, [%[out], #4]\n"
+			: [in] "+r" (in), [blk] "+r" (blk)
+			: [inc] "i" ((char *) &sb_sample_f[1][0][0] -
+					(char *) &sb_sample_f[0][0][0]),
+				[out] "r" (&scale_factor[ch][sb]),
+				[c1] "r" ((1 << SCALE_OUT_BITS) + 1),
+				[c2] "i" (SCALE_OUT_BITS+1)
+			: "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6",
+			  "cc", "memory");
+		}
+	}
+}
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *state)
+{
+	state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_iwmmxt;
+	state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_iwmmxt;
+	state->sbc_calc_scalefactors = sbc_calc_scalefactors_iwmmxt2;
+	state->implementation_info = "IWMMXT";
+}
+
+#endif
diff --git a/sbc/sbc_primitives_iwmmxt.h b/sbc/sbc_primitives_iwmmxt.h
new file mode 100644
index 0000000..827d811
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.h
@@ -0,0 +1,38 @@
+/*
+ *
+ *  Bluetooth low-complexity, subband codec (SBC) library
+ *
+ *  Based on sbc_primitives_mmx.c
+ *
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef __SBC_PRIMITIVES_IWMMXT_H
+#define __SBC_PRIMITIVES_IWMMXT_H
+
+#include "sbc_primitives.h"
+
+#if defined(__GNUC__) && defined(__IWMMXT__) && \
+		!defined(SBC_HIGH_PRECISION) && (SCALE_OUT_BITS == 15)
+
+#define SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *encoder_state);
+
+#endif
+
+#endif

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] Add iwmmxt optimization for sbc for pxa series cpu
  2010-11-12  7:35   ` [PATCH v2] " Keith Mok
@ 2010-11-12 13:22     ` Siarhei Siamashka
  2010-11-15  2:46       ` [PATCH v3] " Keith Mok
  0 siblings, 1 reply; 10+ messages in thread
From: Siarhei Siamashka @ 2010-11-12 13:22 UTC (permalink / raw)
  To: Keith Mok; +Cc: linux-bluetooth

[-- Attachment #1: Type: Text/Plain, Size: 5034 bytes --]

On Friday 12 November 2010 09:35:04 Keith Mok wrote:
> > Did you run some benchmarks with these optimizations to measure how much
> > they are helping?
> 
> Tested on Marvell PXA platform.
> == Before ==
> $ time ./sbcenc   -b53 -s8 -j  c.au  > /dev/null
> real    0m 0.41s
> user    0m 0.40s
> sys     0m 0.00s
> 
> == After ==
> $ time ./sbcenc   -b53 -s8 -j  c.au  > /dev/null
> real    0m 0.19s
> user    0m 0.17s
> sys     0m 0.02s

Thanks, this looks consistent with the results of optimizations on the other
platforms where the performance increases roughly twice after adding SIMD 
optimizations to the sbc analysis filter.

But maybe it's better to use a bit bigger test file, so that the total time 
increases to at least several seconds. With very small times, it's hard to say
whether it is an actual improvement or random noise. It may be ok for such a
huge performance improvement, but with less significant optimizations the 
precision of measurements may become a problem.

Also do you have oprofile available on PXA platform? It may provide a nice 
statistics about what functions are used and are the performance hot spots. 

> > Using back-to-back WLDRD instructions has some performance penalty
> 
> I rearrange the instructions and keep the original one as for reference in
> the block that comment out. Since the code is really difficult to read
> after interleaved.

Thanks, this looks like it really should run quite a bit faster than the
previous variant (based on my understanding of intel pdf files). 

I sometimes use different indentation levels in such cases in order to improve 
readability after instructions reordering, so that each logically independent
block of code has its own indentation level and it is still easily visible
after instructions reordering. For example, with the original code:

A1
A2
A3
A4
B1
B2
B3
B4

If the instructions need to be reordered in order to improve scheduling for the
cpu pipeline, then for example

A1
A2
  B1
A3
  B2
A4
  B3
  B4

looks much more readable to me than

A1
A2
B1
A3
B2
A4
B3
B4

With different indentation levels, one can still see the flow of instructions 
as independent streams. If different levels of indentation in inline assembly 
pass coding style test by checkpatch.pl script, then it should be fine.

Also I'm quite curious whether better instructions scheduling provide any clear
improvement, so some numbers comparing older and newer implementation would
be appreciated. I did not suggest that just for entertainment purposes ;) It
really should provide some practical benefit.

If you have time and want to make such a test, iwmmxt intrinsics could be also
tried, so that instructions scheduling and registers allocation becomes a
responsibility of the compiler. But my previous experiments with arm neon
intrinsics showed that the compiler does a very poor job and can't be trusted
to generate fast code. But maybe iwmmxt could be different or gcc could have
improved since than.

> > The MMX code was using PCMPGTD and the other instructions just because
> > MMX instruction set is very limited and did not have the needed
> > instructions. But you can use WABS and WMAX instructions to do this job
> > better. You can refer to the original C code and also to ARM NEON
> > optimizations to get some ideas about how to do this operation faster.
> 
> Changed as suggested.
> But got a question that the __IWMMXT__ builtin gcc definition is not a
> reliable way to
> determine whether mcpu=iwmmxt2 is turned on or not. It will break when
> compile under pxa270
> which does not support wabs with just mcpu=iwmmx on.

Well, as I said before, I'm not familiar with iwmmxt and pxa platform. And I
did not notice that there are actually several revisions of iwmmxt isa, my bad.
So looks like iwmmxt1 is just as restrictive as the original mmx and the direct 
conversion from mmx like you did before may be the right thing. For arm neon 
optimizations, the effect of using vector ABS/MAX instructions was just
about 1% of overall performance improvement. Not so much, but every little bit
helps. And if for iwmmxt it causes such backwards compatibility issues, then it
might be not worth it. It's up to you to decide.

I would still suggest to initially have just optimizations for 
sbc_analyze_four_iwmmxt/sbc_analyze_eight_iwmmxt in the first patch (or maybe 
in two patches). And then add optimization for sbc_calc_scalefactors in a
separate patch later.

Regarding the benchmarks and functions usage:
1. sbc_analyze_four_iwmmxt is important for 4 subbands case ('-s4' option for 
sbcenc)
2. sbc_analyze_eight_iwmmxt is important for 8 subbands case ('-s8' option for 
sbcenc)
3. sbc_calc_scalefactors is important for either mono audio, or when joint 
stereo is *not* used (sbcenc is run without '-j' option).

All of this is better to be benchmarked/tested separately.

-- 
Best regards,
Siarhei Siamashka

[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] Add iwmmxt optimization for sbc for pxa series cpu
  2010-11-12 13:22     ` Siarhei Siamashka
@ 2010-11-15  2:46       ` Keith Mok
  2010-11-15 11:08         ` Siarhei Siamashka
  0 siblings, 1 reply; 10+ messages in thread
From: Keith Mok @ 2010-11-15  2:46 UTC (permalink / raw)
  To: Siarhei Siamashka; +Cc: linux-bluetooth

> I sometimes use different indentation levels in such cases in order to improve
> readability after instructions reordering, so that each logically independent
> block of code has its own indentation level and it is still easily visible
> after instructions reordering. For example, with the original code:
Thanks for the hints. I rearranged the code.


> Not so much, but every little bit
> helps. And if for iwmmxt it causes such backwards compatibility issues, then it
> might be not worth it. It's up to you to decide.
I removed the scale_factor optimization since from the result I
tested, it shows little help in performance.


> Regarding the benchmarks and functions usage:
> 1. sbc_analyze_four_iwmmxt is important for 4 subbands case ('-s4' option for
> sbcenc)
> 2. sbc_analyze_eight_iwmmxt is important for 8 subbands case ('-s8' option for
> sbcenc)
===  Before (4 bands) ====
$ time  ./sbcenc_orig  -s 4     long.au  > /dev/null
real    0m 2.44s
user    0m 2.39s
sys     0m 0.05s
===  After (4 bands) ====
$ time  ./sbcenc  -s 4     long.au  > /dev/null
real    0m 1.59s
user    0m 1.49s
sys     0m 0.10s


===  Before (8 bands) ====
$ time  ./sbcenc_orig   -s 8     long.au  > /dev/null
real    0m 4.05s
user    0m 3.98s
sys     0m 0.07s
===  After (8 bands) ====
$ time  ./sbcenc  -s 8     long.au  > /dev/null
real    0m 1.48s
user    0m 1.41s
sys     0m 0.06s


===  Before (a2dp usage) ====
$ time  ./sbcenc_orig   -b53 -s8 -j    long.au  > /dev/null
real    0m 4.51s
user    0m 4.41s
sys     0m 0.10s
===  After (a2dp usage) ====
$ time  ./sbcenc   -b53 -s8 -j    long.au  > /dev/null
real    0m 2.05s
user    0m 1.99s
sys     0m 0.06s


Keith


Signed-off-by: Keith Mok <ek9852@gmail.com>
---
 Makefile.am                 |    1 +
 sbc/sbc_primitives.c        |    4 +
 sbc/sbc_primitives_iwmmxt.c |  301 +++++++++++++++++++++++++++++++++++++++++++
 sbc/sbc_primitives_iwmmxt.h |   38 ++++++
 4 files changed, 344 insertions(+), 0 deletions(-)
 create mode 100644 sbc/sbc_primitives_iwmmxt.c
 create mode 100644 sbc/sbc_primitives_iwmmxt.h

diff --git a/Makefile.am b/Makefile.am
index da308a7..03a9bf2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -65,6 +65,7 @@ noinst_LTLIBRARIES += sbc/libsbc.la
 sbc_libsbc_la_SOURCES = sbc/sbc.h sbc/sbc.c sbc/sbc_math.h sbc/sbc_tables.h \
 			sbc/sbc_primitives.h sbc/sbc_primitives.c \
 			sbc/sbc_primitives_mmx.h sbc/sbc_primitives_mmx.c \
+			sbc/sbc_primitives_iwmmxt.h sbc/sbc_primitives_iwmmxt.c \
 			sbc/sbc_primitives_neon.h sbc/sbc_primitives_neon.c \
 			sbc/sbc_primitives_armv6.h sbc/sbc_primitives_armv6.c

diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
index f87fb5a..ad780d0 100644
--- a/sbc/sbc_primitives.c
+++ b/sbc/sbc_primitives.c
@@ -33,6 +33,7 @@

 #include "sbc_primitives.h"
 #include "sbc_primitives_mmx.h"
+#include "sbc_primitives_iwmmxt.h"
 #include "sbc_primitives_neon.h"
 #include "sbc_primitives_armv6.h"

@@ -544,6 +545,9 @@ void sbc_init_primitives(struct sbc_encoder_state *state)
 #ifdef SBC_BUILD_WITH_ARMV6_SUPPORT
 	sbc_init_primitives_armv6(state);
 #endif
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+	sbc_init_primitives_iwmmxt(state);
+#endif
 #ifdef SBC_BUILD_WITH_NEON_SUPPORT
 	sbc_init_primitives_neon(state);
 #endif
diff --git a/sbc/sbc_primitives_iwmmxt.c b/sbc/sbc_primitives_iwmmxt.c
new file mode 100644
index 0000000..fc462d2
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.c
@@ -0,0 +1,301 @@
+/*
+ *
+ *  Bluetooth low-complexity, subband codec (SBC) library
+ *
+ *  Copyright (C) 2010 Keith Mok <ek9852@gmail.com>
+ *  Based on sbc_primitives_mmx.c
+ *
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include <stdint.h>
+#include <limits.h>
+#include "sbc.h"
+#include "sbc_math.h"
+#include "sbc_tables.h"
+
+#include "sbc_primitives_iwmmxt.h"
+
+/*
+ * IWMMXT optimizations
+ */
+
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+static inline void sbc_analyze_four_iwmmxt(const int16_t *in, int32_t *out,
+					const FIXED_T *consts)
+{
+	asm volatile (
+		"wldrd        wr0, [%0]\n"
+		"tbcstw       wr4, %2\n"
+		"wldrd        wr2, [%1]\n"
+		"wldrd        wr1, [%0, #8]\n"
+		"wldrd        wr3, [%1, #8]\n"
+		"wmadds       wr0, wr2, wr0\n"
+		" wldrd       wr6, [%0, #16]\n"
+		"wmadds       wr1, wr3, wr1\n"
+		" wldrd       wr7, [%0, #24]\n"
+		"waddwss      wr0, wr0, wr4\n"
+		" wldrd       wr8, [%1, #16]\n"
+		"waddwss      wr1, wr1, wr4\n"
+		" wldrd       wr9, [%1, #24]\n"
+		" wmadds      wr6, wr8, wr6\n"
+		"  wldrd      wr2, [%0, #32]\n"
+		" wmadds      wr7, wr9, wr7\n"
+		"  wldrd      wr3, [%0, #40]\n"
+		" waddwss     wr0, wr6, wr0\n"
+		"  wldrd      wr4, [%1, #32]\n"
+		" waddwss     wr1, wr7, wr1\n"
+		"  wldrd      wr5, [%1, #40]\n"
+		"  wmadds     wr2, wr4, wr2\n"
+		"wldrd        wr6, [%0, #48]\n"
+		"  wmadds     wr3, wr5, wr3\n"
+		"wldrd        wr7, [%0, #56]\n"
+		"  waddwss    wr0, wr2, wr0\n"
+		"wldrd        wr8, [%1, #48]\n"
+		"  waddwss    wr1, wr3, wr1\n"
+		"wldrd        wr9, [%1, #56]\n"
+		"wmadds       wr6, wr8, wr6\n"
+		" wldrd       wr2, [%0, #64]\n"
+		"wmadds       wr7, wr9, wr7\n"
+		" wldrd       wr3, [%0, #72]\n"
+		"waddwss      wr0, wr6, wr0\n"
+		" wldrd       wr4, [%1, #64]\n"
+		"waddwss      wr1, wr7, wr1\n"
+		" wldrd       wr5, [%1, #72]\n"
+		" wmadds      wr2, wr4, wr2\n"
+		"tmcr       wcgr0, %4\n"
+		" wmadds      wr3, wr5, wr3\n"
+		" waddwss     wr0, wr2, wr0\n"
+		" waddwss     wr1, wr3, wr1\n"
+		"\n"
+		"wsrawg       wr0, wr0, wcgr0\n"
+		" wldrd       wr4, [%1, #80]\n"
+		"wsrawg       wr1, wr1, wcgr0\n"
+		" wldrd       wr5, [%1, #88]\n"
+		"wpackwss     wr0, wr0, wr0\n"
+		" wldrd       wr6, [%1, #96]\n"
+		"wpackwss     wr1, wr1, wr1\n"
+		"wmadds       wr2, wr5, wr0\n"
+		" wldrd       wr7, [%1, #104]\n"
+		"wmadds       wr0, wr4, wr0\n"
+		"\n"
+		" wmadds      wr3, wr7, wr1\n"
+		" wmadds      wr1, wr6, wr1\n"
+		" waddwss     wr2, wr3, wr2\n"
+		" waddwss     wr0, wr1, wr0\n"
+		"\n"
+		"wstrd        wr0, [%3]\n"
+		"wstrd        wr2, [%3, #8]\n"
+		:
+		: "r" (in), "r" (consts),
+			"r" (1 << (SBC_PROTO_FIXED4_SCALE - 1)), "r" (out),
+			"r" (SBC_PROTO_FIXED4_SCALE)
+		: "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6", "wr7",
+		  "wr8", "wr9", "wcgr0", "memory");
+}
+
+static inline void sbc_analyze_eight_iwmmxt(const int16_t *in, int32_t *out,
+							const FIXED_T *consts)
+{
+	asm volatile (
+		"wldrd        wr0, [%0]\n"
+		"tbcstw       wr15, %2\n"
+		"wldrd        wr1, [%0, #8]\n"
+		"wldrd        wr2, [%0, #16]\n"
+		"wldrd        wr3, [%0, #24]\n"
+		"wldrd        wr4, [%1]\n"
+		"wldrd        wr5, [%1, #8]\n"
+		"wldrd        wr6, [%1, #16]\n"
+		"wldrd        wr7, [%1, #24]\n"
+		"wmadds       wr0, wr0, wr4\n"
+		" wldrd       wr8, [%1, #32]\n"
+		"wmadds       wr1, wr1, wr5\n"
+		" wldrd       wr9, [%1, #40]\n"
+		"wmadds       wr2, wr2, wr6\n"
+		" wldrd      wr10, [%1, #48]\n"
+		"wmadds       wr3, wr3, wr7\n"
+		" wldrd      wr11, [%1, #56]\n"
+		"waddwss      wr0, wr0, wr15\n"
+		" wldrd       wr4, [%0, #32]\n"
+		"waddwss      wr1, wr1, wr15\n"
+		" wldrd       wr5, [%0, #40]\n"
+		"waddwss      wr2, wr2, wr15\n"
+		" wldrd       wr6, [%0, #48]\n"
+		"waddwss      wr3, wr3, wr15\n"
+		" wldrd       wr7, [%0, #56]\n"
+		" wmadds      wr4, wr4, wr8\n"
+		"  wldrd     wr12, [%0, #64]\n"
+		" wmadds      wr5, wr5, wr9\n"
+		"  wldrd     wr13, [%0, #72]\n"
+		" wmadds      wr6, wr6, wr10\n"
+		"  wldrd     wr14, [%0, #80]\n"
+		" wmadds      wr7, wr7, wr11\n"
+		"  wldrd     wr15, [%0, #88]\n"
+		" waddwss     wr0, wr4, wr0\n"
+		"  wldrd      wr8, [%1, #64]\n"
+		" waddwss     wr1, wr5, wr1\n"
+		"  wldrd      wr9, [%1, #72]\n"
+		" waddwss     wr2, wr6, wr2\n"
+		"  wldrd     wr10, [%1, #80]\n"
+		" waddwss     wr3, wr7, wr3\n"
+		"  wldrd     wr11, [%1, #88]\n"
+		"  wmadds    wr12, wr12, wr8\n"
+		"wldrd        wr4, [%0, #96]\n"
+		"  wmadds    wr13, wr13, wr9\n"
+		"wldrd        wr5, [%0, #104]\n"
+		"  wmadds    wr14, wr14, wr10\n"
+		"wldrd        wr6, [%0, #112]\n"
+		"  wmadds    wr15, wr15, wr11\n"
+		"wldrd        wr7, [%0, #120]\n"
+		"  waddwss    wr0, wr12, wr0\n"
+		"wldrd        wr8, [%1, #96]\n"
+		"  waddwss    wr1, wr13, wr1\n"
+		"wldrd        wr9, [%1, #104]\n"
+		"  waddwss    wr2, wr14, wr2\n"
+		"wldrd       wr10, [%1, #112]\n"
+		"  waddwss    wr3, wr15, wr3\n"
+		"wldrd       wr11, [%1, #120]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		" wldrd      wr12, [%0, #128]\n"
+		"wmadds       wr5, wr5, wr9\n"
+		" wldrd      wr13, [%0, #136]\n"
+		"wmadds       wr6, wr6, wr10\n"
+		" wldrd      wr14, [%0, #144]\n"
+		"wmadds       wr7, wr7, wr11\n"
+		" wldrd      wr15, [%0, #152]\n"
+		"waddwss      wr0, wr4, wr0\n"
+		" wldrd       wr8, [%1, #128]\n"
+		"waddwss      wr1, wr5, wr1\n"
+		" wldrd       wr9, [%1, #136]\n"
+		"waddwss      wr2, wr6, wr2\n"
+		" wldrd      wr10, [%1, #144]\n"
+		" waddwss     wr3, wr7, wr3\n"
+		" wldrd     wr11, [%1, #152]\n"
+		" wmadds     wr12, wr12, wr8\n"
+		"tmcr       wcgr0, %4\n"
+		" wmadds     wr13, wr13, wr9\n"
+		" wmadds     wr14, wr14, wr10\n"
+		" wmadds     wr15, wr15, wr11\n"
+		" waddwss     wr0, wr12, wr0\n"
+		" waddwss     wr1, wr13, wr1\n"
+		" waddwss     wr2, wr14, wr2\n"
+		" waddwss     wr3, wr15, wr3\n"
+		"\n"
+		"wsrawg       wr0, wr0, wcgr0\n"
+		"wsrawg       wr1, wr1, wcgr0\n"
+		"wsrawg       wr2, wr2, wcgr0\n"
+		"wsrawg       wr3, wr3, wcgr0\n"
+		"\n"
+		"wpackwss     wr0, wr0, wr0\n"
+		"wpackwss     wr1, wr1, wr1\n"
+		" wldrd       wr4, [%1, #160]\n"
+		"wpackwss     wr2, wr2, wr2\n"
+		" wldrd       wr5, [%1, #168]\n"
+		"wpackwss     wr3, wr3, wr3\n"
+		"  wldrd      wr6, [%1, #192]\n"
+		" wmadds      wr4, wr4, wr0\n"
+		"  wldrd      wr7, [%1, #200]\n"
+		" wmadds      wr5, wr5, wr0\n"
+		"   wldrd     wr8, [%1, #224]\n"
+		"  wmadds     wr6, wr6, wr1\n"
+		"   wldrd     wr9, [%1, #232]\n"
+		"  wmadds     wr7, wr7, wr1\n"
+		"  waddwss    wr4, wr6, wr4\n"
+		"  waddwss    wr5, wr7, wr5\n"
+		"   wmadds    wr8, wr8, wr2\n"
+		"wldrd        wr6, [%1, #256]\n"
+		"   wmadds    wr9, wr9, wr2\n"
+		"wldrd        wr7, [%1, #264]\n"
+		"waddwss      wr4, wr8, wr4\n"
+		"   waddwss   wr5, wr9, wr5\n"
+		"wmadds       wr6, wr6, wr3\n"
+		"wmadds       wr7, wr7, wr3\n"
+		"waddwss      wr4, wr6, wr4\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wstrd        wr4, [%3]\n"
+		"wstrd        wr5, [%3, #8]\n"
+		"\n"
+		"wldrd        wr6, [%1, #176]\n"
+		"wldrd        wr5, [%1, #184]\n"
+		"wmadds       wr5, wr5, wr0\n"
+		"wldrd        wr8, [%1, #208]\n"
+		"wmadds       wr0, wr6, wr0\n"
+		"wldrd        wr9, [%1, #216]\n"
+		"wmadds       wr9, wr9, wr1\n"
+		"wldrd        wr6, [%1, #240]\n"
+		"wmadds       wr1, wr8, wr1\n"
+		"wldrd        wr7, [%1, #248]\n"
+		"waddwss      wr0, wr1, wr0\n"
+		"waddwss      wr5, wr9, wr5\n"
+		"wmadds       wr7, wr7, wr2\n"
+		"wldrd        wr8, [%1, #272]\n"
+		"wmadds       wr2, wr6, wr2\n"
+		"wldrd        wr9, [%1, #280]\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"wmadds       wr9, wr9, wr3\n"
+		"wmadds       wr3, wr8, wr3\n"
+		"waddwss      wr0, wr3, wr0\n"
+		"waddwss      wr5, wr9, wr5\n"
+		"\n"
+		"wstrd        wr0, [%3, #16]\n"
+		"wstrd        wr5, [%3, #24]\n"
+		:
+		: "r" (in), "r" (consts),
+			"r" (1 << (SBC_PROTO_FIXED8_SCALE - 1)), "r" (out),
+			"r" (SBC_PROTO_FIXED8_SCALE)
+		: "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6", "wr7",
+		  "wr8", "wr9", "wr10", "wr11", "wr12", "wr13", "wr14", "wr15",
+		  "wcgr0", "memory");
+}
+
+static inline void sbc_analyze_4b_4s_iwmmxt(int16_t *x, int32_t *out,
+						int out_stride)
+{
+	/* Analyze blocks */
+	sbc_analyze_four_iwmmxt(x + 12, out, analysis_consts_fixed4_simd_odd);
+	out += out_stride;
+	sbc_analyze_four_iwmmxt(x + 8, out, analysis_consts_fixed4_simd_even);
+	out += out_stride;
+	sbc_analyze_four_iwmmxt(x + 4, out, analysis_consts_fixed4_simd_odd);
+	out += out_stride;
+	sbc_analyze_four_iwmmxt(x + 0, out, analysis_consts_fixed4_simd_even);
+}
+
+static inline void sbc_analyze_4b_8s_iwmmxt(int16_t *x, int32_t *out,
+						int out_stride)
+{
+	/* Analyze blocks */
+	sbc_analyze_eight_iwmmxt(x + 24, out, analysis_consts_fixed8_simd_odd);
+	out += out_stride;
+	sbc_analyze_eight_iwmmxt(x + 16, out, analysis_consts_fixed8_simd_even);
+	out += out_stride;
+	sbc_analyze_eight_iwmmxt(x + 8, out, analysis_consts_fixed8_simd_odd);
+	out += out_stride;
+	sbc_analyze_eight_iwmmxt(x + 0, out, analysis_consts_fixed8_simd_even);
+}
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *state)
+{
+	state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_iwmmxt;
+	state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_iwmmxt;
+	state->implementation_info = "IWMMXT";
+}
+
+#endif
diff --git a/sbc/sbc_primitives_iwmmxt.h b/sbc/sbc_primitives_iwmmxt.h
new file mode 100644
index 0000000..827d811
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.h
@@ -0,0 +1,38 @@
+/*
+ *
+ *  Bluetooth low-complexity, subband codec (SBC) library
+ *
+ *  Based on sbc_primitives_mmx.c
+ *
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef __SBC_PRIMITIVES_IWMMXT_H
+#define __SBC_PRIMITIVES_IWMMXT_H
+
+#include "sbc_primitives.h"
+
+#if defined(__GNUC__) && defined(__IWMMXT__) && \
+		!defined(SBC_HIGH_PRECISION) && (SCALE_OUT_BITS == 15)
+
+#define SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *encoder_state);
+
+#endif
+
+#endif
-- 
1.6.3.3

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] Add iwmmxt optimization for sbc for pxa series cpu
  2010-11-15  2:46       ` [PATCH v3] " Keith Mok
@ 2010-11-15 11:08         ` Siarhei Siamashka
  2010-11-18 13:05           ` Siarhei Siamashka
  0 siblings, 1 reply; 10+ messages in thread
From: Siarhei Siamashka @ 2010-11-15 11:08 UTC (permalink / raw)
  To: Keith Mok; +Cc: linux-bluetooth

[-- Attachment #1: Type: Text/Plain, Size: 2142 bytes --]

On Monday 15 November 2010 04:46:25 Keith Mok wrote:
> > I sometimes use different indentation levels in such cases in order to
> > improve readability after instructions reordering, so that each
> > logically independent block of code has its own indentation level and it
> > is still easily visible
> 
> > after instructions reordering. For example, with the original code:
> Thanks for the hints. I rearranged the code.

Thanks, now the assembly code looks ok to me. I also discovered that qemu
supports iwmmxt1 emulation just fine and also tried to test your optimizations
for correctness myself (with a script which tries different encoding paramaters 
for different audio samples and checks md5 checksums), no problems detected.

So if somebody else could check whether the other things are right (copyright
notices for example), then we are done with it.

> I removed the scale_factor optimization since from the result I
> tested, it shows little help in performance.

I guess after easily doubling performance by adding simd optimizations to the
sbc analysis filter, just roughly ~10% improvement (as measured for x86 and
arm neon) does not look particularly impressive anymore: 
http://git.kernel.org/?p=bluetooth/bluez.git;a=commit;h=95465b816f0ce7f0ec10a183ce7ff0c6f83d86eb
http://git.kernel.org/?p=bluetooth/bluez.git;a=commit;h=d049a9a2aec2b518e04f11ef0ecc355db8237291

But I still think that every little bit helps. Did you also get something like 
10% speedup, or was it even worse than that?

A bit more important in practice is the optimization for joint stereo scale 
factors calculation (because it is typically used for A2DP). And it provided
almost 20% of performance improvement for arm neon:
http://git.kernel.org/?p=bluetooth/bluez.git;a=commit;h=e1ea3e76c72d56041c30b317818e8d7b5a0c7350

So 'sbc_calc_scalefactors_j_iwmmxt' may be a nice addition too, optimized 
either as a whole for best performance (like in arm neon code), or just with
some small chunks of assembly like in 'sbc_calc_scalefactors_mmx' because it
is easier this way.

-- 
Best regards,
Siarhei Siamashka

[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] Add iwmmxt optimization for sbc for pxa series cpu
  2010-11-15 11:08         ` Siarhei Siamashka
@ 2010-11-18 13:05           ` Siarhei Siamashka
  2010-11-18 13:31             ` Johan Hedberg
  2010-11-18 13:33             ` [PATCH] " Keith Mok
  0 siblings, 2 replies; 10+ messages in thread
From: Siarhei Siamashka @ 2010-11-18 13:05 UTC (permalink / raw)
  To: Keith Mok; +Cc: linux-bluetooth

On Monday 15 November 2010 13:08:19 Siarhei Siamashka wrote:
> On Monday 15 November 2010 04:46:25 Keith Mok wrote:
> > > I sometimes use different indentation levels in such cases in order to
> > > improve readability after instructions reordering, so that each
> > > logically independent block of code has its own indentation level and
> > > it is still easily visible
> > 
> > > after instructions reordering. For example, with the original code:
> > Thanks for the hints. I rearranged the code.
> 
> Thanks, now the assembly code looks ok to me. I also discovered that qemu
> supports iwmmxt1 emulation just fine and also tried to test your
> optimizations for correctness myself (with a script which tries different
> encoding paramaters for different audio samples and checks md5 checksums),
> no problems detected.
> 
> So if somebody else could check whether the other things are right
> (copyright notices for example), then we are done with it.

As nobody else has stepped in, I guess it's still my responsibility to provide
some further guidance even though I'm a very infrequent contributor myself.
Hopefully somebody will correct me if I'm wrong.

So please

1. Make a final patch in such a form that can be pushed to git repository 
without any modifications, it means that you need a clean commit message and 
not just some text intermixed with the parts and quotations of discussion from
this mailing list.
2. "Signed-off-by" header is not needed for the userspace parts of bluez.
3. All files must have copyright notices, even a small one like 
'sbc_primitives_iwmmxt.h'. And probably you should just replicate all the
copyright notices from the source files with sbc mmx optimizations and add your
own copyright on top.

Hopefully that should be enough to get your optimizations applied. Thanks.

-- 
Best regards,
Siarhei Siamashka

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] Add iwmmxt optimization for sbc for pxa series cpu
  2010-11-18 13:05           ` Siarhei Siamashka
@ 2010-11-18 13:31             ` Johan Hedberg
  2010-11-18 13:33             ` [PATCH] " Keith Mok
  1 sibling, 0 replies; 10+ messages in thread
From: Johan Hedberg @ 2010-11-18 13:31 UTC (permalink / raw)
  To: Siarhei Siamashka; +Cc: Keith Mok, linux-bluetooth

Hi Siarhei,

On Thu, Nov 18, 2010, Siarhei Siamashka wrote:
> 1. Make a final patch in such a form that can be pushed to git repository 
> without any modifications, it means that you need a clean commit message and 
> not just some text intermixed with the parts and quotations of discussion from
> this mailing list.
> 2. "Signed-off-by" header is not needed for the userspace parts of bluez.
> 3. All files must have copyright notices, even a small one like 
> 'sbc_primitives_iwmmxt.h'. And probably you should just replicate all the
> copyright notices from the source files with sbc mmx optimizations and add your
> own copyright on top.
> 
> Hopefully that should be enough to get your optimizations applied. Thanks.

Yep, those things would be needed before pushing upstream. Thanks for
reminding me about this patch. I had actually forgotten about it.

Johan

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] Add iwmmxt optimization for sbc for pxa series cpu
  2010-11-18 13:05           ` Siarhei Siamashka
  2010-11-18 13:31             ` Johan Hedberg
@ 2010-11-18 13:33             ` Keith Mok
  2010-11-18 16:53               ` Johan Hedberg
  1 sibling, 1 reply; 10+ messages in thread
From: Keith Mok @ 2010-11-18 13:33 UTC (permalink / raw)
  To: Siarhei Siamashka; +Cc: linux-bluetooth

Add iwmmxt optimization for sbc for pxa series cpu.

Benchmarked on ARM PXA platform:
===  Before (4 bands) ====
$ time  ./sbcenc_orig  -s 4     long.au  > /dev/null
real    0m 2.44s
user    0m 2.39s
sys     0m 0.05s
===  After (4 bands) ====
$ time  ./sbcenc  -s 4     long.au  > /dev/null
real    0m 1.59s
user    0m 1.49s
sys     0m 0.10s

===  Before (8 bands) ====
$ time  ./sbcenc_orig   -s 8     long.au  > /dev/null
real    0m 4.05s
user    0m 3.98s
sys     0m 0.07s
===  After (8 bands) ====
$ time  ./sbcenc  -s 8     long.au  > /dev/null
real    0m 1.48s
user    0m 1.41s
sys     0m 0.06s

===  Before (a2dp usage) ====
$ time  ./sbcenc_orig   -b53 -s8 -j    long.au  > /dev/null
real    0m 4.51s
user    0m 4.41s
sys     0m 0.10s
===  After (a2dp usage) ====
$ time  ./sbcenc   -b53 -s8 -j    long.au  > /dev/null
real    0m 2.05s
user    0m 1.99s
sys     0m 0.06s

---
 Makefile.am                 |    1 +
 sbc/sbc_primitives.c        |    4 +
 sbc/sbc_primitives_iwmmxt.c |  304 +++++++++++++++++++++++++++++++++++++++++++
 sbc/sbc_primitives_iwmmxt.h |   42 ++++++
 4 files changed, 351 insertions(+), 0 deletions(-)
 create mode 100644 sbc/sbc_primitives_iwmmxt.c
 create mode 100644 sbc/sbc_primitives_iwmmxt.h

diff --git a/Makefile.am b/Makefile.am
index da308a7..03a9bf2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -65,6 +65,7 @@ noinst_LTLIBRARIES += sbc/libsbc.la
 sbc_libsbc_la_SOURCES = sbc/sbc.h sbc/sbc.c sbc/sbc_math.h sbc/sbc_tables.h \
 			sbc/sbc_primitives.h sbc/sbc_primitives.c \
 			sbc/sbc_primitives_mmx.h sbc/sbc_primitives_mmx.c \
+			sbc/sbc_primitives_iwmmxt.h sbc/sbc_primitives_iwmmxt.c \
 			sbc/sbc_primitives_neon.h sbc/sbc_primitives_neon.c \
 			sbc/sbc_primitives_armv6.h sbc/sbc_primitives_armv6.c

diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
index f87fb5a..ad780d0 100644
--- a/sbc/sbc_primitives.c
+++ b/sbc/sbc_primitives.c
@@ -33,6 +33,7 @@

 #include "sbc_primitives.h"
 #include "sbc_primitives_mmx.h"
+#include "sbc_primitives_iwmmxt.h"
 #include "sbc_primitives_neon.h"
 #include "sbc_primitives_armv6.h"

@@ -544,6 +545,9 @@ void sbc_init_primitives(struct sbc_encoder_state *state)
 #ifdef SBC_BUILD_WITH_ARMV6_SUPPORT
 	sbc_init_primitives_armv6(state);
 #endif
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+	sbc_init_primitives_iwmmxt(state);
+#endif
 #ifdef SBC_BUILD_WITH_NEON_SUPPORT
 	sbc_init_primitives_neon(state);
 #endif
diff --git a/sbc/sbc_primitives_iwmmxt.c b/sbc/sbc_primitives_iwmmxt.c
new file mode 100644
index 0000000..213967e
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.c
@@ -0,0 +1,304 @@
+/*
+ *
+ *  Bluetooth low-complexity, subband codec (SBC) library
+ *
+ *  Copyright (C) 2010 Keith Mok <ek9852@gmail.com>
+ *  Copyright (C) 2008-2010  Nokia Corporation
+ *  Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ *  Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ *  Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+ *
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include <stdint.h>
+#include <limits.h>
+#include "sbc.h"
+#include "sbc_math.h"
+#include "sbc_tables.h"
+
+#include "sbc_primitives_iwmmxt.h"
+
+/*
+ * IWMMXT optimizations
+ */
+
+#ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+static inline void sbc_analyze_four_iwmmxt(const int16_t *in, int32_t *out,
+					const FIXED_T *consts)
+{
+	asm volatile (
+		"wldrd        wr0, [%0]\n"
+		"tbcstw       wr4, %2\n"
+		"wldrd        wr2, [%1]\n"
+		"wldrd        wr1, [%0, #8]\n"
+		"wldrd        wr3, [%1, #8]\n"
+		"wmadds       wr0, wr2, wr0\n"
+		" wldrd       wr6, [%0, #16]\n"
+		"wmadds       wr1, wr3, wr1\n"
+		" wldrd       wr7, [%0, #24]\n"
+		"waddwss      wr0, wr0, wr4\n"
+		" wldrd       wr8, [%1, #16]\n"
+		"waddwss      wr1, wr1, wr4\n"
+		" wldrd       wr9, [%1, #24]\n"
+		" wmadds      wr6, wr8, wr6\n"
+		"  wldrd      wr2, [%0, #32]\n"
+		" wmadds      wr7, wr9, wr7\n"
+		"  wldrd      wr3, [%0, #40]\n"
+		" waddwss     wr0, wr6, wr0\n"
+		"  wldrd      wr4, [%1, #32]\n"
+		" waddwss     wr1, wr7, wr1\n"
+		"  wldrd      wr5, [%1, #40]\n"
+		"  wmadds     wr2, wr4, wr2\n"
+		"wldrd        wr6, [%0, #48]\n"
+		"  wmadds     wr3, wr5, wr3\n"
+		"wldrd        wr7, [%0, #56]\n"
+		"  waddwss    wr0, wr2, wr0\n"
+		"wldrd        wr8, [%1, #48]\n"
+		"  waddwss    wr1, wr3, wr1\n"
+		"wldrd        wr9, [%1, #56]\n"
+		"wmadds       wr6, wr8, wr6\n"
+		" wldrd       wr2, [%0, #64]\n"
+		"wmadds       wr7, wr9, wr7\n"
+		" wldrd       wr3, [%0, #72]\n"
+		"waddwss      wr0, wr6, wr0\n"
+		" wldrd       wr4, [%1, #64]\n"
+		"waddwss      wr1, wr7, wr1\n"
+		" wldrd       wr5, [%1, #72]\n"
+		" wmadds      wr2, wr4, wr2\n"
+		"tmcr       wcgr0, %4\n"
+		" wmadds      wr3, wr5, wr3\n"
+		" waddwss     wr0, wr2, wr0\n"
+		" waddwss     wr1, wr3, wr1\n"
+		"\n"
+		"wsrawg       wr0, wr0, wcgr0\n"
+		" wldrd       wr4, [%1, #80]\n"
+		"wsrawg       wr1, wr1, wcgr0\n"
+		" wldrd       wr5, [%1, #88]\n"
+		"wpackwss     wr0, wr0, wr0\n"
+		" wldrd       wr6, [%1, #96]\n"
+		"wpackwss     wr1, wr1, wr1\n"
+		"wmadds       wr2, wr5, wr0\n"
+		" wldrd       wr7, [%1, #104]\n"
+		"wmadds       wr0, wr4, wr0\n"
+		"\n"
+		" wmadds      wr3, wr7, wr1\n"
+		" wmadds      wr1, wr6, wr1\n"
+		" waddwss     wr2, wr3, wr2\n"
+		" waddwss     wr0, wr1, wr0\n"
+		"\n"
+		"wstrd        wr0, [%3]\n"
+		"wstrd        wr2, [%3, #8]\n"
+		:
+		: "r" (in), "r" (consts),
+			"r" (1 << (SBC_PROTO_FIXED4_SCALE - 1)), "r" (out),
+			"r" (SBC_PROTO_FIXED4_SCALE)
+		: "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6", "wr7",
+		  "wr8", "wr9", "wcgr0", "memory");
+}
+
+static inline void sbc_analyze_eight_iwmmxt(const int16_t *in, int32_t *out,
+							const FIXED_T *consts)
+{
+	asm volatile (
+		"wldrd        wr0, [%0]\n"
+		"tbcstw       wr15, %2\n"
+		"wldrd        wr1, [%0, #8]\n"
+		"wldrd        wr2, [%0, #16]\n"
+		"wldrd        wr3, [%0, #24]\n"
+		"wldrd        wr4, [%1]\n"
+		"wldrd        wr5, [%1, #8]\n"
+		"wldrd        wr6, [%1, #16]\n"
+		"wldrd        wr7, [%1, #24]\n"
+		"wmadds       wr0, wr0, wr4\n"
+		" wldrd       wr8, [%1, #32]\n"
+		"wmadds       wr1, wr1, wr5\n"
+		" wldrd       wr9, [%1, #40]\n"
+		"wmadds       wr2, wr2, wr6\n"
+		" wldrd      wr10, [%1, #48]\n"
+		"wmadds       wr3, wr3, wr7\n"
+		" wldrd      wr11, [%1, #56]\n"
+		"waddwss      wr0, wr0, wr15\n"
+		" wldrd       wr4, [%0, #32]\n"
+		"waddwss      wr1, wr1, wr15\n"
+		" wldrd       wr5, [%0, #40]\n"
+		"waddwss      wr2, wr2, wr15\n"
+		" wldrd       wr6, [%0, #48]\n"
+		"waddwss      wr3, wr3, wr15\n"
+		" wldrd       wr7, [%0, #56]\n"
+		" wmadds      wr4, wr4, wr8\n"
+		"  wldrd     wr12, [%0, #64]\n"
+		" wmadds      wr5, wr5, wr9\n"
+		"  wldrd     wr13, [%0, #72]\n"
+		" wmadds      wr6, wr6, wr10\n"
+		"  wldrd     wr14, [%0, #80]\n"
+		" wmadds      wr7, wr7, wr11\n"
+		"  wldrd     wr15, [%0, #88]\n"
+		" waddwss     wr0, wr4, wr0\n"
+		"  wldrd      wr8, [%1, #64]\n"
+		" waddwss     wr1, wr5, wr1\n"
+		"  wldrd      wr9, [%1, #72]\n"
+		" waddwss     wr2, wr6, wr2\n"
+		"  wldrd     wr10, [%1, #80]\n"
+		" waddwss     wr3, wr7, wr3\n"
+		"  wldrd     wr11, [%1, #88]\n"
+		"  wmadds    wr12, wr12, wr8\n"
+		"wldrd        wr4, [%0, #96]\n"
+		"  wmadds    wr13, wr13, wr9\n"
+		"wldrd        wr5, [%0, #104]\n"
+		"  wmadds    wr14, wr14, wr10\n"
+		"wldrd        wr6, [%0, #112]\n"
+		"  wmadds    wr15, wr15, wr11\n"
+		"wldrd        wr7, [%0, #120]\n"
+		"  waddwss    wr0, wr12, wr0\n"
+		"wldrd        wr8, [%1, #96]\n"
+		"  waddwss    wr1, wr13, wr1\n"
+		"wldrd        wr9, [%1, #104]\n"
+		"  waddwss    wr2, wr14, wr2\n"
+		"wldrd       wr10, [%1, #112]\n"
+		"  waddwss    wr3, wr15, wr3\n"
+		"wldrd       wr11, [%1, #120]\n"
+		"wmadds       wr4, wr4, wr8\n"
+		" wldrd      wr12, [%0, #128]\n"
+		"wmadds       wr5, wr5, wr9\n"
+		" wldrd      wr13, [%0, #136]\n"
+		"wmadds       wr6, wr6, wr10\n"
+		" wldrd      wr14, [%0, #144]\n"
+		"wmadds       wr7, wr7, wr11\n"
+		" wldrd      wr15, [%0, #152]\n"
+		"waddwss      wr0, wr4, wr0\n"
+		" wldrd       wr8, [%1, #128]\n"
+		"waddwss      wr1, wr5, wr1\n"
+		" wldrd       wr9, [%1, #136]\n"
+		"waddwss      wr2, wr6, wr2\n"
+		" wldrd      wr10, [%1, #144]\n"
+		" waddwss     wr3, wr7, wr3\n"
+		" wldrd     wr11, [%1, #152]\n"
+		" wmadds     wr12, wr12, wr8\n"
+		"tmcr       wcgr0, %4\n"
+		" wmadds     wr13, wr13, wr9\n"
+		" wmadds     wr14, wr14, wr10\n"
+		" wmadds     wr15, wr15, wr11\n"
+		" waddwss     wr0, wr12, wr0\n"
+		" waddwss     wr1, wr13, wr1\n"
+		" waddwss     wr2, wr14, wr2\n"
+		" waddwss     wr3, wr15, wr3\n"
+		"\n"
+		"wsrawg       wr0, wr0, wcgr0\n"
+		"wsrawg       wr1, wr1, wcgr0\n"
+		"wsrawg       wr2, wr2, wcgr0\n"
+		"wsrawg       wr3, wr3, wcgr0\n"
+		"\n"
+		"wpackwss     wr0, wr0, wr0\n"
+		"wpackwss     wr1, wr1, wr1\n"
+		" wldrd       wr4, [%1, #160]\n"
+		"wpackwss     wr2, wr2, wr2\n"
+		" wldrd       wr5, [%1, #168]\n"
+		"wpackwss     wr3, wr3, wr3\n"
+		"  wldrd      wr6, [%1, #192]\n"
+		" wmadds      wr4, wr4, wr0\n"
+		"  wldrd      wr7, [%1, #200]\n"
+		" wmadds      wr5, wr5, wr0\n"
+		"   wldrd     wr8, [%1, #224]\n"
+		"  wmadds     wr6, wr6, wr1\n"
+		"   wldrd     wr9, [%1, #232]\n"
+		"  wmadds     wr7, wr7, wr1\n"
+		"  waddwss    wr4, wr6, wr4\n"
+		"  waddwss    wr5, wr7, wr5\n"
+		"   wmadds    wr8, wr8, wr2\n"
+		"wldrd        wr6, [%1, #256]\n"
+		"   wmadds    wr9, wr9, wr2\n"
+		"wldrd        wr7, [%1, #264]\n"
+		"waddwss      wr4, wr8, wr4\n"
+		"   waddwss   wr5, wr9, wr5\n"
+		"wmadds       wr6, wr6, wr3\n"
+		"wmadds       wr7, wr7, wr3\n"
+		"waddwss      wr4, wr6, wr4\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"\n"
+		"wstrd        wr4, [%3]\n"
+		"wstrd        wr5, [%3, #8]\n"
+		"\n"
+		"wldrd        wr6, [%1, #176]\n"
+		"wldrd        wr5, [%1, #184]\n"
+		"wmadds       wr5, wr5, wr0\n"
+		"wldrd        wr8, [%1, #208]\n"
+		"wmadds       wr0, wr6, wr0\n"
+		"wldrd        wr9, [%1, #216]\n"
+		"wmadds       wr9, wr9, wr1\n"
+		"wldrd        wr6, [%1, #240]\n"
+		"wmadds       wr1, wr8, wr1\n"
+		"wldrd        wr7, [%1, #248]\n"
+		"waddwss      wr0, wr1, wr0\n"
+		"waddwss      wr5, wr9, wr5\n"
+		"wmadds       wr7, wr7, wr2\n"
+		"wldrd        wr8, [%1, #272]\n"
+		"wmadds       wr2, wr6, wr2\n"
+		"wldrd        wr9, [%1, #280]\n"
+		"waddwss      wr0, wr2, wr0\n"
+		"waddwss      wr5, wr7, wr5\n"
+		"wmadds       wr9, wr9, wr3\n"
+		"wmadds       wr3, wr8, wr3\n"
+		"waddwss      wr0, wr3, wr0\n"
+		"waddwss      wr5, wr9, wr5\n"
+		"\n"
+		"wstrd        wr0, [%3, #16]\n"
+		"wstrd        wr5, [%3, #24]\n"
+		:
+		: "r" (in), "r" (consts),
+			"r" (1 << (SBC_PROTO_FIXED8_SCALE - 1)), "r" (out),
+			"r" (SBC_PROTO_FIXED8_SCALE)
+		: "wr0", "wr1", "wr2", "wr3", "wr4", "wr5", "wr6", "wr7",
+		  "wr8", "wr9", "wr10", "wr11", "wr12", "wr13", "wr14", "wr15",
+		  "wcgr0", "memory");
+}
+
+static inline void sbc_analyze_4b_4s_iwmmxt(int16_t *x, int32_t *out,
+						int out_stride)
+{
+	/* Analyze blocks */
+	sbc_analyze_four_iwmmxt(x + 12, out, analysis_consts_fixed4_simd_odd);
+	out += out_stride;
+	sbc_analyze_four_iwmmxt(x + 8, out, analysis_consts_fixed4_simd_even);
+	out += out_stride;
+	sbc_analyze_four_iwmmxt(x + 4, out, analysis_consts_fixed4_simd_odd);
+	out += out_stride;
+	sbc_analyze_four_iwmmxt(x + 0, out, analysis_consts_fixed4_simd_even);
+}
+
+static inline void sbc_analyze_4b_8s_iwmmxt(int16_t *x, int32_t *out,
+						int out_stride)
+{
+	/* Analyze blocks */
+	sbc_analyze_eight_iwmmxt(x + 24, out, analysis_consts_fixed8_simd_odd);
+	out += out_stride;
+	sbc_analyze_eight_iwmmxt(x + 16, out, analysis_consts_fixed8_simd_even);
+	out += out_stride;
+	sbc_analyze_eight_iwmmxt(x + 8, out, analysis_consts_fixed8_simd_odd);
+	out += out_stride;
+	sbc_analyze_eight_iwmmxt(x + 0, out, analysis_consts_fixed8_simd_even);
+}
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *state)
+{
+	state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_iwmmxt;
+	state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_iwmmxt;
+	state->implementation_info = "IWMMXT";
+}
+
+#endif
diff --git a/sbc/sbc_primitives_iwmmxt.h b/sbc/sbc_primitives_iwmmxt.h
new file mode 100644
index 0000000..b535e68
--- /dev/null
+++ b/sbc/sbc_primitives_iwmmxt.h
@@ -0,0 +1,42 @@
+/*
+ *
+ *  Bluetooth low-complexity, subband codec (SBC) library
+ *
+ *  Copyright (C) 2010 Keith Mok <ek9852@gmail.com>
+ *  Copyright (C) 2008-2010  Nokia Corporation
+ *  Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ *  Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ *  Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+ *
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef __SBC_PRIMITIVES_IWMMXT_H
+#define __SBC_PRIMITIVES_IWMMXT_H
+
+#include "sbc_primitives.h"
+
+#if defined(__GNUC__) && defined(__IWMMXT__) && \
+		!defined(SBC_HIGH_PRECISION) && (SCALE_OUT_BITS == 15)
+
+#define SBC_BUILD_WITH_IWMMXT_SUPPORT
+
+void sbc_init_primitives_iwmmxt(struct sbc_encoder_state *encoder_state);
+
+#endif
+
+#endif
-- 
1.6.3.3

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] Add iwmmxt optimization for sbc for pxa series cpu
  2010-11-18 13:33             ` [PATCH] " Keith Mok
@ 2010-11-18 16:53               ` Johan Hedberg
  0 siblings, 0 replies; 10+ messages in thread
From: Johan Hedberg @ 2010-11-18 16:53 UTC (permalink / raw)
  To: Keith Mok; +Cc: Siarhei Siamashka, linux-bluetooth

Hi Keith,

On Thu, Nov 18, 2010, Keith Mok wrote:
> Add iwmmxt optimization for sbc for pxa series cpu.
> 
> Benchmarked on ARM PXA platform:
> ===  Before (4 bands) ====
> $ time  ./sbcenc_orig  -s 4     long.au  > /dev/null
> real    0m 2.44s
> user    0m 2.39s
> sys     0m 0.05s
> ===  After (4 bands) ====
> $ time  ./sbcenc  -s 4     long.au  > /dev/null
> real    0m 1.59s
> user    0m 1.49s
> sys     0m 0.10s
> 
> ===  Before (8 bands) ====
> $ time  ./sbcenc_orig   -s 8     long.au  > /dev/null
> real    0m 4.05s
> user    0m 3.98s
> sys     0m 0.07s
> ===  After (8 bands) ====
> $ time  ./sbcenc  -s 8     long.au  > /dev/null
> real    0m 1.48s
> user    0m 1.41s
> sys     0m 0.06s
> 
> ===  Before (a2dp usage) ====
> $ time  ./sbcenc_orig   -b53 -s8 -j    long.au  > /dev/null
> real    0m 4.51s
> user    0m 4.41s
> sys     0m 0.10s
> ===  After (a2dp usage) ====
> $ time  ./sbcenc   -b53 -s8 -j    long.au  > /dev/null
> real    0m 2.05s
> user    0m 1.99s
> sys     0m 0.06s
> 
> ---
>  Makefile.am                 |    1 +
>  sbc/sbc_primitives.c        |    4 +
>  sbc/sbc_primitives_iwmmxt.c |  304 +++++++++++++++++++++++++++++++++++++++++++
>  sbc/sbc_primitives_iwmmxt.h |   42 ++++++
>  4 files changed, 351 insertions(+), 0 deletions(-)
>  create mode 100644 sbc/sbc_primitives_iwmmxt.c
>  create mode 100644 sbc/sbc_primitives_iwmmxt.h

Pushed upstream. Thanks.

Johan

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2010-11-18 16:53 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-11-11  8:05 [PATCH] Add iwmmxt optimization for sbc for pxa series cpu Keith Mok
2010-11-11 11:46 ` Siarhei Siamashka
2010-11-12  7:35   ` [PATCH v2] " Keith Mok
2010-11-12 13:22     ` Siarhei Siamashka
2010-11-15  2:46       ` [PATCH v3] " Keith Mok
2010-11-15 11:08         ` Siarhei Siamashka
2010-11-18 13:05           ` Siarhei Siamashka
2010-11-18 13:31             ` Johan Hedberg
2010-11-18 13:33             ` [PATCH] " Keith Mok
2010-11-18 16:53               ` Johan Hedberg

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).