[RFC PATCH] MIPS: optimise 32-bit do_div() with constant divisor

All of lore.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH] MIPS: optimise 32-bit do_div() with constant divisor
@ 2014-11-06 16:23 Mans Rullgard
  2014-11-06 17:08 ` David Daney
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Mans Rullgard @ 2014-11-06 16:23 UTC (permalink / raw)
  To: linux-mips

This is an adaptation of the optimised do_div() for ARM by
Nicolas Pitre implementing division by a constant using a
multiplication by the inverse.  Ideally, the compiler would
do this internally as it does for 32-bit operands, but it
doesn't.

This version of the code requires an assembler with support
for the DSP ASE syntax since accessing the hi/lo registers
sanely from inline asm is impossible without this.  Building
for a CPU without this extension still works, however.

It also does not protect against the WAR hazards for the
hi/lo registers present on CPUs prior to MIPS IV.

I have only tested it as far as booting and light use with
the BUG_ON enabled wihtout encountering any issues.

The inverse computation code is a straight copy from ARM,
so this should probably be moved to a shared location.

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 arch/mips/include/asm/div64.h | 178 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 174 insertions(+), 4 deletions(-)

diff --git a/arch/mips/include/asm/div64.h b/arch/mips/include/asm/div64.h
index dc5ea57..bcebfb1 100644
--- a/arch/mips/include/asm/div64.h
+++ b/arch/mips/include/asm/div64.h
@@ -9,12 +9,12 @@
 #ifndef __ASM_DIV64_H
 #define __ASM_DIV64_H
 
-#include <asm-generic/div64.h>
-
-#if BITS_PER_LONG == 64
-
 #include <linux/types.h>
 
+#if BITS_PER_LONG == 64
+
+#include <asm-generic/div64.h>
+
 /*
  * No traps on overflows for any of these...
  */
@@ -63,6 +63,176 @@
 	__mod32;							\
 })
 
+#elif __GNUC__ >= 4
+
+#include <asm/bug.h>
+
+extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
+
+/*
+ * If the divisor happens to be constant, we determine the appropriate
+ * inverse at compile time to turn the division into a few inline
+ * multiplications instead which is much faster.
+ * (It is unfortunate that gcc doesn't perform all this internally.)
+ */
+#define do_div(n, base)							\
+({									\
+	unsigned int __r, __b = (base);					\
+	if (likely(((n) >> 32) == 0)) {					\
+		__r = (u32)(n) % __b;					\
+		(n) = (u32)(n) / __b;					\
+	} else if (!__builtin_constant_p(__b) || __b == 0) {		\
+		/* non-constant divisor (or zero): slow path */		\
+		__r = __div64_32(&(n), __b);				\
+	} else if ((__b & (__b - 1)) == 0) {				\
+		/* Trivial: __b is constant and a power of 2 */		\
+		/* gcc does the right thing with this code.  */		\
+		__r = n;						\
+		__r &= (__b - 1);					\
+		n /= __b;						\
+	} else {							\
+		/* Multiply by inverse of __b: n/b = n*(p/b)/p       */	\
+		/* We rely on the fact that most of this code gets   */	\
+		/* optimized away at compile time due to constant    */	\
+		/* propagation and only a couple inline assembly     */	\
+		/* instructions should remain. Better avoid any      */	\
+		/* code construct that might prevent that.           */	\
+		unsigned long long __res, __x, __t, __m, __n = n;	\
+		unsigned int __c, __p, __z = 0;				\
+		/* preserve low part of n for reminder computation */	\
+		__r = __n;						\
+		/* determine number of bits to represent __b */		\
+		__p = 1 << __div64_fls(__b);				\
+		/* compute __m = ((__p << 64) + __b - 1) / __b */	\
+		__m = (~0ULL / __b) * __p;				\
+		__m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b;	\
+		/* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */	\
+		__x = ~0ULL / __b * __b - 1;				\
+		__res = (__m & 0xffffffff) * (__x & 0xffffffff);	\
+		__res >>= 32;						\
+		__res += (__m & 0xffffffff) * (__x >> 32);		\
+		__t = __res;						\
+		__res += (__x & 0xffffffff) * (__m >> 32);		\
+		__t = (__res < __t) ? (1ULL << 32) : 0;			\
+		__res = (__res >> 32) + __t;				\
+		__res += (__m >> 32) * (__x >> 32);			\
+		__res /= __p;						\
+		/* Now sanitize and optimize what we've got. */		\
+		if (~0ULL % (__b / (__b & -__b)) == 0) {		\
+			/* those cases can be simplified with: */	\
+			__n /= (__b & -__b);				\
+			__m = ~0ULL / (__b / (__b & -__b));		\
+			__p = 1;					\
+			__c = 1;					\
+		} else if (__res != __x / __b) {			\
+			/* We can't get away without a correction    */	\
+			/* to compensate for bit truncation errors.  */	\
+			/* To avoid it we'd need an additional bit   */	\
+			/* to represent __m which would overflow it. */	\
+			/* Instead we do m=p/b and n/b=(n*m+m)/p.    */	\
+			__c = 1;					\
+			/* Compute __m = (__p << 64) / __b */		\
+			__m = (~0ULL / __b) * __p;			\
+			__m += ((~0ULL % __b + 1) * __p) / __b;		\
+		} else {						\
+			/* Reduce __m/__p, and try to clear bit 31   */	\
+			/* of __m when possible otherwise that'll    */	\
+			/* need extra overflow handling later.       */	\
+			unsigned int __bits = -(__m & -__m);		\
+			__bits |= __m >> 32;				\
+			__bits = (~__bits) << 1;			\
+			/* If __bits == 0 then setting bit 31 is     */	\
+			/* unavoidable.  Simply apply the maximum    */	\
+			/* possible reduction in that case.          */	\
+			/* Otherwise the MSB of __bits indicates the */	\
+			/* best reduction we should apply.           */	\
+			if (!__bits) {					\
+				__p /= (__m & -__m);			\
+				__m /= (__m & -__m);			\
+			} else {					\
+				__p >>= __div64_fls(__bits);		\
+				__m >>= __div64_fls(__bits);		\
+			}						\
+			/* No correction needed. */			\
+			__c = 0;					\
+		}							\
+		/* Now we have a combination of 2 conditions:        */	\
+		/* 1) whether or not we need a correction (__c), and */	\
+		/* 2) whether or not there might be an overflow in   */	\
+		/*    the cross product (__m & ((1<<63) | (1<<31)))  */	\
+		/* Select the best insn combination to perform the   */	\
+		/* actual __m * __n / (__p << 64) operation.         */	\
+		if (!__c) {						\
+			__res = (u64)(u32)__m * (u32)__n >> 32;		\
+		} else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {	\
+			__res = (__m + (u64)(u32)__m * (u32)__n) >> 32;	\
+		} else {						\
+			__t = __m;					\
+			asm (	".set	push		\n"		\
+				".set	dsp		\n"		\
+				"maddu	%q2, %L3, %L4	\n"             \
+                                "mflo	%L0, %q2	\n"             \
+                                "mfhi	%M0, %q2	\n"             \
+				"sltu	%1,  %L0, %L3	\n"		\
+				"addu	%L0, %M0, %1	\n"		\
+				"sltu	%M0, %L0, %M3	\n"		\
+				".set	pop		\n"		\
+				: "=&r"(__res), "=&r"(__c), "+&ka"(__t) \
+				: "r"(__m), "r"(__n));			\
+		}							\
+		if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {		\
+			asm (	".set	push		\n"		\
+				".set	dsp		\n"		\
+				"maddu	%q0, %M2, %L3	\n"		\
+				"maddu	%q0, %L2, %M3	\n"		\
+				"mfhi	%1,  %q0	\n"		\
+				"mthi	$0,  %q0	\n"		\
+				"mtlo	%1,  %q0	\n"		\
+				"maddu	%q0, %M2, %M3	\n"		\
+				".set	pop		\n"		\
+				: "+&ka"(__res), "=&r"(__c)		\
+				: "r"(__m), "r"(__n));			\
+		} else {						\
+			asm (	".set	push		\n"		\
+				".set	dsp		\n"		\
+				"maddu	%q0, %M3, %L4	\n"		\
+				"mfhi	%2,  %q0	\n"		\
+				"maddu	%q0, %L3, %M4	\n"		\
+				"mfhi	%1,  %q0	\n"		\
+				"sltu	%2,  %1,  %2	\n"		\
+				"mtlo	%1,  %q0	\n"		\
+				"mthi	%2,  %q0	\n"		\
+				"maddu	%q0, %M3, %M4	\n"		\
+				".set	pop		\n"		\
+				: "+&ka"(__res), "=&r"(__z), "=&r"(__c) \
+				: "r"(__m), "r"(__n));			\
+		}							\
+		__res /= __p;						\
+		/* The reminder can be computed with 32-bit regs     */	\
+		/* only, and gcc is good at that.                    */	\
+		{							\
+			unsigned int __res0 = __res;			\
+			unsigned int __b0 = __b;			\
+			__r -= __res0 * __b0;				\
+		}							\
+		/* BUG_ON(__r >= __b || __res * __b + __r != n); */	\
+		n = __res;						\
+	}								\
+	__r;								\
+})
+
+/* our own fls implementation to make sure constant propagation is fine */
+#define __div64_fls(bits)						\
+({									\
+	unsigned int __left = (bits), __nr = 0;				\
+	if (__left & 0xffff0000) __nr += 16, __left >>= 16;		\
+	if (__left & 0x0000ff00) __nr +=  8, __left >>=  8;		\
+	if (__left & 0x000000f0) __nr +=  4, __left >>=  4;		\
+	if (__left & 0x0000000c) __nr +=  2, __left >>=  2;		\
+	if (__left & 0x00000002) __nr +=  1;				\
+	__nr;								\
+})
+
 #endif /* BITS_PER_LONG == 64 */
 
 #endif /* __ASM_DIV64_H */
-- 
2.0.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] MIPS: optimise 32-bit do_div() with constant divisor
  2014-11-06 16:23 [RFC PATCH] MIPS: optimise 32-bit do_div() with constant divisor Mans Rullgard
@ 2014-11-06 17:08 ` David Daney
  2014-11-06 17:42   ` Måns Rullgård
  2014-11-07  0:50 ` Ralf Baechle
  2015-01-11  5:05 ` Maciej W. Rozycki
  2 siblings, 1 reply; 10+ messages in thread
From: David Daney @ 2014-11-06 17:08 UTC (permalink / raw)
  To: Mans Rullgard; +Cc: linux-mips

On 11/06/2014 08:23 AM, Mans Rullgard wrote:
> This is an adaptation of the optimised do_div() for ARM by
> Nicolas Pitre implementing division by a constant using a
> multiplication by the inverse.  Ideally, the compiler would
> do this internally as it does for 32-bit operands, but it
> doesn't.
>
> This version of the code requires an assembler with support
> for the DSP ASE syntax since accessing the hi/lo registers
> sanely from inline asm is impossible without this.

It is easy to access hi/lo from inline asm.  It is true that it is 
difficult to use them as input/output registers.

You should use MFHI/MFLO and to move the results to some general purpose 
registers.  Then mention "hi", "lo" in the clobbers statement for the 
inline asm.

FWIW, it seems like you are missing the clobbers in your inline asm below.


>  Building
> for a CPU without this extension still works, however.
>
> It also does not protect against the WAR hazards for the
> hi/lo registers present on CPUs prior to MIPS IV.
>
> I have only tested it as far as booting and light use with
> the BUG_ON enabled wihtout encountering any issues.
>
> The inverse computation code is a straight copy from ARM,
> so this should probably be moved to a shared location.
>
> Signed-off-by: Mans Rullgard <mans@mansr.com>
> ---
>   arch/mips/include/asm/div64.h | 178 +++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 174 insertions(+), 4 deletions(-)
>
> diff --git a/arch/mips/include/asm/div64.h b/arch/mips/include/asm/div64.h
> index dc5ea57..bcebfb1 100644
> --- a/arch/mips/include/asm/div64.h
> +++ b/arch/mips/include/asm/div64.h
> @@ -9,12 +9,12 @@
>   #ifndef __ASM_DIV64_H
>   #define __ASM_DIV64_H
>
> -#include <asm-generic/div64.h>
> -
> -#if BITS_PER_LONG == 64
> -
>   #include <linux/types.h>
>
> +#if BITS_PER_LONG == 64
> +
> +#include <asm-generic/div64.h>
> +
>   /*
>    * No traps on overflows for any of these...
>    */
> @@ -63,6 +63,176 @@
>   	__mod32;							\
>   })
>
> +#elif __GNUC__ >= 4
> +
> +#include <asm/bug.h>
> +
> +extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
> +
> +/*
> + * If the divisor happens to be constant, we determine the appropriate
> + * inverse at compile time to turn the division into a few inline
> + * multiplications instead which is much faster.
> + * (It is unfortunate that gcc doesn't perform all this internally.)
> + */
> +#define do_div(n, base)							\
> +({									\
> +	unsigned int __r, __b = (base);					\
> +	if (likely(((n) >> 32) == 0)) {					\
> +		__r = (u32)(n) % __b;					\
> +		(n) = (u32)(n) / __b;					\
> +	} else if (!__builtin_constant_p(__b) || __b == 0) {		\
> +		/* non-constant divisor (or zero): slow path */		\
> +		__r = __div64_32(&(n), __b);				\
> +	} else if ((__b & (__b - 1)) == 0) {				\
> +		/* Trivial: __b is constant and a power of 2 */		\
> +		/* gcc does the right thing with this code.  */		\
> +		__r = n;						\
> +		__r &= (__b - 1);					\
> +		n /= __b;						\
> +	} else {							\
> +		/* Multiply by inverse of __b: n/b = n*(p/b)/p       */	\
> +		/* We rely on the fact that most of this code gets   */	\
> +		/* optimized away at compile time due to constant    */	\
> +		/* propagation and only a couple inline assembly     */	\
> +		/* instructions should remain. Better avoid any      */	\
> +		/* code construct that might prevent that.           */	\
> +		unsigned long long __res, __x, __t, __m, __n = n;	\
> +		unsigned int __c, __p, __z = 0;				\
> +		/* preserve low part of n for reminder computation */	\
> +		__r = __n;						\
> +		/* determine number of bits to represent __b */		\
> +		__p = 1 << __div64_fls(__b);				\
> +		/* compute __m = ((__p << 64) + __b - 1) / __b */	\
> +		__m = (~0ULL / __b) * __p;				\
> +		__m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b;	\
> +		/* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */	\
> +		__x = ~0ULL / __b * __b - 1;				\
> +		__res = (__m & 0xffffffff) * (__x & 0xffffffff);	\
> +		__res >>= 32;						\
> +		__res += (__m & 0xffffffff) * (__x >> 32);		\
> +		__t = __res;						\
> +		__res += (__x & 0xffffffff) * (__m >> 32);		\
> +		__t = (__res < __t) ? (1ULL << 32) : 0;			\
> +		__res = (__res >> 32) + __t;				\
> +		__res += (__m >> 32) * (__x >> 32);			\
> +		__res /= __p;						\
> +		/* Now sanitize and optimize what we've got. */		\
> +		if (~0ULL % (__b / (__b & -__b)) == 0) {		\
> +			/* those cases can be simplified with: */	\
> +			__n /= (__b & -__b);				\
> +			__m = ~0ULL / (__b / (__b & -__b));		\
> +			__p = 1;					\
> +			__c = 1;					\
> +		} else if (__res != __x / __b) {			\
> +			/* We can't get away without a correction    */	\
> +			/* to compensate for bit truncation errors.  */	\
> +			/* To avoid it we'd need an additional bit   */	\
> +			/* to represent __m which would overflow it. */	\
> +			/* Instead we do m=p/b and n/b=(n*m+m)/p.    */	\
> +			__c = 1;					\
> +			/* Compute __m = (__p << 64) / __b */		\
> +			__m = (~0ULL / __b) * __p;			\
> +			__m += ((~0ULL % __b + 1) * __p) / __b;		\
> +		} else {						\
> +			/* Reduce __m/__p, and try to clear bit 31   */	\
> +			/* of __m when possible otherwise that'll    */	\
> +			/* need extra overflow handling later.       */	\
> +			unsigned int __bits = -(__m & -__m);		\
> +			__bits |= __m >> 32;				\
> +			__bits = (~__bits) << 1;			\
> +			/* If __bits == 0 then setting bit 31 is     */	\
> +			/* unavoidable.  Simply apply the maximum    */	\
> +			/* possible reduction in that case.          */	\
> +			/* Otherwise the MSB of __bits indicates the */	\
> +			/* best reduction we should apply.           */	\
> +			if (!__bits) {					\
> +				__p /= (__m & -__m);			\
> +				__m /= (__m & -__m);			\
> +			} else {					\
> +				__p >>= __div64_fls(__bits);		\
> +				__m >>= __div64_fls(__bits);		\
> +			}						\
> +			/* No correction needed. */			\
> +			__c = 0;					\
> +		}							\
> +		/* Now we have a combination of 2 conditions:        */	\
> +		/* 1) whether or not we need a correction (__c), and */	\
> +		/* 2) whether or not there might be an overflow in   */	\
> +		/*    the cross product (__m & ((1<<63) | (1<<31)))  */	\
> +		/* Select the best insn combination to perform the   */	\
> +		/* actual __m * __n / (__p << 64) operation.         */	\
> +		if (!__c) {						\
> +			__res = (u64)(u32)__m * (u32)__n >> 32;		\
> +		} else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {	\
> +			__res = (__m + (u64)(u32)__m * (u32)__n) >> 32;	\
> +		} else {						\
> +			__t = __m;					\
> +			asm (	".set	push		\n"		\
> +				".set	dsp		\n"		\
> +				"maddu	%q2, %L3, %L4	\n"             \
> +                                "mflo	%L0, %q2	\n"             \
> +                                "mfhi	%M0, %q2	\n"             \
> +				"sltu	%1,  %L0, %L3	\n"		\
> +				"addu	%L0, %M0, %1	\n"		\
> +				"sltu	%M0, %L0, %M3	\n"		\
> +				".set	pop		\n"		\
> +				: "=&r"(__res), "=&r"(__c), "+&ka"(__t) \
> +				: "r"(__m), "r"(__n));			\
> +		}							\
> +		if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {		\
> +			asm (	".set	push		\n"		\
> +				".set	dsp		\n"		\
> +				"maddu	%q0, %M2, %L3	\n"		\
> +				"maddu	%q0, %L2, %M3	\n"		\
> +				"mfhi	%1,  %q0	\n"		\
> +				"mthi	$0,  %q0	\n"		\
> +				"mtlo	%1,  %q0	\n"		\
> +				"maddu	%q0, %M2, %M3	\n"		\
> +				".set	pop		\n"		\
> +				: "+&ka"(__res), "=&r"(__c)		\
> +				: "r"(__m), "r"(__n));			\
> +		} else {						\
> +			asm (	".set	push		\n"		\
> +				".set	dsp		\n"		\
> +				"maddu	%q0, %M3, %L4	\n"		\
> +				"mfhi	%2,  %q0	\n"		\
> +				"maddu	%q0, %L3, %M4	\n"		\
> +				"mfhi	%1,  %q0	\n"		\
> +				"sltu	%2,  %1,  %2	\n"		\
> +				"mtlo	%1,  %q0	\n"		\
> +				"mthi	%2,  %q0	\n"		\
> +				"maddu	%q0, %M3, %M4	\n"		\
> +				".set	pop		\n"		\
> +				: "+&ka"(__res), "=&r"(__z), "=&r"(__c) \
> +				: "r"(__m), "r"(__n));			\
> +		}							\
> +		__res /= __p;						\
> +		/* The reminder can be computed with 32-bit regs     */	\
> +		/* only, and gcc is good at that.                    */	\
> +		{							\
> +			unsigned int __res0 = __res;			\
> +			unsigned int __b0 = __b;			\
> +			__r -= __res0 * __b0;				\
> +		}							\
> +		/* BUG_ON(__r >= __b || __res * __b + __r != n); */	\
> +		n = __res;						\
> +	}								\
> +	__r;								\
> +})
> +
> +/* our own fls implementation to make sure constant propagation is fine */
> +#define __div64_fls(bits)						\
> +({									\
> +	unsigned int __left = (bits), __nr = 0;				\
> +	if (__left & 0xffff0000) __nr += 16, __left >>= 16;		\
> +	if (__left & 0x0000ff00) __nr +=  8, __left >>=  8;		\
> +	if (__left & 0x000000f0) __nr +=  4, __left >>=  4;		\
> +	if (__left & 0x0000000c) __nr +=  2, __left >>=  2;		\
> +	if (__left & 0x00000002) __nr +=  1;				\
> +	__nr;								\
> +})
> +
>   #endif /* BITS_PER_LONG == 64 */
>
>   #endif /* __ASM_DIV64_H */
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] MIPS: optimise 32-bit do_div() with constant divisor
  2014-11-06 17:08 ` David Daney
@ 2014-11-06 17:42   ` Måns Rullgård
  0 siblings, 0 replies; 10+ messages in thread
From: Måns Rullgård @ 2014-11-06 17:42 UTC (permalink / raw)
  To: David Daney; +Cc: linux-mips

David Daney <ddaney.cavm@gmail.com> writes:

> On 11/06/2014 08:23 AM, Mans Rullgard wrote:
>> This is an adaptation of the optimised do_div() for ARM by
>> Nicolas Pitre implementing division by a constant using a
>> multiplication by the inverse.  Ideally, the compiler would
>> do this internally as it does for 32-bit operands, but it
>> doesn't.
>>
>> This version of the code requires an assembler with support
>> for the DSP ASE syntax since accessing the hi/lo registers
>> sanely from inline asm is impossible without this.
>
> It is easy to access hi/lo from inline asm.  It is true that it is
> difficult to use them as input/output registers.

Of course they can be accessed directly, but that tends to produce worse
code since the compiler then can't schedule the mfhi/mflo (or mthi/mtlo
on entry to the asm) around other instructions to avoid stalling.

> You should use MFHI/MFLO and to move the results to some general
> purpose registers.  Then mention "hi", "lo" in the clobbers statement
> for the inline asm.

In some cases, results in needless bouncing between hi/lo and gprs.
It's better to let the compiler decide if and when to retrieve the
values.

Moreover, using "ka" constraints lets the compiler use the additional
accumulator registers available on CPUs with the DSP ASE.

> FWIW, it seems like you are missing the clobbers in your inline asm below.

No clobbers are needed when using explicit input/output arguments.  In
fact, if hi/lo were marked as clobbered this would fail to compile as
these would then be unavailable as inputs or outputs.

Is there a compelling reason to support old assembler versions without
this syntax?  I'm not sure when this was added, but I see references to
it from 2005.

-- 
Måns Rullgård
mans@mansr.com

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] MIPS: optimise 32-bit do_div() with constant divisor
  2014-11-06 16:23 [RFC PATCH] MIPS: optimise 32-bit do_div() with constant divisor Mans Rullgard
  2014-11-06 17:08 ` David Daney
@ 2014-11-07  0:50 ` Ralf Baechle
  2014-11-07  2:20   ` Måns Rullgård
  2015-01-11  5:05 ` Maciej W. Rozycki
  2 siblings, 1 reply; 10+ messages in thread
From: Ralf Baechle @ 2014-11-07  0:50 UTC (permalink / raw)
  To: Mans Rullgard; +Cc: linux-mips

On Thu, Nov 06, 2014 at 04:23:18PM +0000, Mans Rullgard wrote:

> This is an adaptation of the optimised do_div() for ARM by
> Nicolas Pitre implementing division by a constant using a
> multiplication by the inverse.  Ideally, the compiler would
> do this internally as it does for 32-bit operands, but it
> doesn't.
> 
> This version of the code requires an assembler with support
> for the DSP ASE syntax since accessing the hi/lo registers
> sanely from inline asm is impossible without this.  Building
> for a CPU without this extension still works, however.
> 
> It also does not protect against the WAR hazards for the
> hi/lo registers present on CPUs prior to MIPS IV.
> 
> I have only tested it as far as booting and light use with
> the BUG_ON enabled wihtout encountering any issues.
> 
> The inverse computation code is a straight copy from ARM,
> so this should probably be moved to a shared location.

Can you explain why you need __div64_fls()?  There's __fls() which on
MIPS is carefully written to make use of the CLZ rsp. DCLZ instructions
where available; the fallback implementation is looking fairly similar
to your code.

MADD is named MAD on some older CPUs; yet other CPUs don't have it
at all.  I take it you tried to make GCC emit the instruction but it
doesn't?

  Ralf

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] MIPS: optimise 32-bit do_div() with constant divisor
  2014-11-07  0:50 ` Ralf Baechle
@ 2014-11-07  2:20   ` Måns Rullgård
  2014-11-07 11:35     ` Ralf Baechle
  0 siblings, 1 reply; 10+ messages in thread
From: Måns Rullgård @ 2014-11-07  2:20 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: linux-mips

Ralf Baechle <ralf@linux-mips.org> writes:

> On Thu, Nov 06, 2014 at 04:23:18PM +0000, Mans Rullgard wrote:
>
>> This is an adaptation of the optimised do_div() for ARM by
>> Nicolas Pitre implementing division by a constant using a
>> multiplication by the inverse.  Ideally, the compiler would
>> do this internally as it does for 32-bit operands, but it
>> doesn't.
>> 
>> This version of the code requires an assembler with support
>> for the DSP ASE syntax since accessing the hi/lo registers
>> sanely from inline asm is impossible without this.  Building
>> for a CPU without this extension still works, however.
>> 
>> It also does not protect against the WAR hazards for the
>> hi/lo registers present on CPUs prior to MIPS IV.
>> 
>> I have only tested it as far as booting and light use with
>> the BUG_ON enabled wihtout encountering any issues.
>> 
>> The inverse computation code is a straight copy from ARM,
>> so this should probably be moved to a shared location.
>
> Can you explain why you need __div64_fls()?  There's __fls() which on
> MIPS is carefully written to make use of the CLZ rsp. DCLZ instructions
> where available; the fallback implementation is looking fairly similar
> to your code.

The regular __fls() doesn't necessarily evaluate at compile-time, which
is required here.  The normal __fls() could of course be amended to
bypass the CLZ instruction for constant arguments.

> MADD is named MAD on some older CPUs; yet other CPUs don't have it
> at all.  I take it you tried to make GCC emit the instruction but it
> doesn't?

GCC generates MADDU instructions only in the most trivial of cases.  For
instance, (x >> 32) * (u32)y with 64-bit x and y produces far from
optimal code.  In fact, looking at it again, I see it is even more
stupid than I thought, so there needs to be more asm, not less.

Reading the manuals more carefully, it appears that MADDU is only
reliably available starting with MIPS32 (btw, this information is
annoyingly hard to find).  Thus this code should be restricted to such
targets (which probably covers most current users) unless someone feels
like writing a version for these older CPUs.

-- 
Måns Rullgård
mans@mansr.com

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] MIPS: optimise 32-bit do_div() with constant divisor
  2014-11-07  2:20   ` Måns Rullgård
@ 2014-11-07 11:35     ` Ralf Baechle
  2014-11-07 15:00       ` Måns Rullgård
  0 siblings, 1 reply; 10+ messages in thread
From: Ralf Baechle @ 2014-11-07 11:35 UTC (permalink / raw)
  To: Måns Rullgård; +Cc: linux-mips

On Fri, Nov 07, 2014 at 02:20:11AM +0000, Måns Rullgård wrote:
> Date:   Fri, 07 Nov 2014 02:20:11 +0000
> From: Måns Rullgård <mans@mansr.com>
> To: Ralf Baechle <ralf@linux-mips.org>
> Cc: linux-mips@linux-mips.org
> Subject: Re: [RFC PATCH] MIPS: optimise 32-bit do_div() with constant
>  divisor
> Content-Type: text/plain; charset=iso-8859-1
> 
> Ralf Baechle <ralf@linux-mips.org> writes:
> 
> > On Thu, Nov 06, 2014 at 04:23:18PM +0000, Mans Rullgard wrote:
> >
> >> This is an adaptation of the optimised do_div() for ARM by
> >> Nicolas Pitre implementing division by a constant using a
> >> multiplication by the inverse.  Ideally, the compiler would
> >> do this internally as it does for 32-bit operands, but it
> >> doesn't.
> >> 
> >> This version of the code requires an assembler with support
> >> for the DSP ASE syntax since accessing the hi/lo registers
> >> sanely from inline asm is impossible without this.  Building
> >> for a CPU without this extension still works, however.
> >> 
> >> It also does not protect against the WAR hazards for the
> >> hi/lo registers present on CPUs prior to MIPS IV.
> >> 
> >> I have only tested it as far as booting and light use with
> >> the BUG_ON enabled wihtout encountering any issues.
> >> 
> >> The inverse computation code is a straight copy from ARM,
> >> so this should probably be moved to a shared location.
> >
> > Can you explain why you need __div64_fls()?  There's __fls() which on
> > MIPS is carefully written to make use of the CLZ rsp. DCLZ instructions
> > where available; the fallback implementation is looking fairly similar
> > to your code.
> 
> The regular __fls() doesn't necessarily evaluate at compile-time, which
> is required here.  The normal __fls() could of course be amended to
> bypass the CLZ instruction for constant arguments.
> 
> > MADD is named MAD on some older CPUs; yet other CPUs don't have it
> > at all.  I take it you tried to make GCC emit the instruction but it
> > doesn't?
> 
> GCC generates MADDU instructions only in the most trivial of cases.  For
> instance, (x >> 32) * (u32)y with 64-bit x and y produces far from
> optimal code.  In fact, looking at it again, I see it is even more
> stupid than I thought, so there needs to be more asm, not less.
> 
> Reading the manuals more carefully, it appears that MADDU is only
> reliably available starting with MIPS32 (btw, this information is
> annoyingly hard to find).  Thus this code should be restricted to such
> targets (which probably covers most current users) unless someone feels
> like writing a version for these older CPUs.

I'm primarily concered about not enabling MADD where it's not available.
As for pre-MIPS32 processors - we can do the manual reading about where
to enable MAD(D) later.

As for access to hi/lo, I tried to explicitly put a variable in the lo
register.  Which sort of works for very simple cases but as expected it's
easy to get GCC to spill its RTL guts because it runs out of spill
registers.  It maybe can be made to work but I'd feel nervous about its
stability unless a GCC guru approved this method.

  Ralf

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] MIPS: optimise 32-bit do_div() with constant divisor
  2014-11-07 11:35     ` Ralf Baechle
@ 2014-11-07 15:00       ` Måns Rullgård
  2014-11-07 18:35         ` David Daney
  0 siblings, 1 reply; 10+ messages in thread
From: Måns Rullgård @ 2014-11-07 15:00 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: linux-mips

Ralf Baechle <ralf@linux-mips.org> writes:

> On Fri, Nov 07, 2014 at 02:20:11AM +0000, Måns Rullgård wrote:
>> Date:   Fri, 07 Nov 2014 02:20:11 +0000
>> From: Måns Rullgård <mans@mansr.com>
>> To: Ralf Baechle <ralf@linux-mips.org>
>> Cc: linux-mips@linux-mips.org
>> Subject: Re: [RFC PATCH] MIPS: optimise 32-bit do_div() with constant
>>  divisor
>> Content-Type: text/plain; charset=iso-8859-1
>> 
>> Ralf Baechle <ralf@linux-mips.org> writes:
>> 
>> > On Thu, Nov 06, 2014 at 04:23:18PM +0000, Mans Rullgard wrote:
>> >
>> >> This is an adaptation of the optimised do_div() for ARM by
>> >> Nicolas Pitre implementing division by a constant using a
>> >> multiplication by the inverse.  Ideally, the compiler would
>> >> do this internally as it does for 32-bit operands, but it
>> >> doesn't.
>> >> 
>> >> This version of the code requires an assembler with support
>> >> for the DSP ASE syntax since accessing the hi/lo registers
>> >> sanely from inline asm is impossible without this.  Building
>> >> for a CPU without this extension still works, however.
>> >> 
>> >> It also does not protect against the WAR hazards for the
>> >> hi/lo registers present on CPUs prior to MIPS IV.
>> >> 
>> >> I have only tested it as far as booting and light use with
>> >> the BUG_ON enabled wihtout encountering any issues.
>> >> 
>> >> The inverse computation code is a straight copy from ARM,
>> >> so this should probably be moved to a shared location.
>> >
>> > Can you explain why you need __div64_fls()?  There's __fls() which on
>> > MIPS is carefully written to make use of the CLZ rsp. DCLZ instructions
>> > where available; the fallback implementation is looking fairly similar
>> > to your code.
>> 
>> The regular __fls() doesn't necessarily evaluate at compile-time, which
>> is required here.  The normal __fls() could of course be amended to
>> bypass the CLZ instruction for constant arguments.
>> 
>> > MADD is named MAD on some older CPUs; yet other CPUs don't have it
>> > at all.  I take it you tried to make GCC emit the instruction but it
>> > doesn't?
>> 
>> GCC generates MADDU instructions only in the most trivial of cases.  For
>> instance, (x >> 32) * (u32)y with 64-bit x and y produces far from
>> optimal code.  In fact, looking at it again, I see it is even more
>> stupid than I thought, so there needs to be more asm, not less.
>> 
>> Reading the manuals more carefully, it appears that MADDU is only
>> reliably available starting with MIPS32 (btw, this information is
>> annoyingly hard to find).  Thus this code should be restricted to such
>> targets (which probably covers most current users) unless someone feels
>> like writing a version for these older CPUs.
>
> I'm primarily concered about not enabling MADD where it's not available.
> As for pre-MIPS32 processors - we can do the manual reading about where
> to enable MAD(D) later.

A simple ifdef will take care of that.

> As for access to hi/lo, I tried to explicitly put a variable in the lo
> register.  Which sort of works for very simple cases but as expected it's
> easy to get GCC to spill its RTL guts because it runs out of spill
> registers.  It maybe can be made to work but I'd feel nervous about its
> stability unless a GCC guru approved this method.

The "x" constraint can be used to move a double-word to/from the hi/lo
registers.  On DSP targets, the "ka" constraint provides access to the
three additional hi/lo pairs while on a non-DSP targets it degenerates
to "x".  The "ka" constraint is available since gcc 4.3.0.  I see no
reason not to allow this extra flexibility.

-- 
Måns Rullgård
mans@mansr.com

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] MIPS: optimise 32-bit do_div() with constant divisor
  2014-11-07 15:00       ` Måns Rullgård
@ 2014-11-07 18:35         ` David Daney
  2014-11-08  2:18           ` Måns Rullgård
  0 siblings, 1 reply; 10+ messages in thread
From: David Daney @ 2014-11-07 18:35 UTC (permalink / raw)
  To: Måns Rullgård; +Cc: Ralf Baechle, linux-mips

On 11/07/2014 07:00 AM, Måns Rullgård wrote:
[...]
>
>> As for access to hi/lo, I tried to explicitly put a variable in the lo
>> register.  Which sort of works for very simple cases but as expected it's
>> easy to get GCC to spill its RTL guts because it runs out of spill
>> registers.  It maybe can be made to work but I'd feel nervous about its
>> stability unless a GCC guru approved this method.
>
> The "x" constraint can be used to move a double-word to/from the hi/lo
> registers.  On DSP targets, the "ka" constraint provides access to the
> three additional hi/lo pairs while on a non-DSP targets it degenerates
> to "x".  The "ka" constraint is available since gcc 4.3.0.  I see no
> reason not to allow this extra flexibility.
>

What would the performance penalty be to hand code the assembly so that 
only mips32 instructions are used (i.e. no MADD), and transfers from 
hi/lo were all explicitly coded so that there were no hi/lo register 
constraints, but only clobbers of "hi", "lo"?

That would give you something usable on any 32-bit CPU with any version 
of GCC.

David Daney

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] MIPS: optimise 32-bit do_div() with constant divisor
  2014-11-07 18:35         ` David Daney
@ 2014-11-08  2:18           ` Måns Rullgård
  0 siblings, 0 replies; 10+ messages in thread
From: Måns Rullgård @ 2014-11-08  2:18 UTC (permalink / raw)
  To: David Daney; +Cc: Ralf Baechle, linux-mips

David Daney <ddaney.cavm@gmail.com> writes:

> On 11/07/2014 07:00 AM, Måns Rullgård wrote:
> [...]
>>
>>> As for access to hi/lo, I tried to explicitly put a variable in the lo
>>> register.  Which sort of works for very simple cases but as expected it's
>>> easy to get GCC to spill its RTL guts because it runs out of spill
>>> registers.  It maybe can be made to work but I'd feel nervous about its
>>> stability unless a GCC guru approved this method.
>>
>> The "x" constraint can be used to move a double-word to/from the hi/lo
>> registers.  On DSP targets, the "ka" constraint provides access to the
>> three additional hi/lo pairs while on a non-DSP targets it degenerates
>> to "x".  The "ka" constraint is available since gcc 4.3.0.  I see no
>> reason not to allow this extra flexibility.
>
> What would the performance penalty be to hand code the assembly so
> that only mips32 instructions are used (i.e. no MADD), and transfers
> from hi/lo were all explicitly coded so that there were no hi/lo
> register constraints, but only clobbers of "hi", "lo"?

It's hard to say in absolute numbers.  However, the pre-mips32
equivalent of MADDU looks something like this:

        multu   $4, $5
        mflo    $6
        mfhi    $7
        addu    $8, $8, $6
        addu    $9, $9, $7
        sltu    $6, $8, $6
        addu    $9, $9, $6

-- 
Måns Rullgård
mans@mansr.com

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] MIPS: optimise 32-bit do_div() with constant divisor
  2014-11-06 16:23 [RFC PATCH] MIPS: optimise 32-bit do_div() with constant divisor Mans Rullgard
  2014-11-06 17:08 ` David Daney
  2014-11-07  0:50 ` Ralf Baechle
@ 2015-01-11  5:05 ` Maciej W. Rozycki
  2 siblings, 0 replies; 10+ messages in thread
From: Maciej W. Rozycki @ 2015-01-11  5:05 UTC (permalink / raw)
  To: Mans Rullgard; +Cc: linux-mips

On Thu, 6 Nov 2014, Mans Rullgard wrote:

> This is an adaptation of the optimised do_div() for ARM by
> Nicolas Pitre implementing division by a constant using a
> multiplication by the inverse.  Ideally, the compiler would
> do this internally as it does for 32-bit operands, but it
> doesn't.
> 
> This version of the code requires an assembler with support
> for the DSP ASE syntax since accessing the hi/lo registers
> sanely from inline asm is impossible without this.  Building
> for a CPU without this extension still works, however.

 Well it also requires MADDU that is not always there; only added with 
MIPS32/MIPS64 and scarcely present as a vendor extension beforehand.  
However...

> +		} else {						\
> +			__t = __m;					\
> +			asm (	".set	push		\n"		\
> +				".set	dsp		\n"		\
> +				"maddu	%q2, %L3, %L4	\n"             \
> +                                "mflo	%L0, %q2	\n"             \
> +                                "mfhi	%M0, %q2	\n"             \

[Some formatting issue here.]

> +				"sltu	%1,  %L0, %L3	\n"		\
> +				"addu	%L0, %M0, %1	\n"		\
> +				"sltu	%M0, %L0, %M3	\n"		\
> +				".set	pop		\n"		\
> +				: "=&r"(__res), "=&r"(__c), "+&ka"(__t) \
> +				: "r"(__m), "r"(__n));			\
> +		}							\
> +		if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {		\
> +			asm (	".set	push		\n"		\
> +				".set	dsp		\n"		\
> +				"maddu	%q0, %M2, %L3	\n"		\
> +				"maddu	%q0, %L2, %M3	\n"		\
> +				"mfhi	%1,  %q0	\n"		\
> +				"mthi	$0,  %q0	\n"		\
> +				"mtlo	%1,  %q0	\n"		\
> +				"maddu	%q0, %M2, %M3	\n"		\
> +				".set	pop		\n"		\
> +				: "+&ka"(__res), "=&r"(__c)		\
> +				: "r"(__m), "r"(__n));			\
> +		} else {						\
> +			asm (	".set	push		\n"		\
> +				".set	dsp		\n"		\
> +				"maddu	%q0, %M3, %L4	\n"		\
> +				"mfhi	%2,  %q0	\n"		\
> +				"maddu	%q0, %L3, %M4	\n"		\
> +				"mfhi	%1,  %q0	\n"		\
> +				"sltu	%2,  %1,  %2	\n"		\
> +				"mtlo	%1,  %q0	\n"		\
> +				"mthi	%2,  %q0	\n"		\
> +				"maddu	%q0, %M3, %M4	\n"		\
> +				".set	pop		\n"		\
> +				: "+&ka"(__res), "=&r"(__z), "=&r"(__c) \
> +				: "r"(__m), "r"(__n));			\
> +		}							\

 ... is there actually a need to implement these blocks as inline assembly 
code?  It looks to me like these are straightforward operations, GCC 
should be able to produce reasonable code easily.

 Plus using the extra DSP accumulators in the kernel is not allowed with 
our code structure as it stands, they are not saved/restored on a kernel 
entry/return and are not call-saved registers in the ABI, so their user 
values will get clobbered here.

  Maciej

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2015-01-11  5:05 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-11-06 16:23 [RFC PATCH] MIPS: optimise 32-bit do_div() with constant divisor Mans Rullgard
2014-11-06 17:08 ` David Daney
2014-11-06 17:42   ` Måns Rullgård
2014-11-07  0:50 ` Ralf Baechle
2014-11-07  2:20   ` Måns Rullgård
2014-11-07 11:35     ` Ralf Baechle
2014-11-07 15:00       ` Måns Rullgård
2014-11-07 18:35         ` David Daney
2014-11-08  2:18           ` Måns Rullgård
2015-01-11  5:05 ` Maciej W. Rozycki

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.