From: rdunlap@infradead.org (Randy Dunlap)
To: linux-snps-arc@lists.infradead.org
Subject: [PATCH] __div64_32: implement division by multiplication for 32-bit arches
Date: Thu, 29 Oct 2015 10:09:18 -0700 [thread overview]
Message-ID: <563252BE.9030506@infradead.org> (raw)
In-Reply-To: <1446072455-16074-1-git-send-email-abrodkin@synopsys.com>
typos (spellos):
On 10/28/15 15:47, Alexey Brodkin wrote:
> ---
> lib/div64.c | 153 ++++++++++++++++++++++++++++++++++++++++++++++++++----------
> 1 file changed, 128 insertions(+), 25 deletions(-)
>
> diff --git a/lib/div64.c b/lib/div64.c
> index 62a698a..3055328 100644
> --- a/lib/div64.c
> +++ b/lib/div64.c
> @@ -23,37 +23,140 @@
> /* Not needed on 64bit architectures */
> #if BITS_PER_LONG == 32
>
> +
> +/*
> + * If the divisor happens to be constant, we determine the appropriate
> + * inverse at compile time to turn the division into a few inline
> + * multiplications instead which is much faster.
> + */
> uint32_t __attribute__((weak)) __div64_32(uint64_t *n, uint32_t base)
> {
> - uint64_t rem = *n;
> - uint64_t b = base;
> - uint64_t res, d = 1;
> - uint32_t high = rem >> 32;
> -
> - /* Reduce the thing a bit first */
> - res = 0;
> - if (high >= base) {
> - high /= base;
> - res = (uint64_t) high << 32;
> - rem -= (uint64_t) (high*base) << 32;
> - }
> + unsigned int __r, __b = base;
>
> - while ((int64_t)b > 0 && b < rem) {
> - b = b+b;
> - d = d+d;
> - }
> + if (!__builtin_constant_p(__b) || __b == 0) {
> + /* non-constant divisor (or zero): slow path */
> + uint64_t rem = *n;
> + uint64_t b = base;
> + uint64_t res, d = 1;
> + uint32_t high = rem >> 32;
> +
> + /* Reduce the thing a bit first */
> + res = 0;
> + if (high >= base) {
> + high /= base;
> + res = (uint64_t) high << 32;
> + rem -= (uint64_t) (high*base) << 32;
> + }
> +
> + while ((int64_t)b > 0 && b < rem) {
> + b = b+b;
> + d = d+d;
> + }
> +
> + do {
> + if (rem >= b) {
> + rem -= b;
> + res += d;
> + }
> + b >>= 1;
> + d >>= 1;
> + } while (d);
>
> - do {
> - if (rem >= b) {
> - rem -= b;
> - res += d;
> + *n = res;
> + __r = rem;
> + } else if ((__b & (__b - 1)) == 0) {
> + /*
> + * Trivial: __b is constant and a power of 2
> + * gcc does the right thing with this code.
> + * Even though code is the same as above but
> + * we make it visually as a separate path.
> + * Still only one of these branches will survive
> + * pre-processor stage, so let's leave it here.
> + */
> + __r = *n;
> + __r &= (__b - 1);
> + *n /= __b;
> + } else {
> + /* Start of preprocessor calculations */
> +
> + /*
> + * Multiply by inverse of __b: *n/b = *n*(p/b)/p
> + * We rely on the fact that most of this code gets
> + * optimized away at compile time due to constant
> + * propagation and only a couple inline assembly
> + * instructions should remain. Better avoid any
> + * code construct that might prevent that.
> + */
> + unsigned long long __res, __x, __t, __m, __n = *n;
> + unsigned int __p;
> + /* preserve low part of *n for reminder computation */
remainder
> + __r = __n;
> + /* determine number of bits to represent __b */
> + __p = 1 << __div64_fls(__b);
> + /* compute __m = ((__p << 64) + __b - 1) / __b */
> + __m = (~0ULL / __b) * __p;
> + __m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b;
> + /* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */
> + __x = ~0ULL / __b * __b - 1;
> + __res = (__m & 0xffffffff) * (__x & 0xffffffff);
> + __res >>= 32;
> + __res += (__m & 0xffffffff) * (__x >> 32);
> + __t = __res;
> + __res += (__x & 0xffffffff) * (__m >> 32);
> + __t = (__res < __t) ? (1ULL << 32) : 0;
> + __res = (__res >> 32) + __t;
> + __res += (__m >> 32) * (__x >> 32);
> + __res /= __p;
> + /* End of preprocessor calculations */
> +
> + /* Start of run-time calculations */
> + __res = (unsigned int)__m * (unsigned int)__n;
> + __res >>= 32;
> + __res += (unsigned int)__m * (__n >> 32);
> + __t = __res;
> + __res += (unsigned int)__n * (__m >> 32);
> + __t = (__res < __t) ? (1ULL << 32) : 0;
> + __res = (__res >> 32) + __t;
> + __res += (__m >> 32) * (__n >> 32);
> + __res /= __p;
> +
> + /*
> + * The reminder can be computed with 32-bit regs
remainder
> + * only, and gcc is good at that.
> + */
> + {
> + unsigned int __res0 = __res;
> + unsigned int __b0 = __b;
> +
> + __r -= __res0 * __b0;
> }
> - b >>= 1;
> - d >>= 1;
> - } while (d);
> + /* End of run-time calculations */
>
> - *n = res;
> - return rem;
> + *n = __res;
> + }
> + return __r;
> }
>
> EXPORT_SYMBOL(__div64_32);
--
~Randy
WARNING: multiple messages have this Message-ID (diff)
From: Randy Dunlap <rdunlap@infradead.org>
To: Alexey Brodkin <Alexey.Brodkin@synopsys.com>,
linux-kernel@vger.kernel.org
Cc: linux-snps-arc@lists.infradead.org,
Vineet Gupta <Vineet.Gupta1@synopsys.com>,
Ingo Molnar <mingo@elte.hu>,
Stephen Hemminger <shemminger@linux-foundation.org>,
"David S. Miller" <davem@davemloft.net>,
Nicolas Pitre <nico@cam.org>,
Russell King <rmk+kernel@arm.linux.org.uk>
Subject: Re: [PATCH] __div64_32: implement division by multiplication for 32-bit arches
Date: Thu, 29 Oct 2015 10:09:18 -0700 [thread overview]
Message-ID: <563252BE.9030506@infradead.org> (raw)
In-Reply-To: <1446072455-16074-1-git-send-email-abrodkin@synopsys.com>
typos (spellos):
On 10/28/15 15:47, Alexey Brodkin wrote:
> ---
> lib/div64.c | 153 ++++++++++++++++++++++++++++++++++++++++++++++++++----------
> 1 file changed, 128 insertions(+), 25 deletions(-)
>
> diff --git a/lib/div64.c b/lib/div64.c
> index 62a698a..3055328 100644
> --- a/lib/div64.c
> +++ b/lib/div64.c
> @@ -23,37 +23,140 @@
> /* Not needed on 64bit architectures */
> #if BITS_PER_LONG == 32
>
> +
> +/*
> + * If the divisor happens to be constant, we determine the appropriate
> + * inverse at compile time to turn the division into a few inline
> + * multiplications instead which is much faster.
> + */
> uint32_t __attribute__((weak)) __div64_32(uint64_t *n, uint32_t base)
> {
> - uint64_t rem = *n;
> - uint64_t b = base;
> - uint64_t res, d = 1;
> - uint32_t high = rem >> 32;
> -
> - /* Reduce the thing a bit first */
> - res = 0;
> - if (high >= base) {
> - high /= base;
> - res = (uint64_t) high << 32;
> - rem -= (uint64_t) (high*base) << 32;
> - }
> + unsigned int __r, __b = base;
>
> - while ((int64_t)b > 0 && b < rem) {
> - b = b+b;
> - d = d+d;
> - }
> + if (!__builtin_constant_p(__b) || __b == 0) {
> + /* non-constant divisor (or zero): slow path */
> + uint64_t rem = *n;
> + uint64_t b = base;
> + uint64_t res, d = 1;
> + uint32_t high = rem >> 32;
> +
> + /* Reduce the thing a bit first */
> + res = 0;
> + if (high >= base) {
> + high /= base;
> + res = (uint64_t) high << 32;
> + rem -= (uint64_t) (high*base) << 32;
> + }
> +
> + while ((int64_t)b > 0 && b < rem) {
> + b = b+b;
> + d = d+d;
> + }
> +
> + do {
> + if (rem >= b) {
> + rem -= b;
> + res += d;
> + }
> + b >>= 1;
> + d >>= 1;
> + } while (d);
>
> - do {
> - if (rem >= b) {
> - rem -= b;
> - res += d;
> + *n = res;
> + __r = rem;
> + } else if ((__b & (__b - 1)) == 0) {
> + /*
> + * Trivial: __b is constant and a power of 2
> + * gcc does the right thing with this code.
> + * Even though code is the same as above but
> + * we make it visually as a separate path.
> + * Still only one of these branches will survive
> + * pre-processor stage, so let's leave it here.
> + */
> + __r = *n;
> + __r &= (__b - 1);
> + *n /= __b;
> + } else {
> + /* Start of preprocessor calculations */
> +
> + /*
> + * Multiply by inverse of __b: *n/b = *n*(p/b)/p
> + * We rely on the fact that most of this code gets
> + * optimized away at compile time due to constant
> + * propagation and only a couple inline assembly
> + * instructions should remain. Better avoid any
> + * code construct that might prevent that.
> + */
> + unsigned long long __res, __x, __t, __m, __n = *n;
> + unsigned int __p;
> + /* preserve low part of *n for reminder computation */
remainder
> + __r = __n;
> + /* determine number of bits to represent __b */
> + __p = 1 << __div64_fls(__b);
> + /* compute __m = ((__p << 64) + __b - 1) / __b */
> + __m = (~0ULL / __b) * __p;
> + __m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b;
> + /* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */
> + __x = ~0ULL / __b * __b - 1;
> + __res = (__m & 0xffffffff) * (__x & 0xffffffff);
> + __res >>= 32;
> + __res += (__m & 0xffffffff) * (__x >> 32);
> + __t = __res;
> + __res += (__x & 0xffffffff) * (__m >> 32);
> + __t = (__res < __t) ? (1ULL << 32) : 0;
> + __res = (__res >> 32) + __t;
> + __res += (__m >> 32) * (__x >> 32);
> + __res /= __p;
> + /* End of preprocessor calculations */
> +
> + /* Start of run-time calculations */
> + __res = (unsigned int)__m * (unsigned int)__n;
> + __res >>= 32;
> + __res += (unsigned int)__m * (__n >> 32);
> + __t = __res;
> + __res += (unsigned int)__n * (__m >> 32);
> + __t = (__res < __t) ? (1ULL << 32) : 0;
> + __res = (__res >> 32) + __t;
> + __res += (__m >> 32) * (__n >> 32);
> + __res /= __p;
> +
> + /*
> + * The reminder can be computed with 32-bit regs
remainder
> + * only, and gcc is good at that.
> + */
> + {
> + unsigned int __res0 = __res;
> + unsigned int __b0 = __b;
> +
> + __r -= __res0 * __b0;
> }
> - b >>= 1;
> - d >>= 1;
> - } while (d);
> + /* End of run-time calculations */
>
> - *n = res;
> - return rem;
> + *n = __res;
> + }
> + return __r;
> }
>
> EXPORT_SYMBOL(__div64_32);
--
~Randy
next prev parent reply other threads:[~2015-10-29 17:09 UTC|newest]
Thread overview: 48+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-10-28 22:47 [PATCH] __div64_32: implement division by multiplication for 32-bit arches Alexey Brodkin
2015-10-28 22:47 ` Alexey Brodkin
2015-10-28 23:32 ` Nicolas Pitre
2015-10-28 23:32 ` Nicolas Pitre
2015-10-29 7:34 ` Alexey Brodkin
2015-10-29 7:34 ` Alexey Brodkin
2015-10-30 1:26 ` Nicolas Pitre
2015-10-30 1:26 ` Nicolas Pitre
2015-10-30 5:41 ` Vineet Gupta
2015-10-30 5:41 ` Vineet Gupta
2015-10-30 12:41 ` Måns Rullgård
2015-10-30 12:41 ` Måns Rullgård
2015-10-30 12:40 ` Måns Rullgård
2015-10-30 12:40 ` Måns Rullgård
2015-10-30 15:17 ` Nicolas Pitre
2015-10-30 15:17 ` Nicolas Pitre
2015-10-30 15:54 ` Alexey Brodkin
2015-10-30 15:54 ` Alexey Brodkin
2015-10-30 16:55 ` Nicolas Pitre
2015-10-30 16:55 ` Nicolas Pitre
2015-10-30 17:45 ` Måns Rullgård
2015-10-30 17:45 ` Måns Rullgård
2015-11-04 23:46 ` Nicolas Pitre
2015-11-04 23:46 ` Nicolas Pitre
2015-11-04 23:48 ` Nicolas Pitre
2015-11-04 23:48 ` Nicolas Pitre
2015-11-05 3:13 ` Vineet Gupta
2015-11-05 3:13 ` Vineet Gupta
2015-11-05 5:06 ` Nicolas Pitre
2015-11-05 5:06 ` Nicolas Pitre
2015-11-04 23:49 ` Måns Rullgård
2015-11-04 23:49 ` Måns Rullgård
2015-10-30 14:28 ` Alexey Brodkin
2015-10-30 14:28 ` Alexey Brodkin
2015-10-29 0:36 ` kbuild test robot
2015-10-29 0:36 ` kbuild test robot
2015-10-29 12:52 ` Måns Rullgård
2015-10-29 12:52 ` Måns Rullgård
2015-10-29 13:05 ` Alexey Brodkin
2015-10-29 13:05 ` Alexey Brodkin
2015-10-29 13:37 ` Måns Rullgård
2015-10-29 13:37 ` Måns Rullgård
2015-10-29 13:31 ` Russell King - ARM Linux
2015-10-29 13:31 ` Russell King - ARM Linux
2015-10-29 14:32 ` Alexey Brodkin
2015-10-29 14:32 ` Alexey Brodkin
2015-10-29 17:09 ` Randy Dunlap [this message]
2015-10-29 17:09 ` Randy Dunlap
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=563252BE.9030506@infradead.org \
--to=rdunlap@infradead.org \
--cc=linux-snps-arc@lists.infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.