* [RFC] div64_64 support
@ 2007-02-24 1:05 Stephen Hemminger
2007-02-24 16:19 ` Sami Farin
2007-02-26 20:09 ` Jan Engelhardt
0 siblings, 2 replies; 57+ messages in thread
From: Stephen Hemminger @ 2007-02-24 1:05 UTC (permalink / raw)
To: linux-kernel, netdev
Since there already two users of full 64 bit division in the kernel,
and other places maybe hiding out as well. Add a full 64/64 bit divide.
Yes this expensive, but there are places where it is necessary.
It is not clear if doing the scaling buys any advantage on 64 bit platforms,
so for them a full divide is done.
---
include/asm-arm/div64.h | 2 ++
include/asm-generic/div64.h | 8 ++++++++
include/asm-m68k/div64.h | 2 ++
include/asm-mips/div64.h | 8 ++++++++
include/asm-um/div64.h | 1 +
include/asm-xtensa/div64.h | 1 +
lib/div64.c | 22 ++++++++++++++++++++++
net/ipv4/tcp_cubic.c | 22 ----------------------
net/netfilter/xt_connbytes.c | 16 ----------------
9 files changed, 44 insertions(+), 38 deletions(-)
--- linux-2.6.21-rc1.orig/include/asm-arm/div64.h 2007-02-23 16:44:41.000000000 -0800
+++ linux-2.6.21-rc1/include/asm-arm/div64.h 2007-02-23 16:57:36.000000000 -0800
@@ -221,6 +221,8 @@
__nr; \
})
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
+
#endif
#endif
--- linux-2.6.21-rc1.orig/include/asm-generic/div64.h 2007-02-23 16:35:27.000000000 -0800
+++ linux-2.6.21-rc1/include/asm-generic/div64.h 2007-02-23 16:56:40.000000000 -0800
@@ -30,9 +30,17 @@
__rem; \
})
+/*
+ * div64_64 - Divide two 64 bit numbers
+ */
+static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
+{
+ return dividend / divisor;
+}
#elif BITS_PER_LONG == 32
extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
/* The unnecessary pointer compare is there
* to check for type safety (n must be 64bit)
--- linux-2.6.21-rc1.orig/include/asm-m68k/div64.h 2007-02-23 16:45:20.000000000 -0800
+++ linux-2.6.21-rc1/include/asm-m68k/div64.h 2007-02-23 16:56:27.000000000 -0800
@@ -23,4 +23,6 @@
__rem; \
})
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
+
#endif /* _M68K_DIV64_H */
--- linux-2.6.21-rc1.orig/include/asm-mips/div64.h 2007-02-23 16:45:25.000000000 -0800
+++ linux-2.6.21-rc1/include/asm-mips/div64.h 2007-02-23 16:59:52.000000000 -0800
@@ -78,6 +78,9 @@
__quot = __quot << 32 | __low; \
(n) = __quot; \
__mod; })
+
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
+
#endif /* (_MIPS_SZLONG == 32) */
#if (_MIPS_SZLONG == 64)
@@ -101,6 +104,11 @@
(n) = __quot; \
__mod; })
+static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
+{
+ return dividend / divisor;
+}
+
#endif /* (_MIPS_SZLONG == 64) */
#endif /* _ASM_DIV64_H */
--- linux-2.6.21-rc1.orig/include/asm-um/div64.h 2007-02-23 16:45:37.000000000 -0800
+++ linux-2.6.21-rc1/include/asm-um/div64.h 2007-02-23 16:58:29.000000000 -0800
@@ -3,4 +3,5 @@
#include "asm/arch/div64.h"
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
#endif
--- linux-2.6.21-rc1.orig/include/asm-xtensa/div64.h 2007-02-23 16:45:43.000000000 -0800
+++ linux-2.6.21-rc1/include/asm-xtensa/div64.h 2007-02-23 16:58:04.000000000 -0800
@@ -16,4 +16,5 @@
n /= (unsigned int) base; \
__res; })
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
#endif
--- linux-2.6.21-rc1.orig/lib/div64.c 2007-02-23 16:50:40.000000000 -0800
+++ linux-2.6.21-rc1/lib/div64.c 2007-02-23 16:54:56.000000000 -0800
@@ -18,6 +18,7 @@
#include <linux/types.h>
#include <linux/module.h>
+#include <asm/bitops.h>
#include <asm/div64.h>
/* Not needed on 64bit architectures */
@@ -58,4 +59,25 @@
EXPORT_SYMBOL(__div64_32);
+/* Use scaling to do a full 64 bit division */
+uint64_t div64_64(uint64_t dividend, uint64_t divisor)
+{
+ uint32_t d = divisor;
+
+ if (divisor > 0xffffffffULL) {
+ unsigned int shift = fls(divisor >> 32);
+
+ d = divisor >> shift;
+ dividend >>= shift;
+ }
+
+ /* avoid 64 bit division if possible */
+ if (dividend >> 32)
+ do_div(dividend, d);
+ else
+ dividend = (uint32_t) dividend / d;
+
+ return dividend;
+}
+
#endif /* BITS_PER_LONG == 32 */
--- linux-2.6.21-rc1.orig/net/ipv4/tcp_cubic.c 2007-02-23 16:33:52.000000000 -0800
+++ linux-2.6.21-rc1/net/ipv4/tcp_cubic.c 2007-02-23 16:45:50.000000000 -0800
@@ -51,7 +51,6 @@
module_param(tcp_friendliness, int, 0644);
MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
-#include <asm/div64.h>
/* BIC TCP Parameters */
struct bictcp {
@@ -93,27 +92,6 @@
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
-/* 64bit divisor, dividend and result. dynamic precision */
-static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
-{
- u_int32_t d = divisor;
-
- if (divisor > 0xffffffffULL) {
- unsigned int shift = fls(divisor >> 32);
-
- d = divisor >> shift;
- dividend >>= shift;
- }
-
- /* avoid 64 bit division if possible */
- if (dividend >> 32)
- do_div(dividend, d);
- else
- dividend = (uint32_t) dividend / d;
-
- return dividend;
-}
-
/*
* calculate the cubic root of x using Newton-Raphson
*/
--- linux-2.6.21-rc1.orig/net/netfilter/xt_connbytes.c 2007-02-23 16:40:57.000000000 -0800
+++ linux-2.6.21-rc1/net/netfilter/xt_connbytes.c 2007-02-23 16:41:09.000000000 -0800
@@ -24,22 +24,6 @@
MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection");
MODULE_ALIAS("ipt_connbytes");
-/* 64bit divisor, dividend and result. dynamic precision */
-static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
-{
- u_int32_t d = divisor;
-
- if (divisor > 0xffffffffULL) {
- unsigned int shift = fls(divisor >> 32);
-
- d = divisor >> shift;
- dividend >>= shift;
- }
-
- do_div(dividend, d);
- return dividend;
-}
-
static int
match(const struct sk_buff *skb,
const struct net_device *in,
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [RFC] div64_64 support 2007-02-24 1:05 [RFC] div64_64 support Stephen Hemminger @ 2007-02-24 16:19 ` Sami Farin 2007-02-26 20:09 ` Jan Engelhardt 1 sibling, 0 replies; 57+ messages in thread From: Sami Farin @ 2007-02-24 16:19 UTC (permalink / raw) To: linux-kernel, netdev On Fri, Feb 23, 2007 at 17:05:27 -0800, Stephen Hemminger wrote: > Since there already two users of full 64 bit division in the kernel, > and other places maybe hiding out as well. Add a full 64/64 bit divide. > > Yes this expensive, but there are places where it is necessary. > It is not clear if doing the scaling buys any advantage on 64 bit platforms, > so for them a full divide is done. Still does not work after these fixes... how came? WARNING: "div64_64" [net/netfilter/xt_connbytes.ko] undefined! WARNING: "div64_64" [net/ipv4/tcp_cubic.ko] undefined! --- linux-2.6.19/include/asm-i386/div64.h.bak 2006-11-29 23:57:37.000000000 +0200 +++ linux-2.6.19/include/asm-i386/div64.h 2007-02-24 16:24:55.822529880 +0200 @@ -45,4 +45,7 @@ div_ll_X_l_rem(long long divs, long div, return dum2; } + +extern uint64_t div64_64(uint64_t dividend, uint64_t divisor); + #endif --- linux-2.6.19/lib/div64.c.bak 2007-02-24 16:10:03.686084000 +0200 +++ linux-2.6.19/lib/div64.c 2007-02-24 17:01:11.224517353 +0200 @@ -80,4 +80,6 @@ uint64_t div64_64(uint64_t dividend, uin return dividend; } +EXPORT_SYMBOL(div64_64); + #endif /* BITS_PER_LONG == 32 */ -- ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-02-24 1:05 [RFC] div64_64 support Stephen Hemminger 2007-02-24 16:19 ` Sami Farin @ 2007-02-26 20:09 ` Jan Engelhardt 2007-02-26 21:28 ` Stephen Hemminger 2007-02-26 22:31 ` Stephen Hemminger 1 sibling, 2 replies; 57+ messages in thread From: Jan Engelhardt @ 2007-02-26 20:09 UTC (permalink / raw) To: Stephen Hemminger; +Cc: linux-kernel, netdev On Feb 23 2007 17:05, Stephen Hemminger wrote: > >Since there already two users of full 64 bit division in the kernel, >and other places maybe hiding out as well. Add a full 64/64 bit divide. > >Yes this expensive, but there are places where it is necessary. >It is not clear if doing the scaling buys any advantage on 64 bit platforms, >so for them a full divide is done. > >--- > include/asm-arm/div64.h | 2 ++ > include/asm-generic/div64.h | 8 ++++++++ > include/asm-m68k/div64.h | 2 ++ > include/asm-mips/div64.h | 8 ++++++++ > include/asm-um/div64.h | 1 + > include/asm-xtensa/div64.h | 1 + > lib/div64.c | 22 ++++++++++++++++++++++ > net/ipv4/tcp_cubic.c | 22 ---------------------- > net/netfilter/xt_connbytes.c | 16 ---------------- > 9 files changed, 44 insertions(+), 38 deletions(-) Actually, there is udivdi3 support in the kernel ./arch/arm26/lib/udivdi3.c ./arch/sh/lib/udivdi3.c ./arch/sparc/lib/udivdi3.S should not this be consolidated too? Jan -- ft: http://freshmeat.net/p/chaostables/ ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-02-26 20:09 ` Jan Engelhardt @ 2007-02-26 21:28 ` Stephen Hemminger 2007-02-27 1:20 ` H. Peter Anvin 2007-02-26 22:31 ` Stephen Hemminger 1 sibling, 1 reply; 57+ messages in thread From: Stephen Hemminger @ 2007-02-26 21:28 UTC (permalink / raw) To: Jan Engelhardt; +Cc: linux-kernel, netdev On Mon, 26 Feb 2007 21:09:26 +0100 (MET) Jan Engelhardt <jengelh@linux01.gwdg.de> wrote: > > On Feb 23 2007 17:05, Stephen Hemminger wrote: > > > >Since there already two users of full 64 bit division in the kernel, > >and other places maybe hiding out as well. Add a full 64/64 bit divide. > > > >Yes this expensive, but there are places where it is necessary. > >It is not clear if doing the scaling buys any advantage on 64 bit platforms, > >so for them a full divide is done. > > > >--- > > include/asm-arm/div64.h | 2 ++ > > include/asm-generic/div64.h | 8 ++++++++ > > include/asm-m68k/div64.h | 2 ++ > > include/asm-mips/div64.h | 8 ++++++++ > > include/asm-um/div64.h | 1 + > > include/asm-xtensa/div64.h | 1 + > > lib/div64.c | 22 ++++++++++++++++++++++ > > net/ipv4/tcp_cubic.c | 22 ---------------------- > > net/netfilter/xt_connbytes.c | 16 ---------------- > > 9 files changed, 44 insertions(+), 38 deletions(-) > > Actually, there is udivdi3 support in the kernel > > ./arch/arm26/lib/udivdi3.c > ./arch/sh/lib/udivdi3.c > ./arch/sparc/lib/udivdi3.S > > should not this be consolidated too? Hmm. Those are the GCC internal versions, that are picked up but doing divide in place. Do we want to allow general 64 bit in kernel to be easily used? It could cause sloppy slow code, but it would look cleaner. -- Stephen Hemminger <shemminger@linux-foundation.org> ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-02-26 21:28 ` Stephen Hemminger @ 2007-02-27 1:20 ` H. Peter Anvin 2007-02-27 3:45 ` Segher Boessenkool 0 siblings, 1 reply; 57+ messages in thread From: H. Peter Anvin @ 2007-02-27 1:20 UTC (permalink / raw) To: Stephen Hemminger; +Cc: Jan Engelhardt, linux-kernel, netdev Stephen Hemminger wrote: > > Hmm. Those are the GCC internal versions, that are picked up but > doing divide in place. Do we want to allow general 64 bit in kernel to > be easily used? It could cause sloppy slow code, but it would look > cleaner. > ... and it would handle datatypes which may be architecture-dependent a lot cleaner. I thought the motivation for div64() was that a 64:32->32 divide could be done a lot faster on a number of platforms (including the important x86) than a generic 64:64->64 divide, but gcc doesn't handle the devolution automatically -- there is no such libgcc function. -hpa ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-02-27 1:20 ` H. Peter Anvin @ 2007-02-27 3:45 ` Segher Boessenkool 0 siblings, 0 replies; 57+ messages in thread From: Segher Boessenkool @ 2007-02-27 3:45 UTC (permalink / raw) To: H. Peter Anvin; +Cc: Jan Engelhardt, netdev, Stephen Hemminger, linux-kernel > I thought the motivation for div64() was that a 64:32->32 divide could > be done a lot faster on a number of platforms (including the important > x86) than a generic 64:64->64 divide, but gcc doesn't handle the > devolution automatically -- there is no such libgcc function. That there's no such function in libgcc doesn't mean GCC cannot handle this; libgcc is a bunch of library functions that are really needed for generated code (because you really don't want to expand those functions inline everywhere) -- you won't find an addsi3 in libgcc either. There does exist a divmoddisi4, sort of. It used to be defined in three GCC targets, but commented out in all three. The NS32k is long gone. For Vax, a comment says the machine insn for this isn't used because it is just too slow. And the i386 version is disabled because it returns the wrong result on overflow (not the truncated 64-bit result, required by the implicit cast to 32-bit, but the i386 arch traps to the overflow handler). The only way to express the semantics you want in (GNU) C is to use asm() -- and that's exactly what div64() does :-) Blame it on the C language, but not on GCC. Not this time. Segher ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-02-26 20:09 ` Jan Engelhardt 2007-02-26 21:28 ` Stephen Hemminger @ 2007-02-26 22:31 ` Stephen Hemminger 2007-02-26 23:02 ` Jan Engelhardt ` (2 more replies) 1 sibling, 3 replies; 57+ messages in thread From: Stephen Hemminger @ 2007-02-26 22:31 UTC (permalink / raw) To: Jan Engelhardt; +Cc: linux-kernel, netdev Here is another way to handle the 64 bit divide case. It allows full 64 bit divide by adding the support routine GCC needs. --- arch/alpha/Kconfig | 4 ++++ arch/arm/Kconfig | 4 ++++ arch/arm26/Kconfig | 4 ++++ arch/avr32/Kconfig | 4 ++++ arch/cris/Kconfig | 4 ++++ arch/frv/Kconfig | 4 ++++ arch/h8300/Kconfig | 4 ++++ arch/i386/Kconfig | 4 ++++ arch/m32r/Kconfig | 4 ++++ arch/m68k/Kconfig | 4 ++++ arch/m68knommu/Kconfig | 4 ++++ arch/mips/Kconfig | 4 ++++ arch/parisc/Kconfig | 4 ++++ arch/powerpc/Kconfig | 4 ++++ arch/ppc/Kconfig | 4 ++++ arch/s390/Kconfig | 4 ++++ arch/sh64/Kconfig | 4 ++++ arch/v850/Kconfig | 3 +++ arch/xtensa/Kconfig | 4 ++++ lib/Makefile | 1 + lib/udivdi3.c | 37 +++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_cubic.c | 26 ++------------------------ net/netfilter/xt_connbytes.c | 19 +------------------ 23 files changed, 116 insertions(+), 42 deletions(-) --- pktgen.orig/net/ipv4/tcp_cubic.c 2007-02-26 13:40:08.000000000 -0800 +++ pktgen/net/ipv4/tcp_cubic.c 2007-02-26 14:30:00.000000000 -0800 @@ -51,7 +51,6 @@ module_param(tcp_friendliness, int, 0644); MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); -#include <asm/div64.h> /* BIC TCP Parameters */ struct bictcp { @@ -93,27 +92,6 @@ tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } -/* 64bit divisor, dividend and result. dynamic precision */ -static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) -{ - u_int32_t d = divisor; - - if (divisor > 0xffffffffULL) { - unsigned int shift = fls(divisor >> 32); - - d = divisor >> shift; - dividend >>= shift; - } - - /* avoid 64 bit division if possible */ - if (dividend >> 32) - do_div(dividend, d); - else - dividend = (uint32_t) dividend / d; - - return dividend; -} - /* * calculate the cubic root of x using Newton-Raphson */ @@ -134,7 +112,7 @@ */ do { x1 = x; - x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3; + x = (2 * x + (u32) (a / x*x)) / 3; } while (abs(x1 - x) > 1); return x; --- pktgen.orig/net/netfilter/xt_connbytes.c 2007-02-26 13:40:08.000000000 -0800 +++ pktgen/net/netfilter/xt_connbytes.c 2007-02-26 14:29:13.000000000 -0800 @@ -16,7 +16,6 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_connbytes.h> -#include <asm/div64.h> #include <asm/bitops.h> MODULE_LICENSE("GPL"); @@ -24,22 +23,6 @@ MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection"); MODULE_ALIAS("ipt_connbytes"); -/* 64bit divisor, dividend and result. dynamic precision */ -static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) -{ - u_int32_t d = divisor; - - if (divisor > 0xffffffffULL) { - unsigned int shift = fls(divisor >> 32); - - d = divisor >> shift; - dividend >>= shift; - } - - do_div(dividend, d); - return dividend; -} - static int match(const struct sk_buff *skb, const struct net_device *in, @@ -106,7 +89,7 @@ break; } if (pkts != 0) - what = div64_64(bytes, pkts); + what = bytes / pkts; break; } --- pktgen.orig/lib/Makefile 2007-02-26 13:40:08.000000000 -0800 +++ pktgen/lib/Makefile 2007-02-26 14:17:31.000000000 -0800 @@ -28,6 +28,7 @@ lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o +obj-$(CONFIG_GENERIC_UDIVDI3) += udivdi3.o obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o obj-$(CONFIG_PLIST) += plist.o obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o --- pktgen.orig/arch/alpha/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/alpha/Kconfig 2007-02-26 13:54:35.000000000 -0800 @@ -33,6 +33,10 @@ bool default n +config GENERIC_UDIVDI3 + bool + default y + config GENERIC_FIND_NEXT_BIT bool default y --- pktgen.orig/arch/arm/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/arm/Kconfig 2007-02-26 13:54:57.000000000 -0800 @@ -90,6 +90,10 @@ bool default n +config GENERIC_UDIVDI3 + bool + default y + config GENERIC_HWEIGHT bool default y --- pktgen.orig/arch/arm26/Kconfig 2007-02-26 13:48:46.000000000 -0800 +++ pktgen/arch/arm26/Kconfig 2007-02-26 13:55:24.000000000 -0800 @@ -49,6 +49,10 @@ bool default n +config GENERIC_UDIVDI3 + bool + default y + config GENERIC_HWEIGHT bool default y --- pktgen.orig/arch/avr32/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/avr32/Kconfig 2007-02-26 13:55:39.000000000 -0800 @@ -53,6 +53,10 @@ bool default n +config GENERIC_UDIVDI3 + bool + default y + config GENERIC_BUST_SPINLOCK bool --- pktgen.orig/arch/cris/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/cris/Kconfig 2007-02-26 13:55:53.000000000 -0800 @@ -28,6 +28,10 @@ bool default n +config GENERIC_UDIVDI3 + bool + default y + config GENERIC_FIND_NEXT_BIT bool default y --- pktgen.orig/arch/frv/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/frv/Kconfig 2007-02-26 13:56:09.000000000 -0800 @@ -53,6 +53,10 @@ bool default y +config GENERIC_UDIVDI3 + bool + default y + mainmenu "Fujitsu FR-V Kernel Configuration" source "init/Kconfig" --- pktgen.orig/arch/h8300/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/h8300/Kconfig 2007-02-26 13:56:20.000000000 -0800 @@ -41,6 +41,10 @@ bool default n +config GENERIC_UDIVDI3 + bool + default y + config GENERIC_FIND_NEXT_BIT bool default y --- pktgen.orig/arch/i386/Kconfig 2007-02-26 14:01:59.000000000 -0800 +++ pktgen/arch/i386/Kconfig 2007-02-26 14:02:28.000000000 -0800 @@ -75,6 +75,10 @@ bool default y +config GENERIC_UDIVDI3 + bool + default y + config ARCH_MAY_HAVE_PC_FDC bool default y --- pktgen.orig/arch/m32r/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/m32r/Kconfig 2007-02-26 13:56:43.000000000 -0800 @@ -229,6 +229,10 @@ bool default n +config GENERIC_UDIVDI3 + bool + default y + config GENERIC_FIND_NEXT_BIT bool default y --- pktgen.orig/arch/m68k/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/m68k/Kconfig 2007-02-26 13:56:56.000000000 -0800 @@ -25,6 +25,10 @@ bool default n +config GENERIC_UDIVDI3 + bool + default y + config GENERIC_HWEIGHT bool default y --- pktgen.orig/arch/m68knommu/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/m68knommu/Kconfig 2007-02-26 13:57:05.000000000 -0800 @@ -37,6 +37,10 @@ bool default n +config GENERIC_UDIVDI3 + bool + default y + config GENERIC_FIND_NEXT_BIT bool default y --- pktgen.orig/arch/mips/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/mips/Kconfig 2007-02-26 13:57:13.000000000 -0800 @@ -843,6 +843,10 @@ bool default n +config GENERIC_UDIVDI3 + bool + default y + config GENERIC_FIND_NEXT_BIT bool default y --- pktgen.orig/arch/parisc/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/parisc/Kconfig 2007-02-26 13:57:21.000000000 -0800 @@ -33,6 +33,10 @@ bool default n +config GENERIC_UDIVDI3 + bool + default y + config GENERIC_FIND_NEXT_BIT bool default y --- pktgen.orig/arch/powerpc/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/powerpc/Kconfig 2007-02-26 13:57:31.000000000 -0800 @@ -49,6 +49,10 @@ bool default y if 64BIT +config GENERIC_UDIVDI3 + bool + default y + config GENERIC_HWEIGHT bool default y --- pktgen.orig/arch/ppc/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/ppc/Kconfig 2007-02-26 13:57:44.000000000 -0800 @@ -27,6 +27,10 @@ bool default n +config GENERIC_UDIVDI3 + bool + default y + config GENERIC_HWEIGHT bool default y --- pktgen.orig/arch/s390/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/s390/Kconfig 2007-02-26 13:57:53.000000000 -0800 @@ -34,6 +34,10 @@ bool default n +config GENERIC_UDIVDI3 + bool + default y + config GENERIC_HWEIGHT bool default y --- pktgen.orig/arch/sh64/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/sh64/Kconfig 2007-02-26 14:00:32.000000000 -0800 @@ -21,6 +21,10 @@ bool default y +config GENERIC_UDIVDI3 + bool + default y + config GENERIC_FIND_NEXT_BIT bool default y --- pktgen.orig/arch/v850/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/v850/Kconfig 2007-02-26 13:59:29.000000000 -0800 @@ -19,6 +19,9 @@ config RWSEM_XCHGADD_ALGORITHM bool default n +config GENERIC_UDIVDI3 + bool + default y config GENERIC_FIND_NEXT_BIT bool default y --- pktgen.orig/arch/xtensa/Kconfig 2007-02-26 13:51:29.000000000 -0800 +++ pktgen/arch/xtensa/Kconfig 2007-02-26 13:59:45.000000000 -0800 @@ -26,6 +26,10 @@ bool default y +config GENERIC_UDIVDI3 + bool + default y + config GENERIC_FIND_NEXT_BIT bool default y --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ pktgen/lib/udivdi3.c 2007-02-26 14:14:13.000000000 -0800 @@ -0,0 +1,37 @@ +/* + * Generic C version of full 64 bit by 64 bit division + * Extracted from version used by netfilter connection tracking + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * Code generated for this function might be very inefficient + * for some CPUs, can be overridden by linking arch-specific + * assembly versions such as arch/sparc/lib/udivdi.S + */ +#include <linux/types.h> +#include <linux/module.h> +#include <asm/div64.h> + +uint64_t __udivdi3(uint64_t dividend, uint64_t divisor) +{ + uint32_t d = divisor; + + /* Scale divisor to 32 bits */ + if (divisor > 0xffffffffULL) { + unsigned int shift = fls(divisor >> 32); + + d = divisor >> shift; + dividend >>= shift; + } + + /* avoid 64 bit division if possible */ + if (dividend >> 32) + do_div(dividend, d); + else + dividend = (uint32_t) dividend / d; + + return dividend; +} +EXPORT_SYMBOL(__udivdi3); ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-02-26 22:31 ` Stephen Hemminger @ 2007-02-26 23:02 ` Jan Engelhardt 2007-02-26 23:44 ` Stephen Hemminger 2007-02-27 6:21 ` Dan Williams 2007-03-03 2:31 ` Andi Kleen 2 siblings, 1 reply; 57+ messages in thread From: Jan Engelhardt @ 2007-02-26 23:02 UTC (permalink / raw) To: Stephen Hemminger; +Cc: linux-kernel, netdev On Feb 26 2007 13:28, Stephen Hemminger wrote: >> >> ./arch/arm26/lib/udivdi3.c >> ./arch/sh/lib/udivdi3.c >> ./arch/sparc/lib/udivdi3.S >> >> should not this be consolidated too? > >Hmm. Those are the GCC internal versions, that are picked up but >doing divide in place. Do we want to allow general 64 bit in kernel to >be easily used? It could cause sloppy slow code, but it would look >cleaner. Then our reviewers should catch it, and if not, the janitors will (/me winks at R.P.J.Day and trivial@). >@@ -134,7 +112,7 @@ > */ > do { > x1 = x; >- x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3; >+ x = (2 * x + (u32) (a / x*x)) / 3; Eye see a bug. Previously there was div64_64(a, x*x) which is equivalent to (a)/(x*x), or just: a/(x^2). But now you do a/x*x, which is equivalent to a*x/x (in the domain of real numbers). Furthermore, a/x*x is a-(a%x), which does not even remotely match a/(x^2). Please keep the math intact, thank you ;-) Jan -- ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-02-26 23:02 ` Jan Engelhardt @ 2007-02-26 23:44 ` Stephen Hemminger 2007-02-27 0:05 ` Jan Engelhardt 0 siblings, 1 reply; 57+ messages in thread From: Stephen Hemminger @ 2007-02-26 23:44 UTC (permalink / raw) To: Jan Engelhardt; +Cc: linux-kernel, netdev On Tue, 27 Feb 2007 00:02:50 +0100 (MET) Jan Engelhardt <jengelh@linux01.gwdg.de> wrote: > > On Feb 26 2007 13:28, Stephen Hemminger wrote: > >> > >> ./arch/arm26/lib/udivdi3.c > >> ./arch/sh/lib/udivdi3.c > >> ./arch/sparc/lib/udivdi3.S > >> > >> should not this be consolidated too? > > > >Hmm. Those are the GCC internal versions, that are picked up but > >doing divide in place. Do we want to allow general 64 bit in kernel to > >be easily used? It could cause sloppy slow code, but it would look > >cleaner. > > Then our reviewers should catch it, and if not, the janitors will > (/me winks at R.P.J.Day and trivial@). > > >@@ -134,7 +112,7 @@ > > */ > > do { > > x1 = x; > >- x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3; > >+ x = (2 * x + (u32) (a / x*x)) / 3; > > Eye see a bug. > > Previously there was div64_64(a, x*x) which is equivalent to > (a)/(x*x), or just: a/(x^2). But now you do a/x*x, which is > equivalent to a*x/x (in the domain of real numbers). Furthermore, > a/x*x is a-(a%x), which does not even remotely match a/(x^2). > > Please keep the math intact, thank you ;-) Been there, done that, don't want to repeat it... -- Stephen Hemminger <shemminger@linux-foundation.org> ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-02-26 23:44 ` Stephen Hemminger @ 2007-02-27 0:05 ` Jan Engelhardt 2007-02-27 0:07 ` Stephen Hemminger 0 siblings, 1 reply; 57+ messages in thread From: Jan Engelhardt @ 2007-02-27 0:05 UTC (permalink / raw) To: Stephen Hemminger; +Cc: linux-kernel, netdev On Feb 26 2007 15:44, Stephen Hemminger wrote: >> >- x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3; >> >+ x = (2 * x + (u32) (a / x*x)) / 3; >> >> Previously there was div64_64(a, x*x) which is equivalent to >> (a)/(x*x), or just: a/(x^2). But now you do a/x*x, which is >> equivalent to a*x/x (in the domain of real numbers). Furthermore, >> a/x*x is a-(a%x), which does not even remotely match a/(x^2). >> >Been there, done that, don't want to repeat it... I am sorry I don't quite follow. Jan -- ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-02-27 0:05 ` Jan Engelhardt @ 2007-02-27 0:07 ` Stephen Hemminger 2007-02-27 0:14 ` Jan Engelhardt 0 siblings, 1 reply; 57+ messages in thread From: Stephen Hemminger @ 2007-02-27 0:07 UTC (permalink / raw) To: Jan Engelhardt; +Cc: linux-kernel, netdev On Tue, 27 Feb 2007 01:05:26 +0100 (MET) Jan Engelhardt <jengelh@linux01.gwdg.de> wrote: > > On Feb 26 2007 15:44, Stephen Hemminger wrote: > >> >- x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3; > >> >+ x = (2 * x + (u32) (a / x*x)) / 3; > >> > >> Previously there was div64_64(a, x*x) which is equivalent to > >> (a)/(x*x), or just: a/(x^2). But now you do a/x*x, which is > >> equivalent to a*x/x (in the domain of real numbers). Furthermore, > >> a/x*x is a-(a%x), which does not even remotely match a/(x^2). > >> > >Been there, done that, don't want to repeat it... > > I am sorry I don't quite follow. Once before a missed paren's caused a TCP congestion window bug that took 6 months before it was found... -- Stephen Hemminger <shemminger@linux-foundation.org> ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-02-27 0:07 ` Stephen Hemminger @ 2007-02-27 0:14 ` Jan Engelhardt 0 siblings, 0 replies; 57+ messages in thread From: Jan Engelhardt @ 2007-02-27 0:14 UTC (permalink / raw) To: Stephen Hemminger; +Cc: linux-kernel, netdev On Feb 26 2007 16:07, Stephen Hemminger wrote: >> On Feb 26 2007 15:44, Stephen Hemminger wrote: >> >> >- x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3; >> >> >+ x = (2 * x + (u32) (a / x*x)) / 3; >> >> >> >> Previously there was div64_64(a, x*x) which is equivalent to >> >> (a)/(x*x), or just: a/(x^2). But now you do a/x*x, which is >> >> equivalent to a*x/x (in the domain of real numbers). Furthermore, >> >> a/x*x is a-(a%x), which does not even remotely match a/(x^2). >> >> >> >Been there, done that, don't want to repeat it... >> >> I am sorry I don't quite follow. > >Once before a missed paren's caused a TCP congestion window bug that >took 6 months before it was found... Hah, just as I expected. |On Tue, 27 Feb 2007 00:02:50 +0100 (MET), Jan Engelhardt wrote: |>Then our reviewers should catch it, and if not, the janitors will. Jan -- ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-02-26 22:31 ` Stephen Hemminger 2007-02-26 23:02 ` Jan Engelhardt @ 2007-02-27 6:21 ` Dan Williams 2007-03-03 2:31 ` Andi Kleen 2 siblings, 0 replies; 57+ messages in thread From: Dan Williams @ 2007-02-27 6:21 UTC (permalink / raw) To: Stephen Hemminger Cc: Jan Engelhardt, linux-kernel, netdev, Nicolas Pitre, Russell King - ARM Linux On 2/26/07, Stephen Hemminger <shemminger@linux-foundation.org> wrote: > Here is another way to handle the 64 bit divide case. > It allows full 64 bit divide by adding the support routine > GCC needs. > <snip> I know ARM already went through the process of removing __udivdi3 support: http://www.arm.linux.org.uk/developer/patches/viewpatch.php?id=2723/2 Copying Russell and Nicolas as a heads up. -- Dan ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-02-26 22:31 ` Stephen Hemminger 2007-02-26 23:02 ` Jan Engelhardt 2007-02-27 6:21 ` Dan Williams @ 2007-03-03 2:31 ` Andi Kleen 2007-03-05 23:57 ` Stephen Hemminger 2 siblings, 1 reply; 57+ messages in thread From: Andi Kleen @ 2007-03-03 2:31 UTC (permalink / raw) To: Stephen Hemminger; +Cc: Jan Engelhardt, linux-kernel, netdev Stephen Hemminger <shemminger@linux-foundation.org> writes: > Here is another way to handle the 64 bit divide case. > It allows full 64 bit divide by adding the support routine > GCC needs. Not supplying that was intentional by Linus so that people think twice (or more often) before they using such expensive operations. A plain / looks too innocent. Is it really needed by CUBIC anyways? It uses it for getting the cubic root, but the algorithm recommended by Hacker's Delight (great book) doesn't use any divisions at all. Probably better to use a better algorithm without divisions. -Andi ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-03-03 2:31 ` Andi Kleen @ 2007-03-05 23:57 ` Stephen Hemminger 2007-03-06 0:25 ` David Miller 2007-03-06 13:34 ` [RFC] div64_64 support Andi Kleen 0 siblings, 2 replies; 57+ messages in thread From: Stephen Hemminger @ 2007-03-05 23:57 UTC (permalink / raw) To: Andi Kleen; +Cc: Jan Engelhardt, linux-kernel, netdev On 03 Mar 2007 03:31:52 +0100 Andi Kleen <andi@firstfloor.org> wrote: > Stephen Hemminger <shemminger@linux-foundation.org> writes: > > > Here is another way to handle the 64 bit divide case. > > It allows full 64 bit divide by adding the support routine > > GCC needs. > > Not supplying that was intentional by Linus so that people > think twice (or more often) before they using such expensive > operations. A plain / looks too innocent. > > Is it really needed by CUBIC anyways? It uses it for getting > the cubic root, but the algorithm recommended by Hacker's Delight > (great book) doesn't use any divisions at all. Probably better > to use a better algorithm without divisions. > I tried the code from Hacker's Delight. It is cool, but performance is CPU (and data) dependent: Average # of usecs per operation: Hacker Newton Pentium 3 68.6 < 90.4 T2050 98.6 > 92.0 U1400 450 > 415 Xeon 70 < 90 Xeon (newer) 71 < 78 EM64T 21.8 < 24.6 AMD64 23.4 < 32.0 It might be worth the change for code size reduction though. -- Stephen Hemminger <shemminger@linux-foundation.org> ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-03-05 23:57 ` Stephen Hemminger @ 2007-03-06 0:25 ` David Miller 2007-03-06 13:36 ` Andi Kleen 2007-03-06 14:04 ` [RFC] div64_64 support II Andi Kleen 2007-03-06 13:34 ` [RFC] div64_64 support Andi Kleen 1 sibling, 2 replies; 57+ messages in thread From: David Miller @ 2007-03-06 0:25 UTC (permalink / raw) To: shemminger; +Cc: andi, jengelh, linux-kernel, netdev From: Stephen Hemminger <shemminger@linux-foundation.org> Date: Mon, 5 Mar 2007 15:57:14 -0800 > I tried the code from Hacker's Delight. > It is cool, but performance is CPU (and data) dependent: > > Average # of usecs per operation: Interesting results. The problem with these algorithms that tradoff one or more multiplies in order to avoid a divide is that they don't give anything and often lose when both multiplies and divides are emulated in software. This is particularly true in this cube-root case from Hacker's Delight, because it's using 3 multiplies per iteration in place of one divide per iteration. Actually, sorry, there is only one real multiply in there since the other two can be computed using addition and shifts. Another thing is that the non-Hacker's Delight version iterates differently for different input values, so the input value space is very important to consider when comparing these two pieces of code. ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-03-06 0:25 ` David Miller @ 2007-03-06 13:36 ` Andi Kleen 2007-03-06 14:04 ` [RFC] div64_64 support II Andi Kleen 1 sibling, 0 replies; 57+ messages in thread From: Andi Kleen @ 2007-03-06 13:36 UTC (permalink / raw) To: David Miller; +Cc: shemminger, andi, jengelh, linux-kernel, netdev On Mon, Mar 05, 2007 at 04:25:51PM -0800, David Miller wrote: > Another thing is that the non-Hacker's Delight version iterates > differently for different input values, so the input value space is > very important to consider when comparing these two pieces of code. I did some stochastic testing on my version. It gave 1 off for < 1% of the arguments. Probably not an issue. Besides it actually works for >2^43 @) -Andi ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support II 2007-03-06 0:25 ` David Miller 2007-03-06 13:36 ` Andi Kleen @ 2007-03-06 14:04 ` Andi Kleen 2007-03-06 17:43 ` Dagfinn Ilmari Mannsåker 2007-03-06 18:48 ` H. Peter Anvin 1 sibling, 2 replies; 57+ messages in thread From: Andi Kleen @ 2007-03-06 14:04 UTC (permalink / raw) To: David Miller; +Cc: shemminger, andi, jengelh, linux-kernel, netdev > The problem with these algorithms that tradoff one or more > multiplies in order to avoid a divide is that they don't > give anything and often lose when both multiplies and > divides are emulated in software. Actually on rereading this: is there really any Linux port that emulates multiplies in software? I thought that was only done on really small microcontrollers or smart cards; but anything 32bit+ that runs Linux should have hardware multiply, shouldn't it? -Andi ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support II 2007-03-06 14:04 ` [RFC] div64_64 support II Andi Kleen @ 2007-03-06 17:43 ` Dagfinn Ilmari Mannsåker 2007-03-06 18:25 ` David Miller 2007-03-06 18:48 ` H. Peter Anvin 1 sibling, 1 reply; 57+ messages in thread From: Dagfinn Ilmari Mannsåker @ 2007-03-06 17:43 UTC (permalink / raw) To: Andi Kleen; +Cc: David Miller, shemminger, jengelh, linux-kernel, netdev Andi Kleen <andi@firstfloor.org> writes: > Actually on rereading this: is there really any Linux port > that emulates multiplies in software? I thought that was only > done on really small microcontrollers or smart cards; but anything > 32bit+ that runs Linux should have hardware multiply, shouldn't it? SPARCv7 (sun4/sun4c) doesn't have hardware mul/div. This includes SparcStation 1, 1+, 2, SLC, ELC, IPC and IPX. -- ilmari "A disappointingly low fraction of the human race is, at any given time, on fire." - Stig Sandbeck Mathisen ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support II 2007-03-06 17:43 ` Dagfinn Ilmari Mannsåker @ 2007-03-06 18:25 ` David Miller 0 siblings, 0 replies; 57+ messages in thread From: David Miller @ 2007-03-06 18:25 UTC (permalink / raw) To: ilmari; +Cc: andi, shemminger, jengelh, linux-kernel, netdev From: ilmari@ilmari.org (Dagfinn Ilmari Mannsåker) Date: Tue, 06 Mar 2007 18:43:14 +0100 > Andi Kleen <andi@firstfloor.org> writes: > > > Actually on rereading this: is there really any Linux port > > that emulates multiplies in software? I thought that was only > > done on really small microcontrollers or smart cards; but anything > > 32bit+ that runs Linux should have hardware multiply, shouldn't it? > > SPARCv7 (sun4/sun4c) doesn't have hardware mul/div. This includes > SparcStation 1, 1+, 2, SLC, ELC, IPC and IPX. Right and the cypress sun4m parts don't do multiply/divide in hardware either. I believe the Alpha does divide in software too, see arch/alpha/lib/divide.S ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support II 2007-03-06 14:04 ` [RFC] div64_64 support II Andi Kleen 2007-03-06 17:43 ` Dagfinn Ilmari Mannsåker @ 2007-03-06 18:48 ` H. Peter Anvin 1 sibling, 0 replies; 57+ messages in thread From: H. Peter Anvin @ 2007-03-06 18:48 UTC (permalink / raw) To: Andi Kleen; +Cc: David Miller, shemminger, jengelh, linux-kernel, netdev Andi Kleen wrote: >> The problem with these algorithms that tradoff one or more >> multiplies in order to avoid a divide is that they don't >> give anything and often lose when both multiplies and >> divides are emulated in software. > > Actually on rereading this: is there really any Linux port > that emulates multiplies in software? I thought that was only > done on really small microcontrollers or smart cards; but anything > 32bit+ that runs Linux should have hardware multiply, shouldn't it? SPARC < v8 does multiplies using an MSTEP instruction. -hpa ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-03-05 23:57 ` Stephen Hemminger 2007-03-06 0:25 ` David Miller @ 2007-03-06 13:34 ` Andi Kleen 2007-03-06 14:19 ` Eric Dumazet 1 sibling, 1 reply; 57+ messages in thread From: Andi Kleen @ 2007-03-06 13:34 UTC (permalink / raw) To: Stephen Hemminger; +Cc: Andi Kleen, Jan Engelhardt, linux-kernel, netdev On Mon, Mar 05, 2007 at 03:57:14PM -0800, Stephen Hemminger wrote: > On 03 Mar 2007 03:31:52 +0100 > Andi Kleen <andi@firstfloor.org> wrote: > > > Stephen Hemminger <shemminger@linux-foundation.org> writes: > > > > > Here is another way to handle the 64 bit divide case. > > > It allows full 64 bit divide by adding the support routine > > > GCC needs. > > > > Not supplying that was intentional by Linus so that people > > think twice (or more often) before they using such expensive > > operations. A plain / looks too innocent. > > > > Is it really needed by CUBIC anyways? It uses it for getting > > the cubic root, but the algorithm recommended by Hacker's Delight > > (great book) doesn't use any divisions at all. Probably better > > to use a better algorithm without divisions. > > > > I tried the code from Hacker's Delight. > It is cool, but performance is CPU (and data) dependent: I did too. My experiences were mixed: on 32bit it was slower, on 64bit faster on average. Strangely the 32bit version ran faster again without -fomit-frame-pointer, so it's likely some funny interaction with 32bit long long code generation. The difference is never more than 100 cycles so it shouldn't be a big issue either way. For some input arguments (<1% in my testing) it also gave an answer 1 off from the existing code, but I don't think that's a problem. But more importantly during testing I found that the cubic code gives a division by zero for input arguments >2^43. If you have a system with >16TB of memory this could actually be a remotely exploitable bug :) I still think it's a good idea to switch to the new function, especially since it's shorter code. Here's the patch. Note I didn't verify it with real large window TCP operations; only unit testing. -Andi Use Hacker's delight cube root algorithm in cubic TCP Shorter code and fixes a theoretically remote exploitable bug. Signed-off-by: Andi Kleen <ak@suse.de> Index: linux-2.6.21-rc1-net/net/ipv4/tcp_cubic.c =================================================================== --- linux-2.6.21-rc1-net.orig/net/ipv4/tcp_cubic.c +++ linux-2.6.21-rc1-net/net/ipv4/tcp_cubic.c @@ -93,51 +93,24 @@ static void bictcp_init(struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } -/* 64bit divisor, dividend and result. dynamic precision */ -static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) +static u32 cubic_root(u64 x) { - u_int32_t d = divisor; - - if (divisor > 0xffffffffULL) { - unsigned int shift = fls(divisor >> 32); - - d = divisor >> shift; - dividend >>= shift; - } - - /* avoid 64 bit division if possible */ - if (dividend >> 32) - do_div(dividend, d); - else - dividend = (uint32_t) dividend / d; - - return dividend; -} - -/* - * calculate the cubic root of x using Newton-Raphson - */ -static u32 cubic_root(u64 a) -{ - u32 x, x1; - - /* Initial estimate is based on: - * cbrt(x) = exp(log(x) / 3) - */ - x = 1u << (fls64(a)/3); - - /* - * Iteration based on: - * 2 - * x = ( 2 * x + a / x ) / 3 - * k+1 k k - */ - do { - x1 = x; - x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3; - } while (abs(x1 - x) > 1); - - return x; + int s; + u32 y; + u64 b; + u64 bs; + + y = 0; + for (s = 63; s >= 0; s -= 3) { + y = 2 * y; + b = 3 * y * (y+1) + 1; + bs = b << s; + if (x >= bs && (b == (bs>>s))) { /* avoid overflow */ + x -= bs; + y++; + } + } + return y; } /* ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-03-06 13:34 ` [RFC] div64_64 support Andi Kleen @ 2007-03-06 14:19 ` Eric Dumazet 2007-03-06 14:45 ` Andi Kleen 0 siblings, 1 reply; 57+ messages in thread From: Eric Dumazet @ 2007-03-06 14:19 UTC (permalink / raw) To: Andi Kleen; +Cc: Stephen Hemminger, Jan Engelhardt, linux-kernel, netdev On Tuesday 06 March 2007 14:34, Andi Kleen wrote: > - return x; > + int s; > + u32 y; > + u64 b; > + u64 bs; > + > + y = 0; > + for (s = 63; s >= 0; s -= 3) { > + y = 2 * y; > + b = 3 * y * (y+1) + 1; > + bs = b << s; > + if (x >= bs && (b == (bs>>s))) { /* avoid overflow */ > + x -= bs; > + y++; > + } > + } > + return y; > } > Andi <rant> Let me see... You throw code like that and expect someone to actually understand it in one year, and be able to correct a bug ? </rant> Please add something, an URL or even better a nice explanation, per favor... Thank you ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-03-06 14:19 ` Eric Dumazet @ 2007-03-06 14:45 ` Andi Kleen 2007-03-06 15:10 ` Roland Kuhn 2007-03-06 18:50 ` [RFC] div64_64 support H. Peter Anvin 0 siblings, 2 replies; 57+ messages in thread From: Andi Kleen @ 2007-03-06 14:45 UTC (permalink / raw) To: Eric Dumazet Cc: Andi Kleen, Stephen Hemminger, Jan Engelhardt, linux-kernel, netdev > <rant> > Let me see... You throw code like that and expect someone to actually > understand it in one year, and be able to correct a bug ? To be honest I don't expect any bugs in this function. > </rant> > > Please add something, an URL or even better a nice explanation, per favor... It's straight out of Hacker's delight which is referenced in the commit log. -Andi ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-03-06 14:45 ` Andi Kleen @ 2007-03-06 15:10 ` Roland Kuhn 2007-03-06 18:29 ` Stephen Hemminger 2007-03-06 18:50 ` [RFC] div64_64 support H. Peter Anvin 1 sibling, 1 reply; 57+ messages in thread From: Roland Kuhn @ 2007-03-06 15:10 UTC (permalink / raw) To: Andi Kleen Cc: Eric Dumazet, Stephen Hemminger, Jan Engelhardt, linux-kernel, netdev [-- Attachment #1.1: Type: text/plain, Size: 1341 bytes --] Hi Andi! On 6 Mar 2007, at 15:45, Andi Kleen wrote: >> <rant> >> Let me see... You throw code like that and expect someone to actually >> understand it in one year, and be able to correct a bug ? > > To be honest I don't expect any bugs in this function. > >> </rant> >> >> Please add something, an URL or even better a nice explanation, >> per favor... > > It's straight out of Hacker's delight which is referenced in the > commit > log. > And it's pretty neat, too. Hint: (y+1)**3 = y**3 + 3*y**2 + 3*y + 1. The algorithm is exactly the same as for calculating the cubic root on paper, digit by digit. I found that algo in the school notebook of my grandpa (late 1920ies), a pity that it's not taught anymore... pocket calculators _do_ have downsides ;-) Ciao, Roland -- TU Muenchen, Physik-Department E18, James-Franck-Str., 85748 Garching Telefon 089/289-12575; Telefax 089/289-12570 -- CERN office: 892-1-D23 phone: +41 22 7676540 mobile: +41 76 487 4482 -- Any society that would give up a little liberty to gain a little security will deserve neither and lose both. - Benjamin Franklin -----BEGIN GEEK CODE BLOCK----- Version: 3.12 GS/CS/M/MU d-(++) s:+ a-> C+++ UL++++ P+++ L+++ E(+) W+ !N K- w--- M + !V Y+ PGP++ t+(++) 5 R+ tv-- b+ DI++ e+++>++++ h---- y+++ ------END GEEK CODE BLOCK------ [-- Attachment #1.2: smime.p7s --] [-- Type: application/pkcs7-signature, Size: 4324 bytes --] [-- Attachment #2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 186 bytes --] ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-03-06 15:10 ` Roland Kuhn @ 2007-03-06 18:29 ` Stephen Hemminger 2007-03-06 19:48 ` Andi Kleen ` (2 more replies) 0 siblings, 3 replies; 57+ messages in thread From: Stephen Hemminger @ 2007-03-06 18:29 UTC (permalink / raw) To: Roland Kuhn Cc: Andi Kleen, Eric Dumazet, Jan Engelhardt, linux-kernel, netdev Don't count the existing Newton-Raphson out. It turns out that to get enough precision for 32 bits, only 4 iterations are needed. By unrolling those, it gets much better timing. Slightly gross test program (with original cubic wraparound bug fixed). --- /* Test and measure perf of cube root algorithms. */ #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <math.h> #ifdef __x86_64 #define rdtscll(val) do { \ unsigned int __a,__d; \ asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \ (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \ } while(0) # define do_div(n,base) ({ \ uint32_t __base = (base); \ uint32_t __rem; \ __rem = ((uint64_t)(n)) % __base; \ (n) = ((uint64_t)(n)) / __base; \ __rem; \ }) /** * __ffs - find first bit in word. * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ static __inline__ unsigned long __ffs(unsigned long word) { __asm__("bsfq %1,%0" :"=r" (word) :"rm" (word)); return word; } /* * __fls: find last bit set. * @word: The word to search * * Undefined if no zero exists, so code should check against ~0UL first. */ static inline unsigned long __fls(unsigned long word) { __asm__("bsrq %1,%0" :"=r" (word) :"rm" (word)); return word; } /** * ffs - find first bit set * @x: the word to search * * This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ static __inline__ int ffs(int x) { int r; __asm__("bsfl %1,%0\n\t" "cmovzl %2,%0" : "=r" (r) : "rm" (x), "r" (-1)); return r+1; } /** * fls - find last bit set * @x: the word to search * * This is defined the same way as ffs. */ static inline int fls(int x) { int r; __asm__("bsrl %1,%0\n\t" "cmovzl %2,%0" : "=&r" (r) : "rm" (x), "rm" (-1)); return r+1; } /** * fls64 - find last bit set in 64 bit word * @x: the word to search * * This is defined the same way as fls. */ static inline int fls64(uint64_t x) { if (x == 0) return 0; return __fls(x) + 1; } static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor) { return dividend / divisor; } #elif __i386 #define rdtscll(val) \ __asm__ __volatile__("rdtsc" : "=A" (val)) /** * ffs - find first bit set * @x: the word to search * * This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz() (man ffs). */ static inline int ffs(int x) { int r; __asm__("bsfl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); return r+1; } /** * fls - find last bit set * @x: the word to search * * This is defined the same way as ffs(). */ static inline int fls(int x) { int r; __asm__("bsrl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); return r+1; } static inline int fls64(uint64_t x) { uint32_t h = x >> 32; if (h) return fls(h) + 32; return fls(x); } #define do_div(n,base) ({ \ unsigned long __upper, __low, __high, __mod, __base; \ __base = (base); \ asm("":"=a" (__low), "=d" (__high):"A" (n)); \ __upper = __high; \ if (__high) { \ __upper = __high % (__base); \ __high = __high / (__base); \ } \ asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \ asm("":"=A" (n):"a" (__low),"d" (__high)); \ __mod; \ }) /* 64bit divisor, dividend and result. dynamic precision */ static uint64_t div64_64(uint64_t dividend, uint64_t divisor) { uint32_t d = divisor; if (divisor > 0xffffffffULL) { unsigned int shift = fls(divisor >> 32); d = divisor >> shift; dividend >>= shift; } /* avoid 64 bit division if possible */ if (dividend >> 32) do_div(dividend, d); else dividend = (uint32_t) dividend / d; return dividend; } #endif /* Andi Kleen's version */ uint32_t acbrt(uint64_t x) { uint32_t y = 0; int s; for (s = 63; s >= 0; s -= 3) { uint64_t b, bs; y = 2 * y; b = 3 * y * (y+1) + 1; bs = b << s; if (x >= bs && (b == (bs>>s))) { /* avoid overflow */ x -= bs; y++; } } return y; } /* My version of hacker's delight */ uint32_t hcbrt(uint64_t x) { int s = 60; uint32_t y = 0; do { uint64_t b; y = 2*y; b = (uint64_t)(3*y*(y + 1) + 1) << s; s = s - 3; if (x >= b) { x = x - b; y = y + 1; } } while(s >= 0); return y; } /* calculate the cubic root of x using Newton-Raphson */ static uint32_t ocubic(uint64_t a) { uint32_t x, x1; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) */ x = 1u << (fls64(a)/3); /* * Iteration based on: * 2 * x = ( 2 * x + a / x ) / 3 * k+1 k k */ do { uint64_t x2; x2 = x; x2 *= x; x1 = x; x = (2 * x + div64_64(a, x2)) / 3; } while (abs(x1 - x) > 1); return x; } /* calculate the cubic root of x using Newton-Raphson */ static uint32_t ncubic(uint64_t a) { uint64_t x; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) */ x = 1u << (fls64(a)/3); /* Converges in 3 iterations to > 32 bits */ x = (2 * x + div64_64(a, x*x)) / 3; x = (2 * x + div64_64(a, x*x)) / 3; x = (2 * x + div64_64(a, x*x)) / 3; return x; } static const struct cbrt { uint64_t in; uint32_t result; } cases[] = { {1, 1}, {2, 1}, {3, 1}, {4, 1}, {5, 1}, {6, 1}, {7, 1}, {8, 2}, {9, 2}, {10, 2}, {11, 2}, {12, 2}, {13, 2}, {14, 2}, {15, 2}, {16, 2}, {17, 2}, {18, 2}, {19, 2}, {20, 2}, {21, 2}, {22, 2}, {23, 2}, {24, 2}, {25, 2}, {26, 2}, {27, 3}, {28, 3}, {29, 3}, {30, 3}, {31, 3}, {32, 3}, {33, 3}, {34, 3}, {35, 3}, {36, 3}, {37, 3}, {38, 3}, {39, 3}, {40, 3}, {99, 4}, {100, 4}, {101, 4}, { 125ull, 5 }, { 216ull, 6 }, { 343ull, 7 }, { 512ull, 8 }, { 1000ull, 10 }, { 1331ull, 11 }, { 8000ull, 20 }, { 9261ull, 21 }, {32767, 31}, {32768, 32}, {32769, 32}, { 64000ull, 40 }, { 68921ull, 41 }, { 512000ull, 80 }, { 531441ull, 81 }, { 1000000ull, 100 }, { 1030301ull, 101 }, { 4096000ull, 160 }, { 4173281ull, 161 }, { 16387064ull, 254 }, { 16581375ull, 255 }, { 16777216ull, 256 }, { 16974593ull, 257 }, { 131096512ull, 508 }, { 131872229ull, 509 }, { 132651000ull, 510 }, { 133432831ull, 511 }, { 134217728ull, 512 }, { 135005697ull, 513 }, { 1000000000ull, 1000 }, { 1003003001ull, 1001 }, { 1006012008ull, 1002 }, { 1009027027ull, 1003 }, { 1061208000ull, 1020 }, { 1064332261ull, 1021 }, { 1067462648ull, 1022 }, { 1070599167ull, 1023 }, {1073741823, 1023}, {1073741824, 1024}, {1073741825, 1024}, {~0, 2097151}, /* 100 random values */ { 7749363893351949254ull, 1978891}, { 7222815480849057907ull, 1933016}, { 8408462745175416063ull, 2033475}, { 3091884191388096748ull, 1456826}, { 2562019500164152525ull, 1368340}, { 4403210617922443179ull, 1639041}, { 3364542905362882299ull, 1498449}, { 8782769017716072774ull, 2063211}, { 5863405773976003266ull, 1803225}, { 1306053050111174648ull, 1093084}, { 150346236956174824ull, 531737}, { 1265737889039205261ull, 1081719}, { 1445109530774087002ull, 1130577}, { 1197105577171186275ull, 1061803}, { 9213452462461015967ull, 2096399}, { 4730966302945445786ull, 1678739}, { 5650605098630667570ull, 1781141}, { 5880381756353009591ull, 1804963}, { 4552499520046621784ull, 1657359}, { 2697991130065918298ull, 1392131}, { 4858364911220984157ull, 1693674}, { 3691457481531040535ull, 1545489}, { 2613117305472506601ull, 1377377}, { 7449943749836318932ull, 1953069}, { 643378865959570610ull, 863287}, { 4851450802679832774ull, 1692871}, { 1772859812839988916ull, 1210295}, { 8210946489571640849ull, 2017426}, { 591875965497384322ull, 839608}, { 4221553402965100097ull, 1616183}, { 2197744667347238205ull, 1300146}, { 8321400714356781191ull, 2026432}, { 2459557415995497961ull, 1349850}, { 3460673533926954145ull, 1512586}, { 4727304344741345505ull, 1678306}, { 4903203917250634599ull, 1698869}, { 4036494370831490817ull, 1592214}, { 8585205035691420311ull, 2047624}, { 2622143824319236828ull, 1378961}, { 5902762718897731478ull, 1807250}, { 6344401509618197560ull, 1851243}, { 4059247793194552874ull, 1595200}, { 7648030174294342832ull, 1970228}, { 2111858627070002939ull, 1282985}, { 3231502273651985583ull, 1478432}, { 8821862535190318932ull, 2066268}, { 6062559696943389464ull, 1823414}, { 4054224670122353756ull, 1594541}, { 3674929609692563482ull, 1543179}, { 6310802012126231363ull, 1847969}, { 4450190829039920890ull, 1644849}, { 8764531173541462842ull, 2061782}, { 1361923252301505833ull, 1108453}, { 5912924843615600614ull, 1808287}, { 5714768882048811324ull, 1787857}, { 7249589769047033748ull, 1935401}, { 4123157012528828376ull, 1603528}, { 1729687638268160097ull, 1200390}, { 5132287771298228729ull, 1724925}, { 1564349257200314043ull, 1160854}, { 951586254223522969ull, 983594}, { 4569664949094662293ull, 1659439}, { 9082730968228181483ull, 2086437}, { 6312891027251024051ull, 1848173}, { 6915415788559031791ull, 1905194}, { 2713150456497618688ull, 1394733}, { 5390954890749602465ull, 1753430}, { 1405547745908296421ull, 1120164}, { 1157301728707637259ull, 1049902}, { 1513573187112042448ull, 1148156}, { 687416080475161159ull, 882551}, { 484496930861389501ull, 785411}, { 1625256440396143907ull, 1175729}, { 7358388240824901288ull, 1945035}, { 6055730836615196283ull, 1822729}, { 5897962221937294789ull, 1806760}, { 862205218853780339ull, 951780}, { 4798091009445823173ull, 1686641}, { 644772714391937867ull, 863910}, { 4255852691293155171ull, 1620549}, { 5287931004512034672ull, 1742188}, { 479051048987854372ull, 782457}, { 9223312736680112286ull, 2097147}, { 8208392001457969628ull, 2017217}, { 9203071384420047828ull, 2095612}, { 8029313043584389618ull, 2002439}, { 38384068872053008ull, 337326}, { 5477688516749455419ull, 1762784}, { 1504622508868036557ull, 1145888}, { 8421184723110053200ull, 2034500}, { 3312070181890020423ull, 1490618}, { 5344298403762143580ull, 1748357}, { 6340030040222269807ull, 1850818}, { 4895839553118470425ull, 1698018}, { 2806627376195262363ull, 1410570}, { 5321619225005368821ull, 1745880}, { 6897323351052656353ull, 1903532}, { 326700202259382556ull, 688731}, { 7685269066741890339ull, 1973420}, { 8054506481558450217ull, 2004531}, }; #define NCASES (sizeof(cases)/sizeof(cases[0])) static double ticks_per_usec; static void show(const char *func, uint64_t sum, uint64_t sq, unsigned long long mx, unsigned long err) { double mean, std; mean = (double) sum / ticks_per_usec / NCASES ; std = sqrtl( (double) sq / ticks_per_usec / NCASES - mean * mean); printf("%-10s %8llu %8.2f %8.2f %8.2f %lu\n", func, (unsigned long long) sum / NCASES, mean, std, (double) mx / ticks_per_usec, err); } int main(int argc, char **argv) { int i; uint32_t c; unsigned long long start, end, t, mx; unsigned long long err, sum, sum_sq; rdtscll(start); sleep(2); rdtscll(end); ticks_per_usec = (double) (end - start) / 2000000.; printf("Function clocks mean(us) max(us) std(us) total error\n"); #define DOTEST(func) \ if (func(27) != 3) printf("ouch\n"); \ sum = sum_sq = mx = 0; \ for (err = 0, i = 0; i < NCASES; i++) { \ rdtscll(start); \ c = func(cases[i].in); \ rdtscll(end); \ t = end - start; sum += t; sum_sq += t*t; \ if (t > mx) mx = t; \ err += abs((long) cases[i].result - c); \ } \ show(#func, sum, sum_sq, mx, err); DOTEST(ocubic); DOTEST(ncubic); DOTEST(acbrt); DOTEST(hcbrt); return 0; } ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-03-06 18:29 ` Stephen Hemminger @ 2007-03-06 19:48 ` Andi Kleen 2007-03-06 20:04 ` Stephen Hemminger 2007-03-06 21:53 ` Sami Farin 2007-03-06 21:58 ` [RFC] div64_64 support David Miller 2 siblings, 1 reply; 57+ messages in thread From: Andi Kleen @ 2007-03-06 19:48 UTC (permalink / raw) To: Stephen Hemminger Cc: Roland Kuhn, Andi Kleen, Eric Dumazet, Jan Engelhardt, linux-kernel, netdev On Tue, Mar 06, 2007 at 10:29:41AM -0800, Stephen Hemminger wrote: > Don't count the existing Newton-Raphson out. It turns out that to get enough > precision for 32 bits, only 4 iterations are needed. By unrolling those, it > gets much better timing. But did you fix the >2^43 bug too? SGI has already shipped 10TB Altixen, so it's not entirely theoretical. -Andi ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-03-06 19:48 ` Andi Kleen @ 2007-03-06 20:04 ` Stephen Hemminger 0 siblings, 0 replies; 57+ messages in thread From: Stephen Hemminger @ 2007-03-06 20:04 UTC (permalink / raw) To: Andi Kleen Cc: Roland Kuhn, Andi Kleen, Eric Dumazet, Jan Engelhardt, linux-kernel, netdev On Tue, 6 Mar 2007 20:48:41 +0100 Andi Kleen <andi@firstfloor.org> wrote: > On Tue, Mar 06, 2007 at 10:29:41AM -0800, Stephen Hemminger wrote: > > Don't count the existing Newton-Raphson out. It turns out that to get enough > > precision for 32 bits, only 4 iterations are needed. By unrolling those, it > > gets much better timing. > > But did you fix the >2^43 bug too? It was caused by not doing x^2 in 64 bit. > > SGI has already shipped 10TB Altixen, so it's not entirely theoretical. > > -Andi > -- Stephen Hemminger <shemminger@linux-foundation.org> ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-03-06 18:29 ` Stephen Hemminger 2007-03-06 19:48 ` Andi Kleen @ 2007-03-06 21:53 ` Sami Farin 2007-03-06 22:24 ` Sami Farin 2007-03-06 21:58 ` [RFC] div64_64 support David Miller 2 siblings, 1 reply; 57+ messages in thread From: Sami Farin @ 2007-03-06 21:53 UTC (permalink / raw) To: linux-kernel, netdev On Tue, Mar 06, 2007 at 10:29:41 -0800, Stephen Hemminger wrote: > Don't count the existing Newton-Raphson out. It turns out that to get enough > precision for 32 bits, only 4 iterations are needed. By unrolling those, it > gets much better timing. > > Slightly gross test program (with original cubic wraparound bug fixed). ... > {~0, 2097151}, ^^^^^^^ this should be 2642245. Without serializing instruction before rdtsc and with one loop I do not get very accurate results (104 for ncubic, > 1000 for others). #define rdtscll_serialize(val) \ __asm__ __volatile__("movl $0, %%eax\n\tcpuid\n\trdtsc\n" : "=A" (val) : : "ebx", "ecx") Here Pentium D timings for 1000 loops. ~0, 2097151 Function clocks mean(us) max(us) std(us) total error ocubic 912 0.306 20.317 0.730 545101 ncubic 777 0.261 14.799 0.486 576263 acbrt 1168 0.392 21.681 0.547 547562 hcbrt 827 0.278 15.244 0.387 2410 ~0, 2642245 Function clocks mean(us) max(us) std(us) total error ocubic 908 0.305 20.210 0.656 7 ncubic 775 0.260 14.792 0.550 31169 acbrt 1176 0.395 22.017 0.970 2468 hcbrt 826 0.278 15.326 0.670 547504 And I found bug in gcc-4.1.2, it gave 0 for ncubic results when doing 1000 loops test... gcc-4.0.3 works. -- ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-03-06 21:53 ` Sami Farin @ 2007-03-06 22:24 ` Sami Farin 2007-03-07 16:11 ` Chuck Ebbert 2007-03-08 18:23 ` asm volatile [Was: [RFC] div64_64 support] Sami Farin 0 siblings, 2 replies; 57+ messages in thread From: Sami Farin @ 2007-03-06 22:24 UTC (permalink / raw) To: linux-kernel, netdev On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote: ... > And I found bug in gcc-4.1.2, it gave 0 for ncubic results > when doing 1000 loops test... gcc-4.0.3 works. Found it. --- cbrt-test.c~ 2007-03-07 00:20:54.735248105 +0200 +++ cbrt-test.c 2007-03-07 00:21:03.964864343 +0200 @@ -209,7 +209,7 @@ __asm__("bsrl %1,%0\n\t" "cmovzl %2,%0" - : "=&r" (r) : "rm" (x), "rm" (-1)); + : "=&r" (r) : "rm" (x), "rm" (-1) : "memory"); return r+1; } Now Linux 2.6 does not have "memory" in fls, maybe it causes some gcc funnies some people are seeing. -- ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-03-06 22:24 ` Sami Farin @ 2007-03-07 16:11 ` Chuck Ebbert 2007-03-07 18:32 ` Sami Farin 2007-03-08 18:23 ` asm volatile [Was: [RFC] div64_64 support] Sami Farin 1 sibling, 1 reply; 57+ messages in thread From: Chuck Ebbert @ 2007-03-07 16:11 UTC (permalink / raw) To: Sami Farin; +Cc: linux-kernel, netdev Sami Farin wrote: > On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote: > ... >> And I found bug in gcc-4.1.2, it gave 0 for ncubic results >> when doing 1000 loops test... gcc-4.0.3 works. > > Found it. > > --- cbrt-test.c~ 2007-03-07 00:20:54.735248105 +0200 > +++ cbrt-test.c 2007-03-07 00:21:03.964864343 +0200 > @@ -209,7 +209,7 @@ > > __asm__("bsrl %1,%0\n\t" > "cmovzl %2,%0" > - : "=&r" (r) : "rm" (x), "rm" (-1)); > + : "=&r" (r) : "rm" (x), "rm" (-1) : "memory"); > return r+1; > } > > Now Linux 2.6 does not have "memory" in fls, maybe it causes > some gcc funnies some people are seeing. Can you post the difference in the generated code with that change? ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-03-07 16:11 ` Chuck Ebbert @ 2007-03-07 18:32 ` Sami Farin 0 siblings, 0 replies; 57+ messages in thread From: Sami Farin @ 2007-03-07 18:32 UTC (permalink / raw) To: linux-kernel, netdev [-- Attachment #1: Type: text/plain, Size: 6133 bytes --] On Wed, Mar 07, 2007 at 11:11:49 -0500, Chuck Ebbert wrote: > Sami Farin wrote: > > On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote: > > ... > >> And I found bug in gcc-4.1.2, it gave 0 for ncubic results > >> when doing 1000 loops test... gcc-4.0.3 works. > > > > Found it. > > > > --- cbrt-test.c~ 2007-03-07 00:20:54.735248105 +0200 > > +++ cbrt-test.c 2007-03-07 00:21:03.964864343 +0200 > > @@ -209,7 +209,7 @@ > > > > __asm__("bsrl %1,%0\n\t" > > "cmovzl %2,%0" > > - : "=&r" (r) : "rm" (x), "rm" (-1)); > > + : "=&r" (r) : "rm" (x), "rm" (-1) : "memory"); > > return r+1; > > } > > > > Now Linux 2.6 does not have "memory" in fls, maybe it causes > > some gcc funnies some people are seeing. > > Can you post the difference in the generated code with that change? Fun.. looks when not using "memory" gcc does not even bother calling ncubic() 666 times. So it gets better timings ( 42/666=0 ) =) --- cbrt-test-no_memory.s 2007-03-07 20:22:27.838466385 +0200 +++ cbrt-test-using_memory.s 2007-03-07 20:22:38.237013197 +0200 ... main: leal 4(%esp), %ecx andl $-16, %esp pushl -4(%ecx) pushl %ebp pushl %edi pushl %esi pushl %ebx pushl %ecx - subl $136, %esp + subl $152, %esp movl $.LC0, (%esp) call puts xorl %edx, %edx movl $27, %eax call ncubic cmpl $3, %eax - je .L83 + je .L87 movl $.LC1, (%esp) call puts -.L83: - xorl %eax, %eax - xorl %edi, %edi - movl %eax, 88(%esp) +.L87: xorl %eax, %eax - xorl %esi, %esi + xorl %ebp, %ebp movl %eax, 92(%esp) xorl %eax, %eax - xorl %ebp, %ebp + xorl %edi, %edi movl %eax, 96(%esp) xorl %eax, %eax + xorl %esi, %esi movl %eax, 100(%esp) xorl %eax, %eax movl %eax, 104(%esp) xorl %eax, %eax movl %eax, 108(%esp) - movl %edi, 112(%esp) - movl %esi, 116(%esp) - .p2align 4,,15 -.L84: + xorl %eax, %eax + movl %eax, 112(%esp) + movl %ebp, 116(%esp) + movl %edi, 120(%esp) + movl %esi, 124(%esp) +.L88: #APP movl $0, %eax cpuid rdtsc #NO_APP movl %eax, 56(%esp) movl %edx, 60(%esp) #APP movl $0, %eax cpuid rdtsc #NO_APP movl %eax, %esi movl %edx, %edi subl 56(%esp), %esi sbbl 60(%esp), %edi cmpl $0, %edi ja .L66 cmpl $999, %esi - jbe .L84 + jbe .L88 .L66: + movl 92(%esp), %edx + leal (%edx,%edx,2), %eax + movl cases+4(,%eax,4), %edi + movl cases(,%eax,4), %esi + movl %edi, %edx + movl %esi, %eax + call ncubic #APP movl $0, %eax cpuid rdtsc #NO_APP - movl %eax, %esi - movl %edx, %edi + movl $666, %ebx + movl %eax, 128(%esp) + movl %edx, 132(%esp) + .p2align 4,,15 +.L67: + movl %esi, %eax + movl %edi, %edx + call ncubic + decl %ebx + movl %eax, %ebp + jne .L67 #APP movl $0, %eax cpuid rdtsc #NO_APP - subl %esi, %eax + subl 128(%esp), %eax movl $666, %ebx - sbbl %edi, %edx - xorl %ecx, %ecx movl %ebx, 8(%esp) + sbbl 132(%esp), %edx + xorl %ecx, %ecx movl %ecx, 12(%esp) movl %eax, (%esp) movl %edx, 4(%esp) call __udivdi3 - addl %eax, 104(%esp) + addl %eax, 112(%esp) movl %edx, %ecx movl %eax, %ebx movl %edx, %esi - adcl %edx, 108(%esp) + adcl %edx, 116(%esp) imull %eax, %ecx mull %ebx addl %ecx, %ecx movl %eax, 56(%esp) addl %ecx, %edx movl 56(%esp), %eax - addl %eax, 112(%esp) + addl %eax, 120(%esp) movl %edx, 60(%esp) movl 60(%esp), %edx - adcl %edx, 116(%esp) - cmpl %esi, 92(%esp) - ja .L67 - jb .L68 - cmpl %ebx, 88(%esp) - jae .L67 -.L68: - movl %ebx, 88(%esp) - movl %esi, 92(%esp) -.L67: - leal (%ebp,%ebp,2), %ebx - sall $2, %ebx - movl cases+4(%ebx), %edx - movl cases(%ebx), %eax - call ncubic - movl cases+8(%ebx), %edx - subl %eax, %edx - movl %edx, %eax - sarl $31, %eax - xorl %eax, %edx - subl %eax, %edx - movl %edx, %ecx - sarl $31, %ecx - addl %edx, 96(%esp) - adcl %ecx, 100(%esp) - incl %ebp - cmpl $183, %ebp - jbe .L84 - movl 108(%esp), %eax - fildll 104(%esp) - testl %eax, %eax - js .L85 + adcl %edx, 124(%esp) + cmpl %esi, 100(%esp) + ja .L69 + jb .L70 + cmpl %ebx, 96(%esp) + jae .L69 .L70: - fstpl 120(%esp) + movl %ebx, 96(%esp) + movl %esi, 100(%esp) +.L69: + movl 92(%esp), %edx + leal (%edx,%edx,2), %eax + movl cases+8(,%eax,4), %eax + subl %ebp, %eax + movl %eax, %ecx + sarl $31, %ecx + xorl %ecx, %eax + subl %ecx, %eax + cltd + addl %eax, 104(%esp) + adcl %edx, 108(%esp) + incl 92(%esp) + cmpl $183, 92(%esp) + jbe .L88 movl 116(%esp), %eax - fldl 120(%esp) + fildll 112(%esp) + testl %eax, %eax + js .L89 +.L72: + fstpl 136(%esp) + movl 124(%esp), %eax + fldl 136(%esp) fdivl .LC7 testl %eax, %eax flds .LC4 fdivr %st, %st(1) - fildll 112(%esp) - js .L86 -.L71: - fstpl 120(%esp) - fldl 120(%esp) + fildll 120(%esp) + js .L90 +.L73: + fstpl 136(%esp) + fldl 136(%esp) fdivl .LC7 fdivp %st, %st(1) fld %st(1) fmul %st(2), %st fsubrp %st, %st(1) fld %st(0) fsqrt fucomi %st(0), %st - jp .L88 - je .L89 -.L88: + jp .L92 + je .L93 +.L92: fstp %st(0) fstpl (%esp) fstpl 64(%esp) call sqrt fldl 64(%esp) fxch %st(1) -.L72: - movl 96(%esp), %eax - movl 100(%esp), %edx - fildll 88(%esp) +.L74: + movl 104(%esp), %eax + movl 108(%esp), %edx + fildll 96(%esp) movl %eax, 40(%esp) - movl 92(%esp), %eax + movl 100(%esp), %eax movl %edx, 44(%esp) testl %eax, %eax - js .L87 -.L73: - fstpl 120(%esp) - movl 104(%esp), %eax + js .L91 +.L75: + fstpl 136(%esp) + movl 112(%esp), %eax movl $184, %ebp - fldl 120(%esp) + fldl 136(%esp) xorl %edi, %edi movl $.LC5, %esi fdivl .LC7 - movl 108(%esp), %edx + movl 116(%esp), %edx movl %ebp, 8(%esp) movl %edi, 12(%esp) movl %eax, (%esp) movl %edx, 4(%esp) fstpl 32(%esp) fstpl 24(%esp) fstpl 16(%esp) call __udivdi3 movl %esi, 4(%esp) movl $.LC6, (%esp) movl %eax, 8(%esp) movl %edx, 12(%esp) call printf - addl $136, %esp + addl $152, %esp xorl %eax, %eax popl %ecx popl %ebx popl %esi popl %edi popl %ebp leal -4(%ecx), %esp ret -.L89: +.L93: fstp %st(1) + jmp .L74 +.L89: + fadds .LC2 jmp .L72 -.L85: +.L91: fadds .LC2 - jmp .L70 -.L87: + jmp .L75 +.L90: fadds .LC2 jmp .L73 -.L86: - fadds .LC2 - jmp .L71 .size main, .-main .section .rodata .align 32 .type cases, @object .size cases, 2208 cases: ... -- [-- Attachment #2: cbrt-test.c --] [-- Type: text/plain, Size: 11463 bytes --] /* Don't count the existing Newton-Raphson out. It turns out that to get enough precision for 32 bits, only 4 iterations are needed. By unrolling those, it gets much better timing. Slightly gross test program (with original cubic wraparound bug fixed). --- */ /* Test and measure perf of cube root algorithms. */ #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <math.h> #define MEMCLOBBER 1 /** * ffs - find first bit set * @x: the word to search * * This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ static inline int ffs(int x) { int r; __asm__("bsfl %1,%0\n\t" "cmovzl %2,%0" #if MEMCLOBBER : "=&r" (r) : "rm" (x), "r" (-1) : "memory"); #else : "=&r" (r) : "rm" (x), "r" (-1)); #endif return r+1; } /** * fls - find last bit set * @x: the word to search * * This is defined the same way as ffs. */ static inline int fls(int x) { int r; __asm__("bsrl %1,%0\n\t" "cmovzl %2,%0" #if MEMCLOBBER : "=&r" (r) : "rm" (x), "r" (-1) : "memory" ); #else : "=&r" (r) : "rm" (x), "r" (-1)); #endif return r+1; } #ifdef __x86_64 #define rdtscll(val) do { \ unsigned int __a,__d; \ asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \ (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \ } while(0) # define do_div(n,base) ({ \ uint32_t __base = (base); \ uint32_t __rem; \ __rem = ((uint64_t)(n)) % __base; \ (n) = ((uint64_t)(n)) / __base; \ __rem; \ }) /** * __ffs - find first bit in word. * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ static inline unsigned long __ffs(unsigned long word) { __asm__("bsfq %1,%0" :"=r" (word) :"rm" (word)); return word; } /* * __fls: find last bit set. * @word: The word to search * * Undefined if no zero exists, so code should check against ~0UL first. */ static inline unsigned long __fls(unsigned long word) { __asm__("bsrq %1,%0" :"=r" (word) :"rm" (word)); return word; } /** * fls64 - find last bit set in 64 bit word * @x: the word to search * * This is defined the same way as fls. */ static inline int fls64(uint64_t x) { if (x == 0) return 0; return __fls(x) + 1; } static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor) { return dividend / divisor; } #elif __i386 #define rdtscll_start(val) \ __asm__ __volatile__("movl $0, %%eax\n\tcpuid\n\trdtsc\n" : "=A" (val) : : "ebx", "ecx") #define rdtscll(val) \ __asm__ __volatile__("rdtsc" : "=A" (val)) static inline int fls64(uint64_t x) { uint32_t h = x >> 32; if (h) return fls(h) + 32; return fls(x); } #define do_div(n,base) ({ \ unsigned long __upper, __low, __high, __mod, __base; \ __base = (base); \ asm("":"=a" (__low), "=d" (__high):"A" (n)); \ __upper = __high; \ if (__high) { \ __upper = __high % (__base); \ __high = __high / (__base); \ } \ asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \ asm("":"=A" (n):"a" (__low),"d" (__high)); \ __mod; \ }) /* 64bit divisor, dividend and result. dynamic precision */ static uint64_t div64_64(uint64_t dividend, uint64_t divisor) { uint32_t d = divisor; if (divisor > 0xffffffffULL) { unsigned int shift = fls(divisor >> 32); d = divisor >> shift; dividend >>= shift; } /* avoid 64 bit division if possible */ if (dividend >> 32) do_div(dividend, d); else dividend = (uint32_t) dividend / d; return dividend; } #endif /* Andi Kleen's version */ uint32_t acbrt(uint64_t x) { uint32_t y = 0; int s; for (s = 63; s >= 0; s -= 3) { uint64_t b, bs; y = 2 * y; b = 3 * y * (y+1) + 1; bs = b << s; if (x >= bs && (b == (bs>>s))) { /* avoid overflow */ x -= bs; y++; } } return y; } /* My version of hacker's delight */ uint32_t hcbrt(uint64_t x) { int s = 60; uint32_t y = 0; do { uint64_t b; y = 2*y; b = (uint64_t)(3*y*(y + 1) + 1) << s; s = s - 3; if (x >= b) { x = x - b; y = y + 1; } } while(s >= 0); return y; } /* calculate the cubic root of x using Newton-Raphson */ static uint32_t ocubic(uint64_t a) { uint32_t x, x1; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) */ x = 1u << (fls64(a)/3); /* * Iteration based on: * 2 * x = ( 2 * x + a / x ) / 3 * k+1 k k */ do { uint64_t x2; x2 = x; x2 *= x; x1 = x; x = (2 * x + div64_64(a, x2)) / 3; } while (abs(x1 - x) > 1); return x; } /* calculate the cubic root of x using Newton-Raphson */ static uint32_t ncubic(uint64_t a) { uint64_t x; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) */ x = 1u << (fls64(a)/3); /* Converges in 3 iterations to > 32 bits */ x = (2 * x + div64_64(a, x*x)) / 3; x = (2 * x + div64_64(a, x*x)) / 3; x = (2 * x + div64_64(a, x*x)) / 3; return x; } static const struct cbrt { uint64_t in; uint32_t result; } cases[] = { {1, 1}, {2, 1}, {3, 1}, {4, 1}, {5, 1}, {6, 1}, {7, 1}, {8, 2}, {9, 2}, {10, 2}, {11, 2}, {12, 2}, {13, 2}, {14, 2}, {15, 2}, {16, 2}, {17, 2}, {18, 2}, {19, 2}, {20, 2}, {21, 2}, {22, 2}, {23, 2}, {24, 2}, {25, 2}, {26, 2}, {27, 3}, {28, 3}, {29, 3}, {30, 3}, {31, 3}, {32, 3}, {33, 3}, {34, 3}, {35, 3}, {36, 3}, {37, 3}, {38, 3}, {39, 3}, {40, 3}, {99, 4}, {100, 4}, {101, 4}, { 125ull, 5 }, { 216ull, 6 }, { 343ull, 7 }, { 512ull, 8 }, { 1000ull, 10 }, { 1331ull, 11 }, { 8000ull, 20 }, { 9261ull, 21 }, {32767, 31}, {32768, 32}, {32769, 32}, { 64000ull, 40 }, { 68921ull, 41 }, { 512000ull, 80 }, { 531441ull, 81 }, { 1000000ull, 100 }, { 1030301ull, 101 }, { 4096000ull, 160 }, { 4173281ull, 161 }, { 16387064ull, 254 }, { 16581375ull, 255 }, { 16777216ull, 256 }, { 16974593ull, 257 }, { 131096512ull, 508 }, { 131872229ull, 509 }, { 132651000ull, 510 }, { 133432831ull, 511 }, { 134217728ull, 512 }, { 135005697ull, 513 }, { 1000000000ull, 1000 }, { 1003003001ull, 1001 }, { 1006012008ull, 1002 }, { 1009027027ull, 1003 }, { 1061208000ull, 1020 }, { 1064332261ull, 1021 }, { 1067462648ull, 1022 }, { 1070599167ull, 1023 }, {1073741823, 1023}, {1073741824, 1024}, {1073741825, 1024}, {~0, 2642245}, /* 100 random values */ { 7749363893351949254ull, 1978891}, { 7222815480849057907ull, 1933016}, { 8408462745175416063ull, 2033475}, { 3091884191388096748ull, 1456826}, { 2562019500164152525ull, 1368340}, { 4403210617922443179ull, 1639041}, { 3364542905362882299ull, 1498449}, { 8782769017716072774ull, 2063211}, { 5863405773976003266ull, 1803225}, { 1306053050111174648ull, 1093084}, { 150346236956174824ull, 531737}, { 1265737889039205261ull, 1081719}, { 1445109530774087002ull, 1130577}, { 1197105577171186275ull, 1061803}, { 9213452462461015967ull, 2096399}, { 4730966302945445786ull, 1678739}, { 5650605098630667570ull, 1781141}, { 5880381756353009591ull, 1804963}, { 4552499520046621784ull, 1657359}, { 2697991130065918298ull, 1392131}, { 4858364911220984157ull, 1693674}, { 3691457481531040535ull, 1545489}, { 2613117305472506601ull, 1377377}, { 7449943749836318932ull, 1953069}, { 643378865959570610ull, 863287}, { 4851450802679832774ull, 1692871}, { 1772859812839988916ull, 1210295}, { 8210946489571640849ull, 2017426}, { 591875965497384322ull, 839608}, { 4221553402965100097ull, 1616183}, { 2197744667347238205ull, 1300146}, { 8321400714356781191ull, 2026432}, { 2459557415995497961ull, 1349850}, { 3460673533926954145ull, 1512586}, { 4727304344741345505ull, 1678306}, { 4903203917250634599ull, 1698869}, { 4036494370831490817ull, 1592214}, { 8585205035691420311ull, 2047624}, { 2622143824319236828ull, 1378961}, { 5902762718897731478ull, 1807250}, { 6344401509618197560ull, 1851243}, { 4059247793194552874ull, 1595200}, { 7648030174294342832ull, 1970228}, { 2111858627070002939ull, 1282985}, { 3231502273651985583ull, 1478432}, { 8821862535190318932ull, 2066268}, { 6062559696943389464ull, 1823414}, { 4054224670122353756ull, 1594541}, { 3674929609692563482ull, 1543179}, { 6310802012126231363ull, 1847969}, { 4450190829039920890ull, 1644849}, { 8764531173541462842ull, 2061782}, { 1361923252301505833ull, 1108453}, { 5912924843615600614ull, 1808287}, { 5714768882048811324ull, 1787857}, { 7249589769047033748ull, 1935401}, { 4123157012528828376ull, 1603528}, { 1729687638268160097ull, 1200390}, { 5132287771298228729ull, 1724925}, { 1564349257200314043ull, 1160854}, { 951586254223522969ull, 983594}, { 4569664949094662293ull, 1659439}, { 9082730968228181483ull, 2086437}, { 6312891027251024051ull, 1848173}, { 6915415788559031791ull, 1905194}, { 2713150456497618688ull, 1394733}, { 5390954890749602465ull, 1753430}, { 1405547745908296421ull, 1120164}, { 1157301728707637259ull, 1049902}, { 1513573187112042448ull, 1148156}, { 687416080475161159ull, 882551}, { 484496930861389501ull, 785411}, { 1625256440396143907ull, 1175729}, { 7358388240824901288ull, 1945035}, { 6055730836615196283ull, 1822729}, { 5897962221937294789ull, 1806760}, { 862205218853780339ull, 951780}, { 4798091009445823173ull, 1686641}, { 644772714391937867ull, 863910}, { 4255852691293155171ull, 1620549}, { 5287931004512034672ull, 1742188}, { 479051048987854372ull, 782457}, { 9223312736680112286ull, 2097147}, { 8208392001457969628ull, 2017217}, { 9203071384420047828ull, 2095612}, { 8029313043584389618ull, 2002439}, { 38384068872053008ull, 337326}, { 5477688516749455419ull, 1762784}, { 1504622508868036557ull, 1145888}, { 8421184723110053200ull, 2034500}, { 3312070181890020423ull, 1490618}, { 5344298403762143580ull, 1748357}, { 6340030040222269807ull, 1850818}, { 4895839553118470425ull, 1698018}, { 2806627376195262363ull, 1410570}, { 5321619225005368821ull, 1745880}, { 6897323351052656353ull, 1903532}, { 326700202259382556ull, 688731}, { 7685269066741890339ull, 1973420}, { 8054506481558450217ull, 2004531}, }; #define NCASES (sizeof(cases)/sizeof(cases[0])) static double ticks_per_usec = 2979.0; static void show(const char *func, uint64_t sum, uint64_t sq, unsigned long long mx, unsigned long long err) { double mean, std; mean = (double) sum / ticks_per_usec / NCASES ; std = sqrtl( (double) sq / ticks_per_usec / NCASES - mean * mean); printf("%-10s %8llu %8.3f %8.3f %8.3f %11llu\n", func, (unsigned long long) sum / NCASES, mean, std, (double) mx / ticks_per_usec, err); } int main(int argc, char **argv) { int i; uint32_t c; unsigned long long start, end, t, mx; unsigned long long err, sum, sum_sq, cerr; int loops; #if 0 rdtscll(start); sleep(2); rdtscll(end); ticks_per_usec = (double) (end - start) / 2000000.; #endif printf("Function clocks mean(us) max(us) std(us) total error\n"); #define DOTEST(func) \ if (func(27) != 3) printf("ouch\n"); \ sum = sum_sq = mx = 0; \ for (err = 0, i = 0; i < NCASES; i++) { \ do { \ rdtscll_start(start); \ rdtscll_start(end); \ } while ((end - start) < 1000); \ c = func(cases[i].in); \ loops = 667; \ rdtscll_start(start); \ while (--loops > 0) c = func(cases[i].in); \ rdtscll_start(end); \ t = (end - start) / 666; sum += t; sum_sq += t*t; \ if (t > mx) mx = t; \ cerr = abs((long) cases[i].result - c); \ err += cerr; \ } \ show(#func, sum, sum_sq, mx, err); //DOTEST(ocubic); DOTEST(ncubic); //DOTEST(acbrt); //DOTEST(hcbrt); return 0; } ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: asm volatile [Was: [RFC] div64_64 support] 2007-03-06 22:24 ` Sami Farin 2007-03-07 16:11 ` Chuck Ebbert @ 2007-03-08 18:23 ` Sami Farin 2007-03-08 22:01 ` asm volatile David Miller 1 sibling, 1 reply; 57+ messages in thread From: Sami Farin @ 2007-03-08 18:23 UTC (permalink / raw) To: linux-kernel, netdev On Wed, Mar 07, 2007 at 00:24:35 +0200, Sami Farin wrote: > On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote: > ... > > And I found bug in gcc-4.1.2, it gave 0 for ncubic results > > when doing 1000 loops test... gcc-4.0.3 works. > > Found it. > > --- cbrt-test.c~ 2007-03-07 00:20:54.735248105 +0200 > +++ cbrt-test.c 2007-03-07 00:21:03.964864343 +0200 > @@ -209,7 +209,7 @@ > > __asm__("bsrl %1,%0\n\t" > "cmovzl %2,%0" > - : "=&r" (r) : "rm" (x), "rm" (-1)); > + : "=&r" (r) : "rm" (x), "rm" (-1) : "memory"); > return r+1; > } > > Now Linux 2.6 does not have "memory" in fls, maybe it causes > some gcc funnies some people are seeing. It also works without "memory" if I do "__asm__ volatile". Why some functions have volatile and some have not in include/asm-*/*.h ? -- ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: asm volatile 2007-03-08 18:23 ` asm volatile [Was: [RFC] div64_64 support] Sami Farin @ 2007-03-08 22:01 ` David Miller 0 siblings, 0 replies; 57+ messages in thread From: David Miller @ 2007-03-08 22:01 UTC (permalink / raw) To: 7atbggg02; +Cc: linux-kernel, netdev From: Sami Farin <7atbggg02@sneakemail.com> Date: Thu, 8 Mar 2007 20:23:57 +0200 > On Wed, Mar 07, 2007 at 00:24:35 +0200, Sami Farin wrote: > > On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote: > > ... > > > And I found bug in gcc-4.1.2, it gave 0 for ncubic results > > > when doing 1000 loops test... gcc-4.0.3 works. > > > > Found it. > > > > --- cbrt-test.c~ 2007-03-07 00:20:54.735248105 +0200 > > +++ cbrt-test.c 2007-03-07 00:21:03.964864343 +0200 > > @@ -209,7 +209,7 @@ > > > > __asm__("bsrl %1,%0\n\t" > > "cmovzl %2,%0" > > - : "=&r" (r) : "rm" (x), "rm" (-1)); > > + : "=&r" (r) : "rm" (x), "rm" (-1) : "memory"); > > return r+1; > > } > > > > Now Linux 2.6 does not have "memory" in fls, maybe it causes > > some gcc funnies some people are seeing. > > It also works without "memory" if I do "__asm__ volatile". > > Why some functions have volatile and some have not in include/asm-*/*.h ? "volatile" is really only needed if there is some side effect that cannot be expressed to gcc which makes ordering over the asm wrt. other pieces of code important. But in these case it should absolutely not be needed. It's simply computing an interger result from some inputs and some values in memory. GCC should see perfectly fine what is memory is read by the asm and therefore what ordering constraints there are wrt. writes to the same memory location. ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-03-06 18:29 ` Stephen Hemminger 2007-03-06 19:48 ` Andi Kleen 2007-03-06 21:53 ` Sami Farin @ 2007-03-06 21:58 ` David Miller 2007-03-06 22:47 ` [PATCH] tcp_cubic: faster cube root Stephen Hemminger 2 siblings, 1 reply; 57+ messages in thread From: David Miller @ 2007-03-06 21:58 UTC (permalink / raw) To: shemminger; +Cc: rkuhn, andi, dada1, jengelh, linux-kernel, netdev From: Stephen Hemminger <shemminger@linux-foundation.org> Date: Tue, 6 Mar 2007 10:29:41 -0800 > /* calculate the cubic root of x using Newton-Raphson */ > static uint32_t ncubic(uint64_t a) > { > uint64_t x; > > /* Initial estimate is based on: > * cbrt(x) = exp(log(x) / 3) > */ > x = 1u << (fls64(a)/3); > > /* Converges in 3 iterations to > 32 bits */ > > x = (2 * x + div64_64(a, x*x)) / 3; > x = (2 * x + div64_64(a, x*x)) / 3; > x = (2 * x + div64_64(a, x*x)) / 3; > > return x; > } Indeed that will be the fastest variant for cpus with hw integer division. I did a quick sparc64 port, here is what I got: Function clocks mean(us) max(us) std(us) total error ocubic 529 0.35 15.16 0.66 545101 ncubic 498 0.33 12.83 0.36 576263 acbrt 427 0.28 11.04 0.33 547562 hcbrt 393 0.26 10.18 0.47 2410 ^ permalink raw reply [flat|nested] 57+ messages in thread
* [PATCH] tcp_cubic: faster cube root 2007-03-06 21:58 ` [RFC] div64_64 support David Miller @ 2007-03-06 22:47 ` Stephen Hemminger 2007-03-06 22:58 ` cube root benchmark code Stephen Hemminger 2007-03-07 4:20 ` [PATCH] tcp_cubic: faster cube root David Miller 0 siblings, 2 replies; 57+ messages in thread From: Stephen Hemminger @ 2007-03-06 22:47 UTC (permalink / raw) To: David Miller; +Cc: rkuhn, andi, dada1, jengelh, linux-kernel, netdev The Newton-Raphson method is quadratically convergent so only a small fixed number of steps are necessary. Therefore it is faster to unroll the loop. Since div64_64 is no longer inline it won't cause code explosion. Also fixes a bug that can occur if x^2 was bigger than 32 bits. Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org> --- net/ipv4/tcp_cubic.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) --- net-2.6.22.orig/net/ipv4/tcp_cubic.c 2007-03-06 12:24:34.000000000 -0800 +++ net-2.6.22/net/ipv4/tcp_cubic.c 2007-03-06 14:43:37.000000000 -0800 @@ -96,23 +96,17 @@ */ static u32 cubic_root(u64 a) { - u32 x, x1; + u64 x; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) */ x = 1u << (fls64(a)/3); - /* - * Iteration based on: - * 2 - * x = ( 2 * x + a / x ) / 3 - * k+1 k k - */ - do { - x1 = x; - x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3; - } while (abs(x1 - x) > 1); + /* converges to 32 bits in 3 iterations */ + x = (2 * x + div64_64(a, x*x)) / 3; + x = (2 * x + div64_64(a, x*x)) / 3; + x = (2 * x + div64_64(a, x*x)) / 3; return x; } ^ permalink raw reply [flat|nested] 57+ messages in thread
* cube root benchmark code 2007-03-06 22:47 ` [PATCH] tcp_cubic: faster cube root Stephen Hemminger @ 2007-03-06 22:58 ` Stephen Hemminger 2007-03-07 6:08 ` Update to " Willy Tarreau 2007-03-07 4:20 ` [PATCH] tcp_cubic: faster cube root David Miller 1 sibling, 1 reply; 57+ messages in thread From: Stephen Hemminger @ 2007-03-06 22:58 UTC (permalink / raw) To: David Miller; +Cc: rkuhn, andi, dada1, jengelh, linux-kernel, netdev Here is a better version of the benchmark code. It has the original code used in 2.4 version of Cubic for comparison ----------------------------------------------------------- /* Test and measure perf of cube root algorithms. */ #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <math.h> #include <unistd.h> #ifdef __x86_64 #define rdtscll(val) do { \ unsigned int __a,__d; \ asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \ (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \ } while(0) # define do_div(n,base) ({ \ uint32_t __base = (base); \ uint32_t __rem; \ __rem = ((uint64_t)(n)) % __base; \ (n) = ((uint64_t)(n)) / __base; \ __rem; \ }) /** * __ffs - find first bit in word. * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ static __inline__ unsigned long __ffs(unsigned long word) { __asm__("bsfq %1,%0" :"=r" (word) :"rm" (word)); return word; } /* * __fls: find last bit set. * @word: The word to search * * Undefined if no zero exists, so code should check against ~0UL first. */ static inline unsigned long __fls(unsigned long word) { __asm__("bsrq %1,%0" :"=r" (word) :"rm" (word)); return word; } /** * ffs - find first bit set * @x: the word to search * * This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ static __inline__ int ffs(int x) { int r; __asm__("bsfl %1,%0\n\t" "cmovzl %2,%0" : "=r" (r) : "rm" (x), "r" (-1)); return r+1; } /** * fls - find last bit set * @x: the word to search * * This is defined the same way as ffs. */ static inline int fls(int x) { int r; __asm__("bsrl %1,%0\n\t" "cmovzl %2,%0" : "=&r" (r) : "rm" (x), "rm" (-1)); return r+1; } /** * fls64 - find last bit set in 64 bit word * @x: the word to search * * This is defined the same way as fls. */ static inline int fls64(uint64_t x) { if (x == 0) return 0; return __fls(x) + 1; } static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor) { return dividend / divisor; } #elif __i386 #define rdtscll(val) \ __asm__ __volatile__("rdtsc" : "=A" (val)) /** * ffs - find first bit set * @x: the word to search * * This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz() (man ffs). */ static inline int ffs(int x) { int r; __asm__("bsfl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); return r+1; } /** * fls - find last bit set * @x: the word to search * * This is defined the same way as ffs(). */ static inline int fls(int x) { int r; __asm__("bsrl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); return r+1; } static inline int fls64(uint64_t x) { uint32_t h = x >> 32; if (h) return fls(h) + 32; return fls(x); } #define do_div(n,base) ({ \ unsigned long __upper, __low, __high, __mod, __base; \ __base = (base); \ asm("":"=a" (__low), "=d" (__high):"A" (n)); \ __upper = __high; \ if (__high) { \ __upper = __high % (__base); \ __high = __high / (__base); \ } \ asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \ asm("":"=A" (n):"a" (__low),"d" (__high)); \ __mod; \ }) /* 64bit divisor, dividend and result. dynamic precision */ static uint64_t div64_64(uint64_t dividend, uint64_t divisor) { uint32_t d = divisor; if (divisor > 0xffffffffULL) { unsigned int shift = fls(divisor >> 32); d = divisor >> shift; dividend >>= shift; } /* avoid 64 bit division if possible */ if (dividend >> 32) do_div(dividend, d); else dividend = (uint32_t) dividend / d; return dividend; } #endif /* Andi Kleen's version */ uint32_t acbrt(uint64_t x) { uint32_t y = 0; int s; for (s = 63; s >= 0; s -= 3) { uint64_t b, bs; y = 2 * y; b = 3 * y * (y+1) + 1; bs = b << s; if (x >= bs && (b == (bs>>s))) { /* avoid overflow */ x -= bs; y++; } } return y; } /* My version of hacker's delight */ uint32_t hcbrt(uint64_t x) { int s = 60; uint32_t y = 0; do { uint64_t b; y = 2*y; b = (uint64_t)(3*y*(y + 1) + 1) << s; s = s - 3; if (x >= b) { x = x - b; y = y + 1; } } while(s >= 0); return y; } /* calculate the cubic root of x using Newton-Raphson */ static uint32_t ocubic(uint64_t a) { uint32_t x, x1; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) */ x = 1u << (fls64(a)/3); /* * Iteration based on: * 2 * x = ( 2 * x + a / x ) / 3 * k+1 k k */ do { x1 = x; x = (2 * x + div64_64(a, (uint64_t)x * x)) / 3; } while (abs(x1 - x) > 1); return x; } /* calculate the cubic root of x using Newton-Raphson */ static uint32_t ncubic(uint64_t a) { uint64_t x; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) */ x = 1u << (fls64(a)/3); /* Converges in 3 iterations to > 32 bits */ x = (2 * x + div64_64(a, x*x)) / 3; x = (2 * x + div64_64(a, x*x)) / 3; x = (2 * x + div64_64(a, x*x)) / 3; return x; } /* 65536 times the cubic root of 0, 1, 2, 3, 4, 5, 6, 7*/ static uint64_t bictcp_table[8] = {0, 65536, 82570, 94519, 104030, 112063, 119087, 125367}; /* calculate the cubic root of x the basic idea is that x can be expressed as i*8^j so cubic_root(x) = cubic_root(i)*2^j in the following code, x is i, and y is 2^j because of integer calculation, there are errors in calculation so finally use binary search to find out the exact solution*/ static uint32_t bictcp(uint64_t x) { uint64_t y, app, target, start, end, mid, start_diff, end_diff; if (x == 0) return 0; target = x; /*first estimate lower and upper bound*/ y = 1; while (x >= 8){ x = (x >> 3); y = (y << 1); } start = (y*bictcp_table[x])>>16; if (x==7) end = (y<<1); else end = (y*bictcp_table[x+1]+65535)>>16; /*binary search for more accurate one*/ while (start < end-1) { mid = (start+end) >> 1; app = mid*mid*mid; if (app < target) start = mid; else if (app > target) end = mid; else return mid; } /*find the most accurate one from start and end*/ app = start*start*start; if (app < target) start_diff = target - app; else start_diff = app - target; app = end*end*end; if (app < target) end_diff = target - app; else end_diff = app - target; return (start_diff < end_diff) ? start : end; } #define NCASES 1000 static uint64_t cases[NCASES]; static double results[NCASES]; static double ticks_per_usec; static unsigned long long start, end; static void dotest(const char *name, uint32_t (*func)(uint64_t)) { int i; unsigned long long t, mx = 0, sum = 0, sum_sq = 0; double mean, std, err = 0; for (i = 0; i < NCASES; i++) { uint64_t x = cases[i]; uint32_t v; rdtscll(start); v = (*func)(x); rdtscll(end); t = end - start; if (t > mx) mx = t; sum += t; sum_sq += t*t; err += fabs(((double) v - results[i]) / results[i]); } mean = (double) sum / ticks_per_usec / NCASES ; std = sqrtl( (double) sum_sq / ticks_per_usec / NCASES - mean * mean); printf("%-10s %8llu %8.2f %8.2f %8.2f %.03f%%\n", name, (unsigned long long) sum / NCASES, mean, std, (double) mx / ticks_per_usec, err * 100./ NCASES); } int main(int argc, char **argv) { uint64_t x; int i; printf("Calibrating\n"); rdtscll(start); sleep(2); rdtscll(end); ticks_per_usec = (double) (end - start) / 2000000.; for (i = 0; i < 63; i++) cases[i] = 1ull << i; x = ~0; while (x != 0) { cases[i++] = x; x >>= 1; } x = ~0; while (x != 0) { cases[i++] = x; x <<= 1; } while (i < NCASES) cases[i++] = (uint64_t) random() * (uint64_t) random(); for (i = 0; i < NCASES; i++) results[i] = cbrt((double)cases[i]); printf("Function clocks mean(us) max(us) std(us) Avg error\n"); #define DOTEST(x) dotest(#x, x) DOTEST(bictcp); DOTEST(ocubic); DOTEST(ncubic); DOTEST(acbrt); DOTEST(hcbrt); return 0; } ^ permalink raw reply [flat|nested] 57+ messages in thread
* Update to cube root benchmark code 2007-03-06 22:58 ` cube root benchmark code Stephen Hemminger @ 2007-03-07 6:08 ` Willy Tarreau 2007-03-08 1:07 ` [PATCH] tcp_cubic: use 32 bit math Stephen Hemminger 0 siblings, 1 reply; 57+ messages in thread From: Willy Tarreau @ 2007-03-07 6:08 UTC (permalink / raw) To: Stephen Hemminger Cc: David Miller, rkuhn, andi, dada1, jengelh, linux-kernel, netdev Hi Stephen, Thanks for this code, it's easy to experiment with it. Let me propose this simple update with a variation on your ncubic() function. I noticed that all intermediate results were far below 32 bits, so I did a new version which is 30% faster on my athlon with the same results. This is because we only use x and a/x^2 in the function, with x very close to cbrt(a). So a/x^2 is very close to cbrt(a) which is at most 22 bits. So we only use the 32 lower bits of the result of div64_64(), and all intermediate computations can be done on 32 bits (including multiplies and divides). willy@pcw:~$ ./bictcp Calibrating Function clocks mean(us) max(us) std(us) Avg error bictcp 1085 0.70 28.19 2.30 0.172% ocubic 869 0.56 22.76 1.23 0.274% ncubic 637 0.41 16.29 1.41 0.247% ncubic32 435 0.28 11.18 1.03 0.247% acbrt 824 0.53 21.03 0.85 0.275% hcbrt 547 0.35 13.96 0.42 1.580% I also tried to improve a bit by checking for early convergence and returning before last divide, but it is worthless because it almost never happens so it does not make the code any faster. Here's the code. I think that it would be fine if we merged this version since it's supposed to behave better on most 32 bits machines. Best regards, Willy /* Here is a better version of the benchmark code. It has the original code used in 2.4 version of Cubic for comparison ----------------------------------------------------------- */ /* Test and measure perf of cube root algorithms. */ #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <math.h> #include <unistd.h> #ifdef __x86_64 #define rdtscll(val) do { \ unsigned int __a,__d; \ asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \ (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \ } while(0) # define do_div(n,base) ({ \ uint32_t __base = (base); \ uint32_t __rem; \ __rem = ((uint64_t)(n)) % __base; \ (n) = ((uint64_t)(n)) / __base; \ __rem; \ }) /** * __ffs - find first bit in word. * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ static __inline__ unsigned long __ffs(unsigned long word) { __asm__("bsfq %1,%0" :"=r" (word) :"rm" (word)); return word; } /* * __fls: find last bit set. * @word: The word to search * * Undefined if no zero exists, so code should check against ~0UL first. */ static inline unsigned long __fls(unsigned long word) { __asm__("bsrq %1,%0" :"=r" (word) :"rm" (word)); return word; } /** * ffs - find first bit set * @x: the word to search * * This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ static __inline__ int ffs(int x) { int r; __asm__("bsfl %1,%0\n\t" "cmovzl %2,%0" : "=r" (r) : "rm" (x), "r" (-1)); return r+1; } /** * fls - find last bit set * @x: the word to search * * This is defined the same way as ffs. */ static inline int fls(int x) { int r; __asm__("bsrl %1,%0\n\t" "cmovzl %2,%0" : "=&r" (r) : "rm" (x), "rm" (-1)); return r+1; } /** * fls64 - find last bit set in 64 bit word * @x: the word to search * * This is defined the same way as fls. */ static inline int fls64(uint64_t x) { if (x == 0) return 0; return __fls(x) + 1; } static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor) { return dividend / divisor; } #elif __i386 #define rdtscll(val) \ __asm__ __volatile__("rdtsc" : "=A" (val)) /** * ffs - find first bit set * @x: the word to search * * This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz() (man ffs). */ static inline int ffs(int x) { int r; __asm__("bsfl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); return r+1; } /** * fls - find last bit set * @x: the word to search * * This is defined the same way as ffs(). */ static inline int fls(int x) { int r; __asm__("bsrl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); return r+1; } static inline int fls64(uint64_t x) { uint32_t h = x >> 32; if (h) return fls(h) + 32; return fls(x); } #define do_div(n,base) ({ \ unsigned long __upper, __low, __high, __mod, __base; \ __base = (base); \ asm("":"=a" (__low), "=d" (__high):"A" (n)); \ __upper = __high; \ if (__high) { \ __upper = __high % (__base); \ __high = __high / (__base); \ } \ asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \ asm("":"=A" (n):"a" (__low),"d" (__high)); \ __mod; \ }) /* 64bit divisor, dividend and result. dynamic precision */ static uint64_t div64_64(uint64_t dividend, uint64_t divisor) { uint32_t d = divisor; if (divisor > 0xffffffffULL) { unsigned int shift = fls(divisor >> 32); d = divisor >> shift; dividend >>= shift; } /* avoid 64 bit division if possible */ if (dividend >> 32) do_div(dividend, d); else dividend = (uint32_t) dividend / d; return dividend; } #endif /* Andi Kleen's version */ uint32_t acbrt(uint64_t x) { uint32_t y = 0; int s; for (s = 63; s >= 0; s -= 3) { uint64_t b, bs; y = 2 * y; b = 3 * y * (y+1) + 1; bs = b << s; if (x >= bs && (b == (bs>>s))) { /* avoid overflow */ x -= bs; y++; } } return y; } /* My version of hacker's delight */ uint32_t hcbrt(uint64_t x) { int s = 60; uint32_t y = 0; do { uint64_t b; y = 2*y; b = (uint64_t)(3*y*(y + 1) + 1) << s; s = s - 3; if (x >= b) { x = x - b; y = y + 1; } } while(s >= 0); return y; } /* calculate the cubic root of x using Newton-Raphson */ static uint32_t ocubic(uint64_t a) { uint32_t x, x1; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) */ x = 1u << (fls64(a)/3); /* * Iteration based on: * 2 * x = ( 2 * x + a / x ) / 3 * k+1 k k */ do { x1 = x; x = (2 * x + div64_64(a, (uint64_t)x * x)) / 3; } while (abs(x1 - x) > 1); return x; } /* calculate the cubic root of x using Newton-Raphson */ static uint32_t ncubic(uint64_t a) { uint64_t x; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) */ x = 1u << (fls64(a)/3); /* Converges in 3 iterations to > 32 bits */ x = (2 * x + div64_64(a, x*x)) / 3; x = (2 * x + div64_64(a, x*x)) / 3; x = (2 * x + div64_64(a, x*x)) / 3; return x; } /* calculate the cubic root of x using Newton-Raphson */ static uint32_t ncubic32(uint64_t a) { uint32_t x; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) */ x = 1u << (fls64(a)/3); /* Converges in 3 iterations to > 32 bits */ /* We can do 32bit maths here : * x ~= cbrt(a) so (a/x^2) ~= cbrt(a) which is about 22 bits max */ x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3; x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3; x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3; return x; } /* 65536 times the cubic root of 0, 1, 2, 3, 4, 5, 6, 7*/ static uint64_t bictcp_table[8] = {0, 65536, 82570, 94519, 104030, 112063, 119087, 125367}; /* calculate the cubic root of x the basic idea is that x can be expressed as i*8^j so cubic_root(x) = cubic_root(i)*2^j in the following code, x is i, and y is 2^j because of integer calculation, there are errors in calculation so finally use binary search to find out the exact solution*/ static uint32_t bictcp(uint64_t x) { uint64_t y, app, target, start, end, mid, start_diff, end_diff; if (x == 0) return 0; target = x; /*first estimate lower and upper bound*/ y = 1; while (x >= 8){ x = (x >> 3); y = (y << 1); } start = (y*bictcp_table[x])>>16; if (x==7) end = (y<<1); else end = (y*bictcp_table[x+1]+65535)>>16; /*binary search for more accurate one*/ while (start < end-1) { mid = (start+end) >> 1; app = mid*mid*mid; if (app < target) start = mid; else if (app > target) end = mid; else return mid; } /*find the most accurate one from start and end*/ app = start*start*start; if (app < target) start_diff = target - app; else start_diff = app - target; app = end*end*end; if (app < target) end_diff = target - app; else end_diff = app - target; return (start_diff < end_diff) ? start : end; } #define NCASES 1000 static uint64_t cases[NCASES]; static double results[NCASES]; static double ticks_per_usec; static unsigned long long start, end; static void dotest(const char *name, uint32_t (*func)(uint64_t)) { int i; unsigned long long t, mx = 0, sum = 0, sum_sq = 0; double mean, std, err = 0; for (i = 0; i < NCASES; i++) { uint64_t x = cases[i]; uint32_t v; rdtscll(start); v = (*func)(x); rdtscll(end); t = end - start; if (t > mx) mx = t; sum += t; sum_sq += t*t; err += fabs(((double) v - results[i]) / results[i]); } mean = (double) sum / ticks_per_usec / NCASES ; std = sqrtl( (double) sum_sq / ticks_per_usec / NCASES - mean * mean); printf("%-10s %8llu %8.2f %8.2f %8.2f %.03f%%\n", name, (unsigned long long) sum / NCASES, mean, std, (double) mx / ticks_per_usec, err * 100./ NCASES); } int main(int argc, char **argv) { uint64_t x; int i; printf("Calibrating\n"); rdtscll(start); sleep(2); rdtscll(end); ticks_per_usec = (double) (end - start) / 2000000.; for (i = 0; i < 63; i++) cases[i] = 1ull << i; x = ~0; while (x != 0) { cases[i++] = x; x >>= 1; } x = ~0; while (x != 0) { cases[i++] = x; x <<= 1; } while (i < NCASES) cases[i++] = (uint64_t) random() * (uint64_t) random(); for (i = 0; i < NCASES; i++) results[i] = cbrt((double)cases[i]); printf("Function clocks mean(us) max(us) std(us) Avg error\n"); #define DOTEST(x) dotest(#x, x) DOTEST(bictcp); DOTEST(ocubic); DOTEST(ncubic); DOTEST(ncubic32); DOTEST(acbrt); DOTEST(hcbrt); return 0; } ^ permalink raw reply [flat|nested] 57+ messages in thread
* [PATCH] tcp_cubic: use 32 bit math 2007-03-07 6:08 ` Update to " Willy Tarreau @ 2007-03-08 1:07 ` Stephen Hemminger 2007-03-08 2:55 ` David Miller 0 siblings, 1 reply; 57+ messages in thread From: Stephen Hemminger @ 2007-03-08 1:07 UTC (permalink / raw) To: Willy Tarreau, David Miller Cc: rkuhn, andi, dada1, jengelh, linux-kernel, netdev The basic calculation has to be done in 32 bits to avoid doing 64 bit divide by 3. The value x is only 22bits max so only need full 64 bits only for x^2. Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org> --- net/ipv4/tcp_cubic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) --- net-2.6.22.orig/net/ipv4/tcp_cubic.c 2007-03-07 15:51:37.000000000 -0800 +++ net-2.6.22/net/ipv4/tcp_cubic.c 2007-03-07 17:06:02.000000000 -0800 @@ -96,7 +96,7 @@ */ static u32 cubic_root(u64 a) { - u64 x; + u32 x; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) @@ -104,9 +104,9 @@ x = 1u << (fls64(a)/3); /* converges to 32 bits in 3 iterations */ - x = (2 * x + div64_64(a, x*x)) / 3; - x = (2 * x + div64_64(a, x*x)) / 3; - x = (2 * x + div64_64(a, x*x)) / 3; + x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3; + x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3; + x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3; return x; } ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] tcp_cubic: use 32 bit math 2007-03-08 1:07 ` [PATCH] tcp_cubic: use 32 bit math Stephen Hemminger @ 2007-03-08 2:55 ` David Miller 2007-03-08 3:10 ` Stephen Hemminger 0 siblings, 1 reply; 57+ messages in thread From: David Miller @ 2007-03-08 2:55 UTC (permalink / raw) To: shemminger; +Cc: w, rkuhn, andi, dada1, jengelh, linux-kernel, netdev From: Stephen Hemminger <shemminger@linux-foundation.org> Date: Wed, 7 Mar 2007 17:07:31 -0800 > The basic calculation has to be done in 32 bits to avoid > doing 64 bit divide by 3. The value x is only 22bits max > so only need full 64 bits only for x^2. > > Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org> Applied, thanks Stephen. What about Willy Tarreau's supposedly even faster variant? Or does this incorporate that set of improvements? ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] tcp_cubic: use 32 bit math 2007-03-08 2:55 ` David Miller @ 2007-03-08 3:10 ` Stephen Hemminger 2007-03-08 3:51 ` David Miller 2007-03-08 4:16 ` [PATCH] tcp_cubic: use 32 bit math Willy Tarreau 0 siblings, 2 replies; 57+ messages in thread From: Stephen Hemminger @ 2007-03-08 3:10 UTC (permalink / raw) To: David Miller; +Cc: w, rkuhn, andi, dada1, jengelh, linux-kernel, netdev David Miller wrote: > From: Stephen Hemminger <shemminger@linux-foundation.org> > Date: Wed, 7 Mar 2007 17:07:31 -0800 > > >> The basic calculation has to be done in 32 bits to avoid >> doing 64 bit divide by 3. The value x is only 22bits max >> so only need full 64 bits only for x^2. >> >> Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org> >> > > Applied, thanks Stephen. > > What about Willy Tarreau's supposedly even faster variant? > Or does this incorporate that set of improvements? > That's what this is: x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3; ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] tcp_cubic: use 32 bit math 2007-03-08 3:10 ` Stephen Hemminger @ 2007-03-08 3:51 ` David Miller 2007-03-10 11:48 ` Willy Tarreau 2007-03-08 4:16 ` [PATCH] tcp_cubic: use 32 bit math Willy Tarreau 1 sibling, 1 reply; 57+ messages in thread From: David Miller @ 2007-03-08 3:51 UTC (permalink / raw) To: shemminger; +Cc: w, rkuhn, andi, dada1, jengelh, linux-kernel, netdev From: Stephen Hemminger <shemminger@linux-foundation.org> Date: Wed, 07 Mar 2007 19:10:47 -0800 > David Miller wrote: > > What about Willy Tarreau's supposedly even faster variant? > > Or does this incorporate that set of improvements? > > > That's what this is: > x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3; Great, thanks for the clarification. ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] tcp_cubic: use 32 bit math 2007-03-08 3:51 ` David Miller @ 2007-03-10 11:48 ` Willy Tarreau 2007-03-12 21:11 ` Stephen Hemminger 0 siblings, 1 reply; 57+ messages in thread From: Willy Tarreau @ 2007-03-10 11:48 UTC (permalink / raw) To: David Miller Cc: shemminger, rkuhn, andi, dada1, jengelh, linux-kernel, netdev On Wed, Mar 07, 2007 at 07:51:35PM -0800, David Miller wrote: > From: Stephen Hemminger <shemminger@linux-foundation.org> > Date: Wed, 07 Mar 2007 19:10:47 -0800 > > > David Miller wrote: > > > What about Willy Tarreau's supposedly even faster variant? > > > Or does this incorporate that set of improvements? > > > > > That's what this is: > > x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3; > > Great, thanks for the clarification. Oh BTW, I have a newer version with a first approximation of the cbrt() before the div64_64, which allows us to reduce from 3 div64 to only 2 div64. This results in a version which is twice as fast as the initial one (ncubic), but with slightly less accuracy (0.286% compared to 0.247). But I see that other functions such as hcbrt() had a 1.5% avg error, so I think this is not dramatic. Also, I managed to remove all other divides, to be kind with CPUs having a slow divide instruction or no divide at all. Since we compute on limited range (22 bits), we can multiply then shift right. It shows me even slightly better time on pentium-m and athlon, with a slightly higher avg error (0.297% compared to 0.286%), and slightly smaller code. I just have to clean experiments from my code to provide a patch. David, Stephen, are you interested ? $ ./bictcp fls(0)=0, fls(1)=1, fls(256)=9 Calibrating Function clocks mean(us) max(us) std(us) Avg error bictcp 936 0.61 24.28 1.99 0.172% ocubic 886 0.57 23.51 3.18 0.274% ncubic 644 0.42 16.59 2.18 0.247% ncubic32 444 0.29 11.47 1.50 0.247% ncubic32_1 444 0.29 11.56 1.88 0.238% ncubic32b3 337 0.22 8.67 0.88 0.286% ncubic_ndiv3 329 0.21 8.46 0.69 0.297% acbrt 707 0.46 18.05 0.80 0.275% hcbrt 644 0.42 16.44 0.51 1.580% Regards, Willy ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] tcp_cubic: use 32 bit math 2007-03-10 11:48 ` Willy Tarreau @ 2007-03-12 21:11 ` Stephen Hemminger 2007-03-13 20:50 ` Willy Tarreau 0 siblings, 1 reply; 57+ messages in thread From: Stephen Hemminger @ 2007-03-12 21:11 UTC (permalink / raw) To: Willy Tarreau Cc: David Miller, rkuhn, andi, dada1, jengelh, linux-kernel, netdev On Sat, 10 Mar 2007 12:48:26 +0100 Willy Tarreau <w@1wt.eu> wrote: > On Wed, Mar 07, 2007 at 07:51:35PM -0800, David Miller wrote: > > From: Stephen Hemminger <shemminger@linux-foundation.org> > > Date: Wed, 07 Mar 2007 19:10:47 -0800 > > > > > David Miller wrote: > > > > What about Willy Tarreau's supposedly even faster variant? > > > > Or does this incorporate that set of improvements? > > > > > > > That's what this is: > > > x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3; > > > > Great, thanks for the clarification. > > Oh BTW, I have a newer version with a first approximation of the > cbrt() before the div64_64, which allows us to reduce from 3 div64 > to only 2 div64. This results in a version which is twice as fast > as the initial one (ncubic), but with slightly less accuracy (0.286% > compared to 0.247). But I see that other functions such as hcbrt() > had a 1.5% avg error, so I think this is not dramatic. Ignore my hcbrt() it was a less accurate version of andi's stuff. > Also, I managed to remove all other divides, to be kind with CPUs > having a slow divide instruction or no divide at all. Since we compute > on limited range (22 bits), we can multiply then shift right. It shows > me even slightly better time on pentium-m and athlon, with a slightly > higher avg error (0.297% compared to 0.286%), and slightly smaller > code. What does the code look like? > I just have to clean experiments from my code to provide a patch. > David, Stephen, are you interested ? > > $ ./bictcp > fls(0)=0, fls(1)=1, fls(256)=9 > Calibrating > Function clocks mean(us) max(us) std(us) Avg error > bictcp 936 0.61 24.28 1.99 0.172% > ocubic 886 0.57 23.51 3.18 0.274% > ncubic 644 0.42 16.59 2.18 0.247% > ncubic32 444 0.29 11.47 1.50 0.247% > ncubic32_1 444 0.29 11.56 1.88 0.238% > ncubic32b3 337 0.22 8.67 0.88 0.286% > ncubic_ndiv3 329 0.21 8.46 0.69 0.297% > acbrt 707 0.46 18.05 0.80 0.275% > hcbrt 644 0.42 16.44 0.51 1.580% > > > Regards, > Willy > -- Stephen Hemminger <shemminger@linux-foundation.org> ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] tcp_cubic: use 32 bit math 2007-03-12 21:11 ` Stephen Hemminger @ 2007-03-13 20:50 ` Willy Tarreau 2007-03-21 18:54 ` Stephen Hemminger 0 siblings, 1 reply; 57+ messages in thread From: Willy Tarreau @ 2007-03-13 20:50 UTC (permalink / raw) To: Stephen Hemminger Cc: David Miller, rkuhn, andi, dada1, jengelh, linux-kernel, netdev Hi Stephen, On Mon, Mar 12, 2007 at 02:11:56PM -0700, Stephen Hemminger wrote: > > Oh BTW, I have a newer version with a first approximation of the > > cbrt() before the div64_64, which allows us to reduce from 3 div64 > > to only 2 div64. This results in a version which is twice as fast > > as the initial one (ncubic), but with slightly less accuracy (0.286% > > compared to 0.247). But I see that other functions such as hcbrt() > > had a 1.5% avg error, so I think this is not dramatic. > > Ignore my hcbrt() it was a less accurate version of andi's stuff. OK. > > Also, I managed to remove all other divides, to be kind with CPUs > > having a slow divide instruction or no divide at all. Since we compute > > on limited range (22 bits), we can multiply then shift right. It shows > > me even slightly better time on pentium-m and athlon, with a slightly > > higher avg error (0.297% compared to 0.286%), and slightly smaller > > code. > > What does the code look like? Well, I have cleaned it a little bit, there were more comments and ifdefs than code ! I've appended it to the end of this mail. I have changed it a bit, because I noticed that integer divide precision was so coarse that there were other possibilities to play with the bits. I have experimented with combinations of several methods : - replace integer divides with multiplies/shifts where possible. - compensation for divide imprecisions by adding/removing small values bofore/after them. Often, the integer result of 1/(x*(x-1)) is closer to (float)1/(float)x^2 than 1/(x*x). This is because the divide always truncates the result. - use direct result lookup for small values. Small inputs give small outputs which have very few moving bits. Many different values fit in a 32bit integer, so we use a shift offset to lookup the value. I used this in an fls function I wrote a while ago, that I should also post because it is up to twice as fast as the kernel's. Sometimes it seems faster to lookup in from memory, sometimes it is faster from an immediate value. Maybe more visible differences would show up on RISC CPUs where loading 32 bits immediate needs two instructions. I don't know yet, I've not tested on my sparc yet. - use small lookup tables (64 bytes) with 6 bits inputs and at least as many on output. We only lookup the 6 MSB and return the 2-3 MSB of the result. - iterative search and manual refinment of the lookup tables for best accuracy. The avg error rate can easily be halved this way. I have duplicated tried several functions with 0, 1, 2 and 3 divides. Several of them offer better accuracy over what we currently have, in less cycles. Others offer faster results (up to 5 times) with slightly less accuracy. There is one function which is not to be used, but is just here for comparison (ncubic_0div). It does no divide but has awful avg error. But one which is interesting is the ncubic_tab0. It does not use any divide at all, even not any div64. It shows a 0.6% avg error, which I'm not sure is enough or not. It is 6.7 times faster than initial ncubic() with less accuracy, and 4 times smaller. I suspect that it can differ more on architectures which have no divide instruction. Is 0.6% avg error rate is too much, ncubic_tab1() uses one single div64 and is twice slower (still nearly 3 times faster than ncubic). It show 0.195% avg error, which is better than initial ncubic. I think that it is a good tradeoff. If best accuracy is an absolute requirement, then I have a variation of ncubic (ncubic_3div) which does 0.17% in 2/3 of the time (compared to 0.247%), and which is slightly smaller. I have also added a "size" column, indicating approximative function size, provided that the compiler does not reorder the code. On gcc 3.4, it's OK, but 4.1 returns garbage. That does not matter, it's just a rough estimate anyway. Here are the results classed by speed : /* Sample output on a Pentium-M 600 MHz : Function clocks mean(us) max(us) std(us) Avg err size ncubic_tab0 79 0.66 7.20 1.04 0.613% 160 ncubic_0div 84 0.70 7.64 1.57 4.521% 192 ncubic_1div 178 1.48 16.27 1.81 0.443% 336 ncubic_tab1 179 1.49 16.34 1.85 0.195% 320 ncubic_ndiv3 263 2.18 24.04 3.59 0.250% 512 ncubic_2div 270 2.24 24.70 2.77 0.187% 512 ncubic32_1 359 2.98 32.81 3.59 0.238% 544 ncubic_3div 361 2.99 33.08 3.79 0.170% 656 ncubic32 364 3.02 33.29 3.51 0.247% 544 ncubic 529 4.39 48.39 4.92 0.247% 720 hcbrt 539 4.47 49.25 5.98 1.580% 96 ocubic 732 4.93 61.83 7.22 0.274% 320 acbrt 842 6.98 76.73 8.55 0.275% 192 bictcp 1032 6.95 86.30 9.04 0.172% 768 And now by avg error : ncubic_3div 361 2.99 33.08 3.79 0.170% 656 bictcp 1032 6.95 86.30 9.04 0.172% 768 ncubic_2div 270 2.24 24.70 2.77 0.187% 512 ncubic_tab1 179 1.49 16.34 1.85 0.195% 320 ncubic32_1 359 2.98 32.81 3.59 0.238% 544 ncubic 529 4.39 48.39 4.92 0.247% 720 ncubic32 364 3.02 33.29 3.51 0.247% 544 ncubic_ndiv3 263 2.18 24.04 3.59 0.250% 512 ocubic 732 4.93 61.83 7.22 0.274% 320 acbrt 842 6.98 76.73 8.55 0.275% 192 ncubic_1div 178 1.48 16.27 1.81 0.443% 336 ncubic_tab0 79 0.66 7.20 1.04 0.613% 160 hcbrt 539 4.47 49.25 5.98 1.580% 96 ncubic_0div 84 0.70 7.64 1.57 4.521% 192 And here comes the code. I have tried to document it a bit, as much as can be done on experimentation code. It is often easier to use a pencil and paper to understand how the bits move. Regards, Willy /* Here is a better version of the benchmark code. It has the original code used in 2.4 version of Cubic for comparison ----------------------------------------------------------- */ /* Test and measure perf of cube root algorithms. */ #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <math.h> #include <unistd.h> #ifdef __x86_64 #define rdtscll(val) do { \ unsigned int __a,__d; \ asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \ (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \ } while(0) # define do_div(n,base) ({ \ uint32_t __base = (base); \ uint32_t __rem; \ __rem = ((uint64_t)(n)) % __base; \ (n) = ((uint64_t)(n)) / __base; \ __rem; \ }) /** * __ffs - find first bit in word. * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ static __inline__ unsigned long __ffs(unsigned long word) { __asm__("bsfq %1,%0" :"=r" (word) :"rm" (word)); return word; } /* * __fls: find last bit set. * @word: The word to search * * Undefined if no zero exists, so code should check against ~0UL first. */ static inline unsigned long __fls(unsigned long word) { __asm__("bsrq %1,%0" :"=r" (word) :"rm" (word)); return word; } /** * ffs - find first bit set * @x: the word to search * * This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ static __inline__ int ffs(int x) { int r; __asm__("bsfl %1,%0\n\t" "cmovzl %2,%0" : "=r" (r) : "rm" (x), "r" (-1)); return r+1; } /** * fls - find last bit set * @x: the word to search * * This is defined the same way as ffs. */ static inline int fls(int x) { int r; __asm__("bsrl %1,%0\n\t" "cmovzl %2,%0" : "=&r" (r) : "rm" (x), "rm" (-1)); return r+1; } /** * fls64 - find last bit set in 64 bit word * @x: the word to search * * This is defined the same way as fls. */ static inline int fls64(uint64_t x) { if (x == 0) return 0; return __fls(x) + 1; } static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor) { return dividend / divisor; } #elif __i386 #define rdtscll(val) \ __asm__ __volatile__("rdtsc" : "=A" (val)) /** * ffs - find first bit set * @x: the word to search * * This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz() (man ffs). */ static inline int ffs(int x) { int r; __asm__("bsfl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); return r+1; } /** * fls - find last bit set * @x: the word to search * * This is defined the same way as ffs(). */ static inline int fls(int x) { int r; __asm__("bsrl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); return r+1; } static inline int fls64(uint64_t x) { uint32_t h = x >> 32; if (h) return fls(h) + 32; return fls(x); } #define do_div(n,base) ({ \ unsigned long __upper, __low, __high, __mod, __base; \ __base = (base); \ asm("":"=a" (__low), "=d" (__high):"A" (n)); \ __upper = __high; \ if (__high) { \ __upper = __high % (__base); \ __high = __high / (__base); \ } \ asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \ asm("":"=A" (n):"a" (__low),"d" (__high)); \ __mod; \ }) /* 64bit divisor, dividend and result. dynamic precision */ static uint64_t div64_64(uint64_t dividend, uint64_t divisor) { uint32_t d = divisor; if (divisor > 0xffffffffULL) { unsigned int shift = fls(divisor >> 32); d = divisor >> shift; dividend >>= shift; } /* avoid 64 bit division if possible */ if (dividend >> 32) do_div(dividend, d); else dividend = (uint32_t) dividend / d; return dividend; } /* this one only works when the result is below 32 bits */ static uint32_t div64_64_32(uint64_t dividend, uint64_t divisor) { uint32_t d = divisor; if (divisor > 0xffffffffULL) { unsigned int shift = fls(divisor >> 32); d = divisor >> shift; dividend >>= shift; } /* avoid 64 bit division if possible */ if (dividend >> 32) do_div(dividend, d); else dividend = (uint32_t) dividend / d; return dividend; } #endif /* Andi Kleen's version */ uint32_t acbrt(uint64_t x) { uint32_t y = 0; int s; for (s = 63; s >= 0; s -= 3) { uint64_t b, bs; y = 2 * y; b = 3 * y * (y+1) + 1; bs = b << s; if (x >= bs && (b == (bs>>s))) { /* avoid overflow */ x -= bs; y++; } } return y; } uint32_t end_acbrt() { } /* My version of hacker's delight */ uint32_t hcbrt(uint64_t x) { int s = 60; uint32_t y = 0; do { uint64_t b; y = 2*y; b = (uint64_t)(3*y*(y + 1) + 1) << s; s = s - 3; if (x >= b) { x = x - b; y = y + 1; } } while(s >= 0); return y; } uint32_t end_hcbrt() { } /* calculate the cubic root of x using Newton-Raphson */ static uint32_t ocubic(uint64_t a) { uint32_t x, x1; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) */ x = 1u << (fls64(a)/3); /* * Iteration based on: * 2 * x = ( 2 * x + a / x ) / 3 * k+1 k k */ do { x1 = x; x = (2 * x + div64_64(a, (uint64_t)x * x)) / 3; } while (abs(x1 - x) > 1); return x; } static uint32_t end_ocubic() { } /* calculate the cubic root of x using Newton-Raphson */ static uint32_t ncubic(uint64_t a) { uint64_t x; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) */ x = 1u << (fls64(a)/3); /* Converges in 3 iterations to > 32 bits */ x = (2 * x + div64_64(a, x*x)) / 3; x = (2 * x + div64_64(a, x*x)) / 3; x = (2 * x + div64_64(a, x*x)) / 3; return x; } static uint32_t end_ncubic() { } /* calculate the cubic root of x using Newton-Raphson. * Avg err ~= 0.247% */ static uint32_t ncubic32(uint64_t a) { uint32_t x; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) */ x = 1u << (fls64(a)/3); /* Converges in 3 iterations to > 32 bits */ /* We can do 32bit maths here : * x ~= cbrt(a) so (a/x^2) ~= cbrt(a) which is about 22 bits max */ x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3; x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3; x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3; return x; } static uint32_t end_ncubic32() { } /* calculate the cubic root of x using Newton-Raphson - small refinement. * Avg err ~= 0.238% */ static uint32_t ncubic32_1(uint64_t a) { uint32_t x; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) */ x = 1u << ((fls64(a)+1)/3); /* Converges in 3 iterations to > 32 bits */ /* We can do 32bit maths here : * x ~= cbrt(a) so (a/x^2) ~= cbrt(a) which is about 22 bits max */ x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3; x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3; x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3; return x; } static uint32_t end_ncubic32_1() { } int man_adj; #define ALIGN64 __attribute__ ((aligned (8))) /* calculate the cubic root of x using Newton-Raphson with less divides. * Avg err ~= 0.250% */ static uint32_t ncubic_ndiv3(uint64_t a) { uint32_t x; uint32_t b; uint32_t y; /* * For low values (between 2 and 63), we use a direct mapping of the * input divided by 4 (between 0 and 15) to the output between 1 and 4. * Those 4 values can be stored as two bits if we store the result * minus 1, which constitute 32 bits for the 16 values. * We use a uint32_t for this, which we shift right by a/4. * * a / 4 = 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 * a_max = 63 59 55 51 47 43 39 35 31 27 23 19 15 11 07 03 * a_min = 60 56 52 48 44 40 36 32 28 24 20 16 12 08 04 00 * cbrt(a) = 4 4 4 4 4 3 3 3 3 3 3 3 2 2 2 1 * bits = 11 11 11 11 11 10 10 10 10 10 10 10 01 01 01 00 * = 0xFFEAAA54UL */ ALIGN64 static uint32_t cbrt_x_4m1 = 0xFFEAAA54UL; /* For higher values, we use an initial estimation based on this fact : * cbrt(x) = exp(log(x) / 3) * and : * cbrt(x * y) = cbrt(x) * cbrt(y) * * So for a block of 3 input bits, we can get 1 output bit, and for * 6 input bits, we get 2 output bits (3 in fact due to rounding up). * So we have to operate on 3bit boundaries, and check the highest * 6 bits to provide 2 to 3 bits on output. * * Let's consider n the value so that we have between 4 and 6 MSB * between bits 3n and 3n+5. We will set the output bits between * n and n+2. * * We have 64 possible values for the 6 MSB. But since a cbrt() * output changes slower than its input, we can easily focus on * the 4 MSB only. This means 16 values. Just like above for the * small values, we can store 16 * 2 bits in a uint32_t. * * The little difference is that we are not seeking exact output * result, but the closest value to the exact output, to improve * convergence speed. To achieve this, we start with same values * as above, and iteratively refine them by hand so that the average * error reaches its minimum. * * Theorical map value is 0xFFEAAA64UL. * Experimental best value is 0xFFAFAA94UL. * * In order to find blocks of 3 bits aligned on 3n bits boundaries, * we have to shift right by multiples of 3 bits. We want to avoid * a costly divide, so we instead multiply by 84 and shift right by 8, * as this returns the same values for inputs below and 64. * * For the results, we still have to divide by 3 multiple times. We * know the result as well as intermediate values are less than 2^22, * so we can use the same principle with shifted arithmetics : * * 341/1024 < 1/3 < 342/1024 * * Rouding errors on integer maths generally can be compensated for by * small offset adjustments before divides. Some have been added after * experimentations to provide better accuracy. * */ ALIGN64 static uint32_t cbrt_4msb2lsb = 0xFFAFAA94UL; /* cbrt([0..63]/4)-1 */ b = fls64(a); if (b <= 1) return b; else if (b < 7) { /* a in [0..63] */ uint32_t bits; bits = (uint8_t)a >> 1; bits &= ~1; return 1 + ((cbrt_x_4m1 >> bits) & 0x03); } /* We will shift a right by 3n bits, to retrieve bits 3n..3n+5. * We want (b - 1) / 3 for b between 7 and 64 so that we have a * a bit shift count between 2 and 21 inclusive. */ b = (b * 84) >> 8; y = (a >> (b * 3 - 1)) << 1; x = 1 + ((cbrt_4msb2lsb >> y) & 3); x = (x << b) >> 1; /* x += 6; // provides slightly better accuracy (0.246% vs 0.250%) */ x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)(x - 1))); x = ((x * 344) >> 10); /* = x/2.977 ~= x/3 */ x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)(x - 1))); x = ((x * 341) >> 10); /* = x/3.003 ~= x/3 */ return x; } static uint32_t end_ncubic_ndiv3() { } /* We use this constant to extract the 3 highest bits of <a> and shift them * 2 bits left in order to provide a 4bit-aligned shift pointer to an uint32_t. */ #define VAL_3BIT_TO_SHIFT4(a,b) (((a) >> ((b) * 3)) << 2) /* calculate the cubic root of x using Newton-Raphson in 3 divides after first * approximation. * * Avg err ~= 0.170% */ static uint32_t ncubic_3div(uint64_t a) { uint32_t x; uint32_t b; uint32_t shift; /* * For large values, 3 bits inputs are enough. * * We store 4*cbrt(8*x)-1 for x in [0..7] * * x | cbrt | 4*cbrt | int | int | * range | range | approx | val.| -3 | * -------+-------------+--------+-----+-----+ * 56..63 | 3.83 - 3.98 | 15.66 | 16 | 13 | * 48..55 | 3.63 - 3.80 | 14.93 | 15 | 12 | * 40..47 | 3.42 - 3.61 | 14.12 | 14 | 11 | * 32..39 | 3.17 - 3.39 | 13.21 | 13 | 10 | * 24..31 | 2.88 - 3.14 | 12.15 | 12 | 9 | * 16..23 | 2.51 - 2.84 | 10.86 | 11 | 8 | * 08..15 | 2.00 - 2.46 | 9.16 | 9 | 6 | * 00..07 | 1.00 - 1.91 | 6.35 | 6 | 3 | * * We can store (4*cbrt(x)-3) in 4 bits for x in [0..63]. * So we use the following map : 0xDCBA9863UL */ ALIGN64 static uint32_t cbrt_3msb_4lsb = 0xDCBA9863UL; b = fls64(a); if (b <= 1) return b; if (b < 7) { /* a in [0..63] */ uint32_t bits; bits = (uint8_t)a >> 1; bits &= ~1; return 1 + ((0xFFEAAA54UL >> bits) & 0x03); } /* We want (b - 1) / 3 for b between 7 and 64 so that we have a * a bit shift count between 2 and 21 inclusive. */ b = (b * 84) >> 8; /* We want the highest 3bit block from 'a' */ x = ((0xDCBA9863UL >> VAL_3BIT_TO_SHIFT4(a, b)) & 0x0F) + 3; x = (x << b) >> 3; x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)); x = ((x * 348) >> 10); // = x/2.94. Gives best precision here. x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)); x = ((x * 352) >> 10); // = x/2.91. Gives best precision here x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)(x - 1))); x = ((x * 341) >> 10); // = x/3.003 ~= x/3 return x; } static uint32_t end_ncubic_3div() { } /* calculate the cubic root of x using Newton-Raphson in 2 divides after first * approximation. * * Avg err ~= 0.187% */ static uint32_t ncubic_2div(uint64_t a) { uint32_t x; uint32_t b; uint32_t shift; /* * For large values, 3 bits inputs are enough. * * We store 4*cbrt(8*x)-1 for x in [0..7] * * x | cbrt | 4*cbrt | int | int | * range | range | approx | val.| -3 | * -------+-------------+--------+-----+-----+ * 56..63 | 3.83 - 3.98 | 15.66 | 16 | 13 | * 48..55 | 3.63 - 3.80 | 14.93 | 15 | 12 | * 40..47 | 3.42 - 3.61 | 14.12 | 14 | 11 | * 32..39 | 3.17 - 3.39 | 13.21 | 13 | 10 | * 24..31 | 2.88 - 3.14 | 12.15 | 12 | 9 | * 16..23 | 2.51 - 2.84 | 10.86 | 11 | 8 | * 08..15 | 2.00 - 2.46 | 9.16 | 9 | 6 | * 00..07 | 1.00 - 1.91 | 6.35 | 6 | 3 | * * We can store (4*cbrt(x)-3) in 4 bits for x in [0..63]. * So we use the following map : 0xDCBA9863UL */ ALIGN64 static uint32_t cbrt_3msb_4lsb = 0xDCBA9863UL; b = fls64(a); if (b <= 1) return b; else if (b < 7) { /* a in [0..63] */ uint32_t bits; bits = (uint8_t)a >> 1; bits &= ~1; return 1 + ((0xFFEAAA54UL >> bits) & 0x03); } /* We want (b - 1) / 3 for b between 7 and 64 so that we have a * a bit shift count between 2 and 21 inclusive. */ b = (b * 84) >> 8; /* We want the highest 3bit block from 'a' */ shift = (a >> (b * 3)) << 2; /* shift is 0..28 now. */ x = ((/*cbrt_3msb_4lsb*/0xDCBA9863UL >> shift) & 0x0F) + 3; x = (x << b) >> 3; x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)); x = ((x * 352) >> 10); // = x/2.91. Gives best precision here. x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)(x - 1))); x = ((x * 341) >> 10); // = x/3.003 ~= x/3 return x; } static uint32_t end_ncubic_2div() { } /* calculate the cubic root of x using Newton-Raphson in a single divide after * first approximation. * * Avg err ~= 0.443% */ static uint32_t ncubic_1div(uint64_t a) { uint32_t x; uint32_t b; uint32_t shift; /* * For large values, 3 bits inputs are enough. * * We store 4*cbrt(8*x)-1 for x in [0..7] * * x | cbrt | 4*cbrt | int | int | * range | range | approx | val.| -3 | * -------+-------------+--------+-----+-----+ * 56..63 | 3.83 - 3.98 | 15.66 | 16 | 13 | * 48..55 | 3.63 - 3.80 | 14.93 | 15 | 12 | * 40..47 | 3.42 - 3.61 | 14.12 | 14 | 11 | * 32..39 | 3.17 - 3.39 | 13.21 | 13 | 10 | * 24..31 | 2.88 - 3.14 | 12.15 | 12 | 9 | * 16..23 | 2.51 - 2.84 | 10.86 | 11 | 8 | * 08..15 | 2.00 - 2.46 | 9.16 | 9 | 6 | * 00..07 | 1.00 - 1.91 | 6.35 | 6 | 3 | * * We can store (4*cbrt(x)-3) in 4 bits for x in [0..63]. * So we use the following map : 0xDCBA9863UL */ ALIGN64 static uint32_t cbrt_3msb_4lsb = 0xDCBA9863UL; b = fls64(a); if (b < 7) { if (b <= 1) return b; /* a in [0..63] */ uint32_t bits; bits = (uint8_t)a >> 1; bits &= ~1; return 1 + ((0xFFEAAA54UL >> bits) & 0x03); } /* We want (b - 1) / 3 for b between 7 and 64 so that we have a * a bit shift count between 2 and 21 inclusive. */ b = (b * 84) >> 8; /* We want the highest 3bit block from 'a' */ x = ((/*cbrt_3msb_4lsb*/0xDCBA9863UL >> VAL_3BIT_TO_SHIFT4(a, b)) & 0x0F) + 3; x = (x << b) >> 3; /* one divide is enough to get a value within 0.4% */ x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x * (uint64_t)(x - 1))); x = ((x * 341) >> 10); return x; } static uint32_t end_ncubic_1div() { } /* calculate the cubic root of x using only an approximation. * Avg err ~= 4.521% */ static uint32_t ncubic_0div(uint64_t a) { uint32_t x; uint32_t b; /* * For large values, 3 bits inputs are enough. * * We store 4*cbrt(8*x)-1 for x in [0..7] * * x | cbrt | 4*cbrt | int | int | * range | range | approx | val.| -3 | * -------+-------------+--------+-----+-----+ * 56..63 | 3.83 - 3.98 | 15.66 | 16 | 13 | * 48..55 | 3.63 - 3.80 | 14.93 | 15 | 12 | * 40..47 | 3.42 - 3.61 | 14.12 | 14 | 11 | * 32..39 | 3.17 - 3.39 | 13.21 | 13 | 10 | * 24..31 | 2.88 - 3.14 | 12.15 | 12 | 9 | * 16..23 | 2.51 - 2.84 | 10.86 | 11 | 8 | * 08..15 | 2.00 - 2.46 | 9.16 | 9 | 6 | * 00..07 | 1.00 - 1.91 | 6.35 | 6 | 3 | * * We can store (4*cbrt(x)-3) in 4 bits for x in [0..63]. * So we use the following map : 0xDCBA9863UL */ ALIGN64 static uint32_t cbrt_3msb_4lsb = 0xDCBA9863UL; b = fls64(a); if (b < 7) { if (b <= 1) return b; /* a in [0..63] */ uint32_t bits; bits = (uint8_t)a >> 1; bits &= ~1; return 1 + ((0xFFEAAA54UL >> bits) & 0x03); } /* We want (b - 1) / 3 for b between 7 and 64 so that we have a * a bit shift count between 2 and 21 inclusive. */ b = (b * 84) >> 8; /* We want the highest 3bit block from 'a' */ x = ((/*cbrt_3msb_4lsb*/0xDCBA9863UL >> VAL_3BIT_TO_SHIFT4(a, b)) & 0x0F) + 3; x = (x << b) >> 3; return x; } static uint32_t end_ncubic_0div() { } /* calculate the cubic root of x using table lookups only. * Avg err ~= 0.613% */ static uint32_t ncubic_tab0(uint64_t a) { uint32_t b; uint32_t shift; /* * cbrt(x) MSB values for x MSB values in [0..63]. * Precomputed then refined by hand - Willy Tarreau * * For x in [0..63], * v = cbrt(x << 18) - 1 * cbrt(x) = (v[x] + 1) >> 6 */ static uint8_t v[] = { /* 0x00 */ 0, 63, 63, 63, 127, 127, 127, 127, /* 0x08 */ 129, 135, 139, 143, 147, 152, 155, 160, /* 0x10 */ 161, 165, 168, 172, 174, 177, 179, 182, /* 0x18 */ 184, 188, 190, 193, 195, 197, 199, 202, /* 0x20 */ 203, 205, 207, 209, 212, 213, 215, 217, /* 0x28 */ 219, 221, 222, 224, 226, 227, 229, 231, /* 0x30 */ 232, 234, 236, 237, 239, 240, 241, 243, /* 0x38 */ 244, 246, 247, 249, 251, 252, 253, 255, }; b = fls64(a); if (b < 7) /* a in [0..63] */ return (v[(uint32_t)a] + 31) >> 6; b = ((b * 84) >> 8) - 1; shift = (a >> (b * 3)); return ((uint32_t)(v[shift] + 1) << b) >> 6; } static uint32_t end_ncubic_tab0() { } /* calculate the cubic root of x using a table lookup followed by one * Newton-Raphson iteration. * Avg err ~= 0.195% */ static uint32_t ncubic_tab1(uint64_t a) { uint32_t x; uint32_t b; uint32_t z; uint32_t shift; /* * cbrt(x) MSB values for x MSB values in [0..63]. * Precomputed then refined by hand - Willy Tarreau * * For x in [0..63], * v = cbrt(x << 18) - 1 * cbrt(x) = (v[x] + 10) >> 6 */ static uint8_t v[] = { /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118, /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156, /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179, /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199, /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215, /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229, /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242, /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254, }; b = fls64(a); if (b < 7) { /* a in [0..63] */ return ((uint32_t)v[(uint32_t)a] + 35) >> 6; } b = ((b * 84) >> 8) - 1; shift = (a >> (b * 3)); x = ((uint32_t)(((uint32_t)v[shift] + 10) << b)) >> 6; /* one divide is enough to get a value within 0.19% */ x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x * (uint64_t)(x - 1))); x = ((x * 341) >> 10); return x; } static uint32_t end_ncubic_tab1() { } /* 65536 times the cubic root of 0, 1, 2, 3, 4, 5, 6, 7*/ static uint64_t bictcp_table[8] = {0, 65536, 82570, 94519, 104030, 112063, 119087, 125367}; /* calculate the cubic root of x the basic idea is that x can be expressed as i*8^j so cubic_root(x) = cubic_root(i)*2^j in the following code, x is i, and y is 2^j because of integer calculation, there are errors in calculation so finally use binary search to find out the exact solution. Avg err ~= 0.172% */ static uint32_t bictcp(uint64_t x) { uint64_t y, app, target, start, end, mid, start_diff, end_diff; if (x == 0) return 0; target = x; /*first estimate lower and upper bound*/ y = 1; while (x >= 8){ x = (x >> 3); y = (y << 1); } start = (y*bictcp_table[x])>>16; if (x==7) end = (y<<1); else end = (y*bictcp_table[x+1]+65535)>>16; /*binary search for more accurate one*/ while (start < end-1) { mid = (start+end) >> 1; app = mid*mid*mid; if (app < target) start = mid; else if (app > target) end = mid; else return mid; } /*find the most accurate one from start and end*/ app = start*start*start; if (app < target) start_diff = target - app; else start_diff = app - target; app = end*end*end; if (app < target) end_diff = target - app; else end_diff = app - target; return (start_diff < end_diff) ? start : end; } static uint32_t end_bictcp() { } #define NCASES 1000 static uint64_t cases[NCASES]; static double results[NCASES]; static double ticks_per_usec; static unsigned long long start, end; static void dotest(const char *name, uint32_t (*func)(uint64_t), int size) { int i; unsigned long long t, mx = 0, sum = 0, sum_sq = 0; double mean, std, err = 0; for (i = 0; i < NCASES; i++) { uint64_t x = cases[i]; uint32_t v; rdtscll(start); v = (*func)(x); rdtscll(end); t = end - start; if (t > mx) mx = t; sum += t; sum_sq += t*t; err += fabs(((double) v - results[i]) / results[i]); } mean = (double) sum / ticks_per_usec / NCASES ; std = sqrtl( (double) sum_sq / ticks_per_usec / NCASES - mean * mean); printf("%-15s %8llu %8.2f %8.2f %8.2f %.03f%% %4d\n", name, (unsigned long long) sum / NCASES, mean, std, (double) mx / ticks_per_usec, err * 100./ NCASES, size); } int main(int argc, char **argv) { uint64_t x; int i; printf("Calibrating\n"); rdtscll(start); sleep(2); rdtscll(end); ticks_per_usec = (double) (end - start) / 2000000.; for (i = 0; i < 63; i++) cases[i] = 1ull << i; x = ~0; while (x != 0) { cases[i++] = x; x >>= 1; } x = ~0; while (x != 0) { cases[i++] = x; x <<= 1; } while (i < NCASES) cases[i++] = (uint64_t) random() * (uint64_t) random(); for (i = 0; i < NCASES; i++) results[i] = cbrt((double)cases[i]); printf("Function clocks mean(us) max(us) std(us) Avg err size\n"); #define DOTEST(x) dotest(#x, x, end_##x-x) DOTEST(bictcp); DOTEST(ocubic); DOTEST(ncubic); DOTEST(ncubic32); DOTEST(ncubic32_1); DOTEST(ncubic_ndiv3); DOTEST(ncubic_3div); DOTEST(ncubic_2div); DOTEST(ncubic_1div); DOTEST(ncubic_0div); DOTEST(ncubic_tab1); DOTEST(ncubic_tab0); //for (man_adj = 0; man_adj < 16; man_adj++) { // printf("%02d : ", man_adj); // DOTEST(ncubic_tab1); //} DOTEST(acbrt); DOTEST(hcbrt); return 0; } ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] tcp_cubic: use 32 bit math 2007-03-13 20:50 ` Willy Tarreau @ 2007-03-21 18:54 ` Stephen Hemminger 2007-03-21 19:15 ` Willy Tarreau 0 siblings, 1 reply; 57+ messages in thread From: Stephen Hemminger @ 2007-03-21 18:54 UTC (permalink / raw) To: Willy Tarreau Cc: David Miller, rkuhn, andi, dada1, jengelh, linux-kernel, netdev On Tue, 13 Mar 2007 21:50:20 +0100 Willy Tarreau <w@1wt.eu> wrote: > Hi Stephen, > > On Mon, Mar 12, 2007 at 02:11:56PM -0700, Stephen Hemminger wrote: > > > Oh BTW, I have a newer version with a first approximation of the > > > cbrt() before the div64_64, which allows us to reduce from 3 div64 > > > to only 2 div64. This results in a version which is twice as fast > > > as the initial one (ncubic), but with slightly less accuracy (0.286% > > > compared to 0.247). But I see that other functions such as hcbrt() > > > had a 1.5% avg error, so I think this is not dramatic. > > > > Ignore my hcbrt() it was a less accurate version of andi's stuff. > > OK. > > > > Also, I managed to remove all other divides, to be kind with CPUs > > > having a slow divide instruction or no divide at all. Since we compute > > > on limited range (22 bits), we can multiply then shift right. It shows > > > me even slightly better time on pentium-m and athlon, with a slightly > > > higher avg error (0.297% compared to 0.286%), and slightly smaller > > > code. > > > > What does the code look like? > > Well, I have cleaned it a little bit, there were more comments and ifdefs > than code ! I've appended it to the end of this mail. > > I have changed it a bit, because I noticed that integer divide precision > was so coarse that there were other possibilities to play with the bits. > > I have experimented with combinations of several methods : > - replace integer divides with multiplies/shifts where possible. > > - compensation for divide imprecisions by adding/removing small > values bofore/after them. Often, the integer result of 1/(x*(x-1)) > is closer to (float)1/(float)x^2 than 1/(x*x). This is because > the divide always truncates the result. > > - use direct result lookup for small values. Small inputs give small > outputs which have very few moving bits. Many different values fit > in a 32bit integer, so we use a shift offset to lookup the value. > I used this in an fls function I wrote a while ago, that I should > also post because it is up to twice as fast as the kernel's. > Sometimes it seems faster to lookup in from memory, sometimes it > is faster from an immediate value. Maybe more visible differences > would show up on RISC CPUs where loading 32 bits immediate needs > two instructions. I don't know yet, I've not tested on my sparc > yet. > > - use small lookup tables (64 bytes) with 6 bits inputs and at least > as many on output. We only lookup the 6 MSB and return the 2-3 MSB > of the result. > > - iterative search and manual refinment of the lookup tables for best > accuracy. The avg error rate can easily be halved this way. > > I have duplicated tried several functions with 0, 1, 2 and 3 divides. > Several of them offer better accuracy over what we currently have, in > less cycles. Others offer faster results (up to 5 times) with slightly > less accuracy. > > There is one function which is not to be used, but is just here for > comparison (ncubic_0div). It does no divide but has awful avg error. > > But one which is interesting is the ncubic_tab0. It does not use any > divide at all, even not any div64. It shows a 0.6% avg error, which I'm > not sure is enough or not. It is 6.7 times faster than initial ncubic() > with less accuracy, and 4 times smaller. I suspect that it can differ > more on architectures which have no divide instruction. > > Is 0.6% avg error rate is too much, ncubic_tab1() uses one single div64 > and is twice slower (still nearly 3 times faster than ncubic). It show > 0.195% avg error, which is better than initial ncubic. I think that it > is a good tradeoff. > > If best accuracy is an absolute requirement, then I have a variation of > ncubic (ncubic_3div) which does 0.17% in 2/3 of the time (compared to > 0.247%), and which is slightly smaller. > > I have also added a "size" column, indicating approximative function > size, provided that the compiler does not reorder the code. On gcc 3.4, > it's OK, but 4.1 returns garbage. That does not matter, it's just a > rough estimate anyway. > > Here are the results classed by speed : > > /* Sample output on a Pentium-M 600 MHz : > > Function clocks mean(us) max(us) std(us) Avg err size > ncubic_tab0 79 0.66 7.20 1.04 0.613% 160 > ncubic_0div 84 0.70 7.64 1.57 4.521% 192 > ncubic_1div 178 1.48 16.27 1.81 0.443% 336 > ncubic_tab1 179 1.49 16.34 1.85 0.195% 320 > ncubic_ndiv3 263 2.18 24.04 3.59 0.250% 512 > ncubic_2div 270 2.24 24.70 2.77 0.187% 512 > ncubic32_1 359 2.98 32.81 3.59 0.238% 544 > ncubic_3div 361 2.99 33.08 3.79 0.170% 656 > ncubic32 364 3.02 33.29 3.51 0.247% 544 > ncubic 529 4.39 48.39 4.92 0.247% 720 > hcbrt 539 4.47 49.25 5.98 1.580% 96 > ocubic 732 4.93 61.83 7.22 0.274% 320 > acbrt 842 6.98 76.73 8.55 0.275% 192 > bictcp 1032 6.95 86.30 9.04 0.172% 768 > > And now by avg error : > > ncubic_3div 361 2.99 33.08 3.79 0.170% 656 > bictcp 1032 6.95 86.30 9.04 0.172% 768 > ncubic_2div 270 2.24 24.70 2.77 0.187% 512 > ncubic_tab1 179 1.49 16.34 1.85 0.195% 320 > ncubic32_1 359 2.98 32.81 3.59 0.238% 544 > ncubic 529 4.39 48.39 4.92 0.247% 720 > ncubic32 364 3.02 33.29 3.51 0.247% 544 > ncubic_ndiv3 263 2.18 24.04 3.59 0.250% 512 > ocubic 732 4.93 61.83 7.22 0.274% 320 > acbrt 842 6.98 76.73 8.55 0.275% 192 > ncubic_1div 178 1.48 16.27 1.81 0.443% 336 > ncubic_tab0 79 0.66 7.20 1.04 0.613% 160 > hcbrt 539 4.47 49.25 5.98 1.580% 96 > ncubic_0div 84 0.70 7.64 1.57 4.521% 192 > > > And here comes the code. I have tried to document it a bit, as much > as can be done on experimentation code. It is often easier to use > a pencil and paper to understand how the bits move. > > Regards, > Willy > The following version of div64_64 is faster because do_div already optimized for the 32 bit case.. I get the following results on ULV Core Solo (ie slow current processor) and the following on 64bit Core Duo. ncubic_tab1 seems like the best (no additional error and about as fast) ULV Core Solo Function clocks mean(us) max(us) std(us) Avg err size ncubic_tab0 192 11.24 45.10 15.28 0.450% -2262 ncubic_0div 201 11.77 47.23 27.40 3.357% -2404 ncubic_1div 324 19.02 76.32 25.82 0.189% -2567 ncubic_tab1 326 19.13 76.73 23.71 0.043% -2059 ncubic_2div 456 26.72 108.92 493.16 0.028% -2790 ncubic_ndiv3 463 27.15 133.37 1889.39 0.104% -3344 ncubic32 549 32.18 130.59 508.97 0.041% -3794 ncubic32_1 574 33.66 138.32 548.48 0.029% -3604 ncubic_3div 581 34.04 140.24 608.55 0.018% -3050 ncubic 733 42.92 173.35 523.19 0.041% 299 ocubic 1046 61.25 283.68 3305.65 0.027% -2232 acbrt 1149 67.32 284.91 1941.55 0.029% 168 bictcp 1663 97.41 394.29 604.86 0.017% 628 Core 2 Duo Function clocks mean(us) max(us) std(us) Avg err size ncubic_0div 74 0.03 1.60 0.07 3.357% -2101 ncubic_tab0 74 0.03 1.60 0.04 0.450% -2029 ncubic_1div 142 0.07 3.11 1.05 0.189% -2195 ncubic_tab1 144 0.07 3.18 1.02 0.043% -1638 ncubic_2div 216 0.10 4.74 1.07 0.028% -2326 ncubic_ndiv3 219 0.10 4.76 1.04 0.104% -2709 ncubic32 269 0.13 5.87 1.13 0.041% -1500 ncubic32_1 272 0.13 5.92 1.10 0.029% -2881 ncubic 273 0.13 5.96 1.13 0.041% -1763 ncubic_3div 290 0.14 6.32 1.01 0.018% -2499 acbrt 430 0.20 9.42 1.18 0.029% 77 ocubic 444 0.21 9.82 1.82 0.027% -1924 bictcp 549 0.26 12.06 1.68 0.017% 236 -- Stephen Hemminger <shemminger@linux-foundation.org> ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] tcp_cubic: use 32 bit math 2007-03-21 18:54 ` Stephen Hemminger @ 2007-03-21 19:15 ` Willy Tarreau 2007-03-21 19:58 ` Stephen Hemminger 2007-03-21 20:15 ` [PATCH 1/2] div64_64 optimization Stephen Hemminger 0 siblings, 2 replies; 57+ messages in thread From: Willy Tarreau @ 2007-03-21 19:15 UTC (permalink / raw) To: Stephen Hemminger Cc: David Miller, rkuhn, andi, dada1, jengelh, linux-kernel, netdev Hi Stephen, On Wed, Mar 21, 2007 at 11:54:19AM -0700, Stephen Hemminger wrote: > On Tue, 13 Mar 2007 21:50:20 +0100 > Willy Tarreau <w@1wt.eu> wrote: [...] ( cut my boring part ) > > Here are the results classed by speed : > > > > /* Sample output on a Pentium-M 600 MHz : > > > > Function clocks mean(us) max(us) std(us) Avg err size > > ncubic_tab0 79 0.66 7.20 1.04 0.613% 160 > > ncubic_0div 84 0.70 7.64 1.57 4.521% 192 > > ncubic_1div 178 1.48 16.27 1.81 0.443% 336 > > ncubic_tab1 179 1.49 16.34 1.85 0.195% 320 > > ncubic_ndiv3 263 2.18 24.04 3.59 0.250% 512 > > ncubic_2div 270 2.24 24.70 2.77 0.187% 512 > > ncubic32_1 359 2.98 32.81 3.59 0.238% 544 > > ncubic_3div 361 2.99 33.08 3.79 0.170% 656 > > ncubic32 364 3.02 33.29 3.51 0.247% 544 > > ncubic 529 4.39 48.39 4.92 0.247% 720 > > hcbrt 539 4.47 49.25 5.98 1.580% 96 > > ocubic 732 4.93 61.83 7.22 0.274% 320 > > acbrt 842 6.98 76.73 8.55 0.275% 192 > > bictcp 1032 6.95 86.30 9.04 0.172% 768 > > [...] > The following version of div64_64 is faster because do_div already > optimized for the 32 bit case.. Cool, this is interesting because I first wanted to optimize it but did not find how to start with this. You seem to get very good results. BTW, you did not append your changes. However, one thing I do not understand is why your avg error is about 1/3 below the original one. Was there a precision bug in the original div_64_64 or did you extend the values used in the test ? Or perhaps you used -fast-math to build and the original cbrt() is less precise in this case ? > I get the following results on ULV Core Solo (ie slow current processor) > and the following on 64bit Core Duo. ncubic_tab1 seems like > the best (no additional error and about as fast) OK. It was the one I preferred too unless tab0's avg error was acceptable. > ULV Core Solo > > Function clocks mean(us) max(us) std(us) Avg err size > ncubic_tab0 192 11.24 45.10 15.28 0.450% -2262 > ncubic_0div 201 11.77 47.23 27.40 3.357% -2404 > ncubic_1div 324 19.02 76.32 25.82 0.189% -2567 > ncubic_tab1 326 19.13 76.73 23.71 0.043% -2059 > ncubic_2div 456 26.72 108.92 493.16 0.028% -2790 > ncubic_ndiv3 463 27.15 133.37 1889.39 0.104% -3344 > ncubic32 549 32.18 130.59 508.97 0.041% -3794 > ncubic32_1 574 33.66 138.32 548.48 0.029% -3604 > ncubic_3div 581 34.04 140.24 608.55 0.018% -3050 > ncubic 733 42.92 173.35 523.19 0.041% 299 > ocubic 1046 61.25 283.68 3305.65 0.027% -2232 > acbrt 1149 67.32 284.91 1941.55 0.029% 168 > bictcp 1663 97.41 394.29 604.86 0.017% 628 > > Core 2 Duo > > Function clocks mean(us) max(us) std(us) Avg err size > ncubic_0div 74 0.03 1.60 0.07 3.357% -2101 > ncubic_tab0 74 0.03 1.60 0.04 0.450% -2029 > ncubic_1div 142 0.07 3.11 1.05 0.189% -2195 > ncubic_tab1 144 0.07 3.18 1.02 0.043% -1638 > ncubic_2div 216 0.10 4.74 1.07 0.028% -2326 > ncubic_ndiv3 219 0.10 4.76 1.04 0.104% -2709 > ncubic32 269 0.13 5.87 1.13 0.041% -1500 > ncubic32_1 272 0.13 5.92 1.10 0.029% -2881 > ncubic 273 0.13 5.96 1.13 0.041% -1763 > ncubic_3div 290 0.14 6.32 1.01 0.018% -2499 > acbrt 430 0.20 9.42 1.18 0.029% 77 > ocubic 444 0.21 9.82 1.82 0.027% -1924 > bictcp 549 0.26 12.06 1.68 0.017% 236 Thanks, Willy ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] tcp_cubic: use 32 bit math 2007-03-21 19:15 ` Willy Tarreau @ 2007-03-21 19:58 ` Stephen Hemminger 2007-03-21 20:15 ` [PATCH 1/2] div64_64 optimization Stephen Hemminger 1 sibling, 0 replies; 57+ messages in thread From: Stephen Hemminger @ 2007-03-21 19:58 UTC (permalink / raw) To: Willy Tarreau Cc: David Miller, rkuhn, andi, dada1, jengelh, linux-kernel, netdev On Wed, 21 Mar 2007 20:15:37 +0100 Willy Tarreau <w@1wt.eu> wrote: > Hi Stephen, > > On Wed, Mar 21, 2007 at 11:54:19AM -0700, Stephen Hemminger wrote: > > On Tue, 13 Mar 2007 21:50:20 +0100 > > Willy Tarreau <w@1wt.eu> wrote: > > [...] ( cut my boring part ) > > > > Here are the results classed by speed : > > > > > > /* Sample output on a Pentium-M 600 MHz : > > > > > > Function clocks mean(us) max(us) std(us) Avg err size > > > ncubic_tab0 79 0.66 7.20 1.04 0.613% 160 > > > ncubic_0div 84 0.70 7.64 1.57 4.521% 192 > > > ncubic_1div 178 1.48 16.27 1.81 0.443% 336 > > > ncubic_tab1 179 1.49 16.34 1.85 0.195% 320 > > > ncubic_ndiv3 263 2.18 24.04 3.59 0.250% 512 > > > ncubic_2div 270 2.24 24.70 2.77 0.187% 512 > > > ncubic32_1 359 2.98 32.81 3.59 0.238% 544 > > > ncubic_3div 361 2.99 33.08 3.79 0.170% 656 > > > ncubic32 364 3.02 33.29 3.51 0.247% 544 > > > ncubic 529 4.39 48.39 4.92 0.247% 720 > > > hcbrt 539 4.47 49.25 5.98 1.580% 96 > > > ocubic 732 4.93 61.83 7.22 0.274% 320 > > > acbrt 842 6.98 76.73 8.55 0.275% 192 > > > bictcp 1032 6.95 86.30 9.04 0.172% 768 > > > > > [...] > > > The following version of div64_64 is faster because do_div already > > optimized for the 32 bit case.. > /* 64bit divisor, dividend and result. dynamic precision */ static uint64_t div64_64(uint64_t dividend, uint64_t divisor) { uint32_t high, d; high = divisor >> 32; if (high) { unsigned int shift = fls(high); d = divisor >> shift; dividend >>= shift; } else d = divisor; do_div(dividend, d); return dividend; } > Cool, this is interesting because I first wanted to optimize it but did > not find how to start with this. You seem to get very good results. BTW, > you did not append your changes. > > However, one thing I do not understand is why your avg error is about 1/3 > below the original one. Was there a precision bug in the original div_64_64 > or did you extend the values used in the test ? > > Or perhaps you used -fast-math to build and the original cbrt() is less > precise in this case ? No, but I did use -mtune=pentiumm on the ULV -- Stephen Hemminger <shemminger@linux-foundation.org> ^ permalink raw reply [flat|nested] 57+ messages in thread
* [PATCH 1/2] div64_64 optimization 2007-03-21 19:15 ` Willy Tarreau 2007-03-21 19:58 ` Stephen Hemminger @ 2007-03-21 20:15 ` Stephen Hemminger 2007-03-21 20:17 ` [PATCH 2/2] tcp: cubic optimization Stephen Hemminger 2007-03-22 19:11 ` [PATCH 1/2] div64_64 optimization David Miller 1 sibling, 2 replies; 57+ messages in thread From: Stephen Hemminger @ 2007-03-21 20:15 UTC (permalink / raw) To: Willy Tarreau, David Miller Cc: rkuhn, andi, dada1, jengelh, linux-kernel, netdev Minor optimization of div64_64. do_div() already does optimization for the case of 32 by 32 divide, so no need to do it here. Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org> --- net-2.6.22.orig/lib/div64.c 2007-03-21 12:03:59.000000000 -0700 +++ net-2.6.22/lib/div64.c 2007-03-21 12:04:46.000000000 -0700 @@ -61,20 +61,18 @@ /* 64bit divisor, dividend and result. dynamic precision */ uint64_t div64_64(uint64_t dividend, uint64_t divisor) { - uint32_t d = divisor; + uint32_t high, d; - if (divisor > 0xffffffffULL) { - unsigned int shift = fls(divisor >> 32); + high = divisor >> 32; + if (high) { + unsigned int shift = fls(high); d = divisor >> shift; dividend >>= shift; - } + } else + d = divisor; - /* avoid 64 bit division if possible */ - if (dividend >> 32) - do_div(dividend, d); - else - dividend = (uint32_t) dividend / d; + do_div(dividend, d); return dividend; } ^ permalink raw reply [flat|nested] 57+ messages in thread
* [PATCH 2/2] tcp: cubic optimization 2007-03-21 20:15 ` [PATCH 1/2] div64_64 optimization Stephen Hemminger @ 2007-03-21 20:17 ` Stephen Hemminger 2007-03-22 19:11 ` David Miller 2007-03-22 19:11 ` [PATCH 1/2] div64_64 optimization David Miller 1 sibling, 1 reply; 57+ messages in thread From: Stephen Hemminger @ 2007-03-21 20:17 UTC (permalink / raw) To: Stephen Hemminger Cc: Willy Tarreau, David Miller, rkuhn, andi, dada1, jengelh, linux-kernel, netdev Use willy's work in optimizing cube root by having table for small values. Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org> --- net-2.6.22.orig/net/ipv4/tcp_cubic.c 2007-03-21 12:57:11.000000000 -0700 +++ net-2.6.22/net/ipv4/tcp_cubic.c 2007-03-21 13:04:59.000000000 -0700 @@ -91,23 +91,51 @@ tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } -/* - * calculate the cubic root of x using Newton-Raphson +/* calculate the cubic root of x using a table lookup followed by one + * Newton-Raphson iteration. + * Avg err ~= 0.195% */ static u32 cubic_root(u64 a) { - u32 x; - - /* Initial estimate is based on: - * cbrt(x) = exp(log(x) / 3) + u32 x, b, shift; + /* + * cbrt(x) MSB values for x MSB values in [0..63]. + * Precomputed then refined by hand - Willy Tarreau + * + * For x in [0..63], + * v = cbrt(x << 18) - 1 + * cbrt(x) = (v[x] + 10) >> 6 */ - x = 1u << (fls64(a)/3); + static const u8 v[] = { + /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118, + /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156, + /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179, + /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199, + /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215, + /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229, + /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242, + /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254, + }; + + b = fls64(a); + if (b < 7) { + /* a in [0..63] */ + return ((u32)v[(u32)a] + 35) >> 6; + } + + b = ((b * 84) >> 8) - 1; + shift = (a >> (b * 3)); - /* converges to 32 bits in 3 iterations */ - x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3; - x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3; - x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3; + x = ((u32)(((u32)v[shift] + 10) << b)) >> 6; + /* + * Newton-Raphson iteration + * 2 + * x = ( 2 * x + a / x ) / 3 + * k+1 k k + */ + x = (2 * x + (u32)div64_64(a, (u64)x * (u64)(x - 1))); + x = ((x * 341) >> 10); return x; } -- Stephen Hemminger <shemminger@linux-foundation.org> ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 2/2] tcp: cubic optimization 2007-03-21 20:17 ` [PATCH 2/2] tcp: cubic optimization Stephen Hemminger @ 2007-03-22 19:11 ` David Miller 0 siblings, 0 replies; 57+ messages in thread From: David Miller @ 2007-03-22 19:11 UTC (permalink / raw) To: shemminger; +Cc: w, rkuhn, andi, dada1, jengelh, linux-kernel, netdev From: Stephen Hemminger <shemminger@linux-foundation.org> Date: Wed, 21 Mar 2007 13:17:21 -0700 > Use willy's work in optimizing cube root by having table for small values. > > Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org> Also applied to net-2.6.22, thanks Stephen. ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 1/2] div64_64 optimization 2007-03-21 20:15 ` [PATCH 1/2] div64_64 optimization Stephen Hemminger 2007-03-21 20:17 ` [PATCH 2/2] tcp: cubic optimization Stephen Hemminger @ 2007-03-22 19:11 ` David Miller 1 sibling, 0 replies; 57+ messages in thread From: David Miller @ 2007-03-22 19:11 UTC (permalink / raw) To: shemminger; +Cc: w, rkuhn, andi, dada1, jengelh, linux-kernel, netdev From: Stephen Hemminger <shemminger@linux-foundation.org> Date: Wed, 21 Mar 2007 13:15:32 -0700 > Minor optimization of div64_64. do_div() already does optimization > for the case of 32 by 32 divide, so no need to do it here. > > Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org> Applied to net-2.6.22 ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] tcp_cubic: use 32 bit math 2007-03-08 3:10 ` Stephen Hemminger 2007-03-08 3:51 ` David Miller @ 2007-03-08 4:16 ` Willy Tarreau 1 sibling, 0 replies; 57+ messages in thread From: Willy Tarreau @ 2007-03-08 4:16 UTC (permalink / raw) To: Stephen Hemminger Cc: David Miller, rkuhn, andi, dada1, jengelh, linux-kernel, netdev On Wed, Mar 07, 2007 at 07:10:47PM -0800, Stephen Hemminger wrote: > David Miller wrote: > >From: Stephen Hemminger <shemminger@linux-foundation.org> > >Date: Wed, 7 Mar 2007 17:07:31 -0800 > > > > > >>The basic calculation has to be done in 32 bits to avoid > >>doing 64 bit divide by 3. The value x is only 22bits max > >>so only need full 64 bits only for x^2. > >> > >>Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org> > >> > > > >Applied, thanks Stephen. > > > >What about Willy Tarreau's supposedly even faster variant? > >Or does this incorporate that set of improvements? > > > That's what this is: > x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3; Confirmed, it's the same. BTW, has someone tested on a 64bit system if it brings any difference ? Willy ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] tcp_cubic: faster cube root 2007-03-06 22:47 ` [PATCH] tcp_cubic: faster cube root Stephen Hemminger 2007-03-06 22:58 ` cube root benchmark code Stephen Hemminger @ 2007-03-07 4:20 ` David Miller 2007-03-07 12:12 ` Andi Kleen 1 sibling, 1 reply; 57+ messages in thread From: David Miller @ 2007-03-07 4:20 UTC (permalink / raw) To: shemminger; +Cc: rkuhn, andi, dada1, jengelh, linux-kernel, netdev From: Stephen Hemminger <shemminger@linux-foundation.org> Date: Tue, 6 Mar 2007 14:47:06 -0800 > The Newton-Raphson method is quadratically convergent so > only a small fixed number of steps are necessary. > Therefore it is faster to unroll the loop. Since div64_64 is no longer > inline it won't cause code explosion. > > Also fixes a bug that can occur if x^2 was bigger than 32 bits. > > Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org> Applied, thanks Stephen. ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] tcp_cubic: faster cube root 2007-03-07 4:20 ` [PATCH] tcp_cubic: faster cube root David Miller @ 2007-03-07 12:12 ` Andi Kleen 2007-03-07 19:33 ` David Miller 0 siblings, 1 reply; 57+ messages in thread From: Andi Kleen @ 2007-03-07 12:12 UTC (permalink / raw) To: David Miller Cc: shemminger, rkuhn, andi, dada1, jengelh, linux-kernel, netdev On Tue, Mar 06, 2007 at 08:20:52PM -0800, David Miller wrote: > From: Stephen Hemminger <shemminger@linux-foundation.org> > Date: Tue, 6 Mar 2007 14:47:06 -0800 > > > The Newton-Raphson method is quadratically convergent so > > only a small fixed number of steps are necessary. > > Therefore it is faster to unroll the loop. Since div64_64 is no longer > > inline it won't cause code explosion. > > > > Also fixes a bug that can occur if x^2 was bigger than 32 bits. > > > > Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org> > > Applied, thanks Stephen. Well that still needs the ugly div64_64 function. At least my goal was to eliminate that, not make it faster (I don't see any evidence this function is particularly performance critical). You prefer to keep div64_64? -Andi ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] tcp_cubic: faster cube root 2007-03-07 12:12 ` Andi Kleen @ 2007-03-07 19:33 ` David Miller 0 siblings, 0 replies; 57+ messages in thread From: David Miller @ 2007-03-07 19:33 UTC (permalink / raw) To: andi; +Cc: shemminger, rkuhn, dada1, jengelh, linux-kernel, netdev From: Andi Kleen <andi@firstfloor.org> Date: Wed, 7 Mar 2007 13:12:46 +0100 > Well that still needs the ugly div64_64 function. At least my goal was to > eliminate that, not make it faster (I don't see any evidence this function > is particularly performance critical). You prefer to keep div64_64? I really have no issues with it. ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [RFC] div64_64 support 2007-03-06 14:45 ` Andi Kleen 2007-03-06 15:10 ` Roland Kuhn @ 2007-03-06 18:50 ` H. Peter Anvin 1 sibling, 0 replies; 57+ messages in thread From: H. Peter Anvin @ 2007-03-06 18:50 UTC (permalink / raw) To: Andi Kleen Cc: Eric Dumazet, Stephen Hemminger, Jan Engelhardt, linux-kernel, netdev Andi Kleen wrote: >> <rant> >> Let me see... You throw code like that and expect someone to actually >> understand it in one year, and be able to correct a bug ? > > To be honest I don't expect any bugs in this function. > >> </rant> >> >> Please add something, an URL or even better a nice explanation, per favor... > > It's straight out of Hacker's delight which is referenced in the commit > log. Referencing it in a comment would have been a better idea. -hpa ^ permalink raw reply [flat|nested] 57+ messages in thread
end of thread, other threads:[~2007-03-22 19:11 UTC | newest] Thread overview: 57+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2007-02-24 1:05 [RFC] div64_64 support Stephen Hemminger 2007-02-24 16:19 ` Sami Farin 2007-02-26 20:09 ` Jan Engelhardt 2007-02-26 21:28 ` Stephen Hemminger 2007-02-27 1:20 ` H. Peter Anvin 2007-02-27 3:45 ` Segher Boessenkool 2007-02-26 22:31 ` Stephen Hemminger 2007-02-26 23:02 ` Jan Engelhardt 2007-02-26 23:44 ` Stephen Hemminger 2007-02-27 0:05 ` Jan Engelhardt 2007-02-27 0:07 ` Stephen Hemminger 2007-02-27 0:14 ` Jan Engelhardt 2007-02-27 6:21 ` Dan Williams 2007-03-03 2:31 ` Andi Kleen 2007-03-05 23:57 ` Stephen Hemminger 2007-03-06 0:25 ` David Miller 2007-03-06 13:36 ` Andi Kleen 2007-03-06 14:04 ` [RFC] div64_64 support II Andi Kleen 2007-03-06 17:43 ` Dagfinn Ilmari Mannsåker 2007-03-06 18:25 ` David Miller 2007-03-06 18:48 ` H. Peter Anvin 2007-03-06 13:34 ` [RFC] div64_64 support Andi Kleen 2007-03-06 14:19 ` Eric Dumazet 2007-03-06 14:45 ` Andi Kleen 2007-03-06 15:10 ` Roland Kuhn 2007-03-06 18:29 ` Stephen Hemminger 2007-03-06 19:48 ` Andi Kleen 2007-03-06 20:04 ` Stephen Hemminger 2007-03-06 21:53 ` Sami Farin 2007-03-06 22:24 ` Sami Farin 2007-03-07 16:11 ` Chuck Ebbert 2007-03-07 18:32 ` Sami Farin 2007-03-08 18:23 ` asm volatile [Was: [RFC] div64_64 support] Sami Farin 2007-03-08 22:01 ` asm volatile David Miller 2007-03-06 21:58 ` [RFC] div64_64 support David Miller 2007-03-06 22:47 ` [PATCH] tcp_cubic: faster cube root Stephen Hemminger 2007-03-06 22:58 ` cube root benchmark code Stephen Hemminger 2007-03-07 6:08 ` Update to " Willy Tarreau 2007-03-08 1:07 ` [PATCH] tcp_cubic: use 32 bit math Stephen Hemminger 2007-03-08 2:55 ` David Miller 2007-03-08 3:10 ` Stephen Hemminger 2007-03-08 3:51 ` David Miller 2007-03-10 11:48 ` Willy Tarreau 2007-03-12 21:11 ` Stephen Hemminger 2007-03-13 20:50 ` Willy Tarreau 2007-03-21 18:54 ` Stephen Hemminger 2007-03-21 19:15 ` Willy Tarreau 2007-03-21 19:58 ` Stephen Hemminger 2007-03-21 20:15 ` [PATCH 1/2] div64_64 optimization Stephen Hemminger 2007-03-21 20:17 ` [PATCH 2/2] tcp: cubic optimization Stephen Hemminger 2007-03-22 19:11 ` David Miller 2007-03-22 19:11 ` [PATCH 1/2] div64_64 optimization David Miller 2007-03-08 4:16 ` [PATCH] tcp_cubic: use 32 bit math Willy Tarreau 2007-03-07 4:20 ` [PATCH] tcp_cubic: faster cube root David Miller 2007-03-07 12:12 ` Andi Kleen 2007-03-07 19:33 ` David Miller 2007-03-06 18:50 ` [RFC] div64_64 support H. Peter Anvin
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).