All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64()
@ 2011-12-13 14:56 David Howells
  2011-12-13 14:57 ` [PATCH 2/3] Adjust the comment on get_order() to describe the size==0 case David Howells
                   ` (4 more replies)
  0 siblings, 5 replies; 19+ messages in thread
From: David Howells @ 2011-12-13 14:56 UTC (permalink / raw)
  To: tglx, mingo; +Cc: x86, linux-arch, David Howells

fls(N), ffs(N) and fls64(N) can be optimised on x86_64.  Currently they use a
CMOV instruction after the BSR/BSF to set the destination register to -1 if the
value to be scanned was 0 (in which case BSR/BSF set the Z flag).

Instead, according to the AMD64 specification, we can make use of the fact that
BSR/BSF doesn't modify its output register if its input is 0.  By preloading
the output with -1 and incrementing the result, we achieve the desired result
without the need for a conditional check.

The Intel x86_64 specification, however, says that the result of BSR/BSF in
such a case is undefined.  That said, when queried, one of the Intel CPU
architects said that the behaviour on all Intel CPUs is that:

 (1) with BSRQ/BSFQ, the 64-bit destination register is written with its
     original value if the source is 0, thus, in essence, giving the effect we
     want.  And,

 (2) with BSRL/BSFL, the lower half of the 64-bit destination register is
     written with its original value if the source is 0, and the upper half is
     cleared, thus giving us the effect we want (we return a 4-byte int).

Further, it was indicated that they (Intel) are unlikely to get away with
changing the behaviour.


It might be possible to optimise the 32-bit versions of these functions, but
there's a lot more variation, and so the effective non-destructive property of
BSRL/BSRF cannot be relied on.


I have benchmarked these functions on my Core2 Duo test machine using the
following program:

	#include <stdlib.h>
	#include <stdio.h>

	#ifndef __x86_64__
	#error
	#endif

	#define PAGE_SHIFT 12

	typedef unsigned long long __u64, u64;
	typedef unsigned int __u32, u32;
	#define noinline	__attribute__((noinline))

	static __always_inline int fls64(__u64 x)
	{
		long bitpos = -1;

		asm("bsrq %1,%0"
		    : "+r" (bitpos)
		    : "rm" (x));
		return bitpos + 1;
	}

	static inline unsigned long __fls(unsigned long word)
	{
		asm("bsr %1,%0"
		    : "=r" (word)
		    : "rm" (word));
		return word;
	}
	static __always_inline int old_fls64(__u64 x)
	{
		if (x == 0)
			return 0;
		return __fls(x) + 1;
	}

	static noinline // __attribute__((const))
	int old_get_order(unsigned long size)
	{
		int order;

		size = (size - 1) >> (PAGE_SHIFT - 1);
		order = -1;
		do {
			size >>= 1;
			order++;
		} while (size);
		return order;
	}

	static inline __attribute__((const))
	int get_order_old_fls64(unsigned long size)
	{
		int order;
		size--;
		size >>= PAGE_SHIFT;
		order = old_fls64(size);
		return order;
	}

	static inline __attribute__((const))
	int get_order(unsigned long size)
	{
		int order;
		size--;
		size >>= PAGE_SHIFT;
		order = fls64(size);
		return order;
	}

	unsigned long prevent_optimise_out;

	static noinline unsigned long test_old_get_order(void)
	{
		unsigned long n, total = 0;
		long rep, loop;

		for (rep = 1000000; rep > 0; rep--) {
			for (loop = 0; loop <= 16384; loop += 4) {
				n = 1UL << loop;
				total += old_get_order(n);
			}
		}
		return total;
	}

	static noinline unsigned long test_get_order_old_fls64(void)
	{
		unsigned long n, total = 0;
		long rep, loop;

		for (rep = 1000000; rep > 0; rep--) {
			for (loop = 0; loop <= 16384; loop += 4) {
				n = 1UL << loop;
				total += get_order_old_fls64(n);
			}
		}
		return total;
	}

	static noinline unsigned long test_get_order(void)
	{
		unsigned long n, total = 0;
		long rep, loop;

		for (rep = 1000000; rep > 0; rep--) {
			for (loop = 0; loop <= 16384; loop += 4) {
				n = 1UL << loop;
				total += get_order(n);
			}
		}
		return total;
	}

	int main(int argc, char **argv)
	{
		unsigned long total;

		switch (argc) {
		case 1:  total = test_old_get_order();		break;
		case 2:  total = test_get_order_old_fls64();	break;
		default: total = test_get_order();		break;
		}
		prevent_optimise_out = total;
		return 0;
	}

This allows me to test the use of the old fls64() implementation and the new
fls64() implementation and also to contrast these to the out-of-line loop-based
implementation of get_order().  The results were:

	warthog>time ./get_order
	real    1m37.191s
	user    1m36.313s
	sys     0m0.861s
	warthog>time ./get_order x
	real    0m16.892s
	user    0m16.586s
	sys     0m0.287s
	warthog>time ./get_order x x
	real    0m7.731s
	user    0m7.727s
	sys     0m0.002s

Using the current upstream fls64() as a basis for an inlined get_order() [the
second result above] is much faster than using the current out-of-line
loop-based get_order() [the first result above].

Using my optimised inline fls64()-based get_order() [the third result above]
is even faster still.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 arch/x86/include/asm/bitops.h |   57 +++++++++++++++++++++++++++++++++++++++--
 1 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 1775d6e..9558432 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -395,10 +395,21 @@ static inline unsigned long __fls(unsigned long word)
 static inline int ffs(int x)
 {
 	int r;
-#ifdef CONFIG_X86_CMOV
+
+#if BITS_PER_LONG == 64
+	/* AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
+	 * dest reg is undefined if x==0, but their CPU architect says its
+	 * value is written to set it to the same as before, except that the
+	 * top 32-bits will be cleared.
+	 */
+	long tmp = -1;
+	asm("bsfl %1,%0"
+	    : "=r" (r)
+	    : "rm" (x), "0" (tmp));
+#elif defined(CONFIG_X86_CMOV)
 	asm("bsfl %1,%0\n\t"
 	    "cmovzl %2,%0"
-	    : "=r" (r) : "rm" (x), "r" (-1));
+	    : "=&r" (r) : "rm" (x), "r" (-1));
 #else
 	asm("bsfl %1,%0\n\t"
 	    "jnz 1f\n\t"
@@ -422,7 +433,18 @@ static inline int ffs(int x)
 static inline int fls(int x)
 {
 	int r;
-#ifdef CONFIG_X86_CMOV
+
+#if BITS_PER_LONG == 64
+	/* AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
+	 * dest reg is undefined if x==0, but their CPU architect says its
+	 * value is written to set it to the same as before, except that the
+	 * top 32-bits will be cleared.
+	 */
+	long tmp = -1;
+	asm("bsrl %1,%0"
+	    : "=r" (r)
+	    : "rm" (x), "0" (tmp));
+#elif defined(CONFIG_X86_CMOV)
 	asm("bsrl %1,%0\n\t"
 	    "cmovzl %2,%0"
 	    : "=&r" (r) : "rm" (x), "rm" (-1));
@@ -434,6 +456,33 @@ static inline int fls(int x)
 #endif
 	return r + 1;
 }
+
+/**
+ * fls64 - find last set bit in a 64-bit word
+ * @x: the word to search
+ *
+ * This is defined in a similar way as the libc and compiler builtin
+ * ffsll, but returns the position of the most significant set bit.
+ *
+ * fls64(value) returns 0 if value is 0 or the position of the last
+ * set bit if value is nonzero. The last (most significant) bit is
+ * at position 64.
+ */
+#if BITS_PER_LONG == 64
+static __always_inline int fls64(__u64 x)
+{
+	long bitpos = -1;
+
+	/* AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
+	 * dest reg is undefined if x==0, but their CPU architect says its
+	 * value is written to set it to the same as before.
+	 */
+	asm("bsrq %1,%0"
+	    : "+r" (bitpos)
+	    : "rm" (x));
+	return bitpos + 1;
+}
+#endif
 #endif /* __KERNEL__ */
 
 #undef ADDR
@@ -452,7 +501,9 @@ static inline int fls(int x)
 
 #endif /* __KERNEL__ */
 
+#if BITS_PER_LONG == 32
 #include <asm-generic/bitops/fls64.h>
+#endif
 
 #ifdef __KERNEL__
 

^ permalink raw reply related	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2012-02-20 21:22 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-12-13 14:56 [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64() David Howells
2011-12-13 14:57 ` [PATCH 2/3] Adjust the comment on get_order() to describe the size==0 case David Howells
2011-12-19 18:45   ` H. Peter Anvin
2011-12-19 18:47     ` H. Peter Anvin
2011-12-20 15:09     ` Arnd Bergmann
2011-12-20 15:39       ` David Howells
2012-02-20 21:22         ` H. Peter Anvin
2011-12-20 16:09       ` hpanvin@gmail.com
2011-12-13 14:57 ` [PATCH 3/3] Optimise get_order() David Howells
2011-12-15  0:03 ` [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64() Linus Torvalds
2011-12-15  0:35   ` David Howells
2011-12-15 18:12   ` H. Peter Anvin
2011-12-15 21:29 ` H. Peter Anvin
2011-12-15 22:43   ` Arnd Bergmann
2011-12-15 23:58     ` H. Peter Anvin
2011-12-16 13:57       ` Arnd Bergmann
2011-12-16 15:27         ` H. Peter Anvin
2011-12-15 23:01   ` David Howells
2011-12-15 23:26 ` [tip:x86/asm] x86_64, asm: " tip-bot for David Howells

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.