[PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64()

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64()
@ 2011-12-13 14:56 David Howells
  2011-12-13 14:57 ` [PATCH 2/3] Adjust the comment on get_order() to describe the size==0 case David Howells
                   ` (4 more replies)
  0 siblings, 5 replies; 19+ messages in thread
From: David Howells @ 2011-12-13 14:56 UTC (permalink / raw)
  To: tglx, mingo; +Cc: x86, linux-arch, David Howells

fls(N), ffs(N) and fls64(N) can be optimised on x86_64.  Currently they use a
CMOV instruction after the BSR/BSF to set the destination register to -1 if the
value to be scanned was 0 (in which case BSR/BSF set the Z flag).

Instead, according to the AMD64 specification, we can make use of the fact that
BSR/BSF doesn't modify its output register if its input is 0.  By preloading
the output with -1 and incrementing the result, we achieve the desired result
without the need for a conditional check.

The Intel x86_64 specification, however, says that the result of BSR/BSF in
such a case is undefined.  That said, when queried, one of the Intel CPU
architects said that the behaviour on all Intel CPUs is that:

 (1) with BSRQ/BSFQ, the 64-bit destination register is written with its
     original value if the source is 0, thus, in essence, giving the effect we
     want.  And,

 (2) with BSRL/BSFL, the lower half of the 64-bit destination register is
     written with its original value if the source is 0, and the upper half is
     cleared, thus giving us the effect we want (we return a 4-byte int).

Further, it was indicated that they (Intel) are unlikely to get away with
changing the behaviour.


It might be possible to optimise the 32-bit versions of these functions, but
there's a lot more variation, and so the effective non-destructive property of
BSRL/BSRF cannot be relied on.


I have benchmarked these functions on my Core2 Duo test machine using the
following program:

	#include <stdlib.h>
	#include <stdio.h>

	#ifndef __x86_64__
	#error
	#endif

	#define PAGE_SHIFT 12

	typedef unsigned long long __u64, u64;
	typedef unsigned int __u32, u32;
	#define noinline	__attribute__((noinline))

	static __always_inline int fls64(__u64 x)
	{
		long bitpos = -1;

		asm("bsrq %1,%0"
		    : "+r" (bitpos)
		    : "rm" (x));
		return bitpos + 1;
	}

	static inline unsigned long __fls(unsigned long word)
	{
		asm("bsr %1,%0"
		    : "=r" (word)
		    : "rm" (word));
		return word;
	}
	static __always_inline int old_fls64(__u64 x)
	{
		if (x == 0)
			return 0;
		return __fls(x) + 1;
	}

	static noinline // __attribute__((const))
	int old_get_order(unsigned long size)
	{
		int order;

		size = (size - 1) >> (PAGE_SHIFT - 1);
		order = -1;
		do {
			size >>= 1;
			order++;
		} while (size);
		return order;
	}

	static inline __attribute__((const))
	int get_order_old_fls64(unsigned long size)
	{
		int order;
		size--;
		size >>= PAGE_SHIFT;
		order = old_fls64(size);
		return order;
	}

	static inline __attribute__((const))
	int get_order(unsigned long size)
	{
		int order;
		size--;
		size >>= PAGE_SHIFT;
		order = fls64(size);
		return order;
	}

	unsigned long prevent_optimise_out;

	static noinline unsigned long test_old_get_order(void)
	{
		unsigned long n, total = 0;
		long rep, loop;

		for (rep = 1000000; rep > 0; rep--) {
			for (loop = 0; loop <= 16384; loop += 4) {
				n = 1UL << loop;
				total += old_get_order(n);
			}
		}
		return total;
	}

	static noinline unsigned long test_get_order_old_fls64(void)
	{
		unsigned long n, total = 0;
		long rep, loop;

		for (rep = 1000000; rep > 0; rep--) {
			for (loop = 0; loop <= 16384; loop += 4) {
				n = 1UL << loop;
				total += get_order_old_fls64(n);
			}
		}
		return total;
	}

	static noinline unsigned long test_get_order(void)
	{
		unsigned long n, total = 0;
		long rep, loop;

		for (rep = 1000000; rep > 0; rep--) {
			for (loop = 0; loop <= 16384; loop += 4) {
				n = 1UL << loop;
				total += get_order(n);
			}
		}
		return total;
	}

	int main(int argc, char **argv)
	{
		unsigned long total;

		switch (argc) {
		case 1:  total = test_old_get_order();		break;
		case 2:  total = test_get_order_old_fls64();	break;
		default: total = test_get_order();		break;
		}
		prevent_optimise_out = total;
		return 0;
	}

This allows me to test the use of the old fls64() implementation and the new
fls64() implementation and also to contrast these to the out-of-line loop-based
implementation of get_order().  The results were:

	warthog>time ./get_order
	real    1m37.191s
	user    1m36.313s
	sys     0m0.861s
	warthog>time ./get_order x
	real    0m16.892s
	user    0m16.586s
	sys     0m0.287s
	warthog>time ./get_order x x
	real    0m7.731s
	user    0m7.727s
	sys     0m0.002s

Using the current upstream fls64() as a basis for an inlined get_order() [the
second result above] is much faster than using the current out-of-line
loop-based get_order() [the first result above].

Using my optimised inline fls64()-based get_order() [the third result above]
is even faster still.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 arch/x86/include/asm/bitops.h |   57 +++++++++++++++++++++++++++++++++++++++--
 1 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 1775d6e..9558432 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -395,10 +395,21 @@ static inline unsigned long __fls(unsigned long word)
 static inline int ffs(int x)
 {
 	int r;
-#ifdef CONFIG_X86_CMOV
+
+#if BITS_PER_LONG == 64
+	/* AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
+	 * dest reg is undefined if x==0, but their CPU architect says its
+	 * value is written to set it to the same as before, except that the
+	 * top 32-bits will be cleared.
+	 */
+	long tmp = -1;
+	asm("bsfl %1,%0"
+	    : "=r" (r)
+	    : "rm" (x), "0" (tmp));
+#elif defined(CONFIG_X86_CMOV)
 	asm("bsfl %1,%0\n\t"
 	    "cmovzl %2,%0"
-	    : "=r" (r) : "rm" (x), "r" (-1));
+	    : "=&r" (r) : "rm" (x), "r" (-1));
 #else
 	asm("bsfl %1,%0\n\t"
 	    "jnz 1f\n\t"
@@ -422,7 +433,18 @@ static inline int ffs(int x)
 static inline int fls(int x)
 {
 	int r;
-#ifdef CONFIG_X86_CMOV
+
+#if BITS_PER_LONG == 64
+	/* AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
+	 * dest reg is undefined if x==0, but their CPU architect says its
+	 * value is written to set it to the same as before, except that the
+	 * top 32-bits will be cleared.
+	 */
+	long tmp = -1;
+	asm("bsrl %1,%0"
+	    : "=r" (r)
+	    : "rm" (x), "0" (tmp));
+#elif defined(CONFIG_X86_CMOV)
 	asm("bsrl %1,%0\n\t"
 	    "cmovzl %2,%0"
 	    : "=&r" (r) : "rm" (x), "rm" (-1));
@@ -434,6 +456,33 @@ static inline int fls(int x)
 #endif
 	return r + 1;
 }
+
+/**
+ * fls64 - find last set bit in a 64-bit word
+ * @x: the word to search
+ *
+ * This is defined in a similar way as the libc and compiler builtin
+ * ffsll, but returns the position of the most significant set bit.
+ *
+ * fls64(value) returns 0 if value is 0 or the position of the last
+ * set bit if value is nonzero. The last (most significant) bit is
+ * at position 64.
+ */
+#if BITS_PER_LONG == 64
+static __always_inline int fls64(__u64 x)
+{
+	long bitpos = -1;
+
+	/* AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
+	 * dest reg is undefined if x==0, but their CPU architect says its
+	 * value is written to set it to the same as before.
+	 */
+	asm("bsrq %1,%0"
+	    : "+r" (bitpos)
+	    : "rm" (x));
+	return bitpos + 1;
+}
+#endif
 #endif /* __KERNEL__ */
 
 #undef ADDR
@@ -452,7 +501,9 @@ static inline int fls(int x)
 
 #endif /* __KERNEL__ */
 
+#if BITS_PER_LONG == 32
 #include <asm-generic/bitops/fls64.h>
+#endif
 
 #ifdef __KERNEL__
 

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 2/3] Adjust the comment on get_order() to describe the size==0 case
  2011-12-13 14:56 [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64() David Howells
@ 2011-12-13 14:57 ` David Howells
  2011-12-19 18:45   ` H. Peter Anvin
  2011-12-13 14:57 ` [PATCH 3/3] Optimise get_order() David Howells
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 19+ messages in thread
From: David Howells @ 2011-12-13 14:57 UTC (permalink / raw)
  To: tglx, mingo; +Cc: x86, linux-arch, David Howells

Adjust the comment on get_order() to note that the result of passing a size of
0 results in an undefined value.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 include/asm-generic/getorder.h |   23 ++++++++++++++++++++++-
 1 files changed, 22 insertions(+), 1 deletions(-)

diff --git a/include/asm-generic/getorder.h b/include/asm-generic/getorder.h
index 67e7245..76e9687 100644
--- a/include/asm-generic/getorder.h
+++ b/include/asm-generic/getorder.h
@@ -5,7 +5,28 @@
 
 #include <linux/compiler.h>
 
-/* Pure 2^n version of get_order */
+/**
+ * get_order - Determine the allocation order of a memory size
+ * @size: The size for which to get the order
+ *
+ * Determine the allocation order of a particular sized block of memory.  This
+ * is on a logarithmic scale, where:
+ *
+ *	0 -> 2^0 * PAGE_SIZE and below
+ *	1 -> 2^1 * PAGE_SIZE to 2^0 * PAGE_SIZE + 1
+ *	2 -> 2^2 * PAGE_SIZE to 2^1 * PAGE_SIZE + 1
+ *	3 -> 2^3 * PAGE_SIZE to 2^2 * PAGE_SIZE + 1
+ *	4 -> 2^4 * PAGE_SIZE to 2^3 * PAGE_SIZE + 1
+ *	...
+ *
+ * The order returned is used to find the smallest allocation granule required
+ * to hold an object of the specified size.
+ *
+ * The result is undefined if the size is 0.
+ *
+ * This function may be used to initialise variables with compile time
+ * evaluations of constants.
+ */
 static inline __attribute_const__ int get_order(unsigned long size)
 {
 	int order;

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] Adjust the comment on get_order() to describe the size==0 case
  2011-12-13 14:57 ` [PATCH 2/3] Adjust the comment on get_order() to describe the size==0 case David Howells
@ 2011-12-19 18:45   ` H. Peter Anvin
  2011-12-19 18:47     ` H. Peter Anvin
  2011-12-20 15:09     ` Arnd Bergmann
  0 siblings, 2 replies; 19+ messages in thread
From: H. Peter Anvin @ 2011-12-19 18:45 UTC (permalink / raw)
  To: David Howells, Arnd Bergmann; +Cc: tglx, mingo, x86, linux-arch

Hi Arnd,

Do you want to take the 2/3 and 3/3 patches in this series (since they
are asm-generic) or should I?

	-hpa

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] Adjust the comment on get_order() to describe the size==0 case
  2011-12-19 18:45   ` H. Peter Anvin
@ 2011-12-19 18:47     ` H. Peter Anvin
  2011-12-20 15:09     ` Arnd Bergmann
  1 sibling, 0 replies; 19+ messages in thread
From: H. Peter Anvin @ 2011-12-19 18:47 UTC (permalink / raw)
  To: David Howells, Arnd Bergmann; +Cc: tglx, mingo, x86, linux-arch

On 12/19/2011 10:45 AM, H. Peter Anvin wrote:
> Hi Arnd,
> 
> Do you want to take the 2/3 and 3/3 patches in this series (since they
> are asm-generic) or should I?
> 

If you want to take them:

Acked-by: H. Peter Anvin <hpa@zytor.com>

	-hpa

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] Adjust the comment on get_order() to describe the size==0 case
  2011-12-19 18:45   ` H. Peter Anvin
  2011-12-19 18:47     ` H. Peter Anvin
@ 2011-12-20 15:09     ` Arnd Bergmann
  2011-12-20 15:39       ` David Howells
  2011-12-20 16:09       ` hpanvin@gmail.com
  1 sibling, 2 replies; 19+ messages in thread
From: Arnd Bergmann @ 2011-12-20 15:09 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: David Howells, tglx, mingo, x86, linux-arch

On Monday 19 December 2011, H. Peter Anvin wrote:
> Hi Arnd,
> 
> Do you want to take the 2/3 and 3/3 patches in this series (since they
> are asm-generic) or should I?
> 

I generally prefer asm-generic patches to go through the trees that are
most interested in them (with my ack), which keeps the series close together
and lets me not maintain a separate tree for asm-generic most of the time.

Please put the entire series into -tip.

Acked-by: Arnd Bergmann <arnd@arndb.de>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] Adjust the comment on get_order() to describe the size==0 case
  2011-12-20 15:09     ` Arnd Bergmann
@ 2011-12-20 15:39       ` David Howells
  2012-02-20 21:22         ` H. Peter Anvin
  2011-12-20 16:09       ` hpanvin@gmail.com
  1 sibling, 1 reply; 19+ messages in thread
From: David Howells @ 2011-12-20 15:39 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: dhowells, H. Peter Anvin, tglx, mingo, x86, linux-arch

Arnd Bergmann <arnd@arndb.de> wrote:

> I generally prefer asm-generic patches to go through the trees that are
> most interested in them (with my ack), which keeps the series close together
> and lets me not maintain a separate tree for asm-generic most of the time.

Should I resubmit the latter two patches detached from the first with the
Acked-bys added?

David

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] Adjust the comment on get_order() to describe the size==0 case
  2011-12-20 15:39       ` David Howells
@ 2012-02-20 21:22         ` H. Peter Anvin
  0 siblings, 0 replies; 19+ messages in thread
From: H. Peter Anvin @ 2012-02-20 21:22 UTC (permalink / raw)
  To: David Howells; +Cc: Arnd Bergmann, tglx, mingo, x86, linux-arch

On 12/20/2011 07:39 AM, David Howells wrote:
> Arnd Bergmann <arnd@arndb.de> wrote:
> 
>> I generally prefer asm-generic patches to go through the trees that are
>> most interested in them (with my ack), which keeps the series close together
>> and lets me not maintain a separate tree for asm-generic most of the time.
> 
> Should I resubmit the latter two patches detached from the first with the
> Acked-bys added?
> 
> David

[Resurrecting an old thread from before hpa 2.0]

Yes please, I never received them...

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] Adjust the comment on get_order() to describe the size==0 case
  2011-12-20 15:09     ` Arnd Bergmann
  2011-12-20 15:39       ` David Howells
@ 2011-12-20 16:09       ` hpanvin@gmail.com
  1 sibling, 0 replies; 19+ messages in thread
From: hpanvin@gmail.com @ 2011-12-20 16:09 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: David Howells, tglx, mingo, x86, linux-arch

Will do.

Arnd Bergmann <arnd@arndb.de> wrote:

>On Monday 19 December 2011, H. Peter Anvin wrote:
>> Hi Arnd,
>> 
>> Do you want to take the 2/3 and 3/3 patches in this series (since
>they
>> are asm-generic) or should I?
>> 
>
>I generally prefer asm-generic patches to go through the trees that are
>most interested in them (with my ack), which keeps the series close
>together
>and lets me not maintain a separate tree for asm-generic most of the
>time.
>
>Please put the entire series into -tip.
>
>Acked-by: Arnd Bergmann <arnd@arndb.de>

-- 
Sent from my Android phone with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 3/3] Optimise get_order()
  2011-12-13 14:56 [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64() David Howells
  2011-12-13 14:57 ` [PATCH 2/3] Adjust the comment on get_order() to describe the size==0 case David Howells
@ 2011-12-13 14:57 ` David Howells
  2011-12-15  0:03 ` [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64() Linus Torvalds
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 19+ messages in thread
From: David Howells @ 2011-12-13 14:57 UTC (permalink / raw)
  To: tglx, mingo; +Cc: x86, linux-arch, David Howells

Optimise get_order() to use bit scanning instructions if such exist rather than
a loop.  Also, make it possible to use get_order() in static initialisations
too by building it on top of ilog2() in the constant parameter case.

This has been tested for i386 and x86_64 using the following userspace program,
and for FRV by making appropriate substitutions for fls() and fls64().  It will
abort if the case for get_order() deviates from the original except for the
order of 0, for which get_order() produces an undefined result.  This program
tests both dynamic and static parameters.

	#include <stdlib.h>
	#include <stdio.h>

	#ifdef __x86_64__
	#define BITS_PER_LONG 64
	#else
	#define BITS_PER_LONG 32
	#endif

	#define PAGE_SHIFT 12

	typedef unsigned long long __u64, u64;
	typedef unsigned int __u32, u32;
	#define noinline	__attribute__((noinline))

	static inline int fls(int x)
	{
		int bitpos = -1;

		asm("bsrl %1,%0"
		    : "+r" (bitpos)
		    : "rm" (x));
		return bitpos + 1;
	}

	static __always_inline int fls64(__u64 x)
	{
	#if BITS_PER_LONG == 64
		long bitpos = -1;

		asm("bsrq %1,%0"
		    : "+r" (bitpos)
		    : "rm" (x));
		return bitpos + 1;
	#else
		__u32 h = x >> 32, l = x;
		int bitpos = -1;

		asm("bsrl	%1,%0	\n"
		    "subl	%2,%0	\n"
		    "bsrl	%3,%0	\n"
		    : "+r" (bitpos)
		    : "rm" (l), "i"(32), "rm" (h));

		return bitpos + 33;
	#endif
	}

	static inline __attribute__((const))
	int __ilog2_u32(u32 n)
	{
		return fls(n) - 1;
	}

	static inline __attribute__((const))
	int __ilog2_u64(u64 n)
	{
		return fls64(n) - 1;
	}

	extern __attribute__((const, noreturn))
	int ____ilog2_NaN(void);

	#define ilog2(n)				\
	(						\
		__builtin_constant_p(n) ? (		\
			(n) < 1 ? ____ilog2_NaN() :	\
			(n) & (1ULL << 63) ? 63 :	\
			(n) & (1ULL << 62) ? 62 :	\
			(n) & (1ULL << 61) ? 61 :	\
			(n) & (1ULL << 60) ? 60 :	\
			(n) & (1ULL << 59) ? 59 :	\
			(n) & (1ULL << 58) ? 58 :	\
			(n) & (1ULL << 57) ? 57 :	\
			(n) & (1ULL << 56) ? 56 :	\
			(n) & (1ULL << 55) ? 55 :	\
			(n) & (1ULL << 54) ? 54 :	\
			(n) & (1ULL << 53) ? 53 :	\
			(n) & (1ULL << 52) ? 52 :	\
			(n) & (1ULL << 51) ? 51 :	\
			(n) & (1ULL << 50) ? 50 :	\
			(n) & (1ULL << 49) ? 49 :	\
			(n) & (1ULL << 48) ? 48 :	\
			(n) & (1ULL << 47) ? 47 :	\
			(n) & (1ULL << 46) ? 46 :	\
			(n) & (1ULL << 45) ? 45 :	\
			(n) & (1ULL << 44) ? 44 :	\
			(n) & (1ULL << 43) ? 43 :	\
			(n) & (1ULL << 42) ? 42 :	\
			(n) & (1ULL << 41) ? 41 :	\
			(n) & (1ULL << 40) ? 40 :	\
			(n) & (1ULL << 39) ? 39 :	\
			(n) & (1ULL << 38) ? 38 :	\
			(n) & (1ULL << 37) ? 37 :	\
			(n) & (1ULL << 36) ? 36 :	\
			(n) & (1ULL << 35) ? 35 :	\
			(n) & (1ULL << 34) ? 34 :	\
			(n) & (1ULL << 33) ? 33 :	\
			(n) & (1ULL << 32) ? 32 :	\
			(n) & (1ULL << 31) ? 31 :	\
			(n) & (1ULL << 30) ? 30 :	\
			(n) & (1ULL << 29) ? 29 :	\
			(n) & (1ULL << 28) ? 28 :	\
			(n) & (1ULL << 27) ? 27 :	\
			(n) & (1ULL << 26) ? 26 :	\
			(n) & (1ULL << 25) ? 25 :	\
			(n) & (1ULL << 24) ? 24 :	\
			(n) & (1ULL << 23) ? 23 :	\
			(n) & (1ULL << 22) ? 22 :	\
			(n) & (1ULL << 21) ? 21 :	\
			(n) & (1ULL << 20) ? 20 :	\
			(n) & (1ULL << 19) ? 19 :	\
			(n) & (1ULL << 18) ? 18 :	\
			(n) & (1ULL << 17) ? 17 :	\
			(n) & (1ULL << 16) ? 16 :	\
			(n) & (1ULL << 15) ? 15 :	\
			(n) & (1ULL << 14) ? 14 :	\
			(n) & (1ULL << 13) ? 13 :	\
			(n) & (1ULL << 12) ? 12 :	\
			(n) & (1ULL << 11) ? 11 :	\
			(n) & (1ULL << 10) ? 10 :	\
			(n) & (1ULL <<  9) ?  9 :	\
			(n) & (1ULL <<  8) ?  8 :	\
			(n) & (1ULL <<  7) ?  7 :	\
			(n) & (1ULL <<  6) ?  6 :	\
			(n) & (1ULL <<  5) ?  5 :	\
			(n) & (1ULL <<  4) ?  4 :	\
			(n) & (1ULL <<  3) ?  3 :	\
			(n) & (1ULL <<  2) ?  2 :	\
			(n) & (1ULL <<  1) ?  1 :	\
			(n) & (1ULL <<  0) ?  0 :	\
			____ilog2_NaN()			\
					   ) :		\
		(sizeof(n) <= 4) ?			\
		__ilog2_u32(n) :			\
		__ilog2_u64(n)				\
	 )

	static noinline __attribute__((const))
	int old_get_order(unsigned long size)
	{
		int order;

		size = (size - 1) >> (PAGE_SHIFT - 1);
		order = -1;
		do {
			size >>= 1;
			order++;
		} while (size);
		return order;
	}

	static noinline __attribute__((const))
	int __get_order(unsigned long size)
	{
		int order;
		size--;
		size >>= PAGE_SHIFT;
	#if BITS_PER_LONG == 32
		order = fls(size);
	#else
		order = fls64(size);
	#endif
		return order;
	}

	#define get_order(n)						\
	(								\
		__builtin_constant_p(n) ? (				\
			(n == 0UL) ? BITS_PER_LONG - PAGE_SHIFT :	\
			((n < (1UL << PAGE_SHIFT)) ? 0 :		\
			 ilog2((n) - 1) - PAGE_SHIFT + 1)		\
		) :							\
		__get_order(n)						\
	)

	#define order(N) \
		{ (1UL << N) - 1,	get_order((1UL << N) - 1)	},	\
		{ (1UL << N),		get_order((1UL << N))		},	\
		{ (1UL << N) + 1,	get_order((1UL << N) + 1)	}

	struct order {
		unsigned long n, order;
	};

	static const struct order order_table[] = {
		order(0),
		order(1),
		order(2),
		order(3),
		order(4),
		order(5),
		order(6),
		order(7),
		order(8),
		order(9),
		order(10),
		order(11),
		order(12),
		order(13),
		order(14),
		order(15),
		order(16),
		order(17),
		order(18),
		order(19),
		order(20),
		order(21),
		order(22),
		order(23),
		order(24),
		order(25),
		order(26),
		order(27),
		order(28),
		order(29),
		order(30),
		order(31),
	#if BITS_PER_LONG == 64
		order(32),
		order(33),
		order(34),
		order(35),
	#endif
		{ 0x2929 }
	};

	void check(int loop, unsigned long n)
	{
		unsigned long old, new;

		printf("[%2d]: %09lx | ", loop, n);

		old = old_get_order(n);
		new = get_order(n);

		printf("%3ld, %3ld\n", old, new);
		if (n != 0 && old != new)
			abort();
	}

	int main(int argc, char **argv)
	{
		const struct order *p;
		unsigned long n;
		int loop;

		for (loop = 0; loop <= BITS_PER_LONG - 1; loop++) {
			n = 1UL << loop;
			check(loop, n - 1);
			check(loop, n);
			check(loop, n + 1);
		}

		for (p = order_table; p->n != 0x2929; p++) {
			unsigned long old, new;

			old = old_get_order(p->n);
			new = p->order;
			printf("%09lx\t%3ld, %3ld\n", p->n, old, new);
			if (p->n != 0 && old != new)
				abort();
		}

		return 0;
	}

Disassembling the x86_64 version of the above code shows:

	0000000000400510 <old_get_order>:
	  400510:       48 83 ef 01             sub    $0x1,%rdi
	  400514:       b8 ff ff ff ff          mov    $0xffffffff,%eax
	  400519:       48 c1 ef 0b             shr    $0xb,%rdi
	  40051d:       0f 1f 00                nopl   (%rax)
	  400520:       83 c0 01                add    $0x1,%eax
	  400523:       48 d1 ef                shr    %rdi
	  400526:       75 f8                   jne    400520 <old_get_order+0x10>
	  400528:       f3 c3                   repz retq
	  40052a:       66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)

	0000000000400530 <__get_order>:
	  400530:       48 83 ef 01             sub    $0x1,%rdi
	  400534:       48 c7 c0 ff ff ff ff    mov    $0xffffffffffffffff,%rax
	  40053b:       48 c1 ef 0c             shr    $0xc,%rdi
	  40053f:       48 0f bd c7             bsr    %rdi,%rax
	  400543:       83 c0 01                add    $0x1,%eax
	  400546:       c3                      retq
	  400547:       66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
	  40054e:       00 00

As can be seen, the new __get_order() function is simpler than the
old_get_order() function.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 include/asm-generic/getorder.h |   40 ++++++++++++++++++++++++++++------------
 1 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/include/asm-generic/getorder.h b/include/asm-generic/getorder.h
index 76e9687..e0fb4bf 100644
--- a/include/asm-generic/getorder.h
+++ b/include/asm-generic/getorder.h
@@ -4,6 +4,25 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/compiler.h>
+#include <linux/log2.h>
+
+/*
+ * Runtime evaluation of get_order()
+ */
+static inline __attribute_const__
+int __get_order(unsigned long size)
+{
+	int order;
+
+	size--;
+	size >>= PAGE_SHIFT;
+#if BITS_PER_LONG == 32
+	order = fls(size);
+#else
+	order = fls64(size);
+#endif
+	return order;
+}
 
 /**
  * get_order - Determine the allocation order of a memory size
@@ -27,18 +46,15 @@
  * This function may be used to initialise variables with compile time
  * evaluations of constants.
  */
-static inline __attribute_const__ int get_order(unsigned long size)
-{
-	int order;
-
-	size = (size - 1) >> (PAGE_SHIFT - 1);
-	order = -1;
-	do {
-		size >>= 1;
-		order++;
-	} while (size);
-	return order;
-}
+#define get_order(n)						\
+(								\
+	__builtin_constant_p(n) ? (				\
+		(n == 0UL) ? BITS_PER_LONG - PAGE_SHIFT :	\
+		((n < (1UL << PAGE_SHIFT)) ? 0 :		\
+		 ilog2((n) - 1) - PAGE_SHIFT + 1)		\
+	) :							\
+	__get_order(n)						\
+)
 
 #endif	/* __ASSEMBLY__ */
 

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64()
  2011-12-13 14:56 [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64() David Howells
  2011-12-13 14:57 ` [PATCH 2/3] Adjust the comment on get_order() to describe the size==0 case David Howells
  2011-12-13 14:57 ` [PATCH 3/3] Optimise get_order() David Howells
@ 2011-12-15  0:03 ` Linus Torvalds
  2011-12-15  0:35   ` David Howells
  2011-12-15 18:12   ` H. Peter Anvin
  2011-12-15 21:29 ` H. Peter Anvin
  2011-12-15 23:26 ` [tip:x86/asm] x86_64, asm: " tip-bot for David Howells
  4 siblings, 2 replies; 19+ messages in thread
From: Linus Torvalds @ 2011-12-15  0:03 UTC (permalink / raw)
  To: David Howells; +Cc: tglx, mingo, x86, linux-arch

On Tue, Dec 13, 2011 at 6:56 AM, David Howells <dhowells@redhat.com> wrote:
>
> It might be possible to optimise the 32-bit versions of these functions, but
> there's a lot more variation, and so the effective non-destructive property of
> BSRL/BSRF cannot be relied on.

I'm pretty sure that we had some chips that actually do write random
stuff to the destination register when the input is zero.

We relied on the "doesn't change the destination" at some point *long*
ago, and it turned out not to work, but I can't remember what chip it
was.

I'm nervous about making this change even on x86-64 unless we add big
comments about the old 32-bit change. Can somebody find the historic
thing and a comment about which chip it was?

                     Linus

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64()
  2011-12-15  0:03 ` [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64() Linus Torvalds
@ 2011-12-15  0:35   ` David Howells
  2011-12-15 18:12   ` H. Peter Anvin
  1 sibling, 0 replies; 19+ messages in thread
From: David Howells @ 2011-12-15  0:35 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: dhowells, tglx, mingo, x86, linux-arch

Linus Torvalds <torvalds@linux-foundation.org> wrote:

> I'm nervous about making this change even on x86-64 unless we add big
> comments about the old 32-bit change.

Are you happy with patches 2 and 3?  I can always split the set into two as
the 2nd and 3rd are really independent of the 1st.

David

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64()
  2011-12-15  0:03 ` [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64() Linus Torvalds
  2011-12-15  0:35   ` David Howells
@ 2011-12-15 18:12   ` H. Peter Anvin
  1 sibling, 0 replies; 19+ messages in thread
From: H. Peter Anvin @ 2011-12-15 18:12 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: David Howells, tglx, mingo, x86, linux-arch

On 12/14/2011 04:03 PM, Linus Torvalds wrote:
> 
> I'm pretty sure that we had some chips that actually do write random
> stuff to the destination register when the input is zero.
> 
> We relied on the "doesn't change the destination" at some point *long*
> ago, and it turned out not to work, but I can't remember what chip it
> was.
> 
> I'm nervous about making this change even on x86-64 unless we add big
> comments about the old 32-bit change. Can somebody find the historic
> thing and a comment about which chip it was?
> 

The ones I know of are some steppings of the 486.

	-hpa

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64()
  2011-12-13 14:56 [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64() David Howells
                   ` (2 preceding siblings ...)
  2011-12-15  0:03 ` [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64() Linus Torvalds
@ 2011-12-15 21:29 ` H. Peter Anvin
  2011-12-15 22:43   ` Arnd Bergmann
  2011-12-15 23:01   ` David Howells
  2011-12-15 23:26 ` [tip:x86/asm] x86_64, asm: " tip-bot for David Howells
  4 siblings, 2 replies; 19+ messages in thread
From: H. Peter Anvin @ 2011-12-15 21:29 UTC (permalink / raw)
  To: David Howells; +Cc: tglx, mingo, x86, linux-arch

On 12/13/2011 06:56 AM, David Howells wrote:
>  
> +#if BITS_PER_LONG == 32
>  #include <asm-generic/bitops/fls64.h>
> +#endif
>  

This is outside __KERNEL__, and thus ends up changing what is exported
to userspace (specifically, fls64.h won't be included for 64-bit
non-__KERNEL__ anymore.)  Is this a bug?

	-hpa

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64()
  2011-12-15 21:29 ` H. Peter Anvin
@ 2011-12-15 22:43   ` Arnd Bergmann
  2011-12-15 23:58     ` H. Peter Anvin
  2011-12-15 23:01   ` David Howells
  1 sibling, 1 reply; 19+ messages in thread
From: Arnd Bergmann @ 2011-12-15 22:43 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: David Howells, tglx, mingo, x86, linux-arch

On Thursday 15 December 2011 13:29:07 H. Peter Anvin wrote:
> On 12/13/2011 06:56 AM, David Howells wrote:
> >  
> > +#if BITS_PER_LONG == 32
> >  #include <asm-generic/bitops/fls64.h>
> > +#endif
> >  
> 
> This is outside __KERNEL__, and thus ends up changing what is exported
> to userspace (specifically, fls64.h won't be included for 64-bit
> non-__KERNEL__ anymore.)  Is this a bug?

Not sure, but I think it's a bug to use BITS_PER_LONG rather than
__BITS_PER_LONG outside of __KERNEL__.

	Arnd

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64()
  2011-12-15 22:43   ` Arnd Bergmann
@ 2011-12-15 23:58     ` H. Peter Anvin
  2011-12-16 13:57       ` Arnd Bergmann
  0 siblings, 1 reply; 19+ messages in thread
From: H. Peter Anvin @ 2011-12-15 23:58 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: David Howells, tglx, mingo, x86, linux-arch

On 12/15/2011 02:43 PM, Arnd Bergmann wrote:
> On Thursday 15 December 2011 13:29:07 H. Peter Anvin wrote:
>> On 12/13/2011 06:56 AM, David Howells wrote:
>>>  
>>> +#if BITS_PER_LONG == 32
>>>  #include <asm-generic/bitops/fls64.h>
>>> +#endif
>>>  
>>
>> This is outside __KERNEL__, and thus ends up changing what is exported
>> to userspace (specifically, fls64.h won't be included for 64-bit
>> non-__KERNEL__ anymore.)  Is this a bug?
> 
> Not sure, but I think it's a bug to use BITS_PER_LONG rather than
> __BITS_PER_LONG outside of __KERNEL__.
> 

Yes, it's one of many issues with this stuff.  After tracking down
things a bit further it looks like this was simply a conversion error in
checkin:

d57594c203b1 bitops: use __fls for fls64 on 64-bit archs

	-hpa

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64()
  2011-12-15 23:58     ` H. Peter Anvin
@ 2011-12-16 13:57       ` Arnd Bergmann
  2011-12-16 15:27         ` H. Peter Anvin
  0 siblings, 1 reply; 19+ messages in thread
From: Arnd Bergmann @ 2011-12-16 13:57 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: David Howells, tglx, mingo, x86, linux-arch

On Thursday 15 December 2011, H. Peter Anvin wrote:
> On 12/15/2011 02:43 PM, Arnd Bergmann wrote:
> > On Thursday 15 December 2011 13:29:07 H. Peter Anvin wrote:
> >> On 12/13/2011 06:56 AM, David Howells wrote:
> >>>  
> >>> +#if BITS_PER_LONG == 32
> >>>  #include <asm-generic/bitops/fls64.h>
> >>> +#endif
> >>>  
> >>
> >> This is outside __KERNEL__, and thus ends up changing what is exported
> >> to userspace (specifically, fls64.h won't be included for 64-bit
> >> non-__KERNEL__ anymore.)  Is this a bug?
> > 
> > Not sure, but I think it's a bug to use BITS_PER_LONG rather than
> > __BITS_PER_LONG outside of __KERNEL__.
> > 
> 
> Yes, it's one of many issues with this stuff.  After tracking down
> things a bit further it looks like this was simply a conversion error in
> checkin:
> 
> d57594c203b1 bitops: use __fls for fls64 on 64-bit archs

Hmm, I also looked a bit closer and noticed that we don't actually
export any bitops headers to user space, so any use of __KERNEL__ in
those headers is bogus, but I don't think there is an actual bug here.

	Arnd

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64()
  2011-12-16 13:57       ` Arnd Bergmann
@ 2011-12-16 15:27         ` H. Peter Anvin
  0 siblings, 0 replies; 19+ messages in thread
From: H. Peter Anvin @ 2011-12-16 15:27 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: David Howells, tglx, mingo, x86, linux-arch

On 12/16/2011 05:57 AM, Arnd Bergmann wrote:
>>
>> Yes, it's one of many issues with this stuff.  After tracking down
>> things a bit further it looks like this was simply a conversion error in
>> checkin:
>>
>> d57594c203b1 bitops: use __fls for fls64 on 64-bit archs
> 
> Hmm, I also looked a bit closer and noticed that we don't actually
> export any bitops headers to user space, so any use of __KERNEL__ in
> those headers is bogus, but I don't think there is an actual bug here.
> 

That whole thing probably wants to be cleaned up, then.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64()
  2011-12-15 21:29 ` H. Peter Anvin
  2011-12-15 22:43   ` Arnd Bergmann
@ 2011-12-15 23:01   ` David Howells
  1 sibling, 0 replies; 19+ messages in thread
From: David Howells @ 2011-12-15 23:01 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: dhowells, tglx, mingo, x86, linux-arch, heukelum

H. Peter Anvin <hpa@zytor.com> wrote:

> > +#if BITS_PER_LONG == 32
> >  #include <asm-generic/bitops/fls64.h>
> > +#endif
> >  
> 
> This is outside __KERNEL__, and thus ends up changing what is exported
> to userspace (specifically, fls64.h won't be included for 64-bit
> non-__KERNEL__ anymore.)  Is this a bug?

It moved in the following commit:

	commit d57594c203b1e7b54373080a797f0cbfa4aade68
	Author: Alexander van Heukelum <heukelum@mailshack.com>
	Date:   Sat Mar 15 18:32:36 2008 +0100

	    bitops: use __fls for fls64 on 64-bit archs

	    Use __fls for fls64 on 64-bit archs. The implementation for
	    64-bit archs is moved from x86_64 to asm-generic.

	    Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
	    Signed-off-by: Ingo Molnar <mingo@elte.hu>

So I think it's a bug.

David

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [tip:x86/asm] x86_64, asm: Optimise fls(), ffs() and fls64()
  2011-12-13 14:56 [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64() David Howells
                   ` (3 preceding siblings ...)
  2011-12-15 21:29 ` H. Peter Anvin
@ 2011-12-15 23:26 ` tip-bot for David Howells
  4 siblings, 0 replies; 19+ messages in thread
From: tip-bot for David Howells @ 2011-12-15 23:26 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, dhowells, torvalds, tglx, hpa

Commit-ID:  ca3d30cc02f780f68771087040ce935add6ba2b7
Gitweb:     http://git.kernel.org/tip/ca3d30cc02f780f68771087040ce935add6ba2b7
Author:     David Howells <dhowells@redhat.com>
AuthorDate: Tue, 13 Dec 2011 14:56:54 +0000
Committer:  H. Peter Anvin <hpa@linux.intel.com>
CommitDate: Thu, 15 Dec 2011 15:16:49 -0800

x86_64, asm: Optimise fls(), ffs() and fls64()

fls(N), ffs(N) and fls64(N) can be optimised on x86_64.  Currently they use a
CMOV instruction after the BSR/BSF to set the destination register to -1 if the
value to be scanned was 0 (in which case BSR/BSF set the Z flag).

Instead, according to the AMD64 specification, we can make use of the fact that
BSR/BSF doesn't modify its output register if its input is 0.  By preloading
the output with -1 and incrementing the result, we achieve the desired result
without the need for a conditional check.

The Intel x86_64 specification, however, says that the result of BSR/BSF in
such a case is undefined.  That said, when queried, one of the Intel CPU
architects said that the behaviour on all Intel CPUs is that:

 (1) with BSRQ/BSFQ, the 64-bit destination register is written with its
     original value if the source is 0, thus, in essence, giving the effect we
     want.  And,

 (2) with BSRL/BSFL, the lower half of the 64-bit destination register is
     written with its original value if the source is 0, and the upper half is
     cleared, thus giving us the effect we want (we return a 4-byte int).

Further, it was indicated that they (Intel) are unlikely to get away with
changing the behaviour.

It might be possible to optimise the 32-bit versions of these functions, but
there's a lot more variation, and so the effective non-destructive property of
BSRL/BSRF cannot be relied on.

[ hpa: specifically, some 486 chips are known to NOT have this property. ]

I have benchmarked these functions on my Core2 Duo test machine using the
following program:

	#include <stdlib.h>
	#include <stdio.h>

	#ifndef __x86_64__
	#error
	#endif

	#define PAGE_SHIFT 12

	typedef unsigned long long __u64, u64;
	typedef unsigned int __u32, u32;
	#define noinline	__attribute__((noinline))

	static __always_inline int fls64(__u64 x)
	{
		long bitpos = -1;

		asm("bsrq %1,%0"
		    : "+r" (bitpos)
		    : "rm" (x));
		return bitpos + 1;
	}

	static inline unsigned long __fls(unsigned long word)
	{
		asm("bsr %1,%0"
		    : "=r" (word)
		    : "rm" (word));
		return word;
	}
	static __always_inline int old_fls64(__u64 x)
	{
		if (x == 0)
			return 0;
		return __fls(x) + 1;
	}

	static noinline // __attribute__((const))
	int old_get_order(unsigned long size)
	{
		int order;

		size = (size - 1) >> (PAGE_SHIFT - 1);
		order = -1;
		do {
			size >>= 1;
			order++;
		} while (size);
		return order;
	}

	static inline __attribute__((const))
	int get_order_old_fls64(unsigned long size)
	{
		int order;
		size--;
		size >>= PAGE_SHIFT;
		order = old_fls64(size);
		return order;
	}

	static inline __attribute__((const))
	int get_order(unsigned long size)
	{
		int order;
		size--;
		size >>= PAGE_SHIFT;
		order = fls64(size);
		return order;
	}

	unsigned long prevent_optimise_out;

	static noinline unsigned long test_old_get_order(void)
	{
		unsigned long n, total = 0;
		long rep, loop;

		for (rep = 1000000; rep > 0; rep--) {
			for (loop = 0; loop <= 16384; loop += 4) {
				n = 1UL << loop;
				total += old_get_order(n);
			}
		}
		return total;
	}

	static noinline unsigned long test_get_order_old_fls64(void)
	{
		unsigned long n, total = 0;
		long rep, loop;

		for (rep = 1000000; rep > 0; rep--) {
			for (loop = 0; loop <= 16384; loop += 4) {
				n = 1UL << loop;
				total += get_order_old_fls64(n);
			}
		}
		return total;
	}

	static noinline unsigned long test_get_order(void)
	{
		unsigned long n, total = 0;
		long rep, loop;

		for (rep = 1000000; rep > 0; rep--) {
			for (loop = 0; loop <= 16384; loop += 4) {
				n = 1UL << loop;
				total += get_order(n);
			}
		}
		return total;
	}

	int main(int argc, char **argv)
	{
		unsigned long total;

		switch (argc) {
		case 1:  total = test_old_get_order();		break;
		case 2:  total = test_get_order_old_fls64();	break;
		default: total = test_get_order();		break;
		}
		prevent_optimise_out = total;
		return 0;
	}

This allows me to test the use of the old fls64() implementation and the new
fls64() implementation and also to contrast these to the out-of-line loop-based
implementation of get_order().  The results were:

	warthog>time ./get_order
	real    1m37.191s
	user    1m36.313s
	sys     0m0.861s
	warthog>time ./get_order x
	real    0m16.892s
	user    0m16.586s
	sys     0m0.287s
	warthog>time ./get_order x x
	real    0m7.731s
	user    0m7.727s
	sys     0m0.002s

Using the current upstream fls64() as a basis for an inlined get_order() [the
second result above] is much faster than using the current out-of-line
loop-based get_order() [the first result above].

Using my optimised inline fls64()-based get_order() [the third result above]
is even faster still.

[ hpa: changed the selection of 32 vs 64 bits to use CONFIG_X86_64
  instead of comparing BITS_PER_LONG, updated comments, rebased manually
  on top of 83d99df7c4bf x86, bitops: Move fls64.h inside __KERNEL__ ]

Signed-off-by: David Howells <dhowells@redhat.com>
Link: http://lkml.kernel.org/r/20111213145654.14362.39868.stgit@warthog.procyon.org.uk
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/bitops.h |   67 +++++++++++++++++++++++++++++++++++++---
 1 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 4a6235b..b97596e 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -397,10 +397,25 @@ static inline unsigned long __fls(unsigned long word)
 static inline int ffs(int x)
 {
 	int r;
-#ifdef CONFIG_X86_CMOV
+
+#ifdef CONFIG_X86_64
+	/*
+	 * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
+	 * dest reg is undefined if x==0, but their CPU architect says its
+	 * value is written to set it to the same as before, except that the
+	 * top 32 bits will be cleared.
+	 *
+	 * We cannot do this on 32 bits because at the very least some
+	 * 486 CPUs did not behave this way.
+	 */
+	long tmp = -1;
+	asm("bsfl %1,%0"
+	    : "=r" (r)
+	    : "rm" (x), "0" (tmp));
+#elif defined(CONFIG_X86_CMOV)
 	asm("bsfl %1,%0\n\t"
 	    "cmovzl %2,%0"
-	    : "=r" (r) : "rm" (x), "r" (-1));
+	    : "=&r" (r) : "rm" (x), "r" (-1));
 #else
 	asm("bsfl %1,%0\n\t"
 	    "jnz 1f\n\t"
@@ -424,7 +439,22 @@ static inline int ffs(int x)
 static inline int fls(int x)
 {
 	int r;
-#ifdef CONFIG_X86_CMOV
+
+#ifdef CONFIG_X86_64
+	/*
+	 * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
+	 * dest reg is undefined if x==0, but their CPU architect says its
+	 * value is written to set it to the same as before, except that the
+	 * top 32 bits will be cleared.
+	 *
+	 * We cannot do this on 32 bits because at the very least some
+	 * 486 CPUs did not behave this way.
+	 */
+	long tmp = -1;
+	asm("bsrl %1,%0"
+	    : "=r" (r)
+	    : "rm" (x), "0" (tmp));
+#elif defined(CONFIG_X86_CMOV)
 	asm("bsrl %1,%0\n\t"
 	    "cmovzl %2,%0"
 	    : "=&r" (r) : "rm" (x), "rm" (-1));
@@ -437,6 +467,35 @@ static inline int fls(int x)
 	return r + 1;
 }
 
+/**
+ * fls64 - find last set bit in a 64-bit word
+ * @x: the word to search
+ *
+ * This is defined in a similar way as the libc and compiler builtin
+ * ffsll, but returns the position of the most significant set bit.
+ *
+ * fls64(value) returns 0 if value is 0 or the position of the last
+ * set bit if value is nonzero. The last (most significant) bit is
+ * at position 64.
+ */
+#ifdef CONFIG_X86_64
+static __always_inline int fls64(__u64 x)
+{
+	long bitpos = -1;
+	/*
+	 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
+	 * dest reg is undefined if x==0, but their CPU architect says its
+	 * value is written to set it to the same as before.
+	 */
+	asm("bsrq %1,%0"
+	    : "+r" (bitpos)
+	    : "rm" (x));
+	return bitpos + 1;
+}
+#else
+#include <asm-generic/bitops/fls64.h>
+#endif
+
 #include <asm-generic/bitops/find.h>
 
 #include <asm-generic/bitops/sched.h>
@@ -447,8 +506,6 @@ static inline int fls(int x)
 
 #include <asm-generic/bitops/const_hweight.h>
 
-#include <asm-generic/bitops/fls64.h>
-
 #include <asm-generic/bitops/le.h>
 
 #include <asm-generic/bitops/ext2-atomic-setbit.h>

^ permalink raw reply related	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2012-02-20 21:22 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-12-13 14:56 [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64() David Howells
2011-12-13 14:57 ` [PATCH 2/3] Adjust the comment on get_order() to describe the size==0 case David Howells
2011-12-19 18:45   ` H. Peter Anvin
2011-12-19 18:47     ` H. Peter Anvin
2011-12-20 15:09     ` Arnd Bergmann
2011-12-20 15:39       ` David Howells
2012-02-20 21:22         ` H. Peter Anvin
2011-12-20 16:09       ` hpanvin@gmail.com
2011-12-13 14:57 ` [PATCH 3/3] Optimise get_order() David Howells
2011-12-15  0:03 ` [PATCH 1/3] X86_64: Optimise fls(), ffs() and fls64() Linus Torvalds
2011-12-15  0:35   ` David Howells
2011-12-15 18:12   ` H. Peter Anvin
2011-12-15 21:29 ` H. Peter Anvin
2011-12-15 22:43   ` Arnd Bergmann
2011-12-15 23:58     ` H. Peter Anvin
2011-12-16 13:57       ` Arnd Bergmann
2011-12-16 15:27         ` H. Peter Anvin
2011-12-15 23:01   ` David Howells
2011-12-15 23:26 ` [tip:x86/asm] x86_64, asm: " tip-bot for David Howells

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.