[PATCH] x86: Optimize find_next_(zero_)bit for small constant-size bitmaps

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Alexander van Heukelum <heukelum@mailshack.com>
To: Ingo Molnar <mingo@elte.hu>
Cc: Alexander van Heukelum <heukelum@fastmail.fm>,
	Andi Kleen <andi@firstfloor.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	"H. Peter Anvin" <hpa@zytor.com>,
	LKML <linux-kernel@vger.kernel.org>
Subject: [PATCH] x86: Optimize find_next_(zero_)bit for small constant-size bitmaps
Date: Tue, 11 Mar 2008 16:17:19 +0100	[thread overview]
Message-ID: <20080311151719.GA15313@mailshack.com> (raw)
In-Reply-To: <20080311095631.GT25110@elte.hu>

x86: Optimize find_next_(zero_)bit for small constant-size bitmaps.

This moves an optimization for searching constant-sized small 
bitmaps form x86_64-specific to generic code.

On an i386 defconfig (the x86#testing one), the size of vmlinux hardly
changes with this applied. I have observed only four places where this
optimization avoids a call into find_next_bit:

In the functions return_unused_surplus_pages, alloc_fresh_huge_page,
and adjust_pool_surplus, this patch avoids a call for a 1-bit bitmap.
In __next_cpu a call is avoided for a 32-bit bitmap. That's it.

On x86_64, 52 locations are optimized with a minimal increase in
code size:

Current #testing defconfig: 
	146 x bsf, 27 x find_next_*bit
   text    data     bss     dec     hex filename
   5392637  846592  724424 6963653  6a41c5 vmlinux

After removing the x86_64 specific optimization for find_next_*bit:
	94 x bsf, 79 x find_next_*bit
   text    data     bss     dec     hex filename
   5392358  846592  724424 6963374  6a40ae vmlinux

After this patch (making the optimization generic):
	146 x bsf, 27 x find_next_*bit
   text    data     bss     dec     hex filename
   5392396  846592  724424 6963412  6a40d4 vmlinux

---

Hi Ingo,

I have now tested (in user-space) the one-long-versions of the 
find_next_(zero_)bit in the patch. They give the expected
results.

> > +#ifdef CONFIG_GENERIC_FIND_NEXT_BIT
> > +unsigned long __find_next_bit(const unsigned long *addr,
> > +		unsigned long size, unsigned long offset);
> > +
> > +static __always_inline unsigned long
> > +find_next_bit(const unsigned long *addr, unsigned long size,
> > +		unsigned long offset)
> 
> > +#endif /* CONFIG_GENERIC_FIND_NEXT_BIT */
> 
> there should be an #else here i think, which maps find_next_bit to 
> __find_next_bit / etc.

No, that is intentional: architectures are expected to either set
CONFIG_GENERIC_FIND_NEXT_BIT, or provide their own implementation
of find_next_bit.

An alternative would be to have all architectures providing
__find_next_bit/__find_next_zero_bit and do the optimization
unconditionally.

> > +unsigned long __find_next_bit(const unsigned long *addr,
> > +		unsigned long size, unsigned long offset);
> 
> should be explicitly "extern", so that we see that it's a prototype 
> declaration.

ok.

This version runs fine on qemu.

Greetings,
	Alexander

diff --git a/include/asm-x86/bitops.h b/include/asm-x86/bitops.h
index 7fbf93a..1a23ce1 100644
--- a/include/asm-x86/bitops.h
+++ b/include/asm-x86/bitops.h
@@ -312,12 +312,6 @@ static int test_bit(int nr, const volatile unsigned long *addr);
 
 #undef ADDR
 
-unsigned long find_next_bit(const unsigned long *addr,
-		unsigned long size, unsigned long offset);
-unsigned long find_next_zero_bit(const unsigned long *addr,
-		unsigned long size, unsigned long offset);
-
-
 #ifdef CONFIG_X86_32
 # include "bitops_32.h"
 #else
diff --git a/include/asm-x86/bitops_64.h b/include/asm-x86/bitops_64.h
index 40bf0f8..87e1a17 100644
--- a/include/asm-x86/bitops_64.h
+++ b/include/asm-x86/bitops_64.h
@@ -20,21 +20,11 @@ static inline long __scanbit(unsigned long val, unsigned long max)
   (__scanbit(*(unsigned long *)addr,(size))) : \
   find_first_bit(addr,size)))
 
-#define find_next_bit(addr,size,off) \
-((__builtin_constant_p(size) && (size) <= BITS_PER_LONG ? 	  \
-  ((off) + (__scanbit((*(unsigned long *)addr) >> (off),(size)-(off)))) : \
-	find_next_bit(addr,size,off)))
-
 #define find_first_zero_bit(addr,size) \
 ((__builtin_constant_p(size) && (size) <= BITS_PER_LONG ? \
   (__scanbit(~*(unsigned long *)addr,(size))) : \
   	find_first_zero_bit(addr,size)))
 	
-#define find_next_zero_bit(addr,size,off) \
-((__builtin_constant_p(size) && (size) <= BITS_PER_LONG ? 	  \
-  ((off)+(__scanbit(~(((*(unsigned long *)addr)) >> (off)),(size)-(off)))) : \
-	find_next_zero_bit(addr,size,off)))
-
 static inline void set_bit_string(unsigned long *bitmap, unsigned long i, 
 				  int len) 
 { 
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 69c1edb..15360f9 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -72,4 +72,81 @@ static inline unsigned fls_long(unsigned long l)
 	return fls64(l);
 }
 
+#ifdef __KERNEL__
+#ifdef CONFIG_GENERIC_FIND_NEXT_BIT
+extern unsigned long __find_next_bit(const unsigned long *addr,
+		unsigned long size, unsigned long offset);
+
+/**
+ * find_next_bit - find the next set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ */
+static __always_inline unsigned long
+find_next_bit(const unsigned long *addr, unsigned long size,
+		unsigned long offset)
+{
+	unsigned long value;
+
+	/* Avoid a function call if the bitmap size is a constant */
+	/* and not bigger than BITS_PER_LONG. */
+
+	/* insert a sentinel so that __ffs returns size if there */
+	/* are no set bits in the bitmap */
+	if (__builtin_constant_p(size) && (size < BITS_PER_LONG)) {
+		value = (*addr) & ((~0ul) << offset);
+		value |= (1ul << size);
+		return __ffs(value);
+	}
+
+	/* the result of __ffs(0) is undefined, so it needs to be */
+	/* handled separately */
+	if (__builtin_constant_p(size) && (size == BITS_PER_LONG)) {
+		value = (*addr) & ((~0ul) << offset);
+		return (value == 0) ? BITS_PER_LONG : __ffs(value);
+	}
+
+	/* size is not constant or too big */
+	return __find_next_bit(addr, size, offset);
+}
+
+extern unsigned long __find_next_zero_bit(const unsigned long *addr,
+		unsigned long size, unsigned long offset);
+
+/**
+ * find_next_zero_bit - find the next cleared bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ */
+static __always_inline unsigned long
+find_next_zero_bit(const unsigned long *addr, unsigned long size,
+		unsigned long offset)
+{
+	unsigned long value;
+
+	/* Avoid a function call if the bitmap size is a constant */
+	/* and not bigger than BITS_PER_LONG. */
+
+	/* insert a sentinel so that __ffs returns size if there */
+	/* are no set bits in the bitmap */
+	if (__builtin_constant_p(size) && (size < BITS_PER_LONG)) {
+		value = (~(*addr)) & ((~0ul) << offset);
+		value |= (1ul << size);
+		return __ffs(value);
+	}
+
+	/* the result of __ffs(0) is undefined, so it needs to be */
+	/* handled separately */
+	if (__builtin_constant_p(size) && (size == BITS_PER_LONG)) {
+		value = (~(*addr)) & ((~0ul) << offset);
+		return (value == 0) ? BITS_PER_LONG : __ffs(value);
+	}
+
+	/* size is not constant or too big */
+	return __find_next_zero_bit(addr, size, offset);
+}
+#endif /* CONFIG_GENERIC_FIND_NEXT_BIT */
+#endif /* __KERNEL__ */
 #endif
diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
index 5820e07..ce94c4c 100644
--- a/lib/find_next_bit.c
+++ b/lib/find_next_bit.c
@@ -15,17 +15,12 @@
 #include <asm/byteorder.h>
 
 #define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
-#undef find_next_bit
-#undef find_next_zero_bit
-
-/**
- * find_next_bit - find the next set bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The maximum size to search
+
+/*
+ * Find the next set bit in a memory region.
  */
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
-		unsigned long offset)
+unsigned long __find_next_bit(const unsigned long *addr,
+		unsigned long size, unsigned long offset)
 {
 	const unsigned long *p = addr + BITOP_WORD(offset);
 	unsigned long result = offset & ~(BITS_PER_LONG-1);
@@ -62,15 +57,14 @@ found_first:
 found_middle:
 	return result + __ffs(tmp);
 }
-
-EXPORT_SYMBOL(find_next_bit);
+EXPORT_SYMBOL(__find_next_bit);
 
 /*
  * This implementation of find_{first,next}_zero_bit was stolen from
  * Linus' asm-alpha/bitops.h.
  */
-unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
-		unsigned long offset)
+unsigned long __find_next_zero_bit(const unsigned long *addr,
+		unsigned long size, unsigned long offset)
 {
 	const unsigned long *p = addr + BITOP_WORD(offset);
 	unsigned long result = offset & ~(BITS_PER_LONG-1);
@@ -107,8 +101,7 @@ found_first:
 found_middle:
 	return result + ffz(tmp);
 }
-
-EXPORT_SYMBOL(find_next_zero_bit);
+EXPORT_SYMBOL(__find_next_zero_bit);
 
 #ifdef __BIG_ENDIAN

next prev parent reply	other threads:[~2008-03-11 15:20 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-03-09 20:01 [PATCH] x86: Change x86 to use generic find_next_bit Alexander van Heukelum
2008-03-09 20:10 ` Ingo Molnar
2008-03-09 21:03   ` Andi Kleen
2008-03-09 21:32     ` Andi Kleen
2008-03-09 21:13   ` Alexander van Heukelum
2008-03-10  6:29     ` Ingo Molnar
2008-03-09 20:11 ` Ingo Molnar
2008-03-09 20:31   ` Alexander van Heukelum
2008-03-09 20:51     ` Ingo Molnar
2008-03-09 21:29       ` Andi Kleen
2008-03-10 23:17       ` [RFC/PATCH] x86: Optimize find_next_(zero_)bit for small constant-size bitmaps Alexander van Heukelum
2008-03-11  9:56         ` Ingo Molnar
2008-03-11 15:17           ` Alexander van Heukelum [this message]
2008-03-11 15:22             ` [RFC] non-x86: " Alexander van Heukelum
2008-03-11 15:23             ` [PATCH] x86: " Ingo Molnar
2008-03-09 20:28 ` [PATCH] x86: Change x86 to use generic find_next_bit Andi Kleen
2008-03-09 21:31 ` Andi Kleen
2008-03-13 12:44 ` Aneesh Kumar K.V
2008-03-13 14:27   ` Alexander van Heukelum

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:7fbf93a dfblob:1a23ce1 dfblob:40bf0f8 dfblob:87e1a17
dfblob:69c1edb dfblob:15360f9 dfblob:5820e07 dfblob:ce94c4c )
 OR (
bs:"[PATCH] x86: Optimize find_next_(zero_)bit for small constant-size bitmaps" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20080311151719.GA15313@mailshack.com \
    --to=heukelum@mailshack.com \
    --cc=andi@firstfloor.org \
    --cc=heukelum@fastmail.fm \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.