All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Optimize bitmap_weight
@ 2012-05-11 14:10 Akinobu Mita
  2012-05-11 22:48 ` Andrew Morton
  2012-05-25 22:57 ` Akinobu Mita
  0 siblings, 2 replies; 6+ messages in thread
From: Akinobu Mita @ 2012-05-11 14:10 UTC (permalink / raw)
  To: linux-kernel, akpm; +Cc: Akinobu Mita

The current implementation of bitmap_weight simply evaluates the
population count for each long word of the array, and adds.

The subsection "Counting 1-bits in an Array" in the revisions of
the book 'Hacker's Delight' explains more superior methods than
the naive method.

http://www.hackersdelight.org/revisions.pdf
http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt

My benchmark results on Intel Core i3 CPU with 32-bit kernel
showed 50% faster for 8192 bits bitmap.  However, it is not faster
for small bitmap (< BITS_PER_LONG * 8) than the naive method.
So if the bitmap size is known to be small at compile time,
use the naive method.

/*
 * userspace bitmap_weight() benchmark program
 */
extern int __bitmap_weight(const unsigned long *bitmap, int bits);
extern int __bitmap_weight_fast(const unsigned long *bitmap, int bits);

int main(int argc, char **argv)
{
	int i;
	struct timeval start[2], stop[2], diff[2];
	unsigned int bits;
	unsigned long bitmap[1024 * 1024];
	int n, m;
	int iterations;

	if (argc != 4) {
		fprintf(stderr, "Usage: %s SEED BITS ITERATIONS\n", argv[0]);
		return -1;
	}

	srand(atoi(argv[1]));
	bits = atoi(argv[2]) % (sizeof(bitmap) * 8);
	iterations = atoi(argv[3]);

	for (i = 0; i < sizeof(bitmap); i++)
		((unsigned char *)bitmap)[i] = rand();

	/* dummy call */
	gettimeofday(&start[0], NULL);
	n = __bitmap_weight(bitmap, bits);
	gettimeofday(&stop[0], NULL);
	gettimeofday(&start[1], NULL);
	m = __bitmap_weight_fast(bitmap, bits);
	gettimeofday(&stop[1], NULL);

	if (n != m) {
		fprintf(stderr, "Oops %d != %d (%d)\n", n, m, atoi(argv[1]));
		return -1;
	}

	gettimeofday(&start[0], NULL);
	for (i = 0; i < iterations; i++)
		n += __bitmap_weight(bitmap, bits);
	gettimeofday(&stop[0], NULL);

	gettimeofday(&start[1], NULL);
	for (i = 0; i < iterations; i++)
		m += __bitmap_weight_fast(bitmap, bits);
	gettimeofday(&stop[1], NULL);

	if (n != m)
		return -1;

	timersub(&stop[0], &start[0], &diff[0]);
	timersub(&stop[1], &start[1], &diff[1]);
	printf("%d %d: %lu.%06lu %lu.%06lu\n", bits, iterations,
		diff[0].tv_sec, diff[0].tv_usec,
		diff[1].tv_sec, diff[1].tv_usec);

	return 0;
}

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
---
 include/linux/bitmap.h |    5 +++-
 lib/bitmap.c           |   49 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletions(-)

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 7ad6345..9191c7d 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -111,6 +111,7 @@ extern int __bitmap_intersects(const unsigned long *bitmap1,
 extern int __bitmap_subset(const unsigned long *bitmap1,
 			const unsigned long *bitmap2, int bits);
 extern int __bitmap_weight(const unsigned long *bitmap, int bits);
+extern int __bitmap_weight_fast(const unsigned long *bitmap, int bits);
 
 extern void bitmap_set(unsigned long *map, int i, int len);
 extern void bitmap_clear(unsigned long *map, int start, int nr);
@@ -277,7 +278,9 @@ static inline int bitmap_weight(const unsigned long *src, int nbits)
 {
 	if (small_const_nbits(nbits))
 		return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
-	return __bitmap_weight(src, nbits);
+	else if (__builtin_constant_p(nbits) && (nbits) < BITS_PER_LONG * 8)
+		return __bitmap_weight(src, nbits);
+	return __bitmap_weight_fast(src, nbits);
 }
 
 static inline void bitmap_shift_right(unsigned long *dst,
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 18c67d8..7352263 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -254,6 +254,55 @@ int __bitmap_weight(const unsigned long *bitmap, int bits)
 }
 EXPORT_SYMBOL(__bitmap_weight);
 
+#define CSA(h, l, a, b, c) \
+	{ \
+		unsigned long u = a ^ b; \
+		unsigned long v = c; \
+		h = (a & b) | (u & v); \
+		l = u ^ v; \
+	} while (0)
+
+/**
+ * __bitmap_weight_fast - count the set bits in a bitmap
+ * @bitmap: pointer to bitmap
+ * @bits: bitmap size, in bits
+ *
+ * This implementation is based on the algorithm from the subsection
+ * "Counting 1-bits in an Array" in the revisions of the book
+ * 'Hacker's Delight'.
+ *
+ * http://www.hackersdelight.org/revisions.pdf
+ * http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt
+ */
+int __bitmap_weight_fast(const unsigned long *bitmap, int bits)
+{
+	int k, w = 0, lim = bits / BITS_PER_LONG;
+	unsigned long ones, twos, twosA, twosB, fours, foursA, foursB, eights;
+	fours = twos = ones = 0;
+
+	for (k = 0; k < lim - 7; k += 8) {
+		CSA(twosA, ones, ones, bitmap[k], bitmap[k + 1]);
+		CSA(twosB, ones, ones, bitmap[k + 2], bitmap[k + 3]);
+		CSA(foursA, twos, twos, twosA, twosB);
+		CSA(twosA, ones, ones, bitmap[k + 4], bitmap[k + 5]);
+		CSA(twosB, ones, ones, bitmap[k + 6], bitmap[k + 7]);
+		CSA(foursB, twos, twos, twosA, twosB);
+		CSA(eights, fours, fours, foursA, foursB);
+		w += hweight_long(eights);
+	}
+	w = 8 * w + 4 * hweight_long(fours) + 2 * hweight_long(twos) +
+		hweight_long(ones);
+
+	for (; k < lim; k++)
+		w += hweight_long(bitmap[k]);
+
+	if (bits % BITS_PER_LONG)
+		w += hweight_long(bitmap[k] & BITMAP_LAST_WORD_MASK(bits));
+
+	return w;
+}
+EXPORT_SYMBOL(__bitmap_weight_fast);
+
 void bitmap_set(unsigned long *map, int start, int nr)
 {
 	unsigned long *p = map + BIT_WORD(start);
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] Optimize bitmap_weight
  2012-05-11 14:10 [PATCH] Optimize bitmap_weight Akinobu Mita
@ 2012-05-11 22:48 ` Andrew Morton
  2012-05-13 21:50   ` Akinobu Mita
  2012-05-25 22:57 ` Akinobu Mita
  1 sibling, 1 reply; 6+ messages in thread
From: Andrew Morton @ 2012-05-11 22:48 UTC (permalink / raw)
  To: Akinobu Mita; +Cc: linux-kernel

On Fri, 11 May 2012 23:10:14 +0900
Akinobu Mita <akinobu.mita@gmail.com> wrote:

> The current implementation of bitmap_weight simply evaluates the
> population count for each long word of the array, and adds.
> 
> The subsection "Counting 1-bits in an Array" in the revisions of
> the book 'Hacker's Delight' explains more superior methods than
> the naive method.
> 
> http://www.hackersdelight.org/revisions.pdf
> http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt
> 
> My benchmark results on Intel Core i3 CPU with 32-bit kernel
> showed 50% faster for 8192 bits bitmap.  However, it is not faster
> for small bitmap (< BITS_PER_LONG * 8) than the naive method.
> So if the bitmap size is known to be small at compile time,
> use the naive method.
> 
> ...
>
>  extern void bitmap_clear(unsigned long *map, int start, int nr);
> @@ -277,7 +278,9 @@ static inline int bitmap_weight(const unsigned long *src, int nbits)
>  {
>  	if (small_const_nbits(nbits))
>  		return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));

Why do we require a constant_p `nbits' for this case?

> -	return __bitmap_weight(src, nbits);
> +	else if (__builtin_constant_p(nbits) && (nbits) < BITS_PER_LONG * 8)
> +		return __bitmap_weight(src, nbits);
> +	return __bitmap_weight_fast(src, nbits);
>  }

BITS_PER_LONG*8 sounds like a large bitmap: 256 or 512 entries.  Will
the kernel call __bitmap_weight_fast() sufficiently often to make this
extra code worth merging?


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Optimize bitmap_weight
  2012-05-11 22:48 ` Andrew Morton
@ 2012-05-13 21:50   ` Akinobu Mita
  2012-05-14 20:36     ` Andrew Morton
  0 siblings, 1 reply; 6+ messages in thread
From: Akinobu Mita @ 2012-05-13 21:50 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

2012/5/12 Andrew Morton <akpm@linux-foundation.org>:
> On Fri, 11 May 2012 23:10:14 +0900
> Akinobu Mita <akinobu.mita@gmail.com> wrote:
>
>> The current implementation of bitmap_weight simply evaluates the
>> population count for each long word of the array, and adds.
>>
>> The subsection "Counting 1-bits in an Array" in the revisions of
>> the book 'Hacker's Delight' explains more superior methods than
>> the naive method.
>>
>> http://www.hackersdelight.org/revisions.pdf
>> http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt
>>
>> My benchmark results on Intel Core i3 CPU with 32-bit kernel
>> showed 50% faster for 8192 bits bitmap.  However, it is not faster
>> for small bitmap (< BITS_PER_LONG * 8) than the naive method.
>> So if the bitmap size is known to be small at compile time,
>> use the naive method.
>>
>> ...
>>
>>  extern void bitmap_clear(unsigned long *map, int start, int nr);
>> @@ -277,7 +278,9 @@ static inline int bitmap_weight(const unsigned long *src, int nbits)
>>  {
>>       if (small_const_nbits(nbits))
>>               return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
>
> Why do we require a constant_p `nbits' for this case?
>
>> -     return __bitmap_weight(src, nbits);
>> +     else if (__builtin_constant_p(nbits) && (nbits) < BITS_PER_LONG * 8)
>> +             return __bitmap_weight(src, nbits);
>> +     return __bitmap_weight_fast(src, nbits);
>>  }
>
> BITS_PER_LONG*8 sounds like a large bitmap: 256 or 512 entries.  Will
> the kernel call __bitmap_weight_fast() sufficiently often to make this
> extra code worth merging?
>

I roughly checked the call sites of bitmap_weight() and picked up some
outstanding usages below.

Some filesystems (udf, omfs, ntfs, and hpfs) use bitmap_weight() to
the block size bytes region in statfs() path.

num_online_cpus() and the variants are bitmap_weight() to the NR_CPUS
bitmap and num_online_nodes() and the variants are to the MAX_NUMNODES
bitmap.  So these bitmaps could be large on extremely large system.

bm_count_bits() in drivers/block/drbd/drbd_bitmap.c computes the
population count for multiple pages.  But it is currently open-coded
loops with hweight_long() which can be converted to bitmap_weight().

I consider introducing bitmap_weight_large() which is specialized for
the large bitmap instead of optimizing bitmap_weight() and replace the
call sites like above.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Optimize bitmap_weight
  2012-05-13 21:50   ` Akinobu Mita
@ 2012-05-14 20:36     ` Andrew Morton
  2012-05-15 11:58       ` Akinobu Mita
  0 siblings, 1 reply; 6+ messages in thread
From: Andrew Morton @ 2012-05-14 20:36 UTC (permalink / raw)
  To: Akinobu Mita; +Cc: linux-kernel

On Mon, 14 May 2012 06:50:15 +0900
Akinobu Mita <akinobu.mita@gmail.com> wrote:

> 2012/5/12 Andrew Morton <akpm@linux-foundation.org>:
> > On Fri, 11 May 2012 23:10:14 +0900
> > Akinobu Mita <akinobu.mita@gmail.com> wrote:
> >
> >> The current implementation of bitmap_weight simply evaluates the
> >> population count for each long word of the array, and adds.
> >>
> >> The subsection "Counting 1-bits in an Array" in the revisions of
> >> the book 'Hacker's Delight' explains more superior methods than
> >> the naive method.
> >>
> >> http://www.hackersdelight.org/revisions.pdf
> >> http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt
> >>
> >> My benchmark results on Intel Core i3 CPU with 32-bit kernel
> >> showed 50% faster for 8192 bits bitmap.  However, it is not faster
> >> for small bitmap (< BITS_PER_LONG * 8) than the naive method.
> >> So if the bitmap size is known to be small at compile time,
> >> use the naive method.
> >>
> >> ...
> >>
> >>  extern void bitmap_clear(unsigned long *map, int start, int nr);
> >> @@ -277,7 +278,9 @@ static inline int bitmap_weight(const unsigned long *src, int nbits)
> >>  {
> >>       if (small_const_nbits(nbits))
> >>               return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
> >
> > Why do we require a constant_p `nbits' for this case?

^^ this?

> >> -     return  bitmap_weight(src, nbits);
> >> +     else if ( builtin_constant_p(nbits) && (nbits) < BITS_PER_LONG * 8)
> >> +             return  bitmap_weight(src, nbits);
> >> +     return  bitmap_weight_fast(src, nbits);
> >>  }
> >
> > BITS_PER_LONG*8 sounds like a large bitmap: 256 or 512 entries.  Will
> > the kernel call  bitmap_weight_fast() sufficiently often to make this
> > extra code worth merging?
> >
> 
> I roughly checked the call sites of bitmap_weight() and picked up some
> outstanding usages below.
> 
> Some filesystems (udf, omfs, ntfs, and hpfs) use bitmap_weight() to
> the block size bytes region in statfs() path.
> 
> num_online_cpus() and the variants are bitmap_weight() to the NR_CPUS
> bitmap and num_online_nodes() and the variants are to the MAX_NUMNODES
> bitmap.  So these bitmaps could be large on extremely large system.
> 
> bm_count_bits() in drivers/block/drbd/drbd_bitmap.c computes the
> population count for multiple pages.  But it is currently open-coded
> loops with hweight_long() which can be converted to bitmap_weight().
> 
> I consider introducing bitmap_weight_large() which is specialized for
> the large bitmap instead of optimizing bitmap_weight() and replace the
> call sites like above.

I don't see much advantage to that - it would be better if
bitmap_weight() Just Works.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Optimize bitmap_weight
  2012-05-14 20:36     ` Andrew Morton
@ 2012-05-15 11:58       ` Akinobu Mita
  0 siblings, 0 replies; 6+ messages in thread
From: Akinobu Mita @ 2012-05-15 11:58 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

2012/5/15 Andrew Morton <akpm@linux-foundation.org>:
> On Mon, 14 May 2012 06:50:15 +0900
> Akinobu Mita <akinobu.mita@gmail.com> wrote:
>
>> 2012/5/12 Andrew Morton <akpm@linux-foundation.org>:
>> > On Fri, 11 May 2012 23:10:14 +0900
>> > Akinobu Mita <akinobu.mita@gmail.com> wrote:
>> >
>> >> The current implementation of bitmap_weight simply evaluates the
>> >> population count for each long word of the array, and adds.
>> >>
>> >> The subsection "Counting 1-bits in an Array" in the revisions of
>> >> the book 'Hacker's Delight' explains more superior methods than
>> >> the naive method.
>> >>
>> >> http://www.hackersdelight.org/revisions.pdf
>> >> http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt
>> >>
>> >> My benchmark results on Intel Core i3 CPU with 32-bit kernel
>> >> showed 50% faster for 8192 bits bitmap.  However, it is not faster
>> >> for small bitmap (< BITS_PER_LONG * 8) than the naive method.
>> >> So if the bitmap size is known to be small at compile time,
>> >> use the naive method.
>> >>
>> >> ...
>> >>
>> >>  extern void bitmap_clear(unsigned long *map, int start, int nr);
>> >> @@ -277,7 +278,9 @@ static inline int bitmap_weight(const unsigned long *src, int nbits)
>> >>  {
>> >>       if (small_const_nbits(nbits))
>> >>               return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
>> >
>> > Why do we require a constant_p `nbits' for this case?
>
> ^^ this?

The commit 4b0bc0bca83f3fb7cf920e2ec80684c15d2269c0
"bitmap: test for constant as well as small size for inline versions"
explains that it's for reducing text size, especially for the variable
cpumasks.

>> >> -     return  bitmap_weight(src, nbits);
>> >> +     else if ( builtin_constant_p(nbits) && (nbits) < BITS_PER_LONG * 8)
>> >> +             return  bitmap_weight(src, nbits);
>> >> +     return  bitmap_weight_fast(src, nbits);
>> >>  }
>> >
>> > BITS_PER_LONG*8 sounds like a large bitmap: 256 or 512 entries.  Will
>> > the kernel call  bitmap_weight_fast() sufficiently often to make this
>> > extra code worth merging?
>> >
>>
>> I roughly checked the call sites of bitmap_weight() and picked up some
>> outstanding usages below.
>>
>> Some filesystems (udf, omfs, ntfs, and hpfs) use bitmap_weight() to
>> the block size bytes region in statfs() path.
>>
>> num_online_cpus() and the variants are bitmap_weight() to the NR_CPUS
>> bitmap and num_online_nodes() and the variants are to the MAX_NUMNODES
>> bitmap.  So these bitmaps could be large on extremely large system.
>>
>> bm_count_bits() in drivers/block/drbd/drbd_bitmap.c computes the
>> population count for multiple pages.  But it is currently open-coded
>> loops with hweight_long() which can be converted to bitmap_weight().
>>
>> I consider introducing bitmap_weight_large() which is specialized for
>> the large bitmap instead of optimizing bitmap_weight() and replace the
>> call sites like above.
>
> I don't see much advantage to that - it would be better if
> bitmap_weight() Just Works.

OK, I withdraw the idea of bitmap_weight_large().
Do you still have doubts about merging this and want examples
counting large bitmap popcount in kernel?

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Optimize bitmap_weight
  2012-05-11 14:10 [PATCH] Optimize bitmap_weight Akinobu Mita
  2012-05-11 22:48 ` Andrew Morton
@ 2012-05-25 22:57 ` Akinobu Mita
  1 sibling, 0 replies; 6+ messages in thread
From: Akinobu Mita @ 2012-05-25 22:57 UTC (permalink / raw)
  To: linux-kernel, akpm

2012/5/11 Akinobu Mita <akinobu.mita@gmail.com>:
> The current implementation of bitmap_weight simply evaluates the
> population count for each long word of the array, and adds.
>
> The subsection "Counting 1-bits in an Array" in the revisions of
> the book 'Hacker's Delight' explains more superior methods than
> the naive method.
>
> http://www.hackersdelight.org/revisions.pdf
> http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt
>
> My benchmark results on Intel Core i3 CPU with 32-bit kernel
> showed 50% faster for 8192 bits bitmap.  However, it is not faster
> for small bitmap (< BITS_PER_LONG * 8) than the naive method.
> So if the bitmap size is known to be small at compile time,
> use the naive method.

My benchmark was bogus.  I used __sw_hweight32 for hweight_long.
It is much slower on x86 with X86_FEATURE_POPCNT.

This patch slows bitmap_weight().  So please drop this patch
(bitmap_weight-optimize-for-large-bitmaps.patch) from -mm tree.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2012-05-25 22:57 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-05-11 14:10 [PATCH] Optimize bitmap_weight Akinobu Mita
2012-05-11 22:48 ` Andrew Morton
2012-05-13 21:50   ` Akinobu Mita
2012-05-14 20:36     ` Andrew Morton
2012-05-15 11:58       ` Akinobu Mita
2012-05-25 22:57 ` Akinobu Mita

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.