* [PATCH] Optimize bitmap_weight
@ 2012-05-11 14:10 Akinobu Mita
2012-05-11 22:48 ` Andrew Morton
2012-05-25 22:57 ` Akinobu Mita
0 siblings, 2 replies; 6+ messages in thread
From: Akinobu Mita @ 2012-05-11 14:10 UTC (permalink / raw)
To: linux-kernel, akpm; +Cc: Akinobu Mita
The current implementation of bitmap_weight simply evaluates the
population count for each long word of the array, and adds.
The subsection "Counting 1-bits in an Array" in the revisions of
the book 'Hacker's Delight' explains more superior methods than
the naive method.
http://www.hackersdelight.org/revisions.pdf
http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt
My benchmark results on Intel Core i3 CPU with 32-bit kernel
showed 50% faster for 8192 bits bitmap. However, it is not faster
for small bitmap (< BITS_PER_LONG * 8) than the naive method.
So if the bitmap size is known to be small at compile time,
use the naive method.
/*
* userspace bitmap_weight() benchmark program
*/
extern int __bitmap_weight(const unsigned long *bitmap, int bits);
extern int __bitmap_weight_fast(const unsigned long *bitmap, int bits);
int main(int argc, char **argv)
{
int i;
struct timeval start[2], stop[2], diff[2];
unsigned int bits;
unsigned long bitmap[1024 * 1024];
int n, m;
int iterations;
if (argc != 4) {
fprintf(stderr, "Usage: %s SEED BITS ITERATIONS\n", argv[0]);
return -1;
}
srand(atoi(argv[1]));
bits = atoi(argv[2]) % (sizeof(bitmap) * 8);
iterations = atoi(argv[3]);
for (i = 0; i < sizeof(bitmap); i++)
((unsigned char *)bitmap)[i] = rand();
/* dummy call */
gettimeofday(&start[0], NULL);
n = __bitmap_weight(bitmap, bits);
gettimeofday(&stop[0], NULL);
gettimeofday(&start[1], NULL);
m = __bitmap_weight_fast(bitmap, bits);
gettimeofday(&stop[1], NULL);
if (n != m) {
fprintf(stderr, "Oops %d != %d (%d)\n", n, m, atoi(argv[1]));
return -1;
}
gettimeofday(&start[0], NULL);
for (i = 0; i < iterations; i++)
n += __bitmap_weight(bitmap, bits);
gettimeofday(&stop[0], NULL);
gettimeofday(&start[1], NULL);
for (i = 0; i < iterations; i++)
m += __bitmap_weight_fast(bitmap, bits);
gettimeofday(&stop[1], NULL);
if (n != m)
return -1;
timersub(&stop[0], &start[0], &diff[0]);
timersub(&stop[1], &start[1], &diff[1]);
printf("%d %d: %lu.%06lu %lu.%06lu\n", bits, iterations,
diff[0].tv_sec, diff[0].tv_usec,
diff[1].tv_sec, diff[1].tv_usec);
return 0;
}
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
---
include/linux/bitmap.h | 5 +++-
lib/bitmap.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 53 insertions(+), 1 deletions(-)
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 7ad6345..9191c7d 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -111,6 +111,7 @@ extern int __bitmap_intersects(const unsigned long *bitmap1,
extern int __bitmap_subset(const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_weight(const unsigned long *bitmap, int bits);
+extern int __bitmap_weight_fast(const unsigned long *bitmap, int bits);
extern void bitmap_set(unsigned long *map, int i, int len);
extern void bitmap_clear(unsigned long *map, int start, int nr);
@@ -277,7 +278,9 @@ static inline int bitmap_weight(const unsigned long *src, int nbits)
{
if (small_const_nbits(nbits))
return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
- return __bitmap_weight(src, nbits);
+ else if (__builtin_constant_p(nbits) && (nbits) < BITS_PER_LONG * 8)
+ return __bitmap_weight(src, nbits);
+ return __bitmap_weight_fast(src, nbits);
}
static inline void bitmap_shift_right(unsigned long *dst,
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 18c67d8..7352263 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -254,6 +254,55 @@ int __bitmap_weight(const unsigned long *bitmap, int bits)
}
EXPORT_SYMBOL(__bitmap_weight);
+#define CSA(h, l, a, b, c) \
+ { \
+ unsigned long u = a ^ b; \
+ unsigned long v = c; \
+ h = (a & b) | (u & v); \
+ l = u ^ v; \
+ } while (0)
+
+/**
+ * __bitmap_weight_fast - count the set bits in a bitmap
+ * @bitmap: pointer to bitmap
+ * @bits: bitmap size, in bits
+ *
+ * This implementation is based on the algorithm from the subsection
+ * "Counting 1-bits in an Array" in the revisions of the book
+ * 'Hacker's Delight'.
+ *
+ * http://www.hackersdelight.org/revisions.pdf
+ * http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt
+ */
+int __bitmap_weight_fast(const unsigned long *bitmap, int bits)
+{
+ int k, w = 0, lim = bits / BITS_PER_LONG;
+ unsigned long ones, twos, twosA, twosB, fours, foursA, foursB, eights;
+ fours = twos = ones = 0;
+
+ for (k = 0; k < lim - 7; k += 8) {
+ CSA(twosA, ones, ones, bitmap[k], bitmap[k + 1]);
+ CSA(twosB, ones, ones, bitmap[k + 2], bitmap[k + 3]);
+ CSA(foursA, twos, twos, twosA, twosB);
+ CSA(twosA, ones, ones, bitmap[k + 4], bitmap[k + 5]);
+ CSA(twosB, ones, ones, bitmap[k + 6], bitmap[k + 7]);
+ CSA(foursB, twos, twos, twosA, twosB);
+ CSA(eights, fours, fours, foursA, foursB);
+ w += hweight_long(eights);
+ }
+ w = 8 * w + 4 * hweight_long(fours) + 2 * hweight_long(twos) +
+ hweight_long(ones);
+
+ for (; k < lim; k++)
+ w += hweight_long(bitmap[k]);
+
+ if (bits % BITS_PER_LONG)
+ w += hweight_long(bitmap[k] & BITMAP_LAST_WORD_MASK(bits));
+
+ return w;
+}
+EXPORT_SYMBOL(__bitmap_weight_fast);
+
void bitmap_set(unsigned long *map, int start, int nr)
{
unsigned long *p = map + BIT_WORD(start);
--
1.7.7.6
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH] Optimize bitmap_weight
2012-05-11 14:10 [PATCH] Optimize bitmap_weight Akinobu Mita
@ 2012-05-11 22:48 ` Andrew Morton
2012-05-13 21:50 ` Akinobu Mita
2012-05-25 22:57 ` Akinobu Mita
1 sibling, 1 reply; 6+ messages in thread
From: Andrew Morton @ 2012-05-11 22:48 UTC (permalink / raw)
To: Akinobu Mita; +Cc: linux-kernel
On Fri, 11 May 2012 23:10:14 +0900
Akinobu Mita <akinobu.mita@gmail.com> wrote:
> The current implementation of bitmap_weight simply evaluates the
> population count for each long word of the array, and adds.
>
> The subsection "Counting 1-bits in an Array" in the revisions of
> the book 'Hacker's Delight' explains more superior methods than
> the naive method.
>
> http://www.hackersdelight.org/revisions.pdf
> http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt
>
> My benchmark results on Intel Core i3 CPU with 32-bit kernel
> showed 50% faster for 8192 bits bitmap. However, it is not faster
> for small bitmap (< BITS_PER_LONG * 8) than the naive method.
> So if the bitmap size is known to be small at compile time,
> use the naive method.
>
> ...
>
> extern void bitmap_clear(unsigned long *map, int start, int nr);
> @@ -277,7 +278,9 @@ static inline int bitmap_weight(const unsigned long *src, int nbits)
> {
> if (small_const_nbits(nbits))
> return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
Why do we require a constant_p `nbits' for this case?
> - return __bitmap_weight(src, nbits);
> + else if (__builtin_constant_p(nbits) && (nbits) < BITS_PER_LONG * 8)
> + return __bitmap_weight(src, nbits);
> + return __bitmap_weight_fast(src, nbits);
> }
BITS_PER_LONG*8 sounds like a large bitmap: 256 or 512 entries. Will
the kernel call __bitmap_weight_fast() sufficiently often to make this
extra code worth merging?
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] Optimize bitmap_weight
2012-05-11 22:48 ` Andrew Morton
@ 2012-05-13 21:50 ` Akinobu Mita
2012-05-14 20:36 ` Andrew Morton
0 siblings, 1 reply; 6+ messages in thread
From: Akinobu Mita @ 2012-05-13 21:50 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
2012/5/12 Andrew Morton <akpm@linux-foundation.org>:
> On Fri, 11 May 2012 23:10:14 +0900
> Akinobu Mita <akinobu.mita@gmail.com> wrote:
>
>> The current implementation of bitmap_weight simply evaluates the
>> population count for each long word of the array, and adds.
>>
>> The subsection "Counting 1-bits in an Array" in the revisions of
>> the book 'Hacker's Delight' explains more superior methods than
>> the naive method.
>>
>> http://www.hackersdelight.org/revisions.pdf
>> http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt
>>
>> My benchmark results on Intel Core i3 CPU with 32-bit kernel
>> showed 50% faster for 8192 bits bitmap. However, it is not faster
>> for small bitmap (< BITS_PER_LONG * 8) than the naive method.
>> So if the bitmap size is known to be small at compile time,
>> use the naive method.
>>
>> ...
>>
>> extern void bitmap_clear(unsigned long *map, int start, int nr);
>> @@ -277,7 +278,9 @@ static inline int bitmap_weight(const unsigned long *src, int nbits)
>> {
>> if (small_const_nbits(nbits))
>> return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
>
> Why do we require a constant_p `nbits' for this case?
>
>> - return __bitmap_weight(src, nbits);
>> + else if (__builtin_constant_p(nbits) && (nbits) < BITS_PER_LONG * 8)
>> + return __bitmap_weight(src, nbits);
>> + return __bitmap_weight_fast(src, nbits);
>> }
>
> BITS_PER_LONG*8 sounds like a large bitmap: 256 or 512 entries. Will
> the kernel call __bitmap_weight_fast() sufficiently often to make this
> extra code worth merging?
>
I roughly checked the call sites of bitmap_weight() and picked up some
outstanding usages below.
Some filesystems (udf, omfs, ntfs, and hpfs) use bitmap_weight() to
the block size bytes region in statfs() path.
num_online_cpus() and the variants are bitmap_weight() to the NR_CPUS
bitmap and num_online_nodes() and the variants are to the MAX_NUMNODES
bitmap. So these bitmaps could be large on extremely large system.
bm_count_bits() in drivers/block/drbd/drbd_bitmap.c computes the
population count for multiple pages. But it is currently open-coded
loops with hweight_long() which can be converted to bitmap_weight().
I consider introducing bitmap_weight_large() which is specialized for
the large bitmap instead of optimizing bitmap_weight() and replace the
call sites like above.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] Optimize bitmap_weight
2012-05-13 21:50 ` Akinobu Mita
@ 2012-05-14 20:36 ` Andrew Morton
2012-05-15 11:58 ` Akinobu Mita
0 siblings, 1 reply; 6+ messages in thread
From: Andrew Morton @ 2012-05-14 20:36 UTC (permalink / raw)
To: Akinobu Mita; +Cc: linux-kernel
On Mon, 14 May 2012 06:50:15 +0900
Akinobu Mita <akinobu.mita@gmail.com> wrote:
> 2012/5/12 Andrew Morton <akpm@linux-foundation.org>:
> > On Fri, 11 May 2012 23:10:14 +0900
> > Akinobu Mita <akinobu.mita@gmail.com> wrote:
> >
> >> The current implementation of bitmap_weight simply evaluates the
> >> population count for each long word of the array, and adds.
> >>
> >> The subsection "Counting 1-bits in an Array" in the revisions of
> >> the book 'Hacker's Delight' explains more superior methods than
> >> the naive method.
> >>
> >> http://www.hackersdelight.org/revisions.pdf
> >> http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt
> >>
> >> My benchmark results on Intel Core i3 CPU with 32-bit kernel
> >> showed 50% faster for 8192 bits bitmap. However, it is not faster
> >> for small bitmap (< BITS_PER_LONG * 8) than the naive method.
> >> So if the bitmap size is known to be small at compile time,
> >> use the naive method.
> >>
> >> ...
> >>
> >> extern void bitmap_clear(unsigned long *map, int start, int nr);
> >> @@ -277,7 +278,9 @@ static inline int bitmap_weight(const unsigned long *src, int nbits)
> >> {
> >> if (small_const_nbits(nbits))
> >> return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
> >
> > Why do we require a constant_p `nbits' for this case?
^^ this?
> >> - return bitmap_weight(src, nbits);
> >> + else if ( builtin_constant_p(nbits) && (nbits) < BITS_PER_LONG * 8)
> >> + return bitmap_weight(src, nbits);
> >> + return bitmap_weight_fast(src, nbits);
> >> }
> >
> > BITS_PER_LONG*8 sounds like a large bitmap: 256 or 512 entries. Will
> > the kernel call bitmap_weight_fast() sufficiently often to make this
> > extra code worth merging?
> >
>
> I roughly checked the call sites of bitmap_weight() and picked up some
> outstanding usages below.
>
> Some filesystems (udf, omfs, ntfs, and hpfs) use bitmap_weight() to
> the block size bytes region in statfs() path.
>
> num_online_cpus() and the variants are bitmap_weight() to the NR_CPUS
> bitmap and num_online_nodes() and the variants are to the MAX_NUMNODES
> bitmap. So these bitmaps could be large on extremely large system.
>
> bm_count_bits() in drivers/block/drbd/drbd_bitmap.c computes the
> population count for multiple pages. But it is currently open-coded
> loops with hweight_long() which can be converted to bitmap_weight().
>
> I consider introducing bitmap_weight_large() which is specialized for
> the large bitmap instead of optimizing bitmap_weight() and replace the
> call sites like above.
I don't see much advantage to that - it would be better if
bitmap_weight() Just Works.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] Optimize bitmap_weight
2012-05-14 20:36 ` Andrew Morton
@ 2012-05-15 11:58 ` Akinobu Mita
0 siblings, 0 replies; 6+ messages in thread
From: Akinobu Mita @ 2012-05-15 11:58 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
2012/5/15 Andrew Morton <akpm@linux-foundation.org>:
> On Mon, 14 May 2012 06:50:15 +0900
> Akinobu Mita <akinobu.mita@gmail.com> wrote:
>
>> 2012/5/12 Andrew Morton <akpm@linux-foundation.org>:
>> > On Fri, 11 May 2012 23:10:14 +0900
>> > Akinobu Mita <akinobu.mita@gmail.com> wrote:
>> >
>> >> The current implementation of bitmap_weight simply evaluates the
>> >> population count for each long word of the array, and adds.
>> >>
>> >> The subsection "Counting 1-bits in an Array" in the revisions of
>> >> the book 'Hacker's Delight' explains more superior methods than
>> >> the naive method.
>> >>
>> >> http://www.hackersdelight.org/revisions.pdf
>> >> http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt
>> >>
>> >> My benchmark results on Intel Core i3 CPU with 32-bit kernel
>> >> showed 50% faster for 8192 bits bitmap. However, it is not faster
>> >> for small bitmap (< BITS_PER_LONG * 8) than the naive method.
>> >> So if the bitmap size is known to be small at compile time,
>> >> use the naive method.
>> >>
>> >> ...
>> >>
>> >> extern void bitmap_clear(unsigned long *map, int start, int nr);
>> >> @@ -277,7 +278,9 @@ static inline int bitmap_weight(const unsigned long *src, int nbits)
>> >> {
>> >> if (small_const_nbits(nbits))
>> >> return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
>> >
>> > Why do we require a constant_p `nbits' for this case?
>
> ^^ this?
The commit 4b0bc0bca83f3fb7cf920e2ec80684c15d2269c0
"bitmap: test for constant as well as small size for inline versions"
explains that it's for reducing text size, especially for the variable
cpumasks.
>> >> - return bitmap_weight(src, nbits);
>> >> + else if ( builtin_constant_p(nbits) && (nbits) < BITS_PER_LONG * 8)
>> >> + return bitmap_weight(src, nbits);
>> >> + return bitmap_weight_fast(src, nbits);
>> >> }
>> >
>> > BITS_PER_LONG*8 sounds like a large bitmap: 256 or 512 entries. Will
>> > the kernel call bitmap_weight_fast() sufficiently often to make this
>> > extra code worth merging?
>> >
>>
>> I roughly checked the call sites of bitmap_weight() and picked up some
>> outstanding usages below.
>>
>> Some filesystems (udf, omfs, ntfs, and hpfs) use bitmap_weight() to
>> the block size bytes region in statfs() path.
>>
>> num_online_cpus() and the variants are bitmap_weight() to the NR_CPUS
>> bitmap and num_online_nodes() and the variants are to the MAX_NUMNODES
>> bitmap. So these bitmaps could be large on extremely large system.
>>
>> bm_count_bits() in drivers/block/drbd/drbd_bitmap.c computes the
>> population count for multiple pages. But it is currently open-coded
>> loops with hweight_long() which can be converted to bitmap_weight().
>>
>> I consider introducing bitmap_weight_large() which is specialized for
>> the large bitmap instead of optimizing bitmap_weight() and replace the
>> call sites like above.
>
> I don't see much advantage to that - it would be better if
> bitmap_weight() Just Works.
OK, I withdraw the idea of bitmap_weight_large().
Do you still have doubts about merging this and want examples
counting large bitmap popcount in kernel?
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] Optimize bitmap_weight
2012-05-11 14:10 [PATCH] Optimize bitmap_weight Akinobu Mita
2012-05-11 22:48 ` Andrew Morton
@ 2012-05-25 22:57 ` Akinobu Mita
1 sibling, 0 replies; 6+ messages in thread
From: Akinobu Mita @ 2012-05-25 22:57 UTC (permalink / raw)
To: linux-kernel, akpm
2012/5/11 Akinobu Mita <akinobu.mita@gmail.com>:
> The current implementation of bitmap_weight simply evaluates the
> population count for each long word of the array, and adds.
>
> The subsection "Counting 1-bits in an Array" in the revisions of
> the book 'Hacker's Delight' explains more superior methods than
> the naive method.
>
> http://www.hackersdelight.org/revisions.pdf
> http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt
>
> My benchmark results on Intel Core i3 CPU with 32-bit kernel
> showed 50% faster for 8192 bits bitmap. However, it is not faster
> for small bitmap (< BITS_PER_LONG * 8) than the naive method.
> So if the bitmap size is known to be small at compile time,
> use the naive method.
My benchmark was bogus. I used __sw_hweight32 for hweight_long.
It is much slower on x86 with X86_FEATURE_POPCNT.
This patch slows bitmap_weight(). So please drop this patch
(bitmap_weight-optimize-for-large-bitmaps.patch) from -mm tree.
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2012-05-25 22:57 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-05-11 14:10 [PATCH] Optimize bitmap_weight Akinobu Mita
2012-05-11 22:48 ` Andrew Morton
2012-05-13 21:50 ` Akinobu Mita
2012-05-14 20:36 ` Andrew Morton
2012-05-15 11:58 ` Akinobu Mita
2012-05-25 22:57 ` Akinobu Mita
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.