From: Alexander van Heukelum <heukelum@mailshack.com>
To: Alexander van Heukelum <heukelum@fastmail.fm>
Cc: dean gaudet <dean@arctic.org>, Ingo Molnar <mingo@elte.hu>,
Andi Kleen <andi@firstfloor.org>,
LKML <linux-kernel@vger.kernel.org>
Subject: Alternative implementation of the generic __ffs
Date: Fri, 18 Apr 2008 22:18:09 +0200 [thread overview]
Message-ID: <20080418201809.GA5036@mailshack.com> (raw)
In-Reply-To: <1207563950.7880.1246457209@webmail.messagingengine.com>
On Mon, Apr 07, 2008 at 12:25:50PM +0200, Alexander van Heukelum wrote:
> On Sun, 6 Apr 2008 13:22:58 -0700 (PDT), "dean gaudet" <dean@arctic.org> said:
> > On Sun, 6 Apr 2008, Alexander van Heukelum wrote:
> > > The current generic implementation of ffz is O(lg(n)) already
> >
> > it's O(lg(n)) time... the operations all depend on each other.
> >
> > the implementation i pointed to is O(lg(n)) code space... and the time
> > depends on how parallel the machine is, they're not dependent on each
> > other.
>
> Indeed. The worst dependencies are in the sum of all the partial
> results in this implementation. And addition is associative, so
> partial results can be written as ((a+b)+(c+d))+(e+f). Assuming
> perfect parallel execution this would lead to O(ln(ln(n))). Good.
Hello all,
I've implemented ffs (find first set bit) like it is shown
in http://www.hackersdelight.org/ (see revisions, page 21).
I would be interested to see how it compares with the generic
implementation of __ffs in the linux kernel, in particular
on architectures that do not have an obvious fast way of
determining the first set bit of a word.
I have included a simple benchmark program that should give
a reasonable estimate of the performance ratio of the two
implementations. Please test and report :).
Is this implementation suitable to replace the current one?
Greetings,
Alexander
P.S. Some results for some machines I could test on:
-----------
On a 2.1 GHz Athlon XP:
$ gcc -m32 -Os -fomit-frame-pointer ffs.c && time ./a.out
Original: 396 tics, 756 tics
New: 378 tics, 851 tics
Assembly: 262 tics, 383 tics
Empty loop: 128 tics, 203 tics
real 0m33.862s
user 0m33.026s
sys 0m0.344s
-----------
On a 2.33 GHz Xeon:
$ gcc -m64 -Os ffs.c && time ./a.out
Original: 277 tics, 324 tics
New: 230 tics, 236 tics
Assembly: 90 tics, 84 tics
Empty loop: 90 tics, 82 tics
real 0m14.490s
user 0m14.270s
sys 0m0.220s
$ gcc -m32 -Os -fomit-frame-pointer ffs.c && time ./a.out
Original: 303 tics, 449 tics
New: 231 tics, 394 tics
Assembly: 90 tics, 144 tics
Empty loop: 102 tics, 116 tics
real 0m18.521s
user 0m18.380s
sys 0m0.140s
-----------
On an alpha EV6.7 (21264A) operating at 667 MHz:
$ gcc -Os ffs.c && time ./a.out
Original: 622 tics, 633 tics
New: 431 tics, 465 tics
Assembly: 235 tics, 210 tics
Empty loop: 199 tics, 218 tics
50.358u 0.259s 1:14.28 68.1% 0+1k 2+0io 2pf+0w
-----------
#include <stdio.h>
#include <sys/times.h>
#define LOOPS32 (1<<(30-5-1))
#define LOOPS64 (1<<(30-6-1))
#define ATTR __attribute__((noinline))
static ATTR int __ffs32_orig(unsigned int word)
{
int num = 0;
if ((word & 0xffff) == 0) {
num += 16;
word >>= 16;
}
if ((word & 0xff) == 0) {
num += 8;
word >>= 8;
}
if ((word & 0xf) == 0) {
num += 4;
word >>= 4;
}
if ((word & 0x3) == 0) {
num += 2;
word >>= 2;
}
if ((word & 0x1) == 0)
num += 1;
return num;
}
static ATTR int __ffs64_orig(unsigned long long word)
{
int num = 0;
if ((word & 0xffffffff) == 0) {
num += 32;
word >>= 32;
}
if ((word & 0xffff) == 0) {
num += 16;
word >>= 16;
}
if ((word & 0xff) == 0) {
num += 8;
word >>= 8;
}
if ((word & 0xf) == 0) {
num += 4;
word >>= 4;
}
if ((word & 0x3) == 0) {
num += 2;
word >>= 2;
}
if ((word & 0x1) == 0)
num += 1;
return num;
}
static ATTR int __ffs32_new(unsigned int value)
{
int x0, x1, x2, x3, x4;
value &= -value;
x0 = (value & 0x55555555) ? 0 : 1;
x1 = (value & 0x33333333) ? 0 : 2;
x2 = (value & 0x0f0f0f0f) ? 0 : 4;
x3 = (value & 0x00ff00ff) ? 0 : 8;
x4 = (value & 0x0000ffff) ? 0 : 16;
return x0 | x1 | x2 | x3 | x4;
}
static ATTR int __ffs64_new(unsigned long long value)
{
int x0, x1, x2, x3, x4, x5;
value &= -value;
x0 = (value & 0x5555555555555555ull) ? 0 : 1;
x1 = (value & 0x3333333333333333ull) ? 0 : 2;
x2 = (value & 0x0f0f0f0f0f0f0f0full) ? 0 : 4;
x3 = (value & 0x00ff00ff00ff00ffull) ? 0 : 8;
x4 = (value & 0x0000ffff0000ffffull) ? 0 : 16;
x5 = (value & 0x00000000ffffffffull) ? 0 : 32;
return x0 | x1 | x2 | x3 | x4 | x5;
}
#ifdef __x86_64__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
int res;
asm volatile("bsf %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (value)
);
return res;
}
static ATTR int __ffs64_asm(unsigned long long value)
{
long res;
asm volatile("bsf %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (value)
);
return res;
}
#endif
#ifdef __i386__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
int res;
asm volatile("bsf %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (value)
);
return res;
}
static ATTR int __ffs64_asm(unsigned long long value)
{
unsigned int low, high;
int res;
low = value;
if (low) {
asm volatile("bsf %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (low)
);
return res;
}
high = value >> 32;
asm volatile("bsf %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (high)
);
return res | 32;
}
#endif
#ifdef __alpha__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
int res;
asm volatile("cttz %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (value)
);
return res;
}
static ATTR int __ffs64_asm(unsigned long long value)
{
long res;
asm volatile("cttz %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (value)
);
return res;
}
#endif
static ATTR int __nothing32(unsigned int value)
{
return value;
}
static ATTR int __nothing64(unsigned long long value)
{
return (int)value;
}
/* Random numbers: modified version of libc mrand48 */
static unsigned long long myrand_next = 0x330eull;
static const unsigned long long myrand_a = 0x5deece66dull;
static const unsigned long long myrand_b = 0xbull;
unsigned int myrand(void)
{
myrand_next = myrand_a * myrand_next + myrand_b;
return (unsigned int)(myrand_next >> 16);
}
unsigned long long myrand64(void)
{
return ((unsigned long long)myrand() << 32) | myrand();
}
void myrand_seed(unsigned int seed)
{
int n;
myrand_next = ((unsigned long long)seed << 16) + 0x330eull;
for (n=0; n<100; n++) myrand(); /* warm it up a bit */
}
/* tic: wait for next clock tick and save current tick */
clock_t tictime;
void tic(void)
{
struct tms t;
times(&t);
tictime = t.tms_utime;
do {
times(&t);
} while (tictime == t.tms_utime);
tictime = t.tms_utime;
}
/* toc: report number of ticks since tic() */
long toc(void)
{
struct tms t;
times(&t);
return (long)(t.tms_utime - tictime);
}
__attribute__((noinline))
int bench_orig32(void)
{
unsigned int i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS32; n++) {
i = myrand();
while (i) {
res ^= __ffs32_orig(i);
i &= i - 1;
}
}
return res;
}
__attribute__((noinline))
int bench_orig64(void)
{
unsigned long long i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS64; n++) {
i = myrand64();
while (i) {
res ^= __ffs64_orig(i);
i &= i - 1;
}
}
return res;
}
__attribute__((noinline))
int bench_new32(void)
{
unsigned int i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS32; n++) {
i = myrand();
while (i) {
res ^= __ffs32_new(i);
i &= i - 1;
}
}
return res;
}
__attribute__((noinline))
int bench_new64(void)
{
unsigned long long i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS64; n++) {
i = myrand64();
while (i) {
res ^= __ffs64_new(i);
i &= i - 1;
}
}
return res;
}
#ifdef ASSEMBLYVERSION
__attribute__((noinline))
int bench_asm32(void)
{
unsigned int i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS32; n++) {
i = myrand();
while (i) {
res ^= __ffs32_asm(i);
i &= i - 1;
}
}
return res;
}
__attribute__((noinline))
int bench_asm64(void)
{
unsigned long long i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS64; n++) {
i = myrand64();
while (i) {
res ^= __ffs64_asm(i);
i &= i - 1;
}
}
return res;
}
#endif
__attribute__((noinline))
int bench_none32(void)
{
unsigned int i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS32; n++) {
i = myrand();
while (i) {
res ^= __nothing32(i);
i &= i - 1;
}
}
return res;
}
__attribute__((noinline))
int bench_none64(void)
{
unsigned long long i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS64; n++) {
i = myrand64();
while (i) {
res ^= __nothing64(i);
i &= i - 1;
}
}
return res;
}
void test(void)
{
unsigned long long int i;
int n, res1, res2;
myrand_seed(0);
for (n=0; n<1000; n++) {
i = myrand64();
while (i) {
res1 = __ffs64_orig(i);
res2 = __ffs64_new(i);
if (res1 != res2) {
printf("%llu %i %i\n", i,
res1, res2);
}
i &= i - 1;
}
}
}
int main(void)
{
long tics;
setvbuf(stdout, NULL, _IONBF, 0);
test();
printf("%-14s", "Original:");
tic(); bench_orig32(); tics = toc();
printf("%6lu tics,", tics);
tic(); bench_orig64(); tics = toc();
printf("%6lu tics\n", tics);
printf("%-14s", "New:");
tic(); bench_new32(); tics = toc();
printf("%6lu tics,", tics);
tic(); bench_new64(); tics = toc();
printf("%6lu tics\n", tics);
#ifdef ASSEMBLYVERSION
printf("%-14s", "Assembly:");
tic(); bench_asm32(); tics = toc();
printf("%6lu tics,", tics);
tic(); bench_asm64(); tics = toc();
printf("%6lu tics\n", tics);
#endif
printf("%-14s", "Empty loop:");
tic(); bench_none32(); tics = toc();
printf("%6lu tics,", tics);
tic(); bench_none64(); tics = toc();
printf("%6lu tics\n", tics);
return 0;
}
next prev parent reply other threads:[~2008-04-18 20:20 UTC|newest]
Thread overview: 34+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-03-31 17:15 [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386 Alexander van Heukelum
2008-03-31 17:22 ` Stephen Hemminger
2008-03-31 19:38 ` Alexander van Heukelum
2008-03-31 21:58 ` Andi Kleen
2008-04-01 8:47 ` Ingo Molnar
2008-04-01 9:46 ` Alexander van Heukelum
2008-04-01 15:41 ` [PATCH] x86: switch x86_64 to generic find_first_bit Alexander van Heukelum
2008-04-01 15:42 ` [PATCH] x86: optimize find_first_bit for small bitmaps Alexander van Heukelum
2008-04-01 15:47 ` [PATCH] x86: remove x86-specific implementations of find_first_bit Alexander van Heukelum
2008-04-03 9:34 ` Alexander van Heukelum
2008-04-04 8:47 ` Ingo Molnar
2008-04-06 17:03 ` [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386 dean gaudet
2008-04-06 18:51 ` Alexander van Heukelum
2008-04-06 20:22 ` dean gaudet
2008-04-07 8:43 ` Ingo Molnar
2008-04-07 10:25 ` Alexander van Heukelum
2008-04-18 20:18 ` Alexander van Heukelum [this message]
2008-04-18 23:46 ` Alternative implementation of the generic __ffs dean gaudet
2008-04-19 0:09 ` Harvey Harrison
2008-04-19 0:20 ` dean gaudet
2008-04-19 0:58 ` Joe Perches
2008-04-19 1:04 ` Harvey Harrison
2008-04-19 1:11 ` dean gaudet
2008-04-19 2:55 ` Joe Perches
2008-04-19 4:13 ` dean gaudet
2008-04-19 10:05 ` Mikael Pettersson
2008-04-19 12:10 ` Alexander van Heukelum
2008-04-19 18:17 ` Joe Perches
2008-04-19 20:26 ` Alexander van Heukelum
2008-04-19 22:29 ` Matti Aarnio
2008-04-20 3:06 ` Joe Perches
2008-04-20 8:42 ` Alexander van Heukelum
2008-04-20 12:31 ` Matti Aarnio
2008-04-21 11:43 ` Alexander van Heukelum
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20080418201809.GA5036@mailshack.com \
--to=heukelum@mailshack.com \
--cc=andi@firstfloor.org \
--cc=dean@arctic.org \
--cc=heukelum@fastmail.fm \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.