From: Alexander van Heukelum <heukelum@mailshack.com>
To: Alexander van Heukelum <heukelum@fastmail.fm>
Cc: dean gaudet <dean@arctic.org>, Ingo Molnar <mingo@elte.hu>,
Andi Kleen <andi@firstfloor.org>,
LKML <linux-kernel@vger.kernel.org>
Subject: Alternative implementation of the generic __ffs
Date: Fri, 18 Apr 2008 22:18:09 +0200 [thread overview]
Message-ID: <20080418201809.GA5036@mailshack.com> (raw)
In-Reply-To: <1207563950.7880.1246457209@webmail.messagingengine.com>
On Mon, Apr 07, 2008 at 12:25:50PM +0200, Alexander van Heukelum wrote:
> On Sun, 6 Apr 2008 13:22:58 -0700 (PDT), "dean gaudet" <dean@arctic.org> said:
> > On Sun, 6 Apr 2008, Alexander van Heukelum wrote:
> > > The current generic implementation of ffz is O(lg(n)) already
> >
> > it's O(lg(n)) time... the operations all depend on each other.
> >
> > the implementation i pointed to is O(lg(n)) code space... and the time
> > depends on how parallel the machine is, they're not dependent on each
> > other.
>
> Indeed. The worst dependencies are in the sum of all the partial
> results in this implementation. And addition is associative, so
> partial results can be written as ((a+b)+(c+d))+(e+f). Assuming
> perfect parallel execution this would lead to O(ln(ln(n))). Good.
Hello all,
I've implemented ffs (find first set bit) like it is shown
in http://www.hackersdelight.org/ (see revisions, page 21).
I would be interested to see how it compares with the generic
implementation of __ffs in the linux kernel, in particular
on architectures that do not have an obvious fast way of
determining the first set bit of a word.
I have included a simple benchmark program that should give
a reasonable estimate of the performance ratio of the two
implementations. Please test and report :).
Is this implementation suitable to replace the current one?
Greetings,
Alexander
P.S. Some results for some machines I could test on:
-----------
On a 2.1 GHz Athlon XP:
$ gcc -m32 -Os -fomit-frame-pointer ffs.c && time ./a.out
Original: 396 tics, 756 tics
New: 378 tics, 851 tics
Assembly: 262 tics, 383 tics
Empty loop: 128 tics, 203 tics
real 0m33.862s
user 0m33.026s
sys 0m0.344s
-----------
On a 2.33 GHz Xeon:
$ gcc -m64 -Os ffs.c && time ./a.out
Original: 277 tics, 324 tics
New: 230 tics, 236 tics
Assembly: 90 tics, 84 tics
Empty loop: 90 tics, 82 tics
real 0m14.490s
user 0m14.270s
sys 0m0.220s
$ gcc -m32 -Os -fomit-frame-pointer ffs.c && time ./a.out
Original: 303 tics, 449 tics
New: 231 tics, 394 tics
Assembly: 90 tics, 144 tics
Empty loop: 102 tics, 116 tics
real 0m18.521s
user 0m18.380s
sys 0m0.140s
-----------
On an alpha EV6.7 (21264A) operating at 667 MHz:
$ gcc -Os ffs.c && time ./a.out
Original: 622 tics, 633 tics
New: 431 tics, 465 tics
Assembly: 235 tics, 210 tics
Empty loop: 199 tics, 218 tics
50.358u 0.259s 1:14.28 68.1% 0+1k 2+0io 2pf+0w
-----------
#include <stdio.h>
#include <sys/times.h>
#define LOOPS32 (1<<(30-5-1))
#define LOOPS64 (1<<(30-6-1))
#define ATTR __attribute__((noinline))
static ATTR int __ffs32_orig(unsigned int word)
{
int num = 0;
if ((word & 0xffff) == 0) {
num += 16;
word >>= 16;
}
if ((word & 0xff) == 0) {
num += 8;
word >>= 8;
}
if ((word & 0xf) == 0) {
num += 4;
word >>= 4;
}
if ((word & 0x3) == 0) {
num += 2;
word >>= 2;
}
if ((word & 0x1) == 0)
num += 1;
return num;
}
static ATTR int __ffs64_orig(unsigned long long word)
{
int num = 0;
if ((word & 0xffffffff) == 0) {
num += 32;
word >>= 32;
}
if ((word & 0xffff) == 0) {
num += 16;
word >>= 16;
}
if ((word & 0xff) == 0) {
num += 8;
word >>= 8;
}
if ((word & 0xf) == 0) {
num += 4;
word >>= 4;
}
if ((word & 0x3) == 0) {
num += 2;
word >>= 2;
}
if ((word & 0x1) == 0)
num += 1;
return num;
}
static ATTR int __ffs32_new(unsigned int value)
{
int x0, x1, x2, x3, x4;
value &= -value;
x0 = (value & 0x55555555) ? 0 : 1;
x1 = (value & 0x33333333) ? 0 : 2;
x2 = (value & 0x0f0f0f0f) ? 0 : 4;
x3 = (value & 0x00ff00ff) ? 0 : 8;
x4 = (value & 0x0000ffff) ? 0 : 16;
return x0 | x1 | x2 | x3 | x4;
}
static ATTR int __ffs64_new(unsigned long long value)
{
int x0, x1, x2, x3, x4, x5;
value &= -value;
x0 = (value & 0x5555555555555555ull) ? 0 : 1;
x1 = (value & 0x3333333333333333ull) ? 0 : 2;
x2 = (value & 0x0f0f0f0f0f0f0f0full) ? 0 : 4;
x3 = (value & 0x00ff00ff00ff00ffull) ? 0 : 8;
x4 = (value & 0x0000ffff0000ffffull) ? 0 : 16;
x5 = (value & 0x00000000ffffffffull) ? 0 : 32;
return x0 | x1 | x2 | x3 | x4 | x5;
}
#ifdef __x86_64__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
int res;
asm volatile("bsf %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (value)
);
return res;
}
static ATTR int __ffs64_asm(unsigned long long value)
{
long res;
asm volatile("bsf %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (value)
);
return res;
}
#endif
#ifdef __i386__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
int res;
asm volatile("bsf %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (value)
);
return res;
}
static ATTR int __ffs64_asm(unsigned long long value)
{
unsigned int low, high;
int res;
low = value;
if (low) {
asm volatile("bsf %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (low)
);
return res;
}
high = value >> 32;
asm volatile("bsf %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (high)
);
return res | 32;
}
#endif
#ifdef __alpha__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
int res;
asm volatile("cttz %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (value)
);
return res;
}
static ATTR int __ffs64_asm(unsigned long long value)
{
long res;
asm volatile("cttz %[value], %[res]"
: [res] "=r" (res)
: [value] "r" (value)
);
return res;
}
#endif
static ATTR int __nothing32(unsigned int value)
{
return value;
}
static ATTR int __nothing64(unsigned long long value)
{
return (int)value;
}
/* Random numbers: modified version of libc mrand48 */
static unsigned long long myrand_next = 0x330eull;
static const unsigned long long myrand_a = 0x5deece66dull;
static const unsigned long long myrand_b = 0xbull;
unsigned int myrand(void)
{
myrand_next = myrand_a * myrand_next + myrand_b;
return (unsigned int)(myrand_next >> 16);
}
unsigned long long myrand64(void)
{
return ((unsigned long long)myrand() << 32) | myrand();
}
void myrand_seed(unsigned int seed)
{
int n;
myrand_next = ((unsigned long long)seed << 16) + 0x330eull;
for (n=0; n<100; n++) myrand(); /* warm it up a bit */
}
/* tic: wait for next clock tick and save current tick */
clock_t tictime;
void tic(void)
{
struct tms t;
times(&t);
tictime = t.tms_utime;
do {
times(&t);
} while (tictime == t.tms_utime);
tictime = t.tms_utime;
}
/* toc: report number of ticks since tic() */
long toc(void)
{
struct tms t;
times(&t);
return (long)(t.tms_utime - tictime);
}
__attribute__((noinline))
int bench_orig32(void)
{
unsigned int i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS32; n++) {
i = myrand();
while (i) {
res ^= __ffs32_orig(i);
i &= i - 1;
}
}
return res;
}
__attribute__((noinline))
int bench_orig64(void)
{
unsigned long long i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS64; n++) {
i = myrand64();
while (i) {
res ^= __ffs64_orig(i);
i &= i - 1;
}
}
return res;
}
__attribute__((noinline))
int bench_new32(void)
{
unsigned int i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS32; n++) {
i = myrand();
while (i) {
res ^= __ffs32_new(i);
i &= i - 1;
}
}
return res;
}
__attribute__((noinline))
int bench_new64(void)
{
unsigned long long i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS64; n++) {
i = myrand64();
while (i) {
res ^= __ffs64_new(i);
i &= i - 1;
}
}
return res;
}
#ifdef ASSEMBLYVERSION
__attribute__((noinline))
int bench_asm32(void)
{
unsigned int i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS32; n++) {
i = myrand();
while (i) {
res ^= __ffs32_asm(i);
i &= i - 1;
}
}
return res;
}
__attribute__((noinline))
int bench_asm64(void)
{
unsigned long long i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS64; n++) {
i = myrand64();
while (i) {
res ^= __ffs64_asm(i);
i &= i - 1;
}
}
return res;
}
#endif
__attribute__((noinline))
int bench_none32(void)
{
unsigned int i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS32; n++) {
i = myrand();
while (i) {
res ^= __nothing32(i);
i &= i - 1;
}
}
return res;
}
__attribute__((noinline))
int bench_none64(void)
{
unsigned long long i;
int n, res = 0;
myrand_seed(0);
for (n=0; n<LOOPS64; n++) {
i = myrand64();
while (i) {
res ^= __nothing64(i);
i &= i - 1;
}
}
return res;
}
void test(void)
{
unsigned long long int i;
int n, res1, res2;
myrand_seed(0);
for (n=0; n<1000; n++) {
i = myrand64();
while (i) {
res1 = __ffs64_orig(i);
res2 = __ffs64_new(i);
if (res1 != res2) {
printf("%llu %i %i\n", i,
res1, res2);
}
i &= i - 1;
}
}
}
int main(void)
{
long tics;
setvbuf(stdout, NULL, _IONBF, 0);
test();
printf("%-14s", "Original:");
tic(); bench_orig32(); tics = toc();
printf("%6lu tics,", tics);
tic(); bench_orig64(); tics = toc();
printf("%6lu tics\n", tics);
printf("%-14s", "New:");
tic(); bench_new32(); tics = toc();
printf("%6lu tics,", tics);
tic(); bench_new64(); tics = toc();
printf("%6lu tics\n", tics);
#ifdef ASSEMBLYVERSION
printf("%-14s", "Assembly:");
tic(); bench_asm32(); tics = toc();
printf("%6lu tics,", tics);
tic(); bench_asm64(); tics = toc();
printf("%6lu tics\n", tics);
#endif
printf("%-14s", "Empty loop:");
tic(); bench_none32(); tics = toc();
printf("%6lu tics,", tics);
tic(); bench_none64(); tics = toc();
printf("%6lu tics\n", tics);
return 0;
}
next prev parent reply other threads:[~2008-04-18 20:20 UTC|newest]
Thread overview: 34+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-03-31 17:15 [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386 Alexander van Heukelum
2008-03-31 17:22 ` Stephen Hemminger
2008-03-31 19:38 ` Alexander van Heukelum
2008-03-31 21:58 ` Andi Kleen
2008-04-01 8:47 ` Ingo Molnar
2008-04-01 9:46 ` Alexander van Heukelum
2008-04-01 15:41 ` [PATCH] x86: switch x86_64 to generic find_first_bit Alexander van Heukelum
2008-04-01 15:42 ` [PATCH] x86: optimize find_first_bit for small bitmaps Alexander van Heukelum
2008-04-01 15:47 ` [PATCH] x86: remove x86-specific implementations of find_first_bit Alexander van Heukelum
2008-04-03 9:34 ` Alexander van Heukelum
2008-04-04 8:47 ` Ingo Molnar
2008-04-06 17:03 ` [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386 dean gaudet
2008-04-06 18:51 ` Alexander van Heukelum
2008-04-06 20:22 ` dean gaudet
2008-04-07 8:43 ` Ingo Molnar
2008-04-07 10:25 ` Alexander van Heukelum
2008-04-18 20:18 ` Alexander van Heukelum [this message]
2008-04-18 23:46 ` Alternative implementation of the generic __ffs dean gaudet
2008-04-19 0:09 ` Harvey Harrison
2008-04-19 0:20 ` dean gaudet
2008-04-19 0:58 ` Joe Perches
2008-04-19 1:04 ` Harvey Harrison
2008-04-19 1:11 ` dean gaudet
2008-04-19 2:55 ` Joe Perches
2008-04-19 4:13 ` dean gaudet
2008-04-19 10:05 ` Mikael Pettersson
2008-04-19 12:10 ` Alexander van Heukelum
2008-04-19 18:17 ` Joe Perches
2008-04-19 20:26 ` Alexander van Heukelum
2008-04-19 22:29 ` Matti Aarnio
2008-04-20 3:06 ` Joe Perches
2008-04-20 8:42 ` Alexander van Heukelum
2008-04-20 12:31 ` Matti Aarnio
2008-04-21 11:43 ` Alexander van Heukelum
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20080418201809.GA5036@mailshack.com \
--to=heukelum@mailshack.com \
--cc=andi@firstfloor.org \
--cc=dean@arctic.org \
--cc=heukelum@fastmail.fm \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).