Alternative implementation of the generic __ffs

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Alexander van Heukelum <heukelum@mailshack.com>
To: Alexander van Heukelum <heukelum@fastmail.fm>
Cc: dean gaudet <dean@arctic.org>, Ingo Molnar <mingo@elte.hu>,
	Andi Kleen <andi@firstfloor.org>,
	LKML <linux-kernel@vger.kernel.org>
Subject: Alternative implementation of the generic __ffs
Date: Fri, 18 Apr 2008 22:18:09 +0200	[thread overview]
Message-ID: <20080418201809.GA5036@mailshack.com> (raw)
In-Reply-To: <1207563950.7880.1246457209@webmail.messagingengine.com>

On Mon, Apr 07, 2008 at 12:25:50PM +0200, Alexander van Heukelum wrote:
> On Sun, 6 Apr 2008 13:22:58 -0700 (PDT), "dean gaudet" <dean@arctic.org> said:
> > On Sun, 6 Apr 2008, Alexander van Heukelum wrote:
> > > The current generic implementation of ffz is O(lg(n)) already
> > 
> > it's O(lg(n)) time... the operations all depend on each other.
> > 
> > the implementation i pointed to is O(lg(n)) code space... and the time 
> > depends on how parallel the machine is, they're not dependent on each 
> > other.
> 
> Indeed. The worst dependencies are in the sum of all the partial
> results in this implementation. And addition is associative, so
> partial results can be written as ((a+b)+(c+d))+(e+f). Assuming
> perfect parallel execution this would lead to O(ln(ln(n))). Good.

Hello all,

I've implemented ffs (find first set bit) like it is shown
in http://www.hackersdelight.org/ (see revisions, page 21).
I would be interested to see how it compares with the generic
implementation of __ffs in the linux kernel, in particular
on architectures that do not have an obvious fast way of
determining the first set bit of a word.

I have included a simple benchmark program that should give
a reasonable estimate of the performance ratio of the two
implementations. Please test and report :).

Is this implementation suitable to replace the current one?

Greetings,
	Alexander

P.S. Some results for some machines I could test on:

-----------

On a 2.1 GHz Athlon XP:
$ gcc -m32 -Os -fomit-frame-pointer ffs.c && time ./a.out
Original:        396 tics,   756 tics
New:             378 tics,   851 tics
Assembly:        262 tics,   383 tics
Empty loop:      128 tics,   203 tics

real    0m33.862s
user    0m33.026s
sys     0m0.344s

-----------

On a 2.33 GHz Xeon:
$ gcc -m64 -Os ffs.c && time ./a.out 
Original:        277 tics,   324 tics
New:             230 tics,   236 tics
Assembly:         90 tics,    84 tics
Empty loop:       90 tics,    82 tics

real    0m14.490s
user    0m14.270s
sys     0m0.220s
$ gcc -m32 -Os -fomit-frame-pointer ffs.c && time ./a.out
Original:        303 tics,   449 tics
New:             231 tics,   394 tics
Assembly:         90 tics,   144 tics
Empty loop:      102 tics,   116 tics

real    0m18.521s
user    0m18.380s
sys     0m0.140s

-----------

On an alpha EV6.7 (21264A) operating at 667 MHz:
$ gcc -Os ffs.c && time ./a.out
Original:        622 tics,   633 tics
New:             431 tics,   465 tics
Assembly:        235 tics,   210 tics
Empty loop:      199 tics,   218 tics
50.358u 0.259s 1:14.28 68.1%    0+1k 2+0io 2pf+0w

-----------

#include <stdio.h>
#include <sys/times.h>

#define LOOPS32 (1<<(30-5-1))
#define LOOPS64 (1<<(30-6-1))
#define ATTR __attribute__((noinline))

static ATTR int __ffs32_orig(unsigned int word)
{
	int num = 0;

	if ((word & 0xffff) == 0) {
		num += 16;
		word >>= 16;
	}
	if ((word & 0xff) == 0) {
		num += 8;
		word >>= 8;
	}
	if ((word & 0xf) == 0) {
		num += 4;
		word >>= 4;
	}
	if ((word & 0x3) == 0) {
		num += 2;
		word >>= 2;
	}
	if ((word & 0x1) == 0)
		num += 1;

	return num;
}

static ATTR int __ffs64_orig(unsigned long long word)
{
	int num = 0;

	if ((word & 0xffffffff) == 0) {
		num += 32;
		word >>= 32;
	}
	if ((word & 0xffff) == 0) {
		num += 16;
		word >>= 16;
	}
	if ((word & 0xff) == 0) {
		num += 8;
		word >>= 8;
	}
	if ((word & 0xf) == 0) {
		num += 4;
		word >>= 4;
	}
	if ((word & 0x3) == 0) {
		num += 2;
		word >>= 2;
	}
	if ((word & 0x1) == 0)
		num += 1;

	return num;
}

static ATTR int __ffs32_new(unsigned int value)
{
	int x0, x1, x2, x3, x4;

	value &= -value;
	x0 = (value & 0x55555555) ? 0 : 1;
	x1 = (value & 0x33333333) ? 0 : 2;
	x2 = (value & 0x0f0f0f0f) ? 0 : 4;
	x3 = (value & 0x00ff00ff) ? 0 : 8;
	x4 = (value & 0x0000ffff) ? 0 : 16;

	return x0 | x1 | x2 | x3 | x4;
}

static ATTR int __ffs64_new(unsigned long long value)
{
	int x0, x1, x2, x3, x4, x5;

	value &= -value;
	x0 = (value & 0x5555555555555555ull) ? 0 : 1;
	x1 = (value & 0x3333333333333333ull) ? 0 : 2;
	x2 = (value & 0x0f0f0f0f0f0f0f0full) ? 0 : 4;
	x3 = (value & 0x00ff00ff00ff00ffull) ? 0 : 8;
	x4 = (value & 0x0000ffff0000ffffull) ? 0 : 16;
	x5 = (value & 0x00000000ffffffffull) ? 0 : 32;

	return x0 | x1 | x2 | x3 | x4 | x5;
}

#ifdef __x86_64__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
	int res;

	asm volatile("bsf %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (value)
	);	

	return res;
}

static ATTR int __ffs64_asm(unsigned long long value)
{
	long res;

	asm volatile("bsf %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (value)
	);	

	return res;
}
#endif

#ifdef __i386__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
	int res;

	asm volatile("bsf %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (value)
	);	

	return res;
}

static ATTR int __ffs64_asm(unsigned long long value)
{
	unsigned int low, high;
	int res;

	low = value;
	if (low) {
		asm volatile("bsf %[value], %[res]"
			: [res] "=r" (res)
			: [value] "r" (low)
		);
		return res;
	}

	high = value >> 32;
	asm volatile("bsf %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (high)
	);

	return res | 32;
}
#endif

#ifdef __alpha__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
	int res;

	asm volatile("cttz %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (value)
	);	

	return res;
}

static ATTR int __ffs64_asm(unsigned long long value)
{
	long res;

	asm volatile("cttz %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (value)
	);	

	return res;
}
#endif

static ATTR int __nothing32(unsigned int value)
{
	return value;
}

static ATTR int __nothing64(unsigned long long value)
{
	return (int)value;
}

/* Random numbers: modified version of libc mrand48 */
static unsigned long long myrand_next = 0x330eull;
static const unsigned long long myrand_a = 0x5deece66dull;
static const unsigned long long myrand_b = 0xbull;
unsigned int myrand(void)
{
	myrand_next = myrand_a * myrand_next + myrand_b;
	return (unsigned int)(myrand_next >> 16);
}

unsigned long long myrand64(void)
{
	return ((unsigned long long)myrand() << 32) | myrand();
}

void myrand_seed(unsigned int seed)
{
	int n;
	myrand_next = ((unsigned long long)seed << 16) + 0x330eull;
	for (n=0; n<100; n++) myrand(); /* warm it up a bit */
}

/* tic: wait for next clock tick and save current tick */
clock_t tictime;
void tic(void)
{
	struct tms t;
	times(&t);
	tictime = t.tms_utime;
	do {
		times(&t);
	} while (tictime == t.tms_utime);
	tictime = t.tms_utime;
}

/* toc: report number of ticks since tic() */
long toc(void)
{
	struct tms t;
	times(&t);
	return (long)(t.tms_utime - tictime);
}

__attribute__((noinline))
int bench_orig32(void)
{
	unsigned int i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS32; n++) {
		i = myrand();
		while (i) {
			res ^= __ffs32_orig(i);
			i &= i - 1;	
		}	
	}
	return res;
}

__attribute__((noinline))
int bench_orig64(void)
{
	unsigned long long i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS64; n++) {
		i = myrand64();
		while (i) {
			res ^= __ffs64_orig(i);
			i &= i - 1;	
		}	
	}
	return res;
}

__attribute__((noinline))
int bench_new32(void)
{
	unsigned int i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS32; n++) {
		i = myrand();
		while (i) {
			res ^= __ffs32_new(i);
			i &= i - 1;	
		}	
	}
	return res;
}

__attribute__((noinline))
int bench_new64(void)
{
	unsigned long long i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS64; n++) {
		i = myrand64();
		while (i) {
			res ^= __ffs64_new(i);
			i &= i - 1;	
		}	
	}
	return res;
}

#ifdef ASSEMBLYVERSION
__attribute__((noinline))
int bench_asm32(void)
{
	unsigned int i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS32; n++) {
		i = myrand();
		while (i) {
			res ^= __ffs32_asm(i);
			i &= i - 1;	
		}	
	}
	return res;
}

__attribute__((noinline))
int bench_asm64(void)
{
	unsigned long long i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS64; n++) {
		i = myrand64();
		while (i) {
			res ^= __ffs64_asm(i);
			i &= i - 1;	
		}	
	}
	return res;
}
#endif

__attribute__((noinline))
int bench_none32(void)
{
	unsigned int i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS32; n++) {
		i = myrand();
		while (i) {
			res ^= __nothing32(i);
			i &= i - 1;	
		}	
	}
	return res;
}

__attribute__((noinline))
int bench_none64(void)
{
	unsigned long long i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS64; n++) {
		i = myrand64();
		while (i) {
			res ^= __nothing64(i);
			i &= i - 1;
		}	
	}
	return res;
}

void test(void)
{
	unsigned long long int i;
	int n, res1, res2;

	myrand_seed(0);

	for (n=0; n<1000; n++) {
		i = myrand64();
		while (i) {
			res1 = __ffs64_orig(i);
			res2 = __ffs64_new(i);
			if (res1 != res2) {
				printf("%llu %i %i\n", i,
						res1, res2);
			}
			i &= i - 1;
		}
	}
}

int main(void)
{
	long tics;

	setvbuf(stdout, NULL, _IONBF, 0);
	test();

	printf("%-14s", "Original:");
	tic(); bench_orig32(); tics = toc();
	printf("%6lu tics,", tics);
	tic(); bench_orig64(); tics = toc();
	printf("%6lu tics\n", tics);

	printf("%-14s", "New:");
	tic(); bench_new32(); tics = toc();
	printf("%6lu tics,", tics);
	tic(); bench_new64(); tics = toc();
	printf("%6lu tics\n", tics);

#ifdef ASSEMBLYVERSION
	printf("%-14s", "Assembly:");
	tic(); bench_asm32(); tics = toc();
	printf("%6lu tics,", tics);
	tic(); bench_asm64(); tics = toc();
	printf("%6lu tics\n", tics);
#endif

	printf("%-14s", "Empty loop:");
	tic(); bench_none32(); tics = toc();
	printf("%6lu tics,", tics);
	tic(); bench_none64(); tics = toc();
	printf("%6lu tics\n", tics);

	return 0;
}

next prev parent reply	other threads:[~2008-04-18 20:20 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-03-31 17:15 [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386 Alexander van Heukelum
2008-03-31 17:22 ` Stephen Hemminger
2008-03-31 19:38   ` Alexander van Heukelum
2008-03-31 21:58     ` Andi Kleen
2008-04-01  8:47 ` Ingo Molnar
2008-04-01  9:46   ` Alexander van Heukelum
2008-04-01 15:41     ` [PATCH] x86: switch x86_64 to generic find_first_bit Alexander van Heukelum
2008-04-01 15:42       ` [PATCH] x86: optimize find_first_bit for small bitmaps Alexander van Heukelum
2008-04-01 15:47         ` [PATCH] x86: remove x86-specific implementations of find_first_bit Alexander van Heukelum
2008-04-03  9:34           ` Alexander van Heukelum
2008-04-04  8:47           ` Ingo Molnar
2008-04-06 17:03     ` [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386 dean gaudet
2008-04-06 18:51       ` Alexander van Heukelum
2008-04-06 20:22         ` dean gaudet
2008-04-07  8:43           ` Ingo Molnar
2008-04-07 10:25           ` Alexander van Heukelum
2008-04-18 20:18             ` Alexander van Heukelum [this message]
2008-04-18 23:46               ` Alternative implementation of the generic __ffs dean gaudet
2008-04-19  0:09                 ` Harvey Harrison
2008-04-19  0:20                   ` dean gaudet
2008-04-19  0:58                     ` Joe Perches
2008-04-19  1:04                       ` Harvey Harrison
2008-04-19  1:11                         ` dean gaudet
2008-04-19  2:55                           ` Joe Perches
2008-04-19  4:13                             ` dean gaudet
2008-04-19 10:05                               ` Mikael Pettersson
2008-04-19 12:10                               ` Alexander van Heukelum
2008-04-19 18:17                                 ` Joe Perches
2008-04-19 20:26                                   ` Alexander van Heukelum
2008-04-19 22:29                             ` Matti Aarnio
2008-04-20  3:06                               ` Joe Perches
2008-04-20  8:42                                 ` Alexander van Heukelum
2008-04-20 12:31                                   ` Matti Aarnio
2008-04-21 11:43                                     ` Alexander van Heukelum

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20080418201809.GA5036@mailshack.com \
    --to=heukelum@mailshack.com \
    --cc=andi@firstfloor.org \
    --cc=dean@arctic.org \
    --cc=heukelum@fastmail.fm \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).