From: David Mosberger <davidm@napali.hpl.hp.com>
To: linux-ia64@vger.kernel.org
Subject: Re: [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random problem on 2.6.12-rc1)]
Date: Fri, 08 Apr 2005 23:49:11 +0000 [thread overview]
Message-ID: <16983.6263.400429.553064@napali.hpl.hp.com> (raw)
In-Reply-To: <20050408103324.6c5231df.akpm@osdl.org>
With the attached test-program, I'm seeing these results on a 1.5GHz
Madison:
measured time number of
per call: bundles:
GCC v3.4 GCC pre-v4.1
generic: 8.696 ns 14 7.345 ns
womack: 9.355 ns 10 10.017 ns
popcount: 7.351 ns 8 6.676 ns
arch: 9.357 ns 7 8.683 ns
ia64_fls: 8.016 ns 4 8.011 ns
popcount64: 9.357 ns 8 9.349 ns
(GCC prior to v3.4 doesn't have the __builtin_popcount() so we'd need
to fallback on asm in that case).
Given these results, I'm inclined to continue to use the
FP-normalization for 64-bit ia64_fls() and to use the (32-bit-only)
popcount-version for fls().
I should note that there are faster ways of doing fls() on ia64 but
they require lookup-tables and given that fls() isn't called all that
frequently, those table-lookups would likely result in cache misses.
Of course, if you know of a place in the kernel where fls() is used
very frequently, do let me know.
--david
/*
* Copyright (C) 2005 Grant Grundler (grundler at parisc-linux.org)
*
* Copies code from:
* Copyright (C) 1998-2003 Hewlett-Packard Co
* David Mosberger-Tang <davidm@hpl.hp.com>
*
* testfls.c may be redistributed under GPL 2.0.
*/
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#ifdef __INTEL_COMPILER
# include <ia64intrin.h>
# define ia64_getf_exp(x) __getf_exp(x)
# define popcount(x) _m64_popcnt(x)
# define popcount64(x) _m64_popcnt(x)
#else
# define ia64_getf_exp(x) \
({ \
long ia64_intri_res; \
\
asm ("getf.exp %0=%1" : "=r"(ia64_intri_res) : "f"(x)); \
\
ia64_intri_res; \
})
# define ia64_popcnt(x) \
({ \
unsigned long ia64_intri_res; \
asm ("popcnt %0=%1" : "=r" (ia64_intri_res) : "r" (x)); \
\
ia64_intri_res; \
})
# define popcount(x) __builtin_popcount(x)
# define popcount64(x) ia64_popcnt(x)
#endif
static inline unsigned long
ia64_fls (unsigned long x)
{
long double d = x;
long exp;
exp = ia64_getf_exp(d);
return exp - 0xffff;
}
static inline int
arch_fls (int x)
{
if (!x)
return 0;
return ia64_fls((unsigned int) x) + 1;
}
/*
* fls: find last bit set.
*/
static __inline__ int generic_fls(int x)
{
int r = 32;
if (!x)
return 0;
if (!(x & 0xffff0000u)) {
x <<= 16;
r -= 16;
}
if (!(x & 0xff000000u)) {
x <<= 8;
r -= 8;
}
if (!(x & 0xf0000000u)) {
x <<= 4;
r -= 4;
}
if (!(x & 0xc0000000u)) {
x <<= 2;
r -= 2;
}
if (!(x & 0x80000000u)) {
x <<= 1;
r -= 1;
}
return r;
}
static __inline__ int womack_fls(int x)
{
unsigned int t = x;
int r = 1;
if (!t)
return 0;
if (t & 0xffff0000u) {
t >>= 16;
r += 16;
}
if (t & 0xff00) {
t >>= 8;
r += 8;
}
if (t & 0xf0u) {
t >>= 4;
r += 4;
}
/*
* This is based on an article by Paul Womack posted to
* comp.arch on May 15, 2000.
*
* Last 4 bits are handled by a lookup table. Each
* table-entry occupies only 2 bits, so we can encode the
* table in 16*2 = 32 bits. The table value is constructed
* like this:
*
* 0000 = don't care (not possible)
* 0001 = 1
* 0010 = 2
* 0011 = 2
* 0100 = 3
* 0101 = 3
* 0110 = 3
* 0111 = 3
* 1000 = 4
* 1001 = 4
* 1010 = 4
* 1011 = 4
* 1100 = 4
* 1101 = 4
* 1110 = 4
* 1111 = 4
*
* to make it fit in 2 bits, we subtract 1 before encoding, so
* in binary:
*
* = 0b11111111111111111010101001010000
* 3 3 3 3 3 3 3 3 2 2 2 2 1 1 0 0
* hex (thanks, bc!)
* = 0xffffaa50
*/
return r + ((0xffffaa50u >> (2 * t)) & 0x3);
}
static __inline__ int
popcount_fls(int t)
{
unsigned long x = t & 0xffffffffu;
if (!x)
return 0;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return popcount(x);
}
static __inline__ unsigned long
popcount_fls64(unsigned long x)
{
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
x |= x >> 32;
return popcount64(x) - 1;
}
int testval[] = { -2, -1, 0, 6, 10 };
static inline void
check (int val)
{
if (arch_fls(val) != generic_fls(val))
printf ("arch_fls(%08x) %6d generic_fls(%08x) %6d\n",
val, arch_fls(val), val, generic_fls(val));
if (womack_fls(val) != generic_fls(val))
printf ("arch_fls(%08x) %6d generic_fls(%08x) %6d\n",
val, womack_fls(val), val, generic_fls(val));
if (popcount_fls(val) != generic_fls(val))
printf ("popcount_fls(%08x) %6d generic_fls(%08x) %6d\n",
val, popcount_fls(val), val, generic_fls(val));
}
static inline void
check64 (unsigned long val)
{
if (!val)
return;
if (popcount_fls64(val) != ia64_fls(val))
printf ("popcount_fls64(%016lx) %6ld ia64_fls(%016lx) %6ld\n",
val, popcount_fls64(val), val, ia64_fls(val));
}
static int
empty (int x)
{
return 0;
}
static double
measure (const char *label, int (*func)(int), double offset)
{
struct timeval start, stop;
long i, count = 1000000;
double t;
while (1) {
gettimeofday(&start, NULL);
for (i = 0; i < count; ++i)
(*func)(i | (i << 16));
gettimeofday(&stop, NULL);
t = ((stop.tv_sec + 1e-6*stop.tv_usec)
- (start.tv_sec + 1e-6*start.tv_usec));
if (t >= 1.0)
break;
if (t <= 0.0)
t = 1e-3;
count = (1.1 / t) * count;
}
t = (t / count) * 1e9 - offset;
printf ("%10s: %9.3f ns\n", label, t);
return t;
}
static double
measure64 (const char *label, unsigned long (*func)(unsigned long),
double offset)
{
struct timeval start, stop;
long i, count = 1000000;
double t;
while (1) {
gettimeofday(&start, NULL);
for (i = 0; i < count; ++i)
(*func)(i | (i << 16));
gettimeofday(&stop, NULL);
t = ((stop.tv_sec + 1e-6*stop.tv_usec)
- (start.tv_sec + 1e-6*start.tv_usec));
if (t >= 1.0)
break;
if (t <= 0.0)
t = 1e-3;
count = (1.1 / t) * count;
}
t = (t / count) * 1e9 - offset;
printf ("%10s: %9.3f ns\n", label, t);
return t;
}
int
main (int argc, char **argv)
{
double overhead;
int j;
for (j = 0; j < sizeof(testval)/sizeof(int); j++) {
check(testval[j]);
check64(testval[j]);
}
for (j = 0; j < 32; ++j) {
check(1 << j);
check((1 << j) - 1);
check((1 << j) + 1);
}
for (j = 0; j < 64; ++j) {
check(1UL << j);
check((1UL << j) - 1);
check((1UL << j) + 1);
}
printf ("done with correctness test\n");
overhead = measure("overhead", empty, 0.0);
measure("generic", generic_fls, overhead);
measure("womack", womack_fls, overhead);
measure("popcount", popcount_fls, overhead);
measure("arch", arch_fls, overhead);
measure64("ia64_fls", ia64_fls, overhead);
measure64("popcount64", popcount_fls64, overhead);
return 0;
}
next prev parent reply other threads:[~2005-04-08 23:49 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2005-04-08 17:33 [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random Andrew Morton
2005-04-08 17:48 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random problem on 2.6.12-rc1)] Matt Mackall
2005-04-08 18:02 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random Andrew Morton
2005-04-08 19:05 ` David Mosberger
2005-04-08 20:46 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random problem on 2.6.12-rc1)] David Mosberger
2005-04-08 22:49 ` Matt Mackall
2005-04-08 23:01 ` David Mosberger
2005-04-08 23:02 ` Luck, Tony
2005-04-08 23:15 ` Matt Mackall
2005-04-08 23:17 ` Matt Mackall
2005-04-08 23:19 ` David Mosberger
2005-04-08 23:49 ` David Mosberger [this message]
2005-04-09 0:21 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random David Mosberger
2005-04-09 0:28 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random problem on 2.6.12-rc1)] Matt Mackall
2005-04-09 0:34 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random Arthur Kepner
2005-04-09 0:46 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random problem on 2.6.12-rc1)] Grant Grundler
2005-04-09 3:00 ` Grant Grundler
2005-04-09 4:05 ` David Mosberger
2005-04-09 4:32 ` David Mosberger
2005-04-09 5:09 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random David Mosberger
2005-04-09 5:16 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random problem on 2.6.12-rc1)] Grant Grundler
2005-04-09 6:00 ` David Mosberger
2005-04-22 3:50 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random Andrew Morton
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=16983.6263.400429.553064@napali.hpl.hp.com \
--to=davidm@napali.hpl.hp.com \
--cc=linux-ia64@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox