Re: [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random problem on 2.6.12-rc1)]

public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed

From: David Mosberger <davidm@napali.hpl.hp.com>
To: linux-ia64@vger.kernel.org
Subject: Re: [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random problem on 2.6.12-rc1)]
Date: Fri, 08 Apr 2005 23:49:11 +0000	[thread overview]
Message-ID: <16983.6263.400429.553064@napali.hpl.hp.com> (raw)
In-Reply-To: <20050408103324.6c5231df.akpm@osdl.org>

With the attached test-program, I'm seeing these results on a 1.5GHz
Madison:

                measured time number of
                per call:     bundles:

                GCC v3.4                 GCC pre-v4.1
   generic:     8.696 ns      14          7.345 ns
    womack:     9.355 ns      10         10.017 ns
  popcount:     7.351 ns       8          6.676 ns
      arch:     9.357 ns       7          8.683 ns
  ia64_fls:     8.016 ns       4          8.011 ns
popcount64:     9.357 ns       8          9.349 ns

(GCC prior to v3.4 doesn't have the __builtin_popcount() so we'd need
 to fallback on asm in that case).

Given these results, I'm inclined to continue to use the
FP-normalization for 64-bit ia64_fls() and to use the (32-bit-only)
popcount-version for fls().

I should note that there are faster ways of doing fls() on ia64 but
they require lookup-tables and given that fls() isn't called all that
frequently, those table-lookups would likely result in cache misses.
Of course, if you know of a place in the kernel where fls() is used
very frequently, do let me know.

	--david

/*
 * Copyright (C) 2005 Grant Grundler (grundler at parisc-linux.org)
 *
 *  Copies code from:
 *  Copyright (C) 1998-2003 Hewlett-Packard Co
 *		David Mosberger-Tang <davidm@hpl.hp.com>
 *
 * testfls.c may be redistributed under GPL 2.0.
 */
#include <stdlib.h>
#include <stdio.h>

#include <sys/time.h>

#ifdef __INTEL_COMPILER
# include <ia64intrin.h>
# define ia64_getf_exp(x)	__getf_exp(x)
# define popcount(x)		_m64_popcnt(x)
# define popcount64(x)		_m64_popcnt(x)
#else
# define ia64_getf_exp(x)                                       \
({                                                              \
        long ia64_intri_res;                                    \
                                                                \
        asm ("getf.exp %0=%1" : "=r"(ia64_intri_res) : "f"(x)); \
                                                                \
        ia64_intri_res;                                         \
})
# define ia64_popcnt(x)						\
({								\
	unsigned long ia64_intri_res;				\
	asm ("popcnt %0=%1" : "=r" (ia64_intri_res) : "r" (x));	\
								\
	ia64_intri_res;						\
})
# define popcount(x)		__builtin_popcount(x)
# define popcount64(x)		ia64_popcnt(x)
#endif

static inline unsigned long
ia64_fls (unsigned long x)
{
        long double d = x;
        long exp;

        exp = ia64_getf_exp(d);
        return exp - 0xffff;
}

static inline int
arch_fls (int x)
{
	if (!x)
		return 0;
	return ia64_fls((unsigned int) x) + 1;
}

/*
 * fls: find last bit set.
 */
static __inline__ int generic_fls(int x)
{
	int r = 32;

	if (!x)
		return 0;
	if (!(x & 0xffff0000u)) {
		x <<= 16;
		r -= 16;
	}
	if (!(x & 0xff000000u)) {
		x <<= 8;
		r -= 8;
	}
	if (!(x & 0xf0000000u)) {
		x <<= 4;
		r -= 4;
	}
	if (!(x & 0xc0000000u)) {
		x <<= 2;
		r -= 2;
	}
	if (!(x & 0x80000000u)) {
		x <<= 1;
		r -= 1;
	}
	return r;
}

static __inline__ int womack_fls(int x)
{
	unsigned int t = x;
	int r = 1;

	if (!t)
		return 0;
	if (t & 0xffff0000u) {
		t >>= 16;
		r += 16;
	}
	if (t & 0xff00) {
		t >>= 8;
		r += 8;
	}
	if (t & 0xf0u) {
		t >>= 4;
		r += 4;
	}
	/*
	 * This is based on an article by Paul Womack posted to
	 * comp.arch on May 15, 2000.
	 *
	 * Last 4 bits are handled by a lookup table.  Each
	 * table-entry occupies only 2 bits, so we can encode the
	 * table in 16*2 = 32 bits.  The table value is constructed
	 * like this:
	 *
	 * 0000 = don't care (not possible)
	 * 0001 = 1
	 * 0010 = 2
	 * 0011 = 2
	 * 0100 = 3
	 * 0101 = 3
	 * 0110 = 3
	 * 0111 = 3
	 * 1000 = 4
	 * 1001 = 4
	 * 1010 = 4
	 * 1011 = 4
	 * 1100 = 4
	 * 1101 = 4
	 * 1110 = 4
	 * 1111 = 4
	 *
	 * to make it fit in 2 bits, we subtract 1 before encoding, so
	 * in binary:
	 *
	 *  = 0b11111111111111111010101001010000
	 *       3 3 3 3 3 3 3 3 2 2 2 2 1 1 0 0
	 * hex (thanks, bc!)
	 *  = 0xffffaa50
	 */
	return r + ((0xffffaa50u >> (2 * t)) & 0x3);
}

static __inline__ int
popcount_fls(int t)
{
	unsigned long x = t & 0xffffffffu;

	if (!x)
		return 0;
	x |= x >> 1;
	x |= x >> 2;
	x |= x >> 4;
	x |= x >> 8;
	x |= x >> 16;
	return popcount(x);
}

static __inline__ unsigned long
popcount_fls64(unsigned long x)
{
	x |= x >> 1;
	x |= x >> 2;
	x |= x >> 4;
	x |= x >> 8;
	x |= x >> 16;
	x |= x >> 32;
	return popcount64(x) - 1;
}

int testval[] = { -2, -1, 0, 6, 10 };

static inline void
check (int val)
{
	if (arch_fls(val) != generic_fls(val))
		printf ("arch_fls(%08x) %6d generic_fls(%08x) %6d\n",
			val, arch_fls(val), val, generic_fls(val));
	if (womack_fls(val) != generic_fls(val))
		printf ("arch_fls(%08x) %6d generic_fls(%08x) %6d\n",
			val, womack_fls(val), val, generic_fls(val));
	if (popcount_fls(val) != generic_fls(val))
		printf ("popcount_fls(%08x) %6d generic_fls(%08x) %6d\n",
			val, popcount_fls(val), val, generic_fls(val));
}

static inline void
check64 (unsigned long val)
{
	if (!val)
		return;

	if (popcount_fls64(val) != ia64_fls(val))
		printf ("popcount_fls64(%016lx) %6ld ia64_fls(%016lx) %6ld\n",
			val, popcount_fls64(val), val, ia64_fls(val));
}

static int
empty (int x)
{
	return 0;
}

static double
measure (const char *label, int (*func)(int), double offset)
{
	struct timeval start, stop;
	long i, count = 1000000;
	double t;

	while (1) {
		gettimeofday(&start, NULL);
		for (i = 0; i < count; ++i)
			(*func)(i | (i << 16));
		gettimeofday(&stop, NULL);

		t = ((stop.tv_sec + 1e-6*stop.tv_usec)
		     - (start.tv_sec + 1e-6*start.tv_usec));

		if (t >= 1.0)
			break;

		if (t <= 0.0)
			t = 1e-3;

		count = (1.1 / t) * count;
	}
	t = (t / count) * 1e9 - offset;

	printf ("%10s: %9.3f ns\n", label, t);
	return t;
}

static double
measure64 (const char *label, unsigned long (*func)(unsigned long),
	   double offset)
{
	struct timeval start, stop;
	long i, count = 1000000;
	double t;

	while (1) {
		gettimeofday(&start, NULL);
		for (i = 0; i < count; ++i)
			(*func)(i | (i << 16));
		gettimeofday(&stop, NULL);

		t = ((stop.tv_sec + 1e-6*stop.tv_usec)
		     - (start.tv_sec + 1e-6*start.tv_usec));

		if (t >= 1.0)
			break;

		if (t <= 0.0)
			t = 1e-3;

		count = (1.1 / t) * count;
	}
	t = (t / count) * 1e9 - offset;

	printf ("%10s: %9.3f ns\n", label, t);
	return t;
}

int
main (int argc, char **argv)
{
	double overhead;
	int j;

	for (j = 0; j < sizeof(testval)/sizeof(int); j++) {
		check(testval[j]);
		check64(testval[j]);
	}
	for (j = 0; j < 32; ++j) {
		check(1 << j);
		check((1 << j) - 1);
		check((1 << j) + 1);
	}
	for (j = 0; j < 64; ++j) {
		check(1UL << j);
		check((1UL << j) - 1);
		check((1UL << j) + 1);
	}

	printf ("done with correctness test\n");

	overhead = measure("overhead", empty, 0.0);
	measure("generic", generic_fls, overhead);
	measure("womack", womack_fls, overhead);
	measure("popcount", popcount_fls, overhead);
	measure("arch", arch_fls, overhead);
	measure64("ia64_fls", ia64_fls, overhead);
	measure64("popcount64", popcount_fls64, overhead);
	return 0;
}

next prev parent reply	other threads:[~2005-04-08 23:49 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-04-08 17:33 [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random Andrew Morton
2005-04-08 17:48 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random problem on 2.6.12-rc1)] Matt Mackall
2005-04-08 18:02 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random Andrew Morton
2005-04-08 19:05 ` David Mosberger
2005-04-08 20:46 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random problem on 2.6.12-rc1)] David Mosberger
2005-04-08 22:49 ` Matt Mackall
2005-04-08 23:01 ` David Mosberger
2005-04-08 23:02 ` Luck, Tony
2005-04-08 23:15 ` Matt Mackall
2005-04-08 23:17 ` Matt Mackall
2005-04-08 23:19 ` David Mosberger
2005-04-08 23:49 ` David Mosberger [this message]
2005-04-09  0:21 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random David Mosberger
2005-04-09  0:28 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random problem on 2.6.12-rc1)] Matt Mackall
2005-04-09  0:34 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random Arthur Kepner
2005-04-09  0:46 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random problem on 2.6.12-rc1)] Grant Grundler
2005-04-09  3:00 ` Grant Grundler
2005-04-09  4:05 ` David Mosberger
2005-04-09  4:32 ` David Mosberger
2005-04-09  5:09 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random David Mosberger
2005-04-09  5:16 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random problem on 2.6.12-rc1)] Grant Grundler
2005-04-09  6:00 ` David Mosberger
2005-04-22  3:50 ` [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random Andrew Morton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=16983.6263.400429.553064@napali.hpl.hp.com \
    --to=davidm@napali.hpl.hp.com \
    --cc=linux-ia64@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox