git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/5] compat/bswap.h: Fix build on cygwin, MinGW and msvc
@ 2013-11-07 21:59 Ramsay Jones
  2013-11-08  0:45 ` SZEDER Gábor
  0 siblings, 1 reply; 3+ messages in thread
From: Ramsay Jones @ 2013-11-07 21:59 UTC (permalink / raw)
  To: Jeff King
  Cc: Vicent Marti, Junio C Hamano, Torsten Bögershausen,
	GIT Mailing-list


Signed-off-by: Ramsay Jones <ramsay@ramsay1.demon.co.uk>
---
 compat/bswap.h | 97 ++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 68 insertions(+), 29 deletions(-)

diff --git a/compat/bswap.h b/compat/bswap.h
index ea1a9ed..c18a78e 100644
--- a/compat/bswap.h
+++ b/compat/bswap.h
@@ -17,7 +17,20 @@ static inline uint32_t default_swab32(uint32_t val)
 		((val & 0x000000ff) << 24));
 }
 
+static inline uint64_t default_bswap64(uint64_t val)
+{
+	return (((val & (uint64_t)0x00000000000000ffULL) << 56) |
+		((val & (uint64_t)0x000000000000ff00ULL) << 40) |
+		((val & (uint64_t)0x0000000000ff0000ULL) << 24) |
+		((val & (uint64_t)0x00000000ff000000ULL) <<  8) |
+		((val & (uint64_t)0x000000ff00000000ULL) >>  8) |
+		((val & (uint64_t)0x0000ff0000000000ULL) >> 24) |
+		((val & (uint64_t)0x00ff000000000000ULL) >> 40) |
+		((val & (uint64_t)0xff00000000000000ULL) >> 56));
+}
+
 #undef bswap32
+#undef bswap64
 
 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 
@@ -32,54 +45,80 @@ static inline uint32_t git_bswap32(uint32_t x)
 	return result;
 }
 
+#define bswap64 git_bswap64
+#if defined(__x86_64__)
+static inline uint64_t git_bswap64(uint64_t x)
+{
+	uint64_t result;
+	if (__builtin_constant_p(x))
+		result = default_bswap64(x);
+	else
+		__asm__("bswap %q0" : "=r" (result) : "0" (x));
+	return result;
+}
+#else
+static inline uint64_t git_bswap64(uint64_t x)
+{
+	union { uint64_t i64; uint32_t i32[2]; } tmp, result;
+	if (__builtin_constant_p(x))
+		result.i64 = default_bswap64(x);
+	else {
+		tmp.i64 = x;
+		result.i32[0] = git_bswap32(tmp.i32[1]);
+		result.i32[1] = git_bswap32(tmp.i32[0]);
+	}
+	return result.i64;
+}
+#endif
+
 #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
 
 #include <stdlib.h>
 
 #define bswap32(x) _byteswap_ulong(x)
+#define bswap64(x) _byteswap_uint64(x)
 
 #endif
 
-#ifdef bswap32
+#if defined(bswap32)
 
 #undef ntohl
 #undef htonl
 #define ntohl(x) bswap32(x)
 #define htonl(x) bswap32(x)
 
-#ifndef __BYTE_ORDER
-#	if defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && defined(BIG_ENDIAN)
-#		define __BYTE_ORDER BYTE_ORDER
-#		define __LITTLE_ENDIAN LITTLE_ENDIAN
-#		define __BIG_ENDIAN BIG_ENDIAN
-#	else
-#		error "Cannot determine endianness"
-#	endif
+#endif
+
+#if defined(bswap64)
+
+#undef ntohll
+#undef htonll
+#define ntohll(x) bswap64(x)
+#define htonll(x) bswap64(x)
+
+#else
+
+#undef ntohll
+#undef htonll
+
+#if !defined(__BYTE_ORDER)
+# if defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && defined(BIG_ENDIAN)
+#  define __BYTE_ORDER BYTE_ORDER
+#  define __LITTLE_ENDIAN LITTLE_ENDIAN
+#  define __BIG_ENDIAN BIG_ENDIAN
+# endif
+#endif
+
+#if !defined(__BYTE_ORDER)
+# error "Cannot determine endianness"
 #endif
 
 #if __BYTE_ORDER == __BIG_ENDIAN
 # define ntohll(n) (n)
 # define htonll(n) (n)
-#elif __BYTE_ORDER == __LITTLE_ENDIAN
-#	if defined(__GNUC__) && defined(__GLIBC__)
-#		include <byteswap.h>
-#	else /* GNUC & GLIBC */
-static inline uint64_t bswap_64(uint64_t val)
-{
-	return ((val & (uint64_t)0x00000000000000ffULL) << 56)
-		| ((val & (uint64_t)0x000000000000ff00ULL) << 40)
-		| ((val & (uint64_t)0x0000000000ff0000ULL) << 24)
-		| ((val & (uint64_t)0x00000000ff000000ULL) <<  8)
-		| ((val & (uint64_t)0x000000ff00000000ULL) >>  8)
-		| ((val & (uint64_t)0x0000ff0000000000ULL) >> 24)
-		| ((val & (uint64_t)0x00ff000000000000ULL) >> 40)
-		| ((val & (uint64_t)0xff00000000000000ULL) >> 56);
-}
-#	endif /* GNUC & GLIBC */
-#	define ntohll(n) bswap_64(n)
-#	define htonll(n) bswap_64(n)
-#else /* __BYTE_ORDER */
-#	error "Can't define htonll or ntohll!"
+#else
+# define ntohll(n) default_bswap64(n)
+# define htonll(n) default_bswap64(n)
 #endif
 
 #endif
-- 
1.8.4

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH 1/5] compat/bswap.h: Fix build on cygwin, MinGW and msvc
  2013-11-07 21:59 [PATCH 1/5] compat/bswap.h: Fix build on cygwin, MinGW and msvc Ramsay Jones
@ 2013-11-08  0:45 ` SZEDER Gábor
  2013-11-08 22:27   ` Jeff King
  0 siblings, 1 reply; 3+ messages in thread
From: SZEDER Gábor @ 2013-11-08  0:45 UTC (permalink / raw)
  To: Ramsay Jones
  Cc: Jeff King, Vicent Marti, Junio C Hamano,
	Torsten Bögershausen, GIT Mailing-list

Hi,

On Thu, Nov 07, 2013 at 09:59:38PM +0000, Ramsay Jones wrote:
> +static inline uint64_t default_bswap64(uint64_t val)
> +{
> +	return (((val & (uint64_t)0x00000000000000ffULL) << 56) |
> +		((val & (uint64_t)0x000000000000ff00ULL) << 40) |
> +		((val & (uint64_t)0x0000000000ff0000ULL) << 24) |
> +		((val & (uint64_t)0x00000000ff000000ULL) <<  8) |
> +		((val & (uint64_t)0x000000ff00000000ULL) >>  8) |
> +		((val & (uint64_t)0x0000ff0000000000ULL) >> 24) |
> +		((val & (uint64_t)0x00ff000000000000ULL) >> 40) |
> +		((val & (uint64_t)0xff00000000000000ULL) >> 56));
> +}

This got me thinking.
To swap 8 bytes this function performs 8 bitwise shifts, 8 bitwise
ANDs and 7 bitwise ORs plus uses 8 64bit constants.  We could do
better than that:

static inline uint64_t hacked_bswap64(uint64_t val)
{
	uint64_t tmp = val << 32 | val >> 32;
	return (((tmp & (uint64_t)0xff000000ff000000ULL) >> 24) |
		((tmp & (uint64_t)0x00ff000000ff0000ULL) >>  8) |
		((tmp & (uint64_t)0x0000ff000000ff00ULL) <<  8) |
		((tmp & (uint64_t)0x000000ff000000ffULL) << 24));
}

This performs only 6 shifts, 4 ANDs, 4 ORs and uses 4 64bit constants.

bswap64ing 1000000000 64bit ints with default_bswap64() compiled
with -O2 takes:

  real    0m1.808s
  user    0m1.796s
  sys     0m0.000s

The same with hacked_bswap64():

  real    0m0.823s
  user    0m0.816s
  sys     0m0.000s

I doubt that in normal usage git would spend enough time bswap64ing to
make this noticeable, but it was a fun micro-optimization on a wet
Thursday evening nevertheless :)

Best,
Gábor

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH 1/5] compat/bswap.h: Fix build on cygwin, MinGW and msvc
  2013-11-08  0:45 ` SZEDER Gábor
@ 2013-11-08 22:27   ` Jeff King
  0 siblings, 0 replies; 3+ messages in thread
From: Jeff King @ 2013-11-08 22:27 UTC (permalink / raw)
  To: SZEDER Gábor
  Cc: Ramsay Jones, Vicent Marti, Junio C Hamano,
	Torsten Bögershausen, GIT Mailing-list

On Fri, Nov 08, 2013 at 01:45:50AM +0100, SZEDER Gábor wrote:

> Hi,
> 
> On Thu, Nov 07, 2013 at 09:59:38PM +0000, Ramsay Jones wrote:
> > +static inline uint64_t default_bswap64(uint64_t val)
> > +{
> > +	return (((val & (uint64_t)0x00000000000000ffULL) << 56) |
> > +		((val & (uint64_t)0x000000000000ff00ULL) << 40) |
> > +		((val & (uint64_t)0x0000000000ff0000ULL) << 24) |
> > +		((val & (uint64_t)0x00000000ff000000ULL) <<  8) |
> > +		((val & (uint64_t)0x000000ff00000000ULL) >>  8) |
> > +		((val & (uint64_t)0x0000ff0000000000ULL) >> 24) |
> > +		((val & (uint64_t)0x00ff000000000000ULL) >> 40) |
> > +		((val & (uint64_t)0xff00000000000000ULL) >> 56));
> > +}
> 
> This got me thinking.
> To swap 8 bytes this function performs 8 bitwise shifts, 8 bitwise
> ANDs and 7 bitwise ORs plus uses 8 64bit constants.  We could do
> better than that:
> 
> static inline uint64_t hacked_bswap64(uint64_t val)
> {
> 	uint64_t tmp = val << 32 | val >> 32;
> 	return (((tmp & (uint64_t)0xff000000ff000000ULL) >> 24) |
> 		((tmp & (uint64_t)0x00ff000000ff0000ULL) >>  8) |
> 		((tmp & (uint64_t)0x0000ff000000ff00ULL) <<  8) |
> 		((tmp & (uint64_t)0x000000ff000000ffULL) << 24));
> }
> 
> This performs only 6 shifts, 4 ANDs, 4 ORs and uses 4 64bit constants.

You can do better still by using the bswap instruction. :)

I tried timing the program below.

With -O0, your version is actually _slower_ than the naive version (14s
versus 13s), and the bswap version is better than either (8s). Oddly, in
an earlier iteration of my test, your version was faster than naive,
with bswap faster still (10s, 8s, 5s, respectively). In that version, I
was not assigning the result of the bswap anywhere.

So I think the timing is highly dependent on the code surrounding the
call, but of course the asm instruction seems to always perform better.

If I turn on -O2, all three take about 1.2s.  Inspecting the generated
assembly, it's because gcc converts the raw-code cases into a bswap
instruction.  We end up with something like:

  .L3:
          subq $1, %rdx
          bswap %rax
          jne .L3

And according to perf, we spend all of our time on the jump. The bswap
hardly registers.

> I doubt that in normal usage git would spend enough time bswap64ing to
> make this noticeable, but it was a fun micro-optimization on a wet
> Thursday evening nevertheless :)

We do a fair number, but it's dwarfed by the real work. And from the
numbers above, I think our best bet is to use the system bswap if it's
there, and then not worry too hard about micro-optimizing the rest.

-Peff

-- >8 --
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

#ifdef NAIVE
static inline uint64_t test(uint64_t val)
{
        return ((val & (uint64_t)0x00000000000000ffULL) << 56)
                | ((val & (uint64_t)0x000000000000ff00ULL) << 40)
                | ((val & (uint64_t)0x0000000000ff0000ULL) << 24)
                | ((val & (uint64_t)0x00000000ff000000ULL) <<  8)
                | ((val & (uint64_t)0x000000ff00000000ULL) >>  8)
                | ((val & (uint64_t)0x0000ff0000000000ULL) >> 24)
                | ((val & (uint64_t)0x00ff000000000000ULL) >> 40)
                | ((val & (uint64_t)0xff00000000000000ULL) >> 56);
}

#elif defined(OPTIM)
static inline uint64_t test(uint64_t val)
{
        uint64_t tmp = val << 32 | val >> 32;
	return (((tmp & (uint64_t)0xff000000ff000000ULL) >> 24) |
                ((tmp & (uint64_t)0x00ff000000ff0000ULL) >>  8) |
                ((tmp & (uint64_t)0x0000ff000000ff00ULL) <<  8) |
                ((tmp & (uint64_t)0x000000ff000000ffULL) << 24));
}

#elif defined(ASM)
static inline uint64_t test(uint64_t val)
{
	register uint64_t v, x = val;
	__asm__("bswap %q0" : "=r" (v) : "0" (x));
	return v;
}
#endif

int main(int argc, char **argv)
{
	unsigned long i;
        uint64_t n = strtoull(argv[1], NULL, 10);

	for (i = 0; i < 1000000000; i++)
		n = test(n);
	/* convince gcc that we really want the output value, so
	 * it won't optimize out the whole program */
        printf("%d\n", (int)n);
	return 0;
}

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2013-11-08 22:27 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-11-07 21:59 [PATCH 1/5] compat/bswap.h: Fix build on cygwin, MinGW and msvc Ramsay Jones
2013-11-08  0:45 ` SZEDER Gábor
2013-11-08 22:27   ` Jeff King

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).