* [PATCH 1/1] compat: modernize and simplify byte swapping functions
2026-01-02 0:27 [PATCH 0/1] compat: modernize and simplify byte swapping functions Rostislav Krasny
@ 2026-01-02 0:27 ` Rostislav Krasny
2026-01-02 6:16 ` Jeff King
2026-01-02 7:29 ` [PATCH 0/1] " Jeff King
1 sibling, 1 reply; 7+ messages in thread
From: Rostislav Krasny @ 2026-01-02 0:27 UTC (permalink / raw)
To: git; +Cc: Rostislav Krasny, Junio C Hamano, Sebastian Andrzej Siewior,
Jeff King
Replace manual bit operations with memcpy + network functions for better
maintainability. Add missing 16-bit network byte order conversion.
Key improvements:
- Simplify the get_be*() and put_be*() functions
- Add bswap16() macro with automatic compiler optimization:
* GCC/Clang: __builtin_bswap16() intrinsic
* MSVC: _byteswap_ushort() intrinsic
- Add default_bswap16() static inline function with manual fallback implementation
- Rename default_swab32() to default_bswap32() for naming consistency
- Add ntohs() and htons() macros for complete 16/32/64-bit network conversion
- Add put_be16() function for API completeness alongside existing put_be*() and
get_be*() functions
- Performance improvements (GCC 15.2.1, Clang 21.1.7):
* on x86-64 with -O0 4.2x faster (GCC), 3.7x faster (Clang)
* on x86-64 with -O1 4x faster (GCC), identical (Clang)
* on x86-64 with -O2 identical (GCC), 1.8x faster (Clang)
Signed-off-by: Rostislav Krasny <rostiprodev@gmail.com>
---
compat/bswap.h | 74 ++++++++++++++++++++++++++++++--------------------
1 file changed, 44 insertions(+), 30 deletions(-)
diff --git a/compat/bswap.h b/compat/bswap.h
index 28635ebc69..f9954ef090 100644
--- a/compat/bswap.h
+++ b/compat/bswap.h
@@ -9,10 +9,15 @@
*/
/*
- * Default version that the compiler ought to optimize properly with
+ * Default versions that the compiler ought to optimize properly with
* constant values.
*/
-static inline uint32_t default_swab32(uint32_t val)
+static inline uint16_t default_bswap16(uint16_t val)
+{
+ return ((val & 0xff00) >> 8) | ((val & 0x00ff) << 8);
+}
+
+static inline uint32_t default_bswap32(uint32_t val)
{
return (((val & 0xff000000) >> 24) |
((val & 0x00ff0000) >> 8) |
@@ -40,6 +45,7 @@ static inline uint64_t default_bswap64(uint64_t val)
# define __has_builtin(x) 0
#endif
+#undef bswap16
#undef bswap32
#undef bswap64
@@ -47,6 +53,7 @@ static inline uint64_t default_bswap64(uint64_t val)
#include <stdlib.h>
+#define bswap16(x) _byteswap_ushort(x)
#define bswap32(x) _byteswap_ulong(x)
#define bswap64(x) _byteswap_uint64(x)
@@ -54,8 +61,9 @@ static inline uint64_t default_bswap64(uint64_t val)
#define GIT_BIG_ENDIAN 4321
#define GIT_BYTE_ORDER GIT_LITTLE_ENDIAN
-#elif __has_builtin(__builtin_bswap32) && __has_builtin(__builtin_bswap64)
+#elif __has_builtin(__builtin_bswap16) && __has_builtin(__builtin_bswap32) && __has_builtin(__builtin_bswap64)
+#define bswap16(x) __builtin_bswap16((x))
#define bswap32(x) __builtin_bswap32((x))
#define bswap64(x) __builtin_bswap64((x))
@@ -98,24 +106,36 @@ static inline uint64_t default_bswap64(uint64_t val)
#endif
+#undef ntohs
+#undef htons
#undef ntohl
#undef htonl
#undef ntohll
#undef htonll
#if GIT_BYTE_ORDER == GIT_BIG_ENDIAN
+# define ntohs(x) (x)
+# define htons(x) (x)
# define ntohl(x) (x)
# define htonl(x) (x)
# define ntohll(x) (x)
# define htonll(x) (x)
#else
+# if defined(bswap16)
+# define ntohs(x) bswap16(x)
+# define htons(x) bswap16(x)
+# else
+# define ntohs(x) default_bswap16(x)
+# define htons(x) default_bswap16(x)
+# endif
+
# if defined(bswap32)
# define ntohl(x) bswap32(x)
# define htonl(x) bswap32(x)
# else
-# define ntohl(x) default_swab32(x)
-# define htonl(x) default_swab32(x)
+# define ntohl(x) default_bswap32(x)
+# define htonl(x) default_bswap32(x)
# endif
# if defined(bswap64)
@@ -129,47 +149,41 @@ static inline uint64_t default_bswap64(uint64_t val)
static inline uint16_t get_be16(const void *ptr)
{
- const unsigned char *p = ptr;
- return (uint16_t)p[0] << 8 |
- (uint16_t)p[1] << 0;
+ uint16_t n;
+ memcpy(&n, ptr, sizeof n);
+ return ntohs(n);
}
static inline uint32_t get_be32(const void *ptr)
{
- const unsigned char *p = ptr;
- return (uint32_t)p[0] << 24 |
- (uint32_t)p[1] << 16 |
- (uint32_t)p[2] << 8 |
- (uint32_t)p[3] << 0;
+ uint32_t n;
+ memcpy(&n, ptr, sizeof n);
+ return ntohl(n);
}
static inline uint64_t get_be64(const void *ptr)
{
- const unsigned char *p = ptr;
- return (uint64_t)get_be32(&p[0]) << 32 |
- (uint64_t)get_be32(&p[4]) << 0;
+ uint64_t n;
+ memcpy(&n, ptr, sizeof n);
+ return ntohll(n);
+}
+
+static inline void put_be16(void *ptr, uint16_t value)
+{
+ uint16_t n = htons(value);
+ memcpy(ptr, &n, sizeof n);
}
static inline void put_be32(void *ptr, uint32_t value)
{
- unsigned char *p = ptr;
- p[0] = (value >> 24) & 0xff;
- p[1] = (value >> 16) & 0xff;
- p[2] = (value >> 8) & 0xff;
- p[3] = (value >> 0) & 0xff;
+ uint32_t n = htonl(value);
+ memcpy(ptr, &n, sizeof n);
}
static inline void put_be64(void *ptr, uint64_t value)
{
- unsigned char *p = ptr;
- p[0] = (value >> 56) & 0xff;
- p[1] = (value >> 48) & 0xff;
- p[2] = (value >> 40) & 0xff;
- p[3] = (value >> 32) & 0xff;
- p[4] = (value >> 24) & 0xff;
- p[5] = (value >> 16) & 0xff;
- p[6] = (value >> 8) & 0xff;
- p[7] = (value >> 0) & 0xff;
+ uint64_t n = htonll(value);
+ memcpy(ptr, &n, sizeof n);
}
#endif /* COMPAT_BSWAP_H */
--
2.52.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* Re: [PATCH 0/1] compat: modernize and simplify byte swapping functions
2026-01-02 0:27 [PATCH 0/1] compat: modernize and simplify byte swapping functions Rostislav Krasny
2026-01-02 0:27 ` [PATCH 1/1] " Rostislav Krasny
@ 2026-01-02 7:29 ` Jeff King
1 sibling, 0 replies; 7+ messages in thread
From: Jeff King @ 2026-01-02 7:29 UTC (permalink / raw)
To: Rostislav Krasny; +Cc: git
On Fri, Jan 02, 2026 at 02:27:34AM +0200, Rostislav Krasny wrote:
> The main reason it was implemented so complicated is UB when conversion of a
> pointer to one object type into a pointer of a different object type is used.
> On the other hand, memcpy is protected from such UB and this allows us to make
> that code simpler and even more optimal, in some cases.
We actually used to do casts and unaligned loads on platforms that
allowed it (like x86). But what I measured in c578e29ba0 (bswap.h: drop
unaligned loads, 2020-09-24) indicated that it did not really help, and
the -O2-optimized long-hand code performed the same. So I'm a little
surprised that going back in that direction is beneficial. But my
numbers were all with gcc, and your improvement was seen with clang, so
let's see if we can tease out the reasons.
> #define ITERATIONS 1000000
> #define BUF_SIZE 8192
>
> int main() {
> uint8_t buffer[BUF_SIZE];
> uint64_t sum = 0;
>
> for (int i = 0; i < BUF_SIZE; i++) {
> buffer[i] = (uint8_t)i;
> }
>
> clock_t start = clock();
>
> for (int i = 0; i < ITERATIONS; i++) {
> // use a volatile pointer to force the compiler to read memory
> volatile uint8_t *p = buffer;
> for (int j = 0; j < BUF_SIZE - 8; j++) {
> sum += get_be64((const void*)(p + j));
> }
> }
OK, so this is a measure of pure be64 speeds. That's over-emphasizing
what we'd see in a real workload, but it should at least help us focus
on whether we can see any improvement.
You mentioned sha256 earlier, but it only has a single get_be32() call.
I couldn't measure any change with "test-tool sha256" before/after your
patch. For block-sha1, we'd see get_be32() calls, too, but these days
we'd almost always use the (much slower) sha1dc anyway.
It looks like you iterate byte by byte, so we do get some unaligned
calls there. Good.
I pulled this into Git itself to make it easier to test particular
builds, and to measure it with hyperfine (which introduces some extra
noise due to program startup and exit, but also gives us nicer
statistics):
diff --git a/common-main.c b/common-main.c
index 6b7ab077b0..9f19dfe68c 100644
--- a/common-main.c
+++ b/common-main.c
@@ -1,10 +1,31 @@
#include "git-compat-util.h"
#include "common-init.h"
+#define ITERATIONS 1000000
+#define BUF_SIZE 8192
+
int main(int argc, const char **argv)
{
int result;
+ uint8_t buffer[BUF_SIZE];
+ uint64_t sum = 0;
+
+ for (int i = 0; i < BUF_SIZE; i++) {
+ buffer[i] = (uint8_t)i;
+ }
+
+ for (int i = 0; i < ITERATIONS; i++) {
+ // use a volatile pointer to force the compiler to read memory
+ volatile uint8_t *p = buffer;
+ for (int j = 0; j < BUF_SIZE - 8; j++) {
+ sum += get_be64((const void*)(p + j));
+ }
+ }
+ printf("%"PRIuMAX, (uintmax_t)sum);
+ /* skip git stuff */
+ exit(0);
+
init_git(argv);
result = cmd_main(argc, argv);
> And these are the results:
>
> GCC 15.2.1
> version | -Os | -O0 | -O1 | -O2 | -O3
> ================================================================
> | 3.721806 |72.342204 |11.956021 | 3.119833 | 0.919873
> original| 3.726111 |72.326920 |11.963618 | 3.128222 | 0.921128
> | 3.719791 |72.328175 |11.949108 | 3.130956 | 0.920296
> ================================================================
> | 3.719899 |17.177719 | 3.005065 | 3.120747 | 0.920609
> new | 3.714785 |17.168950 | 3.004978 | 3.119227 | 0.918851
> | 3.716782 |17.145386 | 3.009364 | 3.119573 | 0.920030
> ================================================================
I think we can disregard the -O0 results. They're both horribly slow,
and for obvious reasons.
The -O2 results match what I saw back when I measured sha1 performance
(though I also wouldn't be surprised if get_be32() is lost in the noise
there). That was also with gcc. Testing again with the patch above, I
get similar results to you (actually a slight slowdown, but within the
statistical noise).
The -O3 results are interesting. Not because they change, but because
they outperform -O2 so handily in this case. I get similar results here.
The diff of the generated asm between o2 and o3 looks like this:
--- asm.o2 2026-01-02 01:51:35.312098163 -0500
+++ asm.o3 2026-01-02 01:51:45.744134814 -0500
@@ -19,6 +19,7 @@
movdqa .LC0(%rip), %xmm2
movd %edi, %xmm9
movl $8, %edi
+ movq %rsp, %rcx
movq %rsp, %rax
movd %edi, %xmm8
movl $12, %edi
@@ -59,24 +60,23 @@
pand %xmm5, %xmm0
packuswb %xmm0, %xmm1
movaps %xmm1, -16(%rax)
- cmpq %rdi, %rax
+ cmpq %rax, %rdi
jne .L2
- movl $1000000, %edi
+ leaq 8184(%rcx), %rdi
xorl %esi, %esi
- leaq 8184(%rsp), %rcx
.L3:
- movq %rsp, %rax
- .p2align 5
+ movq (%rcx), %rdx
+ movl $1000000, %eax
+ bswap %rdx
+ .p2align 4
.p2align 4
.p2align 3
.L4:
- movq (%rax), %rdx
- addq $1, %rax
- bswap %rdx
- addq %rdx, %rsi
- cmpq %rcx, %rax
+ leaq (%rsi,%rdx,2), %rsi
+ subl $2, %eax
jne .L4
- subl $1, %edi
+ addq $1, %rcx
+ cmpq %rcx, %rdi
jne .L3
leaq .LC6(%rip), %rdi
xorl %eax, %eax
It looks like the loops were inverted and the bswap was hoisted out of
the inner loop. ;) So that is really just an artifact of how this test
code was written (since we are summing, we do not care about iteration
order).
Oh well, a 3x speedup for -O3 was probably too good to be true anyway.
It feels like there is an easy optimization on top of that loop
inversion: rather than iterating and adding, we could just multiply by
ITERATIONS. And indeed, if we re-order the loops ourselves:
diff --git a/common-main.c b/common-main.c
index 9f19dfe68c..5960755dcc 100644
--- a/common-main.c
+++ b/common-main.c
@@ -15,11 +15,12 @@ int main(int argc, const char **argv)
buffer[i] = (uint8_t)i;
}
- for (int i = 0; i < ITERATIONS; i++) {
+ for (int j = 0; j < BUF_SIZE - 8; j++) {
// use a volatile pointer to force the compiler to read memory
volatile uint8_t *p = buffer;
- for (int j = 0; j < BUF_SIZE - 8; j++) {
- sum += get_be64((const void*)(p + j));
+ uint64_t swapped = get_be64((const void *)(p + j));
+ for (int i = 0; i < ITERATIONS; i++) {
+ sum += swapped;
}
}
printf("%"PRIuMAX, (uintmax_t)sum);
then even at -O2 the whole thing runs in ~2ms, and the generated asm
looks like this:
bswap %rax
imulq $1000000, %rax, %rax
addq %rax, %rsi
It's a little funny that -O3 can do the loop inversion but misses the
multiplication. :)
> Clang 21.1.7
> version | -Os | -O0 | -O1 | -O2 | -O3
> ================================================================
> | 3.690718 |62.916338 | 3.017460 | 3.768443 | 3.778840
> original| 3.686283 |62.965916 | 3.014674 | 3.777897 | 3.774776
> | 3.687775 |62.850648 | 3.003496 | 3.766108 | 3.765313
> ================================================================
> | 3.681818 |16.753385 | 3.008131 | 2.075271 | 2.076090
> new | 3.687184 |16.737982 | 3.004365 | 2.071597 | 2.074507
> | 3.683960 |16.765067 | 2.999775 | 2.075354 | 2.075759
> ================================================================
OK, so here we are getting into the interesting bits, I think. -O0 is
again not really that interesting. And -O3 here behaves like -O2, so
presumably it doesn't figure out the loop inversion.
But -O2 shows two interesting things. One, the existing code is much
slower with clang than gcc. And two, it gets much faster with your
patch. I was able to replicate both results.
For the first, diffing against gcc's -O2 asm might be interesting. Using
"diff" is not productive because there are too many other differences. A
cursory inspection by hand shows the inner loops pretty similar:
gcc:
movq (%rax), %rdx
addq $1, %rax
bswap %rdx
addq %rdx, %rsi
cmpq %rcx, %rax
jne .L4
clang:
movq -7(%rsp,%rcx), %rdx
bswapq %rdx
addq %rdx, %rsi
incq %rcx
cmpq $8191, %rcx # imm = 0x1FFF
jne .LBB0_4
I think the bswapq vs bswap difference is a red herring, since rdx is a
64-bit register and both should do a 64-bit swap. The loop counters are
handled a bit differently, but fundamentally we are just bswapping,
adding, and looping. I wonder why they perform so differently. But I
suspect it has more to do with the fake test looping and not get_be64()
itself.
So let's see what changes after your patch:
--- asm.old 2026-01-02 02:07:25.455631337 -0500
+++ asm.new 2026-01-02 02:17:40.702545167 -0500
@@ -87,15 +87,30 @@
.p2align 4
.LBB0_3: # =>This Loop Header: Depth=1
# Child Loop BB0_4 Depth 2
- movl $7, %ecx
+ movl $5, %ecx
.p2align 4
.LBB0_4: # Parent Loop BB0_3 Depth=1
# => This Inner Loop Header: Depth=2
- movq -7(%rsp,%rcx), %rdx
+ movq -5(%rsp,%rcx), %rdx
+ movq -4(%rsp,%rcx), %rdi
bswapq %rdx
+ bswapq %rdi
+ addq %rsi, %rdx
+ movq -3(%rsp,%rcx), %r8
+ bswapq %r8
+ addq %rdi, %r8
+ movq -2(%rsp,%rcx), %rsi
+ bswapq %rsi
+ addq %rdx, %r8
+ movq -1(%rsp,%rcx), %rdx
+ bswapq %rdx
+ addq %rsi, %rdx
+ movq (%rsp,%rcx), %rsi
+ bswapq %rsi
addq %rdx, %rsi
- incq %rcx
- cmpq $8191, %rcx # imm = 0x1FFF
+ addq %r8, %rsi
+ addq $6, %rcx
+ cmpq $8189, %rcx # imm = 0x1FFD
jne .LBB0_4
# %bb.5: # in Loop: Header=BB0_3 Depth=1
incl %eax
Huh. So we do a bunch more bswap instructions, but move in larger chunks
through the loop. I didn't work out the details, but it looks like it's
taking advantage of the fact that we're bswapping overlapping bits of
memory (which is something a real program would hardly ever do).
If we tweak our program to just swap the whole buffer, 8 bytes at a
time:
diff --git a/common-main.c b/common-main.c
index 9f19dfe68c..55a7066c85 100644
--- a/common-main.c
+++ b/common-main.c
@@ -18,7 +18,7 @@ int main(int argc, const char **argv)
for (int i = 0; i < ITERATIONS; i++) {
// use a volatile pointer to force the compiler to read memory
volatile uint8_t *p = buffer;
- for (int j = 0; j < BUF_SIZE - 8; j++) {
+ for (int j = 0; j < BUF_SIZE; j += 8) {
sum += get_be64((const void*)(p + j));
}
}
then the difference before/after your patch goes away:
Benchmark 1: ./git.old.clang.o2
Time (mean ± σ): 482.9 ms ± 7.2 ms [User: 482.5 ms, System: 0.4 ms]
Range (min … max): 471.7 ms … 495.3 ms 10 runs
Benchmark 2: ./git.new.clang.o2
Time (mean ± σ): 472.4 ms ± 4.5 ms [User: 471.6 ms, System: 0.8 ms]
Range (min … max): 465.3 ms … 477.8 ms 10 runs
Summary
./git.new.clang.o2 ran
1.02 ± 0.02 times faster than ./git.old.clang.o2
Note that everything is 8x faster than before because we are doing 8x
fewer bswaps. But the time doesn't change before/after the patch.
So what does it all mean?
I _think_ most of the speed differences we've seen are artifacts of the
test program, and not a big difference in how get_be64() is being
implemented. We end up with bswap instructions either way. I could be
wrong, though; there were a lot of versions to juggle, and I'm pretty
bad at reading assembly language.
All that said, I'm not particularly opposed to your patch. The memcpy
version may be easier to read. I'm just a little skeptical that it
provides performance improvements.
-Peff
^ permalink raw reply related [flat|nested] 7+ messages in thread