* [patch v3 7/7] crc32: final-cleanup.diff
@ 2011-08-09 5:29 Bob Pearson
2011-08-09 6:21 ` Bob Pearson
0 siblings, 1 reply; 8+ messages in thread
From: Bob Pearson @ 2011-08-09 5:29 UTC (permalink / raw)
To: linux-kernel, linux, joakim.tjernlund, fzago, rpearson, akpm
Some final cleanup changes
- added a comment at the top of crc32.c
- moved macros ahead of function prototype
- replaced loops with for (i = 0; i < xxx; i++) which
requires fewer instructions on x86 since the
buffer lookups can use i as an index.
Signed-off-by: Bob Pearson <rpearson@systemfabricworks.com>
---
lib/crc32.c | 89 ++++++++++++++++++++++++++++--------------------------------
1 file changed, 43 insertions(+), 46 deletions(-)
Index: infiniband/lib/crc32.c
===================================================================
--- infiniband.orig/lib/crc32.c
+++ infiniband/lib/crc32.c
@@ -1,4 +1,8 @@
/*
+ * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
+ * cleaned up code to current version of sparse and added the slicing-by-8
+ * algorithm to the closely similar existing slicing-by-4 algorithm.
+ *
* Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com>
* Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks!
* Code was from the public domain, copyright abandoned. Code was
@@ -45,43 +49,39 @@ MODULE_LICENSE("GPL");
#if CRC_LE_BITS > 8 || CRC_BE_BITS > 8
-/* implements slicing-by-4 or slicing-by-8 algorithm */
-static inline u32
-crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
-{
# ifdef __LITTLE_ENDIAN
# define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8)
-# define DO_CRC4 crc = tab[3][(crc) & 255] ^ \
- tab[2][(crc >> 8) & 255] ^ \
- tab[1][(crc >> 16) & 255] ^ \
- tab[0][(crc >> 24) & 255]
-# define DO_CRC8a (tab[7][(q) & 255] ^ \
- tab[6][(q >> 8) & 255] ^ \
- tab[5][(q >> 16) & 255] ^ \
- tab[4][(q >> 24) & 255])
-# define DO_CRC8b (tab[3][(q) & 255] ^ \
+# define DO_CRC4 (tab[3][(q) & 255] ^ \
tab[2][(q >> 8) & 255] ^ \
tab[1][(q >> 16) & 255] ^ \
tab[0][(q >> 24) & 255])
+# define DO_CRC8 (tab[7][(q) & 255] ^ \
+ tab[6][(q >> 8) & 255] ^ \
+ tab[5][(q >> 16) & 255] ^ \
+ tab[4][(q >> 24) & 255])
# else
# define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
-# define DO_CRC4 crc = tab[0][(crc) & 255] ^ \
- tab[1][(crc >> 8) & 255] ^ \
- tab[2][(crc >> 16) & 255] ^ \
- tab[3][(crc >> 24) & 255]
-# define DO_CRC8a (tab[4][(q) & 255] ^ \
- tab[5][(q >> 8) & 255] ^ \
- tab[6][(q >> 16) & 255] ^ \
- tab[8][(q >> 24) & 255])
-# define DO_CRC8b (tab[0][(q) & 255] ^ \
+# define DO_CRC4 (tab[0][(q) & 255] ^ \
tab[1][(q >> 8) & 255] ^ \
tab[2][(q >> 16) & 255] ^ \
tab[3][(q >> 24) & 255])
+# define DO_CRC8 (tab[4][(q) & 255] ^ \
+ tab[5][(q >> 8) & 255] ^ \
+ tab[6][(q >> 16) & 255] ^ \
+ tab[7][(q >> 24) & 255])
# endif
+
+/* implements slicing-by-4 or slicing-by-8 algorithm */
+static inline u32 crc32_body(u32 crc, unsigned char const *buf,
+ size_t len, const u32 (*tab)[256])
+{
const u32 *b;
+ u8 *p;
+ u32 q;
size_t init_len;
size_t middle_len;
size_t rem_len;
+ size_t i;
/* break buf into init_len bytes before and
* rem_len bytes after a middle section with
@@ -96,37 +96,34 @@ crc32_body(u32 crc, unsigned char const
rem_len = (len - init_len) & 7;
# endif
- /* Align it */
- if (unlikely(init_len)) {
- do {
- DO_CRC(*buf++);
- } while (--init_len);
- }
- b = (const u32 *)buf;
- for (--b; middle_len; --middle_len) {
+ /* process unaligned initial bytes */
+ for (i = 0; i < init_len; i++)
+ DO_CRC(*buf++);
+
+ /* process aligned words */
+ b = (const u32 *)(buf - 4);
+
+ for (i = 0; i < middle_len; i++) {
# if CRC_LE_BITS == 32
- crc ^= *++b; /* use pre increment for speed */
- DO_CRC4;
+ /* slicing-by-4 algorithm */
+ q = crc ^ *++b; /* use pre increment for speed */
+ crc = DO_CRC4;
# else
- u32 q;
+ /* slicing-by-8 algorithm */
q = crc ^ *++b;
- crc = DO_CRC8a;
+ crc = DO_CRC8;
q = *++b;
- crc ^= DO_CRC8b;
+ crc ^= DO_CRC4;
# endif
}
- /* And the last few bytes */
- if (rem_len) {
- u8 *p = (u8 *)(b + 1) - 1;
- do {
- DO_CRC(*++p); /* use pre increment for speed */
- } while (--rem_len);
- }
+
+ /* process unaligned remaining bytes */
+ p = (u8 *)(b + 1) - 1;
+
+ for (i = 0; i < rem_len; i++)
+ DO_CRC(*++p); /* use pre increment for speed */
+
return crc;
-#undef DO_CRC
-#undef DO_CRC4
-#undef DO_CRC8a
-#undef DO_CRC8b
}
#endif
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [patch v3 7/7] crc32: final-cleanup.diff
2011-08-09 5:29 [patch v3 7/7] crc32: final-cleanup.diff Bob Pearson
@ 2011-08-09 6:21 ` Bob Pearson
2011-08-09 17:39 ` Joakim Tjernlund
0 siblings, 1 reply; 8+ messages in thread
From: Bob Pearson @ 2011-08-09 6:21 UTC (permalink / raw)
To: linux-kernel, linux, joakim.tjernlund, fzago, akpm
Additional comments.
> Some final cleanup changes
>
> - added a comment at the top of crc32.c
> - moved macros ahead of function prototype
> - replaced loops with for (i = 0; i < xxx; i++) which
> requires fewer instructions on x86 since the
> buffer lookups can use i as an index.
>
> -/* implements slicing-by-4 or slicing-by-8 algorithm */
> -static inline u32
> -crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32
> (*tab)[256])
After careful measurements and looking at asm code I figured out that there
was no penalty for using 2D array. That allowed me to go back to the
original form.
> -{
> # ifdef __LITTLE_ENDIAN
> # define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8)
> -# define DO_CRC4 crc = tab[3][(crc) & 255] ^ \
> - tab[2][(crc >> 8) & 255] ^ \
> - tab[1][(crc >> 16) & 255] ^ \
> - tab[0][(crc >> 24) & 255]
> -# define DO_CRC8a (tab[7][(q) & 255] ^ \
> - tab[6][(q >> 8) & 255] ^ \
> - tab[5][(q >> 16) & 255] ^ \
> - tab[4][(q >> 24) & 255])
> -# define DO_CRC8b (tab[3][(q) & 255] ^ \
> +# define DO_CRC4 (tab[3][(q) & 255] ^ \
> tab[2][(q >> 8) & 255] ^ \
> tab[1][(q >> 16) & 255] ^ \
> tab[0][(q >> 24) & 255])
By changing the syntax a little so that the assignment is done down below
the 4 byte and 8 byte algorithms can share DO_CRC4.
> +# define DO_CRC8 (tab[7][(q) & 255] ^ \
> + tab[6][(q >> 8) & 255] ^ \
> + tab[5][(q >> 16) & 255] ^ \
> + tab[4][(q >> 24) & 255])
> # else
> # define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
> -# define DO_CRC4 crc = tab[0][(crc) & 255] ^ \
> - tab[1][(crc >> 8) & 255] ^ \
> - tab[2][(crc >> 16) & 255] ^ \
> - tab[3][(crc >> 24) & 255]
> -# define DO_CRC8a (tab[4][(q) & 255] ^ \
> - tab[5][(q >> 8) & 255] ^ \
> - tab[6][(q >> 16) & 255] ^ \
> - tab[8][(q >> 24) & 255])
> -# define DO_CRC8b (tab[0][(q) & 255] ^ \
> +# define DO_CRC4 (tab[0][(q) & 255] ^ \
> tab[1][(q >> 8) & 255] ^ \
> tab[2][(q >> 16) & 255] ^ \
> tab[3][(q >> 24) & 255])
> +# define DO_CRC8 (tab[4][(q) & 255] ^ \
> + tab[5][(q >> 8) & 255] ^ \
> + tab[6][(q >> 16) & 255] ^ \
> + tab[7][(q >> 24) & 255])
> # endif
> +
> +/* implements slicing-by-4 or slicing-by-8 algorithm */
> +static inline u32 crc32_body(u32 crc, unsigned char const *buf,
> + size_t len, const u32 (*tab)[256])
> +{
> const u32 *b;
> + u8 *p;
> + u32 q;
> size_t init_len;
> size_t middle_len;
> size_t rem_len;
> + size_t i;
>
> /* break buf into init_len bytes before and
> * rem_len bytes after a middle section with
> @@ -96,37 +96,34 @@ crc32_body(u32 crc, unsigned char const
> rem_len = (len - init_len) & 7;
> # endif
>
> - /* Align it */
> - if (unlikely(init_len)) {
> - do {
> - DO_CRC(*buf++);
> - } while (--init_len);
> - }
> - b = (const u32 *)buf;
> - for (--b; middle_len; --middle_len) {
> + /* process unaligned initial bytes */
> + for (i = 0; i < init_len; i++)
> + DO_CRC(*buf++);
> +
> + /* process aligned words */
> + b = (const u32 *)(buf - 4);
> +
> + for (i = 0; i < middle_len; i++) {
> # if CRC_LE_BITS == 32
> - crc ^= *++b; /* use pre increment for speed */
> - DO_CRC4;
> + /* slicing-by-4 algorithm */
> + q = crc ^ *++b; /* use pre increment for speed */
> + crc = DO_CRC4;
> # else
> - u32 q;
> + /* slicing-by-8 algorithm */
> q = crc ^ *++b;
> - crc = DO_CRC8a;
> + crc = DO_CRC8;
> q = *++b;
> - crc ^= DO_CRC8b;
> + crc ^= DO_CRC4;
> # endif
This really shows how close the two algorithms are.
> }
> - /* And the last few bytes */
> - if (rem_len) {
> - u8 *p = (u8 *)(b + 1) - 1;
> - do {
> - DO_CRC(*++p); /* use pre increment for speed */
> - } while (--rem_len);
> - }
> +
> + /* process unaligned remaining bytes */
> + p = (u8 *)(b + 1) - 1;
> +
> + for (i = 0; i < rem_len; i++)
> + DO_CRC(*++p); /* use pre increment for speed */
> +
> return crc;
> -#undef DO_CRC
> -#undef DO_CRC4
> -#undef DO_CRC8a
> -#undef DO_CRC8b
> }
> #endif
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [patch v3 7/7] crc32: final-cleanup.diff
2011-08-09 6:21 ` Bob Pearson
@ 2011-08-09 17:39 ` Joakim Tjernlund
2011-08-09 23:05 ` Bob Pearson
0 siblings, 1 reply; 8+ messages in thread
From: Joakim Tjernlund @ 2011-08-09 17:39 UTC (permalink / raw)
To: Bob Pearson; +Cc: akpm, fzago, linux, linux-kernel
"Bob Pearson" <rpearson@systemfabricworks.com> wrote on 2011/08/09 08:21:38:
>
> Additional comments.
>
>
> > Some final cleanup changes
> >
> > - added a comment at the top of crc32.c
> > - moved macros ahead of function prototype
> > - replaced loops with for (i = 0; i < xxx; i++) which
> > requires fewer instructions on x86 since the
> > buffer lookups can use i as an index.
Doing this adds one insn to ppc. What is good for x86 isn't for
pcc32 (and I guess any modern arch).
> >
> > -/* implements slicing-by-4 or slicing-by-8 algorithm */
> > -static inline u32
> > -crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32
> > (*tab)[256])
>
> After careful measurements and looking at asm code I figured out that there
> was no penalty for using 2D array. That allowed me to go back to the
> original form.
What about my suggestion to assign each table to a ptr? This was good
for ppc so if it doesn't slow x86 it should be added.
>
> > -{
> > # ifdef __LITTLE_ENDIAN
> > # define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8)
> > -# define DO_CRC4 crc = tab[3][(crc) & 255] ^ \
> > - tab[2][(crc >> 8) & 255] ^ \
> > - tab[1][(crc >> 16) & 255] ^ \
> > - tab[0][(crc >> 24) & 255]
> > -# define DO_CRC8a (tab[7][(q) & 255] ^ \
> > - tab[6][(q >> 8) & 255] ^ \
> > - tab[5][(q >> 16) & 255] ^ \
> > - tab[4][(q >> 24) & 255])
> > -# define DO_CRC8b (tab[3][(q) & 255] ^ \
> > +# define DO_CRC4 (tab[3][(q) & 255] ^ \
> > tab[2][(q >> 8) & 255] ^ \
> > tab[1][(q >> 16) & 255] ^ \
> > tab[0][(q >> 24) & 255])
>
> By changing the syntax a little so that the assignment is done down below
> the 4 byte and 8 byte algorithms can share DO_CRC4.
>
> > +# define DO_CRC8 (tab[7][(q) & 255] ^ \
> > + tab[6][(q >> 8) & 255] ^ \
> > + tab[5][(q >> 16) & 255] ^ \
> > + tab[4][(q >> 24) & 255])
> > # else
> > # define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
> > -# define DO_CRC4 crc = tab[0][(crc) & 255] ^ \
> > - tab[1][(crc >> 8) & 255] ^ \
> > - tab[2][(crc >> 16) & 255] ^ \
> > - tab[3][(crc >> 24) & 255]
> > -# define DO_CRC8a (tab[4][(q) & 255] ^ \
> > - tab[5][(q >> 8) & 255] ^ \
> > - tab[6][(q >> 16) & 255] ^ \
> > - tab[8][(q >> 24) & 255])
> > -# define DO_CRC8b (tab[0][(q) & 255] ^ \
> > +# define DO_CRC4 (tab[0][(q) & 255] ^ \
> > tab[1][(q >> 8) & 255] ^ \
> > tab[2][(q >> 16) & 255] ^ \
> > tab[3][(q >> 24) & 255])
> > +# define DO_CRC8 (tab[4][(q) & 255] ^ \
> > + tab[5][(q >> 8) & 255] ^ \
> > + tab[6][(q >> 16) & 255] ^ \
> > + tab[7][(q >> 24) & 255])
> > # endif
> > +
> > +/* implements slicing-by-4 or slicing-by-8 algorithm */
> > +static inline u32 crc32_body(u32 crc, unsigned char const *buf,
> > + size_t len, const u32 (*tab)[256])
> > +{
> > const u32 *b;
> > + u8 *p;
> > + u32 q;
> > size_t init_len;
> > size_t middle_len;
> > size_t rem_len;
> > + size_t i;
> >
> > /* break buf into init_len bytes before and
> > * rem_len bytes after a middle section with
> > @@ -96,37 +96,34 @@ crc32_body(u32 crc, unsigned char const
> > rem_len = (len - init_len) & 7;
> > # endif
> >
> > - /* Align it */
> > - if (unlikely(init_len)) {
> > - do {
> > - DO_CRC(*buf++);
> > - } while (--init_len);
> > - }
> > - b = (const u32 *)buf;
> > - for (--b; middle_len; --middle_len) {
> > + /* process unaligned initial bytes */
> > + for (i = 0; i < init_len; i++)
> > + DO_CRC(*buf++);
> > +
> > + /* process aligned words */
> > + b = (const u32 *)(buf - 4);
> > +
> > + for (i = 0; i < middle_len; i++) {
> > # if CRC_LE_BITS == 32
> > - crc ^= *++b; /* use pre increment for speed */
> > - DO_CRC4;
> > + /* slicing-by-4 algorithm */
> > + q = crc ^ *++b; /* use pre increment for speed */
> > + crc = DO_CRC4;
> > # else
> > - u32 q;
> > + /* slicing-by-8 algorithm */
> > q = crc ^ *++b;
> > - crc = DO_CRC8a;
> > + crc = DO_CRC8;
> > q = *++b;
> > - crc ^= DO_CRC8b;
> > + crc ^= DO_CRC4;
> > # endif
>
> This really shows how close the two algorithms are.
Indeed. Look at the WIP patch I sent. There it is even closer :)
>
> > }
> > - /* And the last few bytes */
> > - if (rem_len) {
> > - u8 *p = (u8 *)(b + 1) - 1;
> > - do {
> > - DO_CRC(*++p); /* use pre increment for speed */
> > - } while (--rem_len);
> > - }
> > +
> > + /* process unaligned remaining bytes */
> > + p = (u8 *)(b + 1) - 1;
> > +
> > + for (i = 0; i < rem_len; i++)
> > + DO_CRC(*++p); /* use pre increment for speed */
> > +
> > return crc;
> > -#undef DO_CRC
> > -#undef DO_CRC4
> > -#undef DO_CRC8a
> > -#undef DO_CRC8b
> > }
> > #endif
>
>
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [patch v3 7/7] crc32: final-cleanup.diff
2011-08-09 17:39 ` Joakim Tjernlund
@ 2011-08-09 23:05 ` Bob Pearson
2011-08-10 11:40 ` Joakim Tjernlund
0 siblings, 1 reply; 8+ messages in thread
From: Bob Pearson @ 2011-08-09 23:05 UTC (permalink / raw)
To: 'Joakim Tjernlund', George Spelvin
Cc: akpm, fzago, linux, linux-kernel
>
> Doing this adds one insn to ppc. What is good for x86 isn't for
> pcc32 (and I guess any modern arch).
I looked at the assembly output from the original code and I see two
compares at the end of the first loop, one for --len and one for (buf & 3)
which it has to compute. You are better off just computing buf & 3 once and
comparing with len once to compute init_len then in each iteration in the
loop you only have to compare one thing.
As proposed:
init_len = min((-((uintptr_t)buf)) & 3, len);
...
/* process unaligned initial bytes */
for (i = 0; i < init_len; i++)
DO_CRC(*buf++);
crc32x_le:
<.....>
negq %rcx # (-buf)
andl $3, %ecx # (-buf) & 3
cmpq %rdx, %rcx # min()
cmova %rdx, %rcx # init_len =
xorl %edi, %edi # i = 0
<.....>
jmp .L2
.L3:
# crc = tab[0][(crc ^ (*buf++)) & 255] ^ (crc >> 8)
movb (%rsi,%rdi), %dl # buf[i]
incq %rdi # buf++
xorl %eax, %edx # crc ^ *buf++
shrl $8, %eax # crc >> 8
movzbl %dl, %edx # & 255
xorl crc32table_le(,%rdx,4), %eax # crc =
tab[...] ^ (crc >> 8)
.L2:
cmpq %rcx, %rdi # compare i with
init_len
jb .L3
<.....>
As was originally:
if (unlikely((long)buf & 3 && len)) {
do {
DO_CRC(*buf++);
} while ((--len) && ((long)buf)&3);
}
crc32x_le:
pushq %rbp
movl %edi, %eax
pushq %rbx
testb $3, %sil
je .L16
testq %rdx, %rdx
.L25:
je .L16
movb (%rsi), %cl # *p
incq %rsi # p++
xorl %eax, %ecx # crc ^ *p
shrl $8, %eax # crc >> 8
movzbl %cl, %ecx # ... & 255
xorl crc32table_le(,%rcx,4), %eax # tab[...] ^ (crc >>
8)
decq %rdx # len--
je .L16 # if len != 0
continue
testb $3, %sil # buf & 3
jmp .L25 # if buf & 3
continue
.L16:
The first loop has 8 instructions second one has 11 instructions. First loop
has slightly more setup to compute init_len, second loop has a couple of
extra branches to compute the if() outside of the loop.
I get better performance with the new form of the loop. How, does PPC get
better results with two tests?
>
> > >
> > > -/* implements slicing-by-4 or slicing-by-8 algorithm */
> > > -static inline u32
> > > -crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32
> > > (*tab)[256])
> >
> > After careful measurements and looking at asm code I figured out that
> there
> > was no penalty for using 2D array. That allowed me to go back to the
> > original form.
>
> What about my suggestion to assign each table to a ptr? This was good
> for ppc so if it doesn't slow x86 it should be added.
Gcc is very clever and inlined crc32_body() into crc32_le() and crc32_be()
and then replaced references to tab[x][y] with references to crc32table_le
and crc32table_be plus a constant offset which depended on the first index
([x]). So in effect it completely unrolled back to 1D arrays as
(crc32table_le + offset)[y], which is equivalent to the t0_le[y] code I had
before.
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [patch v3 7/7] crc32: final-cleanup.diff
2011-08-09 23:05 ` Bob Pearson
@ 2011-08-10 11:40 ` Joakim Tjernlund
2011-08-10 15:13 ` Bob Pearson
0 siblings, 1 reply; 8+ messages in thread
From: Joakim Tjernlund @ 2011-08-10 11:40 UTC (permalink / raw)
To: Bob Pearson; +Cc: akpm, fzago, George Spelvin, linux-kernel
"Bob Pearson" <rpearson@systemfabricworks.com> wrote on 2011/08/10 01:05:56:
>
> >
> > Doing this adds one insn to ppc. What is good for x86 isn't for
> > pcc32 (and I guess any modern arch).
>
> I looked at the assembly output from the original code and I see two
> compares at the end of the first loop, one for --len and one for (buf & 3)
> which it has to compute. You are better off just computing buf & 3 once and
> comparing with len once to compute init_len then in each iteration in the
> loop you only have to compare one thing.
>
> As proposed:
> init_len = min((-((uintptr_t)buf)) & 3, len);
> ...
>
> /* process unaligned initial bytes */
> for (i = 0; i < init_len; i++)
> DO_CRC(*buf++);
>
>
> crc32x_le:
> <.....>
> negq %rcx # (-buf)
> andl $3, %ecx # (-buf) & 3
> cmpq %rdx, %rcx # min()
> cmova %rdx, %rcx # init_len =
> xorl %edi, %edi # i = 0
> <.....>
> jmp .L2
> .L3:
> # crc = tab[0][(crc ^ (*buf++)) & 255] ^ (crc >> 8)
> movb (%rsi,%rdi), %dl # buf[i]
> incq %rdi # buf++
> xorl %eax, %edx # crc ^ *buf++
> shrl $8, %eax # crc >> 8
> movzbl %dl, %edx # & 255
> xorl crc32table_le(,%rdx,4), %eax # crc =
> tab[...] ^ (crc >> 8)
> .L2:
> cmpq %rcx, %rdi # compare i with
> init_len
> jb .L3
> <.....>
>
> As was originally:
> if (unlikely((long)buf & 3 && len)) {
> do {
> DO_CRC(*buf++);
> } while ((--len) && ((long)buf)&3);
> }
>
> crc32x_le:
> pushq %rbp
> movl %edi, %eax
> pushq %rbx
> testb $3, %sil
> je .L16
> testq %rdx, %rdx
> .L25:
> je .L16
> movb (%rsi), %cl # *p
> incq %rsi # p++
> xorl %eax, %ecx # crc ^ *p
> shrl $8, %eax # crc >> 8
> movzbl %cl, %ecx # ... & 255
> xorl crc32table_le(,%rcx,4), %eax # tab[...] ^ (crc >>
> 8)
> decq %rdx # len--
> je .L16 # if len != 0
> continue
> testb $3, %sil # buf & 3
> jmp .L25 # if buf & 3
> continue
> .L16:
>
> The first loop has 8 instructions second one has 11 instructions. First loop
> has slightly more setup to compute init_len, second loop has a couple of
> extra branches to compute the if() outside of the loop.
>
> I get better performance with the new form of the loop. How, does PPC get
> better results with two tests?
You are right, even ppc gets less insns in the byte loop. However I tested
your latest patch series and your version is a tiny bit slower than my best
version(numbers below)
Probably due to that your version has a higher startup cost and uses more
registers(3 regs are pushed to the stack vs. 2 regs in mine)
>
> >
> > > >
> > > > -/* implements slicing-by-4 or slicing-by-8 algorithm */
> > > > -static inline u32
> > > > -crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32
> > > > (*tab)[256])
> > >
> > > After careful measurements and looking at asm code I figured out that
> > there
> > > was no penalty for using 2D array. That allowed me to go back to the
> > > original form.
> >
> > What about my suggestion to assign each table to a ptr? This was good
> > for ppc so if it doesn't slow x86 it should be added.
>
> Gcc is very clever and inlined crc32_body() into crc32_le() and crc32_be()
> and then replaced references to tab[x][y] with references to crc32table_le
> and crc32table_be plus a constant offset which depended on the first index
> ([x]). So in effect it completely unrolled back to 1D arrays as
> (crc32table_le + offset)[y], which is equivalent to the t0_le[y] code I had
> before.
So x86 optimizes the table accesses well then. PPC32 doesn't and adding
my small optimization makes a huge difference:
org:
crc32: CRC_LE_BITS = 8, CRC_BE BITS = 8
crc32: self tests passed, processed 225944 bytes in 2257673 nsec
my t0, t1, t2, t3 ptrs
crc32: CRC_LE_BITS = 8, CRC_BE BITS = 8
crc32: self tests passed, processed 225944 bytes in 1949869 nsec
Numbers for 32 bits with your patch.
v3 up to 6 of 7
crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32
crc32: self tests passed, processed 225944 bytes in 2362044 nsec
v3 up to 7 of 7
crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32
crc32: self tests passed, processed 225944 bytes in 2149972 nsec
my t0, t1, t2, t3 ptrs on v3 7 of 7
crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32
crc32: self tests passed, processed 225944 bytes in 1953290 nsec
Finally, 64 bits on PPC is broken in v3!
Jocke
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [patch v3 7/7] crc32: final-cleanup.diff
2011-08-10 11:40 ` Joakim Tjernlund
@ 2011-08-10 15:13 ` Bob Pearson
2011-08-10 15:48 ` Joakim Tjernlund
0 siblings, 1 reply; 8+ messages in thread
From: Bob Pearson @ 2011-08-10 15:13 UTC (permalink / raw)
To: 'Joakim Tjernlund'
Cc: akpm, fzago, 'George Spelvin', linux-kernel
OK. Can you post your current version of crc32.c? I'll try to merge them
together.
> -----Original Message-----
> From: Joakim Tjernlund [mailto:joakim.tjernlund@transmode.se]
> Sent: Wednesday, August 10, 2011 6:41 AM
> To: Bob Pearson
> Cc: akpm@linux-foundation.org; fzago@systemfabricworks.com; George
> Spelvin; linux-kernel@vger.kernel.org
> Subject: RE: [patch v3 7/7] crc32: final-cleanup.diff
>
> "Bob Pearson" <rpearson@systemfabricworks.com> wrote on 2011/08/10
> 01:05:56:
> >
> > >
> > > Doing this adds one insn to ppc. What is good for x86 isn't for
> > > pcc32 (and I guess any modern arch).
> >
> > I looked at the assembly output from the original code and I see two
> > compares at the end of the first loop, one for --len and one for (buf &
3)
> > which it has to compute. You are better off just computing buf & 3 once
> and
> > comparing with len once to compute init_len then in each iteration in
the
> > loop you only have to compare one thing.
> >
> > As proposed:
> > init_len = min((-((uintptr_t)buf)) & 3, len);
> > ...
> >
> > /* process unaligned initial bytes */
> > for (i = 0; i < init_len; i++)
> > DO_CRC(*buf++);
> >
> >
> > crc32x_le:
> > <.....>
> > negq %rcx # (-buf)
> > andl $3, %ecx # (-buf) & 3
> > cmpq %rdx, %rcx # min()
> > cmova %rdx, %rcx # init_len =
> > xorl %edi, %edi # i = 0
> > <.....>
> > jmp .L2
> > .L3:
> > # crc = tab[0][(crc ^ (*buf++)) & 255] ^ (crc >> 8)
> > movb (%rsi,%rdi), %dl # buf[i]
> > incq %rdi # buf++
> > xorl %eax, %edx # crc ^ *buf++
> > shrl $8, %eax # crc >> 8
> > movzbl %dl, %edx # & 255
> > xorl crc32table_le(,%rdx,4), %eax # crc =
> > tab[...] ^ (crc >> 8)
> > .L2:
> > cmpq %rcx, %rdi # compare i with
> > init_len
> > jb .L3
> > <.....>
> >
> > As was originally:
> > if (unlikely((long)buf & 3 && len)) {
> > do {
> > DO_CRC(*buf++);
> > } while ((--len) && ((long)buf)&3);
> > }
> >
> > crc32x_le:
> > pushq %rbp
> > movl %edi, %eax
> > pushq %rbx
> > testb $3, %sil
> > je .L16
> > testq %rdx, %rdx
> > .L25:
> > je .L16
> > movb (%rsi), %cl # *p
> > incq %rsi # p++
> > xorl %eax, %ecx # crc ^ *p
> > shrl $8, %eax # crc >> 8
> > movzbl %cl, %ecx # ... & 255
> > xorl crc32table_le(,%rcx,4), %eax # tab[...] ^ (crc >>
> > 8)
> > decq %rdx # len--
> > je .L16 # if len != 0
> > continue
> > testb $3, %sil # buf & 3
> > jmp .L25 # if buf & 3
> > continue
> > .L16:
> >
> > The first loop has 8 instructions second one has 11 instructions. First
loop
> > has slightly more setup to compute init_len, second loop has a couple of
> > extra branches to compute the if() outside of the loop.
> >
> > I get better performance with the new form of the loop. How, does PPC
> get
> > better results with two tests?
>
> You are right, even ppc gets less insns in the byte loop. However I tested
> your latest patch series and your version is a tiny bit slower than my
best
> version(numbers below)
> Probably due to that your version has a higher startup cost and uses more
> registers(3 regs are pushed to the stack vs. 2 regs in mine)
>
> >
> > >
> > > > >
> > > > > -/* implements slicing-by-4 or slicing-by-8 algorithm */
> > > > > -static inline u32
> > > > > -crc32_body(u32 crc, unsigned char const *buf, size_t len, const
u32
> > > > > (*tab)[256])
> > > >
> > > > After careful measurements and looking at asm code I figured out
that
> > > there
> > > > was no penalty for using 2D array. That allowed me to go back to the
> > > > original form.
> > >
> > > What about my suggestion to assign each table to a ptr? This was good
> > > for ppc so if it doesn't slow x86 it should be added.
> >
> > Gcc is very clever and inlined crc32_body() into crc32_le() and
crc32_be()
> > and then replaced references to tab[x][y] with references to
crc32table_le
> > and crc32table_be plus a constant offset which depended on the first
index
> > ([x]). So in effect it completely unrolled back to 1D arrays as
> > (crc32table_le + offset)[y], which is equivalent to the t0_le[y] code I
had
> > before.
>
> So x86 optimizes the table accesses well then. PPC32 doesn't and adding
> my small optimization makes a huge difference:
> org:
> crc32: CRC_LE_BITS = 8, CRC_BE BITS = 8
> crc32: self tests passed, processed 225944 bytes in 2257673 nsec
>
> my t0, t1, t2, t3 ptrs
> crc32: CRC_LE_BITS = 8, CRC_BE BITS = 8
> crc32: self tests passed, processed 225944 bytes in 1949869 nsec
>
> Numbers for 32 bits with your patch.
>
> v3 up to 6 of 7
> crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32
> crc32: self tests passed, processed 225944 bytes in 2362044 nsec
>
> v3 up to 7 of 7
> crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32
> crc32: self tests passed, processed 225944 bytes in 2149972 nsec
>
> my t0, t1, t2, t3 ptrs on v3 7 of 7
> crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32
> crc32: self tests passed, processed 225944 bytes in 1953290 nsec
>
> Finally, 64 bits on PPC is broken in v3!
>
> Jocke
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [patch v3 7/7] crc32: final-cleanup.diff
2011-08-10 15:13 ` Bob Pearson
@ 2011-08-10 15:48 ` Joakim Tjernlund
2011-08-10 21:57 ` Bob Pearson
0 siblings, 1 reply; 8+ messages in thread
From: Joakim Tjernlund @ 2011-08-10 15:48 UTC (permalink / raw)
To: Bob Pearson; +Cc: akpm, fzago, 'George Spelvin', linux-kernel
"Bob Pearson" <rpearson@systemfabricworks.com> wrote on 2011/08/10 17:13:00:
> From: "Bob Pearson" <rpearson@systemfabricworks.com>
> To: "'Joakim Tjernlund'" <joakim.tjernlund@transmode.se>
> Cc: <akpm@linux-foundation.org>, <fzago@systemfabricworks.com>, "'George Spelvin'" <linux@horizon.com>, <linux-kernel@vger.kernel.org>
> Date: 2011/08/10 17:13
> Subject: RE: [patch v3 7/7] crc32: final-cleanup.diff
>
> OK. Can you post your current version of crc32.c? I'll try to merge them
> together.
OK, here it comes again, prefably this should be the first patch
in the series.
>From f5268d74f1a81610820e92785397f1247946ce15 Mon Sep 17 00:00:00 2001
From: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Date: Fri, 5 Aug 2011 17:49:42 +0200
Subject: [PATCH] crc32: Optimize inner loop.
taking a pointer reference to each row in the crc table matrix,
one can reduce the inner loop with a few insn's on RISC
archs like PowerPC.
---
lib/crc32.c | 21 +++++++++++----------
1 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/lib/crc32.c b/lib/crc32.c
index 4855995..b06d1e7 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -51,20 +51,21 @@ static inline u32
crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
{
# ifdef __LITTLE_ENDIAN
-# define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8)
-# define DO_CRC4 crc = tab[3][(crc) & 255] ^ \
- tab[2][(crc >> 8) & 255] ^ \
- tab[1][(crc >> 16) & 255] ^ \
- tab[0][(crc >> 24) & 255]
+# define DO_CRC(x) crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8)
+# define DO_CRC4 crc = t3[(crc) & 255] ^ \
+ t2[(crc >> 8) & 255] ^ \
+ t1[(crc >> 16) & 255] ^ \
+ t0[(crc >> 24) & 255]
# else
-# define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
-# define DO_CRC4 crc = tab[0][(crc) & 255] ^ \
- tab[1][(crc >> 8) & 255] ^ \
- tab[2][(crc >> 16) & 255] ^ \
- tab[3][(crc >> 24) & 255]
+# define DO_CRC(x) crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
+# define DO_CRC4 crc = t0[(crc) & 255] ^ \
+ t1[(crc >> 8) & 255] ^ \
+ t2[(crc >> 16) & 255] ^ \
+ t3[(crc >> 24) & 255]
# endif
const u32 *b;
size_t rem_len;
+ const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3];
/* Align it */
if (unlikely((long)buf & 3 && len)) {
--
1.7.3.4
^ permalink raw reply related [flat|nested] 8+ messages in thread
* RE: [patch v3 7/7] crc32: final-cleanup.diff
2011-08-10 15:48 ` Joakim Tjernlund
@ 2011-08-10 21:57 ` Bob Pearson
0 siblings, 0 replies; 8+ messages in thread
From: Bob Pearson @ 2011-08-10 21:57 UTC (permalink / raw)
To: 'Joakim Tjernlund'
Cc: akpm, fzago, 'George Spelvin', linux-kernel
> -----Original Message-----
> From: Joakim Tjernlund [mailto:joakim.tjernlund@transmode.se]
> Sent: Wednesday, August 10, 2011 10:49 AM
> To: Bob Pearson
> Cc: akpm@linux-foundation.org; fzago@systemfabricworks.com; 'George
> Spelvin'; linux-kernel@vger.kernel.org
> Subject: RE: [patch v3 7/7] crc32: final-cleanup.diff
>
> "Bob Pearson" <rpearson@systemfabricworks.com> wrote on 2011/08/10
> 17:13:00:
>
> > From: "Bob Pearson" <rpearson@systemfabricworks.com>
> > To: "'Joakim Tjernlund'" <joakim.tjernlund@transmode.se>
> > Cc: <akpm@linux-foundation.org>, <fzago@systemfabricworks.com>,
> "'George Spelvin'" <linux@horizon.com>, <linux-kernel@vger.kernel.org>
> > Date: 2011/08/10 17:13
> > Subject: RE: [patch v3 7/7] crc32: final-cleanup.diff
> >
> > OK. Can you post your current version of crc32.c? I'll try to merge them
> > together.
>
> OK, here it comes again, prefably this should be the first patch
> in the series.
>
> From f5268d74f1a81610820e92785397f1247946ce15 Mon Sep 17 00:00:00 2001
> From: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
> Date: Fri, 5 Aug 2011 17:49:42 +0200
> Subject: [PATCH] crc32: Optimize inner loop.
>
> taking a pointer reference to each row in the crc table matrix,
> one can reduce the inner loop with a few insn's on RISC
> archs like PowerPC.
> ---
> lib/crc32.c | 21 +++++++++++----------
> 1 files changed, 11 insertions(+), 10 deletions(-)
>
> diff --git a/lib/crc32.c b/lib/crc32.c
> index 4855995..b06d1e7 100644
> --- a/lib/crc32.c
> +++ b/lib/crc32.c
> @@ -51,20 +51,21 @@ static inline u32
> crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32
> (*tab)[256])
> {
> # ifdef __LITTLE_ENDIAN
> -# define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8)
> -# define DO_CRC4 crc = tab[3][(crc) & 255] ^ \
> - tab[2][(crc >> 8) & 255] ^ \
> - tab[1][(crc >> 16) & 255] ^ \
> - tab[0][(crc >> 24) & 255]
> +# define DO_CRC(x) crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8)
> +# define DO_CRC4 crc = t3[(crc) & 255] ^ \
> + t2[(crc >> 8) & 255] ^ \
> + t1[(crc >> 16) & 255] ^ \
> + t0[(crc >> 24) & 255]
> # else
> -# define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
> -# define DO_CRC4 crc = tab[0][(crc) & 255] ^ \
> - tab[1][(crc >> 8) & 255] ^ \
> - tab[2][(crc >> 16) & 255] ^ \
> - tab[3][(crc >> 24) & 255]
> +# define DO_CRC(x) crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
> +# define DO_CRC4 crc = t0[(crc) & 255] ^ \
> + t1[(crc >> 8) & 255] ^ \
> + t2[(crc >> 16) & 255] ^ \
> + t3[(crc >> 24) & 255]
> # endif
> const u32 *b;
> size_t rem_len;
> + const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3];
>
> /* Align it */
> if (unlikely((long)buf & 3 && len)) {
> --
> 1.7.3.4
I tried this on X86_64 and Sparc 64. Very small improvement for Intel and
significant improvement for Sparc. Here are the results based on current
self test which is a mix of crc32_le and crc32_be with random offsets and
lengths:
Results are 'best case' I.e. I picked the shortest time from a handful of
runs.
Arch CPU Freq BITS bytes
nsec cycles/byte
____________________________________________________________________________
Current proposed patch
X86_64 Intel E5520 2.268G 64 225944
161294 1.619
X86_64 Intel E5520 2.268G 32 225944
267795 2.688
Sun Sparc III+ 900M 64 225944
757235 3.028
Sun Sparc III+ 900M 32 225944
935558 3.727
With pointers instead of 2D array references
X86_64 E5520 2.268G 64 225944
157975 1.584
X86_64 E5520 2.268M 32 225944
273366 2.744
Sun Sparc III+ 900M 64 225944
570724 2.273
Sun Sparc III+ 900M 32 225944
848897 3.381
The change doesn't really help or hurt for X86_64 but significantly helps
Sparc and you report gains for PPC so it looks good.
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2011-08-10 21:57 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-08-09 5:29 [patch v3 7/7] crc32: final-cleanup.diff Bob Pearson
2011-08-09 6:21 ` Bob Pearson
2011-08-09 17:39 ` Joakim Tjernlund
2011-08-09 23:05 ` Bob Pearson
2011-08-10 11:40 ` Joakim Tjernlund
2011-08-10 15:13 ` Bob Pearson
2011-08-10 15:48 ` Joakim Tjernlund
2011-08-10 21:57 ` Bob Pearson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox