* [patch v3 7/7] crc32: final-cleanup.diff
@ 2011-08-09 5:29 Bob Pearson
2011-08-09 6:21 ` Bob Pearson
0 siblings, 1 reply; 8+ messages in thread
From: Bob Pearson @ 2011-08-09 5:29 UTC (permalink / raw)
To: linux-kernel, linux, joakim.tjernlund, fzago, rpearson, akpm
Some final cleanup changes
- added a comment at the top of crc32.c
- moved macros ahead of function prototype
- replaced loops with for (i = 0; i < xxx; i++) which
requires fewer instructions on x86 since the
buffer lookups can use i as an index.
Signed-off-by: Bob Pearson <rpearson@systemfabricworks.com>
---
lib/crc32.c | 89 ++++++++++++++++++++++++++++--------------------------------
1 file changed, 43 insertions(+), 46 deletions(-)
Index: infiniband/lib/crc32.c
===================================================================
--- infiniband.orig/lib/crc32.c
+++ infiniband/lib/crc32.c
@@ -1,4 +1,8 @@
/*
+ * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
+ * cleaned up code to current version of sparse and added the slicing-by-8
+ * algorithm to the closely similar existing slicing-by-4 algorithm.
+ *
* Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com>
* Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks!
* Code was from the public domain, copyright abandoned. Code was
@@ -45,43 +49,39 @@ MODULE_LICENSE("GPL");
#if CRC_LE_BITS > 8 || CRC_BE_BITS > 8
-/* implements slicing-by-4 or slicing-by-8 algorithm */
-static inline u32
-crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
-{
# ifdef __LITTLE_ENDIAN
# define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8)
-# define DO_CRC4 crc = tab[3][(crc) & 255] ^ \
- tab[2][(crc >> 8) & 255] ^ \
- tab[1][(crc >> 16) & 255] ^ \
- tab[0][(crc >> 24) & 255]
-# define DO_CRC8a (tab[7][(q) & 255] ^ \
- tab[6][(q >> 8) & 255] ^ \
- tab[5][(q >> 16) & 255] ^ \
- tab[4][(q >> 24) & 255])
-# define DO_CRC8b (tab[3][(q) & 255] ^ \
+# define DO_CRC4 (tab[3][(q) & 255] ^ \
tab[2][(q >> 8) & 255] ^ \
tab[1][(q >> 16) & 255] ^ \
tab[0][(q >> 24) & 255])
+# define DO_CRC8 (tab[7][(q) & 255] ^ \
+ tab[6][(q >> 8) & 255] ^ \
+ tab[5][(q >> 16) & 255] ^ \
+ tab[4][(q >> 24) & 255])
# else
# define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
-# define DO_CRC4 crc = tab[0][(crc) & 255] ^ \
- tab[1][(crc >> 8) & 255] ^ \
- tab[2][(crc >> 16) & 255] ^ \
- tab[3][(crc >> 24) & 255]
-# define DO_CRC8a (tab[4][(q) & 255] ^ \
- tab[5][(q >> 8) & 255] ^ \
- tab[6][(q >> 16) & 255] ^ \
- tab[8][(q >> 24) & 255])
-# define DO_CRC8b (tab[0][(q) & 255] ^ \
+# define DO_CRC4 (tab[0][(q) & 255] ^ \
tab[1][(q >> 8) & 255] ^ \
tab[2][(q >> 16) & 255] ^ \
tab[3][(q >> 24) & 255])
+# define DO_CRC8 (tab[4][(q) & 255] ^ \
+ tab[5][(q >> 8) & 255] ^ \
+ tab[6][(q >> 16) & 255] ^ \
+ tab[7][(q >> 24) & 255])
# endif
+
+/* implements slicing-by-4 or slicing-by-8 algorithm */
+static inline u32 crc32_body(u32 crc, unsigned char const *buf,
+ size_t len, const u32 (*tab)[256])
+{
const u32 *b;
+ u8 *p;
+ u32 q;
size_t init_len;
size_t middle_len;
size_t rem_len;
+ size_t i;
/* break buf into init_len bytes before and
* rem_len bytes after a middle section with
@@ -96,37 +96,34 @@ crc32_body(u32 crc, unsigned char const
rem_len = (len - init_len) & 7;
# endif
- /* Align it */
- if (unlikely(init_len)) {
- do {
- DO_CRC(*buf++);
- } while (--init_len);
- }
- b = (const u32 *)buf;
- for (--b; middle_len; --middle_len) {
+ /* process unaligned initial bytes */
+ for (i = 0; i < init_len; i++)
+ DO_CRC(*buf++);
+
+ /* process aligned words */
+ b = (const u32 *)(buf - 4);
+
+ for (i = 0; i < middle_len; i++) {
# if CRC_LE_BITS == 32
- crc ^= *++b; /* use pre increment for speed */
- DO_CRC4;
+ /* slicing-by-4 algorithm */
+ q = crc ^ *++b; /* use pre increment for speed */
+ crc = DO_CRC4;
# else
- u32 q;
+ /* slicing-by-8 algorithm */
q = crc ^ *++b;
- crc = DO_CRC8a;
+ crc = DO_CRC8;
q = *++b;
- crc ^= DO_CRC8b;
+ crc ^= DO_CRC4;
# endif
}
- /* And the last few bytes */
- if (rem_len) {
- u8 *p = (u8 *)(b + 1) - 1;
- do {
- DO_CRC(*++p); /* use pre increment for speed */
- } while (--rem_len);
- }
+
+ /* process unaligned remaining bytes */
+ p = (u8 *)(b + 1) - 1;
+
+ for (i = 0; i < rem_len; i++)
+ DO_CRC(*++p); /* use pre increment for speed */
+
return crc;
-#undef DO_CRC
-#undef DO_CRC4
-#undef DO_CRC8a
-#undef DO_CRC8b
}
#endif
^ permalink raw reply [flat|nested] 8+ messages in thread* RE: [patch v3 7/7] crc32: final-cleanup.diff 2011-08-09 5:29 [patch v3 7/7] crc32: final-cleanup.diff Bob Pearson @ 2011-08-09 6:21 ` Bob Pearson 2011-08-09 17:39 ` Joakim Tjernlund 0 siblings, 1 reply; 8+ messages in thread From: Bob Pearson @ 2011-08-09 6:21 UTC (permalink / raw) To: linux-kernel, linux, joakim.tjernlund, fzago, akpm Additional comments. > Some final cleanup changes > > - added a comment at the top of crc32.c > - moved macros ahead of function prototype > - replaced loops with for (i = 0; i < xxx; i++) which > requires fewer instructions on x86 since the > buffer lookups can use i as an index. > > -/* implements slicing-by-4 or slicing-by-8 algorithm */ > -static inline u32 > -crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 > (*tab)[256]) After careful measurements and looking at asm code I figured out that there was no penalty for using 2D array. That allowed me to go back to the original form. > -{ > # ifdef __LITTLE_ENDIAN > # define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8) > -# define DO_CRC4 crc = tab[3][(crc) & 255] ^ \ > - tab[2][(crc >> 8) & 255] ^ \ > - tab[1][(crc >> 16) & 255] ^ \ > - tab[0][(crc >> 24) & 255] > -# define DO_CRC8a (tab[7][(q) & 255] ^ \ > - tab[6][(q >> 8) & 255] ^ \ > - tab[5][(q >> 16) & 255] ^ \ > - tab[4][(q >> 24) & 255]) > -# define DO_CRC8b (tab[3][(q) & 255] ^ \ > +# define DO_CRC4 (tab[3][(q) & 255] ^ \ > tab[2][(q >> 8) & 255] ^ \ > tab[1][(q >> 16) & 255] ^ \ > tab[0][(q >> 24) & 255]) By changing the syntax a little so that the assignment is done down below the 4 byte and 8 byte algorithms can share DO_CRC4. > +# define DO_CRC8 (tab[7][(q) & 255] ^ \ > + tab[6][(q >> 8) & 255] ^ \ > + tab[5][(q >> 16) & 255] ^ \ > + tab[4][(q >> 24) & 255]) > # else > # define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8) > -# define DO_CRC4 crc = tab[0][(crc) & 255] ^ \ > - tab[1][(crc >> 8) & 255] ^ \ > - tab[2][(crc >> 16) & 255] ^ \ > - tab[3][(crc >> 24) & 255] > -# define DO_CRC8a (tab[4][(q) & 255] ^ \ > - tab[5][(q >> 8) & 255] ^ \ > - tab[6][(q >> 16) & 255] ^ \ > - tab[8][(q >> 24) & 255]) > -# define DO_CRC8b (tab[0][(q) & 255] ^ \ > +# define DO_CRC4 (tab[0][(q) & 255] ^ \ > tab[1][(q >> 8) & 255] ^ \ > tab[2][(q >> 16) & 255] ^ \ > tab[3][(q >> 24) & 255]) > +# define DO_CRC8 (tab[4][(q) & 255] ^ \ > + tab[5][(q >> 8) & 255] ^ \ > + tab[6][(q >> 16) & 255] ^ \ > + tab[7][(q >> 24) & 255]) > # endif > + > +/* implements slicing-by-4 or slicing-by-8 algorithm */ > +static inline u32 crc32_body(u32 crc, unsigned char const *buf, > + size_t len, const u32 (*tab)[256]) > +{ > const u32 *b; > + u8 *p; > + u32 q; > size_t init_len; > size_t middle_len; > size_t rem_len; > + size_t i; > > /* break buf into init_len bytes before and > * rem_len bytes after a middle section with > @@ -96,37 +96,34 @@ crc32_body(u32 crc, unsigned char const > rem_len = (len - init_len) & 7; > # endif > > - /* Align it */ > - if (unlikely(init_len)) { > - do { > - DO_CRC(*buf++); > - } while (--init_len); > - } > - b = (const u32 *)buf; > - for (--b; middle_len; --middle_len) { > + /* process unaligned initial bytes */ > + for (i = 0; i < init_len; i++) > + DO_CRC(*buf++); > + > + /* process aligned words */ > + b = (const u32 *)(buf - 4); > + > + for (i = 0; i < middle_len; i++) { > # if CRC_LE_BITS == 32 > - crc ^= *++b; /* use pre increment for speed */ > - DO_CRC4; > + /* slicing-by-4 algorithm */ > + q = crc ^ *++b; /* use pre increment for speed */ > + crc = DO_CRC4; > # else > - u32 q; > + /* slicing-by-8 algorithm */ > q = crc ^ *++b; > - crc = DO_CRC8a; > + crc = DO_CRC8; > q = *++b; > - crc ^= DO_CRC8b; > + crc ^= DO_CRC4; > # endif This really shows how close the two algorithms are. > } > - /* And the last few bytes */ > - if (rem_len) { > - u8 *p = (u8 *)(b + 1) - 1; > - do { > - DO_CRC(*++p); /* use pre increment for speed */ > - } while (--rem_len); > - } > + > + /* process unaligned remaining bytes */ > + p = (u8 *)(b + 1) - 1; > + > + for (i = 0; i < rem_len; i++) > + DO_CRC(*++p); /* use pre increment for speed */ > + > return crc; > -#undef DO_CRC > -#undef DO_CRC4 > -#undef DO_CRC8a > -#undef DO_CRC8b > } > #endif ^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [patch v3 7/7] crc32: final-cleanup.diff 2011-08-09 6:21 ` Bob Pearson @ 2011-08-09 17:39 ` Joakim Tjernlund 2011-08-09 23:05 ` Bob Pearson 0 siblings, 1 reply; 8+ messages in thread From: Joakim Tjernlund @ 2011-08-09 17:39 UTC (permalink / raw) To: Bob Pearson; +Cc: akpm, fzago, linux, linux-kernel "Bob Pearson" <rpearson@systemfabricworks.com> wrote on 2011/08/09 08:21:38: > > Additional comments. > > > > Some final cleanup changes > > > > - added a comment at the top of crc32.c > > - moved macros ahead of function prototype > > - replaced loops with for (i = 0; i < xxx; i++) which > > requires fewer instructions on x86 since the > > buffer lookups can use i as an index. Doing this adds one insn to ppc. What is good for x86 isn't for pcc32 (and I guess any modern arch). > > > > -/* implements slicing-by-4 or slicing-by-8 algorithm */ > > -static inline u32 > > -crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 > > (*tab)[256]) > > After careful measurements and looking at asm code I figured out that there > was no penalty for using 2D array. That allowed me to go back to the > original form. What about my suggestion to assign each table to a ptr? This was good for ppc so if it doesn't slow x86 it should be added. > > > -{ > > # ifdef __LITTLE_ENDIAN > > # define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8) > > -# define DO_CRC4 crc = tab[3][(crc) & 255] ^ \ > > - tab[2][(crc >> 8) & 255] ^ \ > > - tab[1][(crc >> 16) & 255] ^ \ > > - tab[0][(crc >> 24) & 255] > > -# define DO_CRC8a (tab[7][(q) & 255] ^ \ > > - tab[6][(q >> 8) & 255] ^ \ > > - tab[5][(q >> 16) & 255] ^ \ > > - tab[4][(q >> 24) & 255]) > > -# define DO_CRC8b (tab[3][(q) & 255] ^ \ > > +# define DO_CRC4 (tab[3][(q) & 255] ^ \ > > tab[2][(q >> 8) & 255] ^ \ > > tab[1][(q >> 16) & 255] ^ \ > > tab[0][(q >> 24) & 255]) > > By changing the syntax a little so that the assignment is done down below > the 4 byte and 8 byte algorithms can share DO_CRC4. > > > +# define DO_CRC8 (tab[7][(q) & 255] ^ \ > > + tab[6][(q >> 8) & 255] ^ \ > > + tab[5][(q >> 16) & 255] ^ \ > > + tab[4][(q >> 24) & 255]) > > # else > > # define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8) > > -# define DO_CRC4 crc = tab[0][(crc) & 255] ^ \ > > - tab[1][(crc >> 8) & 255] ^ \ > > - tab[2][(crc >> 16) & 255] ^ \ > > - tab[3][(crc >> 24) & 255] > > -# define DO_CRC8a (tab[4][(q) & 255] ^ \ > > - tab[5][(q >> 8) & 255] ^ \ > > - tab[6][(q >> 16) & 255] ^ \ > > - tab[8][(q >> 24) & 255]) > > -# define DO_CRC8b (tab[0][(q) & 255] ^ \ > > +# define DO_CRC4 (tab[0][(q) & 255] ^ \ > > tab[1][(q >> 8) & 255] ^ \ > > tab[2][(q >> 16) & 255] ^ \ > > tab[3][(q >> 24) & 255]) > > +# define DO_CRC8 (tab[4][(q) & 255] ^ \ > > + tab[5][(q >> 8) & 255] ^ \ > > + tab[6][(q >> 16) & 255] ^ \ > > + tab[7][(q >> 24) & 255]) > > # endif > > + > > +/* implements slicing-by-4 or slicing-by-8 algorithm */ > > +static inline u32 crc32_body(u32 crc, unsigned char const *buf, > > + size_t len, const u32 (*tab)[256]) > > +{ > > const u32 *b; > > + u8 *p; > > + u32 q; > > size_t init_len; > > size_t middle_len; > > size_t rem_len; > > + size_t i; > > > > /* break buf into init_len bytes before and > > * rem_len bytes after a middle section with > > @@ -96,37 +96,34 @@ crc32_body(u32 crc, unsigned char const > > rem_len = (len - init_len) & 7; > > # endif > > > > - /* Align it */ > > - if (unlikely(init_len)) { > > - do { > > - DO_CRC(*buf++); > > - } while (--init_len); > > - } > > - b = (const u32 *)buf; > > - for (--b; middle_len; --middle_len) { > > + /* process unaligned initial bytes */ > > + for (i = 0; i < init_len; i++) > > + DO_CRC(*buf++); > > + > > + /* process aligned words */ > > + b = (const u32 *)(buf - 4); > > + > > + for (i = 0; i < middle_len; i++) { > > # if CRC_LE_BITS == 32 > > - crc ^= *++b; /* use pre increment for speed */ > > - DO_CRC4; > > + /* slicing-by-4 algorithm */ > > + q = crc ^ *++b; /* use pre increment for speed */ > > + crc = DO_CRC4; > > # else > > - u32 q; > > + /* slicing-by-8 algorithm */ > > q = crc ^ *++b; > > - crc = DO_CRC8a; > > + crc = DO_CRC8; > > q = *++b; > > - crc ^= DO_CRC8b; > > + crc ^= DO_CRC4; > > # endif > > This really shows how close the two algorithms are. Indeed. Look at the WIP patch I sent. There it is even closer :) > > > } > > - /* And the last few bytes */ > > - if (rem_len) { > > - u8 *p = (u8 *)(b + 1) - 1; > > - do { > > - DO_CRC(*++p); /* use pre increment for speed */ > > - } while (--rem_len); > > - } > > + > > + /* process unaligned remaining bytes */ > > + p = (u8 *)(b + 1) - 1; > > + > > + for (i = 0; i < rem_len; i++) > > + DO_CRC(*++p); /* use pre increment for speed */ > > + > > return crc; > > -#undef DO_CRC > > -#undef DO_CRC4 > > -#undef DO_CRC8a > > -#undef DO_CRC8b > > } > > #endif > > ^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [patch v3 7/7] crc32: final-cleanup.diff 2011-08-09 17:39 ` Joakim Tjernlund @ 2011-08-09 23:05 ` Bob Pearson 2011-08-10 11:40 ` Joakim Tjernlund 0 siblings, 1 reply; 8+ messages in thread From: Bob Pearson @ 2011-08-09 23:05 UTC (permalink / raw) To: 'Joakim Tjernlund', George Spelvin Cc: akpm, fzago, linux, linux-kernel > > Doing this adds one insn to ppc. What is good for x86 isn't for > pcc32 (and I guess any modern arch). I looked at the assembly output from the original code and I see two compares at the end of the first loop, one for --len and one for (buf & 3) which it has to compute. You are better off just computing buf & 3 once and comparing with len once to compute init_len then in each iteration in the loop you only have to compare one thing. As proposed: init_len = min((-((uintptr_t)buf)) & 3, len); ... /* process unaligned initial bytes */ for (i = 0; i < init_len; i++) DO_CRC(*buf++); crc32x_le: <.....> negq %rcx # (-buf) andl $3, %ecx # (-buf) & 3 cmpq %rdx, %rcx # min() cmova %rdx, %rcx # init_len = xorl %edi, %edi # i = 0 <.....> jmp .L2 .L3: # crc = tab[0][(crc ^ (*buf++)) & 255] ^ (crc >> 8) movb (%rsi,%rdi), %dl # buf[i] incq %rdi # buf++ xorl %eax, %edx # crc ^ *buf++ shrl $8, %eax # crc >> 8 movzbl %dl, %edx # & 255 xorl crc32table_le(,%rdx,4), %eax # crc = tab[...] ^ (crc >> 8) .L2: cmpq %rcx, %rdi # compare i with init_len jb .L3 <.....> As was originally: if (unlikely((long)buf & 3 && len)) { do { DO_CRC(*buf++); } while ((--len) && ((long)buf)&3); } crc32x_le: pushq %rbp movl %edi, %eax pushq %rbx testb $3, %sil je .L16 testq %rdx, %rdx .L25: je .L16 movb (%rsi), %cl # *p incq %rsi # p++ xorl %eax, %ecx # crc ^ *p shrl $8, %eax # crc >> 8 movzbl %cl, %ecx # ... & 255 xorl crc32table_le(,%rcx,4), %eax # tab[...] ^ (crc >> 8) decq %rdx # len-- je .L16 # if len != 0 continue testb $3, %sil # buf & 3 jmp .L25 # if buf & 3 continue .L16: The first loop has 8 instructions second one has 11 instructions. First loop has slightly more setup to compute init_len, second loop has a couple of extra branches to compute the if() outside of the loop. I get better performance with the new form of the loop. How, does PPC get better results with two tests? > > > > > > > -/* implements slicing-by-4 or slicing-by-8 algorithm */ > > > -static inline u32 > > > -crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 > > > (*tab)[256]) > > > > After careful measurements and looking at asm code I figured out that > there > > was no penalty for using 2D array. That allowed me to go back to the > > original form. > > What about my suggestion to assign each table to a ptr? This was good > for ppc so if it doesn't slow x86 it should be added. Gcc is very clever and inlined crc32_body() into crc32_le() and crc32_be() and then replaced references to tab[x][y] with references to crc32table_le and crc32table_be plus a constant offset which depended on the first index ([x]). So in effect it completely unrolled back to 1D arrays as (crc32table_le + offset)[y], which is equivalent to the t0_le[y] code I had before. ^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [patch v3 7/7] crc32: final-cleanup.diff 2011-08-09 23:05 ` Bob Pearson @ 2011-08-10 11:40 ` Joakim Tjernlund 2011-08-10 15:13 ` Bob Pearson 0 siblings, 1 reply; 8+ messages in thread From: Joakim Tjernlund @ 2011-08-10 11:40 UTC (permalink / raw) To: Bob Pearson; +Cc: akpm, fzago, George Spelvin, linux-kernel "Bob Pearson" <rpearson@systemfabricworks.com> wrote on 2011/08/10 01:05:56: > > > > > Doing this adds one insn to ppc. What is good for x86 isn't for > > pcc32 (and I guess any modern arch). > > I looked at the assembly output from the original code and I see two > compares at the end of the first loop, one for --len and one for (buf & 3) > which it has to compute. You are better off just computing buf & 3 once and > comparing with len once to compute init_len then in each iteration in the > loop you only have to compare one thing. > > As proposed: > init_len = min((-((uintptr_t)buf)) & 3, len); > ... > > /* process unaligned initial bytes */ > for (i = 0; i < init_len; i++) > DO_CRC(*buf++); > > > crc32x_le: > <.....> > negq %rcx # (-buf) > andl $3, %ecx # (-buf) & 3 > cmpq %rdx, %rcx # min() > cmova %rdx, %rcx # init_len = > xorl %edi, %edi # i = 0 > <.....> > jmp .L2 > .L3: > # crc = tab[0][(crc ^ (*buf++)) & 255] ^ (crc >> 8) > movb (%rsi,%rdi), %dl # buf[i] > incq %rdi # buf++ > xorl %eax, %edx # crc ^ *buf++ > shrl $8, %eax # crc >> 8 > movzbl %dl, %edx # & 255 > xorl crc32table_le(,%rdx,4), %eax # crc = > tab[...] ^ (crc >> 8) > .L2: > cmpq %rcx, %rdi # compare i with > init_len > jb .L3 > <.....> > > As was originally: > if (unlikely((long)buf & 3 && len)) { > do { > DO_CRC(*buf++); > } while ((--len) && ((long)buf)&3); > } > > crc32x_le: > pushq %rbp > movl %edi, %eax > pushq %rbx > testb $3, %sil > je .L16 > testq %rdx, %rdx > .L25: > je .L16 > movb (%rsi), %cl # *p > incq %rsi # p++ > xorl %eax, %ecx # crc ^ *p > shrl $8, %eax # crc >> 8 > movzbl %cl, %ecx # ... & 255 > xorl crc32table_le(,%rcx,4), %eax # tab[...] ^ (crc >> > 8) > decq %rdx # len-- > je .L16 # if len != 0 > continue > testb $3, %sil # buf & 3 > jmp .L25 # if buf & 3 > continue > .L16: > > The first loop has 8 instructions second one has 11 instructions. First loop > has slightly more setup to compute init_len, second loop has a couple of > extra branches to compute the if() outside of the loop. > > I get better performance with the new form of the loop. How, does PPC get > better results with two tests? You are right, even ppc gets less insns in the byte loop. However I tested your latest patch series and your version is a tiny bit slower than my best version(numbers below) Probably due to that your version has a higher startup cost and uses more registers(3 regs are pushed to the stack vs. 2 regs in mine) > > > > > > > > > > > -/* implements slicing-by-4 or slicing-by-8 algorithm */ > > > > -static inline u32 > > > > -crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 > > > > (*tab)[256]) > > > > > > After careful measurements and looking at asm code I figured out that > > there > > > was no penalty for using 2D array. That allowed me to go back to the > > > original form. > > > > What about my suggestion to assign each table to a ptr? This was good > > for ppc so if it doesn't slow x86 it should be added. > > Gcc is very clever and inlined crc32_body() into crc32_le() and crc32_be() > and then replaced references to tab[x][y] with references to crc32table_le > and crc32table_be plus a constant offset which depended on the first index > ([x]). So in effect it completely unrolled back to 1D arrays as > (crc32table_le + offset)[y], which is equivalent to the t0_le[y] code I had > before. So x86 optimizes the table accesses well then. PPC32 doesn't and adding my small optimization makes a huge difference: org: crc32: CRC_LE_BITS = 8, CRC_BE BITS = 8 crc32: self tests passed, processed 225944 bytes in 2257673 nsec my t0, t1, t2, t3 ptrs crc32: CRC_LE_BITS = 8, CRC_BE BITS = 8 crc32: self tests passed, processed 225944 bytes in 1949869 nsec Numbers for 32 bits with your patch. v3 up to 6 of 7 crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32 crc32: self tests passed, processed 225944 bytes in 2362044 nsec v3 up to 7 of 7 crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32 crc32: self tests passed, processed 225944 bytes in 2149972 nsec my t0, t1, t2, t3 ptrs on v3 7 of 7 crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32 crc32: self tests passed, processed 225944 bytes in 1953290 nsec Finally, 64 bits on PPC is broken in v3! Jocke ^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [patch v3 7/7] crc32: final-cleanup.diff 2011-08-10 11:40 ` Joakim Tjernlund @ 2011-08-10 15:13 ` Bob Pearson 2011-08-10 15:48 ` Joakim Tjernlund 0 siblings, 1 reply; 8+ messages in thread From: Bob Pearson @ 2011-08-10 15:13 UTC (permalink / raw) To: 'Joakim Tjernlund' Cc: akpm, fzago, 'George Spelvin', linux-kernel OK. Can you post your current version of crc32.c? I'll try to merge them together. > -----Original Message----- > From: Joakim Tjernlund [mailto:joakim.tjernlund@transmode.se] > Sent: Wednesday, August 10, 2011 6:41 AM > To: Bob Pearson > Cc: akpm@linux-foundation.org; fzago@systemfabricworks.com; George > Spelvin; linux-kernel@vger.kernel.org > Subject: RE: [patch v3 7/7] crc32: final-cleanup.diff > > "Bob Pearson" <rpearson@systemfabricworks.com> wrote on 2011/08/10 > 01:05:56: > > > > > > > > Doing this adds one insn to ppc. What is good for x86 isn't for > > > pcc32 (and I guess any modern arch). > > > > I looked at the assembly output from the original code and I see two > > compares at the end of the first loop, one for --len and one for (buf & 3) > > which it has to compute. You are better off just computing buf & 3 once > and > > comparing with len once to compute init_len then in each iteration in the > > loop you only have to compare one thing. > > > > As proposed: > > init_len = min((-((uintptr_t)buf)) & 3, len); > > ... > > > > /* process unaligned initial bytes */ > > for (i = 0; i < init_len; i++) > > DO_CRC(*buf++); > > > > > > crc32x_le: > > <.....> > > negq %rcx # (-buf) > > andl $3, %ecx # (-buf) & 3 > > cmpq %rdx, %rcx # min() > > cmova %rdx, %rcx # init_len = > > xorl %edi, %edi # i = 0 > > <.....> > > jmp .L2 > > .L3: > > # crc = tab[0][(crc ^ (*buf++)) & 255] ^ (crc >> 8) > > movb (%rsi,%rdi), %dl # buf[i] > > incq %rdi # buf++ > > xorl %eax, %edx # crc ^ *buf++ > > shrl $8, %eax # crc >> 8 > > movzbl %dl, %edx # & 255 > > xorl crc32table_le(,%rdx,4), %eax # crc = > > tab[...] ^ (crc >> 8) > > .L2: > > cmpq %rcx, %rdi # compare i with > > init_len > > jb .L3 > > <.....> > > > > As was originally: > > if (unlikely((long)buf & 3 && len)) { > > do { > > DO_CRC(*buf++); > > } while ((--len) && ((long)buf)&3); > > } > > > > crc32x_le: > > pushq %rbp > > movl %edi, %eax > > pushq %rbx > > testb $3, %sil > > je .L16 > > testq %rdx, %rdx > > .L25: > > je .L16 > > movb (%rsi), %cl # *p > > incq %rsi # p++ > > xorl %eax, %ecx # crc ^ *p > > shrl $8, %eax # crc >> 8 > > movzbl %cl, %ecx # ... & 255 > > xorl crc32table_le(,%rcx,4), %eax # tab[...] ^ (crc >> > > 8) > > decq %rdx # len-- > > je .L16 # if len != 0 > > continue > > testb $3, %sil # buf & 3 > > jmp .L25 # if buf & 3 > > continue > > .L16: > > > > The first loop has 8 instructions second one has 11 instructions. First loop > > has slightly more setup to compute init_len, second loop has a couple of > > extra branches to compute the if() outside of the loop. > > > > I get better performance with the new form of the loop. How, does PPC > get > > better results with two tests? > > You are right, even ppc gets less insns in the byte loop. However I tested > your latest patch series and your version is a tiny bit slower than my best > version(numbers below) > Probably due to that your version has a higher startup cost and uses more > registers(3 regs are pushed to the stack vs. 2 regs in mine) > > > > > > > > > > > > > > > > -/* implements slicing-by-4 or slicing-by-8 algorithm */ > > > > > -static inline u32 > > > > > -crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 > > > > > (*tab)[256]) > > > > > > > > After careful measurements and looking at asm code I figured out that > > > there > > > > was no penalty for using 2D array. That allowed me to go back to the > > > > original form. > > > > > > What about my suggestion to assign each table to a ptr? This was good > > > for ppc so if it doesn't slow x86 it should be added. > > > > Gcc is very clever and inlined crc32_body() into crc32_le() and crc32_be() > > and then replaced references to tab[x][y] with references to crc32table_le > > and crc32table_be plus a constant offset which depended on the first index > > ([x]). So in effect it completely unrolled back to 1D arrays as > > (crc32table_le + offset)[y], which is equivalent to the t0_le[y] code I had > > before. > > So x86 optimizes the table accesses well then. PPC32 doesn't and adding > my small optimization makes a huge difference: > org: > crc32: CRC_LE_BITS = 8, CRC_BE BITS = 8 > crc32: self tests passed, processed 225944 bytes in 2257673 nsec > > my t0, t1, t2, t3 ptrs > crc32: CRC_LE_BITS = 8, CRC_BE BITS = 8 > crc32: self tests passed, processed 225944 bytes in 1949869 nsec > > Numbers for 32 bits with your patch. > > v3 up to 6 of 7 > crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32 > crc32: self tests passed, processed 225944 bytes in 2362044 nsec > > v3 up to 7 of 7 > crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32 > crc32: self tests passed, processed 225944 bytes in 2149972 nsec > > my t0, t1, t2, t3 ptrs on v3 7 of 7 > crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32 > crc32: self tests passed, processed 225944 bytes in 1953290 nsec > > Finally, 64 bits on PPC is broken in v3! > > Jocke ^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [patch v3 7/7] crc32: final-cleanup.diff 2011-08-10 15:13 ` Bob Pearson @ 2011-08-10 15:48 ` Joakim Tjernlund 2011-08-10 21:57 ` Bob Pearson 0 siblings, 1 reply; 8+ messages in thread From: Joakim Tjernlund @ 2011-08-10 15:48 UTC (permalink / raw) To: Bob Pearson; +Cc: akpm, fzago, 'George Spelvin', linux-kernel "Bob Pearson" <rpearson@systemfabricworks.com> wrote on 2011/08/10 17:13:00: > From: "Bob Pearson" <rpearson@systemfabricworks.com> > To: "'Joakim Tjernlund'" <joakim.tjernlund@transmode.se> > Cc: <akpm@linux-foundation.org>, <fzago@systemfabricworks.com>, "'George Spelvin'" <linux@horizon.com>, <linux-kernel@vger.kernel.org> > Date: 2011/08/10 17:13 > Subject: RE: [patch v3 7/7] crc32: final-cleanup.diff > > OK. Can you post your current version of crc32.c? I'll try to merge them > together. OK, here it comes again, prefably this should be the first patch in the series. >From f5268d74f1a81610820e92785397f1247946ce15 Mon Sep 17 00:00:00 2001 From: Joakim Tjernlund <Joakim.Tjernlund@transmode.se> Date: Fri, 5 Aug 2011 17:49:42 +0200 Subject: [PATCH] crc32: Optimize inner loop. taking a pointer reference to each row in the crc table matrix, one can reduce the inner loop with a few insn's on RISC archs like PowerPC. --- lib/crc32.c | 21 +++++++++++---------- 1 files changed, 11 insertions(+), 10 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 4855995..b06d1e7 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -51,20 +51,21 @@ static inline u32 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) { # ifdef __LITTLE_ENDIAN -# define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8) -# define DO_CRC4 crc = tab[3][(crc) & 255] ^ \ - tab[2][(crc >> 8) & 255] ^ \ - tab[1][(crc >> 16) & 255] ^ \ - tab[0][(crc >> 24) & 255] +# define DO_CRC(x) crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8) +# define DO_CRC4 crc = t3[(crc) & 255] ^ \ + t2[(crc >> 8) & 255] ^ \ + t1[(crc >> 16) & 255] ^ \ + t0[(crc >> 24) & 255] # else -# define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8) -# define DO_CRC4 crc = tab[0][(crc) & 255] ^ \ - tab[1][(crc >> 8) & 255] ^ \ - tab[2][(crc >> 16) & 255] ^ \ - tab[3][(crc >> 24) & 255] +# define DO_CRC(x) crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8) +# define DO_CRC4 crc = t0[(crc) & 255] ^ \ + t1[(crc >> 8) & 255] ^ \ + t2[(crc >> 16) & 255] ^ \ + t3[(crc >> 24) & 255] # endif const u32 *b; size_t rem_len; + const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3]; /* Align it */ if (unlikely((long)buf & 3 && len)) { -- 1.7.3.4 ^ permalink raw reply related [flat|nested] 8+ messages in thread
* RE: [patch v3 7/7] crc32: final-cleanup.diff 2011-08-10 15:48 ` Joakim Tjernlund @ 2011-08-10 21:57 ` Bob Pearson 0 siblings, 0 replies; 8+ messages in thread From: Bob Pearson @ 2011-08-10 21:57 UTC (permalink / raw) To: 'Joakim Tjernlund' Cc: akpm, fzago, 'George Spelvin', linux-kernel > -----Original Message----- > From: Joakim Tjernlund [mailto:joakim.tjernlund@transmode.se] > Sent: Wednesday, August 10, 2011 10:49 AM > To: Bob Pearson > Cc: akpm@linux-foundation.org; fzago@systemfabricworks.com; 'George > Spelvin'; linux-kernel@vger.kernel.org > Subject: RE: [patch v3 7/7] crc32: final-cleanup.diff > > "Bob Pearson" <rpearson@systemfabricworks.com> wrote on 2011/08/10 > 17:13:00: > > > From: "Bob Pearson" <rpearson@systemfabricworks.com> > > To: "'Joakim Tjernlund'" <joakim.tjernlund@transmode.se> > > Cc: <akpm@linux-foundation.org>, <fzago@systemfabricworks.com>, > "'George Spelvin'" <linux@horizon.com>, <linux-kernel@vger.kernel.org> > > Date: 2011/08/10 17:13 > > Subject: RE: [patch v3 7/7] crc32: final-cleanup.diff > > > > OK. Can you post your current version of crc32.c? I'll try to merge them > > together. > > OK, here it comes again, prefably this should be the first patch > in the series. > > From f5268d74f1a81610820e92785397f1247946ce15 Mon Sep 17 00:00:00 2001 > From: Joakim Tjernlund <Joakim.Tjernlund@transmode.se> > Date: Fri, 5 Aug 2011 17:49:42 +0200 > Subject: [PATCH] crc32: Optimize inner loop. > > taking a pointer reference to each row in the crc table matrix, > one can reduce the inner loop with a few insn's on RISC > archs like PowerPC. > --- > lib/crc32.c | 21 +++++++++++---------- > 1 files changed, 11 insertions(+), 10 deletions(-) > > diff --git a/lib/crc32.c b/lib/crc32.c > index 4855995..b06d1e7 100644 > --- a/lib/crc32.c > +++ b/lib/crc32.c > @@ -51,20 +51,21 @@ static inline u32 > crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 > (*tab)[256]) > { > # ifdef __LITTLE_ENDIAN > -# define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8) > -# define DO_CRC4 crc = tab[3][(crc) & 255] ^ \ > - tab[2][(crc >> 8) & 255] ^ \ > - tab[1][(crc >> 16) & 255] ^ \ > - tab[0][(crc >> 24) & 255] > +# define DO_CRC(x) crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8) > +# define DO_CRC4 crc = t3[(crc) & 255] ^ \ > + t2[(crc >> 8) & 255] ^ \ > + t1[(crc >> 16) & 255] ^ \ > + t0[(crc >> 24) & 255] > # else > -# define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8) > -# define DO_CRC4 crc = tab[0][(crc) & 255] ^ \ > - tab[1][(crc >> 8) & 255] ^ \ > - tab[2][(crc >> 16) & 255] ^ \ > - tab[3][(crc >> 24) & 255] > +# define DO_CRC(x) crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8) > +# define DO_CRC4 crc = t0[(crc) & 255] ^ \ > + t1[(crc >> 8) & 255] ^ \ > + t2[(crc >> 16) & 255] ^ \ > + t3[(crc >> 24) & 255] > # endif > const u32 *b; > size_t rem_len; > + const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3]; > > /* Align it */ > if (unlikely((long)buf & 3 && len)) { > -- > 1.7.3.4 I tried this on X86_64 and Sparc 64. Very small improvement for Intel and significant improvement for Sparc. Here are the results based on current self test which is a mix of crc32_le and crc32_be with random offsets and lengths: Results are 'best case' I.e. I picked the shortest time from a handful of runs. Arch CPU Freq BITS bytes nsec cycles/byte ____________________________________________________________________________ Current proposed patch X86_64 Intel E5520 2.268G 64 225944 161294 1.619 X86_64 Intel E5520 2.268G 32 225944 267795 2.688 Sun Sparc III+ 900M 64 225944 757235 3.028 Sun Sparc III+ 900M 32 225944 935558 3.727 With pointers instead of 2D array references X86_64 E5520 2.268G 64 225944 157975 1.584 X86_64 E5520 2.268M 32 225944 273366 2.744 Sun Sparc III+ 900M 64 225944 570724 2.273 Sun Sparc III+ 900M 32 225944 848897 3.381 The change doesn't really help or hurt for X86_64 but significantly helps Sparc and you report gains for PPC so it looks good. ^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2011-08-10 21:57 UTC | newest] Thread overview: 8+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2011-08-09 5:29 [patch v3 7/7] crc32: final-cleanup.diff Bob Pearson 2011-08-09 6:21 ` Bob Pearson 2011-08-09 17:39 ` Joakim Tjernlund 2011-08-09 23:05 ` Bob Pearson 2011-08-10 11:40 ` Joakim Tjernlund 2011-08-10 15:13 ` Bob Pearson 2011-08-10 15:48 ` Joakim Tjernlund 2011-08-10 21:57 ` Bob Pearson
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox