[patch v3 7/7] crc32: final-cleanup.diff

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [patch v3 7/7] crc32: final-cleanup.diff
@ 2011-08-09  5:29 Bob Pearson
  2011-08-09  6:21 ` Bob Pearson
  0 siblings, 1 reply; 8+ messages in thread
From: Bob Pearson @ 2011-08-09  5:29 UTC (permalink / raw)
  To: linux-kernel, linux, joakim.tjernlund, fzago, rpearson, akpm

Some final cleanup changes

	- added a comment at the top of crc32.c
	- moved macros ahead of function prototype
	- replaced loops with for (i = 0; i < xxx; i++) which
	  requires fewer instructions on x86 since the
	  buffer lookups can use i as an index.

Signed-off-by: Bob Pearson <rpearson@systemfabricworks.com>

---
 lib/crc32.c |   89 ++++++++++++++++++++++++++++--------------------------------
 1 file changed, 43 insertions(+), 46 deletions(-)

Index: infiniband/lib/crc32.c
===================================================================
--- infiniband.orig/lib/crc32.c
+++ infiniband/lib/crc32.c
@@ -1,4 +1,8 @@
 /*
+ * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
+ * cleaned up code to current version of sparse and added the slicing-by-8
+ * algorithm to the closely similar existing slicing-by-4 algorithm.
+ *
  * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com>
  * Nicer crc32 functions/docs submitted by linux@horizon.com.  Thanks!
  * Code was from the public domain, copyright abandoned.  Code was
@@ -45,43 +49,39 @@ MODULE_LICENSE("GPL");

 #if CRC_LE_BITS > 8 || CRC_BE_BITS > 8

-/* implements slicing-by-4 or slicing-by-8 algorithm */
-static inline u32
-crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
-{
 # ifdef __LITTLE_ENDIAN
 #  define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8)
-#  define DO_CRC4 crc = tab[3][(crc) & 255] ^ \
-		tab[2][(crc >> 8) & 255] ^ \
-		tab[1][(crc >> 16) & 255] ^ \
-		tab[0][(crc >> 24) & 255]
-#  define DO_CRC8a (tab[7][(q) & 255] ^ \
-		tab[6][(q >> 8) & 255] ^ \
-		tab[5][(q >> 16) & 255] ^ \
-		tab[4][(q >> 24) & 255])
-#  define DO_CRC8b (tab[3][(q) & 255] ^ \
+#  define DO_CRC4 (tab[3][(q) & 255] ^ \
 		tab[2][(q >> 8) & 255] ^ \
 		tab[1][(q >> 16) & 255] ^ \
 		tab[0][(q >> 24) & 255])
+#  define DO_CRC8 (tab[7][(q) & 255] ^ \
+		tab[6][(q >> 8) & 255] ^ \
+		tab[5][(q >> 16) & 255] ^ \
+		tab[4][(q >> 24) & 255])
 # else
 #  define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
-#  define DO_CRC4 crc = tab[0][(crc) & 255] ^ \
-		tab[1][(crc >> 8) & 255] ^ \
-		tab[2][(crc >> 16) & 255] ^ \
-		tab[3][(crc >> 24) & 255]
-#  define DO_CRC8a (tab[4][(q) & 255] ^ \
-		tab[5][(q >> 8) & 255] ^ \
-		tab[6][(q >> 16) & 255] ^ \
-		tab[8][(q >> 24) & 255])
-#  define DO_CRC8b (tab[0][(q) & 255] ^ \
+#  define DO_CRC4 (tab[0][(q) & 255] ^ \
 		tab[1][(q >> 8) & 255] ^ \
 		tab[2][(q >> 16) & 255] ^ \
 		tab[3][(q >> 24) & 255])
+#  define DO_CRC8 (tab[4][(q) & 255] ^ \
+		tab[5][(q >> 8) & 255] ^ \
+		tab[6][(q >> 16) & 255] ^ \
+		tab[7][(q >> 24) & 255])
 # endif
+
+/* implements slicing-by-4 or slicing-by-8 algorithm */
+static inline u32 crc32_body(u32 crc, unsigned char const *buf,
+			     size_t len, const u32 (*tab)[256])
+{
 	const u32 *b;
+	u8 *p;
+	u32 q;
 	size_t init_len;
 	size_t middle_len;
 	size_t rem_len;
+	size_t i;

 	/* break buf into init_len bytes before and
 	 * rem_len bytes after a middle section with
@@ -96,37 +96,34 @@ crc32_body(u32 crc, unsigned char const
 	rem_len = (len - init_len) & 7;
 # endif

-	/* Align it */
-	if (unlikely(init_len)) {
-		do {
-			DO_CRC(*buf++);
-		} while (--init_len);
-	}
-	b = (const u32 *)buf;
-	for (--b; middle_len; --middle_len) {
+	/* process unaligned initial bytes */
+	for (i = 0; i < init_len; i++)
+		DO_CRC(*buf++);
+
+	/* process aligned words */
+	b = (const u32 *)(buf - 4);
+
+	for (i = 0; i < middle_len; i++) {
 # if CRC_LE_BITS == 32
-		crc ^= *++b; /* use pre increment for speed */
-		DO_CRC4;
+		/* slicing-by-4 algorithm */
+		q = crc ^ *++b; /* use pre increment for speed */
+		crc = DO_CRC4;
 # else
-		u32 q;
+		/* slicing-by-8 algorithm */
 		q = crc ^ *++b;
-		crc = DO_CRC8a;
+		crc = DO_CRC8;
 		q = *++b;
-		crc ^= DO_CRC8b;
+		crc ^= DO_CRC4;
 # endif
 	}
-	/* And the last few bytes */
-	if (rem_len) {
-		u8 *p = (u8 *)(b + 1) - 1;
-		do {
-			DO_CRC(*++p); /* use pre increment for speed */
-		} while (--rem_len);
-	}
+
+	/* process unaligned remaining bytes */
+	p = (u8 *)(b + 1) - 1;
+
+	for (i = 0; i < rem_len; i++)
+		DO_CRC(*++p); /* use pre increment for speed */
+
 	return crc;
-#undef DO_CRC
-#undef DO_CRC4
-#undef DO_CRC8a
-#undef DO_CRC8b
 }
 #endif


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [patch v3 7/7] crc32: final-cleanup.diff
  2011-08-09  5:29 [patch v3 7/7] crc32: final-cleanup.diff Bob Pearson
@ 2011-08-09  6:21 ` Bob Pearson
  2011-08-09 17:39   ` Joakim Tjernlund
  0 siblings, 1 reply; 8+ messages in thread
From: Bob Pearson @ 2011-08-09  6:21 UTC (permalink / raw)
  To: linux-kernel, linux, joakim.tjernlund, fzago, akpm

Additional comments.

 
> Some final cleanup changes
> 
> 	- added a comment at the top of crc32.c
> 	- moved macros ahead of function prototype
> 	- replaced loops with for (i = 0; i < xxx; i++) which
> 	  requires fewer instructions on x86 since the
> 	  buffer lookups can use i as an index.
> 
> -/* implements slicing-by-4 or slicing-by-8 algorithm */
> -static inline u32
> -crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32
> (*tab)[256])

After careful measurements and looking at asm code I figured out that there
was no penalty for using 2D array. That allowed me to go back to the
original form.

> -{
>  # ifdef __LITTLE_ENDIAN
>  #  define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8)
> -#  define DO_CRC4 crc = tab[3][(crc) & 255] ^ \
> -		tab[2][(crc >> 8) & 255] ^ \
> -		tab[1][(crc >> 16) & 255] ^ \
> -		tab[0][(crc >> 24) & 255]
> -#  define DO_CRC8a (tab[7][(q) & 255] ^ \
> -		tab[6][(q >> 8) & 255] ^ \
> -		tab[5][(q >> 16) & 255] ^ \
> -		tab[4][(q >> 24) & 255])
> -#  define DO_CRC8b (tab[3][(q) & 255] ^ \
> +#  define DO_CRC4 (tab[3][(q) & 255] ^ \
>  		tab[2][(q >> 8) & 255] ^ \
>  		tab[1][(q >> 16) & 255] ^ \
>  		tab[0][(q >> 24) & 255])

By changing the syntax a little so that the assignment is done down below
the 4 byte and 8 byte algorithms can share DO_CRC4.

> +#  define DO_CRC8 (tab[7][(q) & 255] ^ \
> +		tab[6][(q >> 8) & 255] ^ \
> +		tab[5][(q >> 16) & 255] ^ \
> +		tab[4][(q >> 24) & 255])
>  # else
>  #  define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
> -#  define DO_CRC4 crc = tab[0][(crc) & 255] ^ \
> -		tab[1][(crc >> 8) & 255] ^ \
> -		tab[2][(crc >> 16) & 255] ^ \
> -		tab[3][(crc >> 24) & 255]
> -#  define DO_CRC8a (tab[4][(q) & 255] ^ \
> -		tab[5][(q >> 8) & 255] ^ \
> -		tab[6][(q >> 16) & 255] ^ \
> -		tab[8][(q >> 24) & 255])
> -#  define DO_CRC8b (tab[0][(q) & 255] ^ \
> +#  define DO_CRC4 (tab[0][(q) & 255] ^ \
>  		tab[1][(q >> 8) & 255] ^ \
>  		tab[2][(q >> 16) & 255] ^ \
>  		tab[3][(q >> 24) & 255])
> +#  define DO_CRC8 (tab[4][(q) & 255] ^ \
> +		tab[5][(q >> 8) & 255] ^ \
> +		tab[6][(q >> 16) & 255] ^ \
> +		tab[7][(q >> 24) & 255])
>  # endif
> +
> +/* implements slicing-by-4 or slicing-by-8 algorithm */
> +static inline u32 crc32_body(u32 crc, unsigned char const *buf,
> +			     size_t len, const u32 (*tab)[256])
> +{
>  	const u32 *b;
> +	u8 *p;
> +	u32 q;
>  	size_t init_len;
>  	size_t middle_len;
>  	size_t rem_len;
> +	size_t i;
> 
>  	/* break buf into init_len bytes before and
>  	 * rem_len bytes after a middle section with
> @@ -96,37 +96,34 @@ crc32_body(u32 crc, unsigned char const
>  	rem_len = (len - init_len) & 7;
>  # endif
> 
> -	/* Align it */
> -	if (unlikely(init_len)) {
> -		do {
> -			DO_CRC(*buf++);
> -		} while (--init_len);
> -	}
> -	b = (const u32 *)buf;
> -	for (--b; middle_len; --middle_len) {
> +	/* process unaligned initial bytes */
> +	for (i = 0; i < init_len; i++)
> +		DO_CRC(*buf++);
> +
> +	/* process aligned words */
> +	b = (const u32 *)(buf - 4);
> +
> +	for (i = 0; i < middle_len; i++) {
>  # if CRC_LE_BITS == 32
> -		crc ^= *++b; /* use pre increment for speed */
> -		DO_CRC4;
> +		/* slicing-by-4 algorithm */
> +		q = crc ^ *++b; /* use pre increment for speed */
> +		crc = DO_CRC4;
>  # else
> -		u32 q;
> +		/* slicing-by-8 algorithm */
>  		q = crc ^ *++b;
> -		crc = DO_CRC8a;
> +		crc = DO_CRC8;
>  		q = *++b;
> -		crc ^= DO_CRC8b;
> +		crc ^= DO_CRC4;
>  # endif

This really shows how close the two algorithms are.

>  	}
> -	/* And the last few bytes */
> -	if (rem_len) {
> -		u8 *p = (u8 *)(b + 1) - 1;
> -		do {
> -			DO_CRC(*++p); /* use pre increment for speed */
> -		} while (--rem_len);
> -	}
> +
> +	/* process unaligned remaining bytes */
> +	p = (u8 *)(b + 1) - 1;
> +
> +	for (i = 0; i < rem_len; i++)
> +		DO_CRC(*++p); /* use pre increment for speed */
> +
>  	return crc;
> -#undef DO_CRC
> -#undef DO_CRC4
> -#undef DO_CRC8a
> -#undef DO_CRC8b
>  }
>  #endif



^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [patch v3 7/7] crc32: final-cleanup.diff
  2011-08-09  6:21 ` Bob Pearson
@ 2011-08-09 17:39   ` Joakim Tjernlund
  2011-08-09 23:05     ` Bob Pearson
  0 siblings, 1 reply; 8+ messages in thread
From: Joakim Tjernlund @ 2011-08-09 17:39 UTC (permalink / raw)
  To: Bob Pearson; +Cc: akpm, fzago, linux, linux-kernel

"Bob Pearson" <rpearson@systemfabricworks.com> wrote on 2011/08/09 08:21:38:
>
> Additional comments.
>
>
> > Some final cleanup changes
> >
> >    - added a comment at the top of crc32.c
> >    - moved macros ahead of function prototype
> >    - replaced loops with for (i = 0; i < xxx; i++) which
> >      requires fewer instructions on x86 since the
> >      buffer lookups can use i as an index.

Doing this adds one insn to ppc. What is good for x86 isn't for
pcc32 (and I guess any modern arch).

> >
> > -/* implements slicing-by-4 or slicing-by-8 algorithm */
> > -static inline u32
> > -crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32
> > (*tab)[256])
>
> After careful measurements and looking at asm code I figured out that there
> was no penalty for using 2D array. That allowed me to go back to the
> original form.

What about my suggestion to assign each table to a ptr? This was good
for ppc so if it doesn't slow x86 it should be added.

>
> > -{
> >  # ifdef __LITTLE_ENDIAN
> >  #  define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8)
> > -#  define DO_CRC4 crc = tab[3][(crc) & 255] ^ \
> > -      tab[2][(crc >> 8) & 255] ^ \
> > -      tab[1][(crc >> 16) & 255] ^ \
> > -      tab[0][(crc >> 24) & 255]
> > -#  define DO_CRC8a (tab[7][(q) & 255] ^ \
> > -      tab[6][(q >> 8) & 255] ^ \
> > -      tab[5][(q >> 16) & 255] ^ \
> > -      tab[4][(q >> 24) & 255])
> > -#  define DO_CRC8b (tab[3][(q) & 255] ^ \
> > +#  define DO_CRC4 (tab[3][(q) & 255] ^ \
> >        tab[2][(q >> 8) & 255] ^ \
> >        tab[1][(q >> 16) & 255] ^ \
> >        tab[0][(q >> 24) & 255])
>
> By changing the syntax a little so that the assignment is done down below
> the 4 byte and 8 byte algorithms can share DO_CRC4.
>
> > +#  define DO_CRC8 (tab[7][(q) & 255] ^ \
> > +      tab[6][(q >> 8) & 255] ^ \
> > +      tab[5][(q >> 16) & 255] ^ \
> > +      tab[4][(q >> 24) & 255])
> >  # else
> >  #  define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
> > -#  define DO_CRC4 crc = tab[0][(crc) & 255] ^ \
> > -      tab[1][(crc >> 8) & 255] ^ \
> > -      tab[2][(crc >> 16) & 255] ^ \
> > -      tab[3][(crc >> 24) & 255]
> > -#  define DO_CRC8a (tab[4][(q) & 255] ^ \
> > -      tab[5][(q >> 8) & 255] ^ \
> > -      tab[6][(q >> 16) & 255] ^ \
> > -      tab[8][(q >> 24) & 255])
> > -#  define DO_CRC8b (tab[0][(q) & 255] ^ \
> > +#  define DO_CRC4 (tab[0][(q) & 255] ^ \
> >        tab[1][(q >> 8) & 255] ^ \
> >        tab[2][(q >> 16) & 255] ^ \
> >        tab[3][(q >> 24) & 255])
> > +#  define DO_CRC8 (tab[4][(q) & 255] ^ \
> > +      tab[5][(q >> 8) & 255] ^ \
> > +      tab[6][(q >> 16) & 255] ^ \
> > +      tab[7][(q >> 24) & 255])
> >  # endif
> > +
> > +/* implements slicing-by-4 or slicing-by-8 algorithm */
> > +static inline u32 crc32_body(u32 crc, unsigned char const *buf,
> > +              size_t len, const u32 (*tab)[256])
> > +{
> >     const u32 *b;
> > +   u8 *p;
> > +   u32 q;
> >     size_t init_len;
> >     size_t middle_len;
> >     size_t rem_len;
> > +   size_t i;
> >
> >     /* break buf into init_len bytes before and
> >      * rem_len bytes after a middle section with
> > @@ -96,37 +96,34 @@ crc32_body(u32 crc, unsigned char const
> >     rem_len = (len - init_len) & 7;
> >  # endif
> >
> > -   /* Align it */
> > -   if (unlikely(init_len)) {
> > -      do {
> > -         DO_CRC(*buf++);
> > -      } while (--init_len);
> > -   }
> > -   b = (const u32 *)buf;
> > -   for (--b; middle_len; --middle_len) {
> > +   /* process unaligned initial bytes */
> > +   for (i = 0; i < init_len; i++)
> > +      DO_CRC(*buf++);
> > +
> > +   /* process aligned words */
> > +   b = (const u32 *)(buf - 4);
> > +
> > +   for (i = 0; i < middle_len; i++) {
> >  # if CRC_LE_BITS == 32
> > -      crc ^= *++b; /* use pre increment for speed */
> > -      DO_CRC4;
> > +      /* slicing-by-4 algorithm */
> > +      q = crc ^ *++b; /* use pre increment for speed */
> > +      crc = DO_CRC4;
> >  # else
> > -      u32 q;
> > +      /* slicing-by-8 algorithm */
> >        q = crc ^ *++b;
> > -      crc = DO_CRC8a;
> > +      crc = DO_CRC8;
> >        q = *++b;
> > -      crc ^= DO_CRC8b;
> > +      crc ^= DO_CRC4;
> >  # endif
>
> This really shows how close the two algorithms are.

Indeed. Look at the WIP patch I sent. There it is even closer :)

>
> >     }
> > -   /* And the last few bytes */
> > -   if (rem_len) {
> > -      u8 *p = (u8 *)(b + 1) - 1;
> > -      do {
> > -         DO_CRC(*++p); /* use pre increment for speed */
> > -      } while (--rem_len);
> > -   }
> > +
> > +   /* process unaligned remaining bytes */
> > +   p = (u8 *)(b + 1) - 1;
> > +
> > +   for (i = 0; i < rem_len; i++)
> > +      DO_CRC(*++p); /* use pre increment for speed */
> > +
> >     return crc;
> > -#undef DO_CRC
> > -#undef DO_CRC4
> > -#undef DO_CRC8a
> > -#undef DO_CRC8b
> >  }
> >  #endif
>
>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [patch v3 7/7] crc32: final-cleanup.diff
  2011-08-09 17:39   ` Joakim Tjernlund
@ 2011-08-09 23:05     ` Bob Pearson
  2011-08-10 11:40       ` Joakim Tjernlund
  0 siblings, 1 reply; 8+ messages in thread
From: Bob Pearson @ 2011-08-09 23:05 UTC (permalink / raw)
  To: 'Joakim Tjernlund', George Spelvin
  Cc: akpm, fzago, linux, linux-kernel



> 
> Doing this adds one insn to ppc. What is good for x86 isn't for
> pcc32 (and I guess any modern arch).

I looked at the assembly output from the original code and I see two
compares at the end of the first loop, one for --len and one for (buf & 3)
which it has to compute. You are better off just computing buf & 3 once and
comparing with len once to compute init_len then in each iteration in the
loop you only have to compare one thing.

As proposed:
        init_len = min((-((uintptr_t)buf)) & 3, len);
        ...

        /* process unaligned initial bytes */
        for (i = 0; i < init_len; i++)
                DO_CRC(*buf++);


crc32x_le:
        <.....>
        negq    %rcx				# (-buf)
        andl    $3, %ecx				# (-buf) & 3
        cmpq    %rdx, %rcx				# min()
        cmova   %rdx, %rcx				# init_len = 
        xorl    %edi, %edi				# i = 0
        <.....>
        jmp     .L2
.L3:
# crc = tab[0][(crc ^ (*buf++)) & 255] ^ (crc >> 8)
        movb    (%rsi,%rdi), %dl			# buf[i]
        incq    %rdi					# buf++
        xorl    %eax, %edx				# crc ^ *buf++
        shrl    $8, %eax				# crc >> 8
        movzbl  %dl, %edx				# & 255
        xorl    crc32table_le(,%rdx,4), %eax                 # crc =
tab[...] ^ (crc >> 8)
.L2:
        cmpq    %rcx, %rdi				# compare i with
init_len
        jb      .L3
        <.....>

As was originally:
        if (unlikely((long)buf & 3 && len)) {
                do {
                        DO_CRC(*buf++);
                } while ((--len) && ((long)buf)&3);
        }

crc32x_le:
        pushq   %rbp
        movl    %edi, %eax
        pushq   %rbx
        testb   $3, %sil
        je      .L16
        testq   %rdx, %rdx
.L25:
        je      .L16
        movb    (%rsi), %cl				# *p
        incq    %rsi					# p++
        xorl    %eax, %ecx				# crc ^ *p
        shrl    $8, %eax				# crc >> 8
        movzbl  %cl, %ecx				# ... & 255
        xorl    crc32table_le(,%rcx,4), %eax		# tab[...] ^ (crc >>
8)
        decq    %rdx				# len--
        je      .L16					# if len != 0
continue
        testb   $3, %sil				# buf & 3
        jmp     .L25					# if buf & 3
continue
.L16:

The first loop has 8 instructions second one has 11 instructions. First loop
has slightly more setup to compute init_len, second loop has a couple of
extra branches to compute the if() outside of the loop.

I get better performance with the new form of the loop. How, does PPC get
better results with two tests?

> 
> > >
> > > -/* implements slicing-by-4 or slicing-by-8 algorithm */
> > > -static inline u32
> > > -crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32
> > > (*tab)[256])
> >
> > After careful measurements and looking at asm code I figured out that
> there
> > was no penalty for using 2D array. That allowed me to go back to the
> > original form.
> 
> What about my suggestion to assign each table to a ptr? This was good
> for ppc so if it doesn't slow x86 it should be added.

Gcc is very clever and inlined crc32_body() into crc32_le() and crc32_be()
and then replaced references to tab[x][y] with references to crc32table_le
and crc32table_be plus a constant offset which depended on the first index
([x]). So in effect it completely unrolled back to 1D arrays as
(crc32table_le + offset)[y], which is equivalent to the t0_le[y] code I had
before.



^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [patch v3 7/7] crc32: final-cleanup.diff
  2011-08-09 23:05     ` Bob Pearson
@ 2011-08-10 11:40       ` Joakim Tjernlund
  2011-08-10 15:13         ` Bob Pearson
  0 siblings, 1 reply; 8+ messages in thread
From: Joakim Tjernlund @ 2011-08-10 11:40 UTC (permalink / raw)
  To: Bob Pearson; +Cc: akpm, fzago, George Spelvin, linux-kernel

"Bob Pearson" <rpearson@systemfabricworks.com> wrote on 2011/08/10 01:05:56:
>
> >
> > Doing this adds one insn to ppc. What is good for x86 isn't for
> > pcc32 (and I guess any modern arch).
>
> I looked at the assembly output from the original code and I see two
> compares at the end of the first loop, one for --len and one for (buf & 3)
> which it has to compute. You are better off just computing buf & 3 once and
> comparing with len once to compute init_len then in each iteration in the
> loop you only have to compare one thing.
>
> As proposed:
>         init_len = min((-((uintptr_t)buf)) & 3, len);
>         ...
>
>         /* process unaligned initial bytes */
>         for (i = 0; i < init_len; i++)
>                 DO_CRC(*buf++);
>
>
> crc32x_le:
>         <.....>
>         negq    %rcx            # (-buf)
>         andl    $3, %ecx            # (-buf) & 3
>         cmpq    %rdx, %rcx            # min()
>         cmova   %rdx, %rcx            # init_len =
>         xorl    %edi, %edi            # i = 0
>         <.....>
>         jmp     .L2
> .L3:
> # crc = tab[0][(crc ^ (*buf++)) & 255] ^ (crc >> 8)
>         movb    (%rsi,%rdi), %dl         # buf[i]
>         incq    %rdi               # buf++
>         xorl    %eax, %edx            # crc ^ *buf++
>         shrl    $8, %eax            # crc >> 8
>         movzbl  %dl, %edx            # & 255
>         xorl    crc32table_le(,%rdx,4), %eax                 # crc =
> tab[...] ^ (crc >> 8)
> .L2:
>         cmpq    %rcx, %rdi            # compare i with
> init_len
>         jb      .L3
>         <.....>
>
> As was originally:
>         if (unlikely((long)buf & 3 && len)) {
>                 do {
>                         DO_CRC(*buf++);
>                 } while ((--len) && ((long)buf)&3);
>         }
>
> crc32x_le:
>         pushq   %rbp
>         movl    %edi, %eax
>         pushq   %rbx
>         testb   $3, %sil
>         je      .L16
>         testq   %rdx, %rdx
> .L25:
>         je      .L16
>         movb    (%rsi), %cl            # *p
>         incq    %rsi               # p++
>         xorl    %eax, %ecx            # crc ^ *p
>         shrl    $8, %eax            # crc >> 8
>         movzbl  %cl, %ecx            # ... & 255
>         xorl    crc32table_le(,%rcx,4), %eax      # tab[...] ^ (crc >>
> 8)
>         decq    %rdx            # len--
>         je      .L16               # if len != 0
> continue
>         testb   $3, %sil            # buf & 3
>         jmp     .L25               # if buf & 3
> continue
> .L16:
>
> The first loop has 8 instructions second one has 11 instructions. First loop
> has slightly more setup to compute init_len, second loop has a couple of
> extra branches to compute the if() outside of the loop.
>
> I get better performance with the new form of the loop. How, does PPC get
> better results with two tests?

You are right, even ppc gets less insns in the byte loop. However I tested
your latest patch series and your version is a tiny bit slower than my best
version(numbers below)
Probably due to that your version has a higher startup cost and uses more
registers(3 regs are pushed to the stack vs. 2 regs in mine)

>
> >
> > > >
> > > > -/* implements slicing-by-4 or slicing-by-8 algorithm */
> > > > -static inline u32
> > > > -crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32
> > > > (*tab)[256])
> > >
> > > After careful measurements and looking at asm code I figured out that
> > there
> > > was no penalty for using 2D array. That allowed me to go back to the
> > > original form.
> >
> > What about my suggestion to assign each table to a ptr? This was good
> > for ppc so if it doesn't slow x86 it should be added.
>
> Gcc is very clever and inlined crc32_body() into crc32_le() and crc32_be()
> and then replaced references to tab[x][y] with references to crc32table_le
> and crc32table_be plus a constant offset which depended on the first index
> ([x]). So in effect it completely unrolled back to 1D arrays as
> (crc32table_le + offset)[y], which is equivalent to the t0_le[y] code I had
> before.

So x86 optimizes the table accesses well then. PPC32 doesn't and adding
my small optimization makes a huge difference:
org:
crc32: CRC_LE_BITS = 8, CRC_BE BITS = 8
crc32: self tests passed, processed 225944 bytes in 2257673 nsec

my t0, t1, t2, t3 ptrs
crc32: CRC_LE_BITS = 8, CRC_BE BITS = 8
crc32: self tests passed, processed 225944 bytes in 1949869 nsec

Numbers for 32 bits with your patch.

v3 up to 6 of 7
crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32
crc32: self tests passed, processed 225944 bytes in 2362044 nsec

v3 up to 7 of 7
crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32
crc32: self tests passed, processed 225944 bytes in 2149972 nsec

my t0, t1, t2, t3 ptrs on v3 7 of 7
crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32
crc32: self tests passed, processed 225944 bytes in 1953290 nsec

Finally, 64 bits on PPC is broken in v3!

 Jocke


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [patch v3 7/7] crc32: final-cleanup.diff
  2011-08-10 11:40       ` Joakim Tjernlund
@ 2011-08-10 15:13         ` Bob Pearson
  2011-08-10 15:48           ` Joakim Tjernlund
  0 siblings, 1 reply; 8+ messages in thread
From: Bob Pearson @ 2011-08-10 15:13 UTC (permalink / raw)
  To: 'Joakim Tjernlund'
  Cc: akpm, fzago, 'George Spelvin', linux-kernel

OK. Can you post your current version of crc32.c? I'll try to merge them
together.

> -----Original Message-----
> From: Joakim Tjernlund [mailto:joakim.tjernlund@transmode.se]
> Sent: Wednesday, August 10, 2011 6:41 AM
> To: Bob Pearson
> Cc: akpm@linux-foundation.org; fzago@systemfabricworks.com; George
> Spelvin; linux-kernel@vger.kernel.org
> Subject: RE: [patch v3 7/7] crc32: final-cleanup.diff
> 
> "Bob Pearson" <rpearson@systemfabricworks.com> wrote on 2011/08/10
> 01:05:56:
> >
> > >
> > > Doing this adds one insn to ppc. What is good for x86 isn't for
> > > pcc32 (and I guess any modern arch).
> >
> > I looked at the assembly output from the original code and I see two
> > compares at the end of the first loop, one for --len and one for (buf &
3)
> > which it has to compute. You are better off just computing buf & 3 once
> and
> > comparing with len once to compute init_len then in each iteration in
the
> > loop you only have to compare one thing.
> >
> > As proposed:
> >         init_len = min((-((uintptr_t)buf)) & 3, len);
> >         ...
> >
> >         /* process unaligned initial bytes */
> >         for (i = 0; i < init_len; i++)
> >                 DO_CRC(*buf++);
> >
> >
> > crc32x_le:
> >         <.....>
> >         negq    %rcx            # (-buf)
> >         andl    $3, %ecx            # (-buf) & 3
> >         cmpq    %rdx, %rcx            # min()
> >         cmova   %rdx, %rcx            # init_len =
> >         xorl    %edi, %edi            # i = 0
> >         <.....>
> >         jmp     .L2
> > .L3:
> > # crc = tab[0][(crc ^ (*buf++)) & 255] ^ (crc >> 8)
> >         movb    (%rsi,%rdi), %dl         # buf[i]
> >         incq    %rdi               # buf++
> >         xorl    %eax, %edx            # crc ^ *buf++
> >         shrl    $8, %eax            # crc >> 8
> >         movzbl  %dl, %edx            # & 255
> >         xorl    crc32table_le(,%rdx,4), %eax                 # crc =
> > tab[...] ^ (crc >> 8)
> > .L2:
> >         cmpq    %rcx, %rdi            # compare i with
> > init_len
> >         jb      .L3
> >         <.....>
> >
> > As was originally:
> >         if (unlikely((long)buf & 3 && len)) {
> >                 do {
> >                         DO_CRC(*buf++);
> >                 } while ((--len) && ((long)buf)&3);
> >         }
> >
> > crc32x_le:
> >         pushq   %rbp
> >         movl    %edi, %eax
> >         pushq   %rbx
> >         testb   $3, %sil
> >         je      .L16
> >         testq   %rdx, %rdx
> > .L25:
> >         je      .L16
> >         movb    (%rsi), %cl            # *p
> >         incq    %rsi               # p++
> >         xorl    %eax, %ecx            # crc ^ *p
> >         shrl    $8, %eax            # crc >> 8
> >         movzbl  %cl, %ecx            # ... & 255
> >         xorl    crc32table_le(,%rcx,4), %eax      # tab[...] ^ (crc >>
> > 8)
> >         decq    %rdx            # len--
> >         je      .L16               # if len != 0
> > continue
> >         testb   $3, %sil            # buf & 3
> >         jmp     .L25               # if buf & 3
> > continue
> > .L16:
> >
> > The first loop has 8 instructions second one has 11 instructions. First
loop
> > has slightly more setup to compute init_len, second loop has a couple of
> > extra branches to compute the if() outside of the loop.
> >
> > I get better performance with the new form of the loop. How, does PPC
> get
> > better results with two tests?
> 
> You are right, even ppc gets less insns in the byte loop. However I tested
> your latest patch series and your version is a tiny bit slower than my
best
> version(numbers below)
> Probably due to that your version has a higher startup cost and uses more
> registers(3 regs are pushed to the stack vs. 2 regs in mine)
> 
> >
> > >
> > > > >
> > > > > -/* implements slicing-by-4 or slicing-by-8 algorithm */
> > > > > -static inline u32
> > > > > -crc32_body(u32 crc, unsigned char const *buf, size_t len, const
u32
> > > > > (*tab)[256])
> > > >
> > > > After careful measurements and looking at asm code I figured out
that
> > > there
> > > > was no penalty for using 2D array. That allowed me to go back to the
> > > > original form.
> > >
> > > What about my suggestion to assign each table to a ptr? This was good
> > > for ppc so if it doesn't slow x86 it should be added.
> >
> > Gcc is very clever and inlined crc32_body() into crc32_le() and
crc32_be()
> > and then replaced references to tab[x][y] with references to
crc32table_le
> > and crc32table_be plus a constant offset which depended on the first
index
> > ([x]). So in effect it completely unrolled back to 1D arrays as
> > (crc32table_le + offset)[y], which is equivalent to the t0_le[y] code I
had
> > before.
> 
> So x86 optimizes the table accesses well then. PPC32 doesn't and adding
> my small optimization makes a huge difference:
> org:
> crc32: CRC_LE_BITS = 8, CRC_BE BITS = 8
> crc32: self tests passed, processed 225944 bytes in 2257673 nsec
> 
> my t0, t1, t2, t3 ptrs
> crc32: CRC_LE_BITS = 8, CRC_BE BITS = 8
> crc32: self tests passed, processed 225944 bytes in 1949869 nsec
> 
> Numbers for 32 bits with your patch.
> 
> v3 up to 6 of 7
> crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32
> crc32: self tests passed, processed 225944 bytes in 2362044 nsec
> 
> v3 up to 7 of 7
> crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32
> crc32: self tests passed, processed 225944 bytes in 2149972 nsec
> 
> my t0, t1, t2, t3 ptrs on v3 7 of 7
> crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32
> crc32: self tests passed, processed 225944 bytes in 1953290 nsec
> 
> Finally, 64 bits on PPC is broken in v3!
> 
>  Jocke



^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [patch v3 7/7] crc32: final-cleanup.diff
  2011-08-10 15:13         ` Bob Pearson
@ 2011-08-10 15:48           ` Joakim Tjernlund
  2011-08-10 21:57             ` Bob Pearson
  0 siblings, 1 reply; 8+ messages in thread
From: Joakim Tjernlund @ 2011-08-10 15:48 UTC (permalink / raw)
  To: Bob Pearson; +Cc: akpm, fzago, 'George Spelvin', linux-kernel

"Bob Pearson" <rpearson@systemfabricworks.com> wrote on 2011/08/10 17:13:00:

> From: "Bob Pearson" <rpearson@systemfabricworks.com>
> To: "'Joakim Tjernlund'" <joakim.tjernlund@transmode.se>
> Cc: <akpm@linux-foundation.org>, <fzago@systemfabricworks.com>, "'George Spelvin'" <linux@horizon.com>, <linux-kernel@vger.kernel.org>
> Date: 2011/08/10 17:13
> Subject: RE: [patch v3 7/7] crc32: final-cleanup.diff
>
> OK. Can you post your current version of crc32.c? I'll try to merge them
> together.

OK, here it comes again, prefably this should be the first patch
in the series.

>From f5268d74f1a81610820e92785397f1247946ce15 Mon Sep 17 00:00:00 2001
From: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Date: Fri, 5 Aug 2011 17:49:42 +0200
Subject: [PATCH] crc32: Optimize inner loop.

taking a pointer reference to each row in the crc table matrix,
one can reduce the inner loop with a few insn's on RISC
archs like PowerPC.
---
 lib/crc32.c |   21 +++++++++++----------
 1 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/lib/crc32.c b/lib/crc32.c
index 4855995..b06d1e7 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -51,20 +51,21 @@ static inline u32
 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
 {
 # ifdef __LITTLE_ENDIAN
-#  define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8)
-#  define DO_CRC4 crc = tab[3][(crc) & 255] ^ \
-		tab[2][(crc >> 8) & 255] ^ \
-		tab[1][(crc >> 16) & 255] ^ \
-		tab[0][(crc >> 24) & 255]
+#  define DO_CRC(x) crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8)
+#  define DO_CRC4 crc = t3[(crc) & 255] ^ \
+		t2[(crc >> 8) & 255] ^ \
+		t1[(crc >> 16) & 255] ^ \
+		t0[(crc >> 24) & 255]
 # else
-#  define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
-#  define DO_CRC4 crc = tab[0][(crc) & 255] ^ \
-		tab[1][(crc >> 8) & 255] ^ \
-		tab[2][(crc >> 16) & 255] ^ \
-		tab[3][(crc >> 24) & 255]
+#  define DO_CRC(x) crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
+#  define DO_CRC4 crc = t0[(crc) & 255] ^ \
+		t1[(crc >> 8) & 255] ^  \
+		t2[(crc >> 16) & 255] ^	\
+		t3[(crc >> 24) & 255]
 # endif
 	const u32 *b;
 	size_t    rem_len;
+	const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3];

 	/* Align it */
 	if (unlikely((long)buf & 3 && len)) {
--
1.7.3.4


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* RE: [patch v3 7/7] crc32: final-cleanup.diff
  2011-08-10 15:48           ` Joakim Tjernlund
@ 2011-08-10 21:57             ` Bob Pearson
  0 siblings, 0 replies; 8+ messages in thread
From: Bob Pearson @ 2011-08-10 21:57 UTC (permalink / raw)
  To: 'Joakim Tjernlund'
  Cc: akpm, fzago, 'George Spelvin', linux-kernel



> -----Original Message-----
> From: Joakim Tjernlund [mailto:joakim.tjernlund@transmode.se]
> Sent: Wednesday, August 10, 2011 10:49 AM
> To: Bob Pearson
> Cc: akpm@linux-foundation.org; fzago@systemfabricworks.com; 'George
> Spelvin'; linux-kernel@vger.kernel.org
> Subject: RE: [patch v3 7/7] crc32: final-cleanup.diff
> 
> "Bob Pearson" <rpearson@systemfabricworks.com> wrote on 2011/08/10
> 17:13:00:
> 
> > From: "Bob Pearson" <rpearson@systemfabricworks.com>
> > To: "'Joakim Tjernlund'" <joakim.tjernlund@transmode.se>
> > Cc: <akpm@linux-foundation.org>, <fzago@systemfabricworks.com>,
> "'George Spelvin'" <linux@horizon.com>, <linux-kernel@vger.kernel.org>
> > Date: 2011/08/10 17:13
> > Subject: RE: [patch v3 7/7] crc32: final-cleanup.diff
> >
> > OK. Can you post your current version of crc32.c? I'll try to merge them
> > together.
> 
> OK, here it comes again, prefably this should be the first patch
> in the series.
> 
> From f5268d74f1a81610820e92785397f1247946ce15 Mon Sep 17 00:00:00 2001
> From: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
> Date: Fri, 5 Aug 2011 17:49:42 +0200
> Subject: [PATCH] crc32: Optimize inner loop.
> 
> taking a pointer reference to each row in the crc table matrix,
> one can reduce the inner loop with a few insn's on RISC
> archs like PowerPC.
> ---
>  lib/crc32.c |   21 +++++++++++----------
>  1 files changed, 11 insertions(+), 10 deletions(-)
> 
> diff --git a/lib/crc32.c b/lib/crc32.c
> index 4855995..b06d1e7 100644
> --- a/lib/crc32.c
> +++ b/lib/crc32.c
> @@ -51,20 +51,21 @@ static inline u32
>  crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32
> (*tab)[256])
>  {
>  # ifdef __LITTLE_ENDIAN
> -#  define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8)
> -#  define DO_CRC4 crc = tab[3][(crc) & 255] ^ \
> -		tab[2][(crc >> 8) & 255] ^ \
> -		tab[1][(crc >> 16) & 255] ^ \
> -		tab[0][(crc >> 24) & 255]
> +#  define DO_CRC(x) crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8)
> +#  define DO_CRC4 crc = t3[(crc) & 255] ^ \
> +		t2[(crc >> 8) & 255] ^ \
> +		t1[(crc >> 16) & 255] ^ \
> +		t0[(crc >> 24) & 255]
>  # else
> -#  define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
> -#  define DO_CRC4 crc = tab[0][(crc) & 255] ^ \
> -		tab[1][(crc >> 8) & 255] ^ \
> -		tab[2][(crc >> 16) & 255] ^ \
> -		tab[3][(crc >> 24) & 255]
> +#  define DO_CRC(x) crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
> +#  define DO_CRC4 crc = t0[(crc) & 255] ^ \
> +		t1[(crc >> 8) & 255] ^  \
> +		t2[(crc >> 16) & 255] ^	\
> +		t3[(crc >> 24) & 255]
>  # endif
>  	const u32 *b;
>  	size_t    rem_len;
> +	const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3];
> 
>  	/* Align it */
>  	if (unlikely((long)buf & 3 && len)) {
> --
> 1.7.3.4

I tried this on X86_64 and Sparc 64. Very small improvement for Intel and
significant improvement for Sparc. Here are the results based on current
self test which is a mix of crc32_le and crc32_be with random offsets and
lengths:
Results are 'best case' I.e. I picked the shortest time from a handful of
runs.

	Arch		CPU		Freq		BITS	bytes
nsec		cycles/byte
	
____________________________________________________________________________
Current proposed patch
	X86_64		Intel E5520	2.268G		64	225944
161294		1.619
	X86_64		Intel E5520	2.268G		32	225944
267795		2.688
	Sun		Sparc III+	900M		64	225944
757235		3.028
	Sun		Sparc III+	900M		32	225944
935558		3.727
With pointers instead of 2D array references
	X86_64		E5520		2.268G		64	225944
157975		1.584
	X86_64		E5520		2.268M		32	225944
273366		2.744
	Sun		Sparc III+	900M		64	225944
570724		2.273
 	Sun		Sparc III+	900M		32	225944
848897		3.381

The change doesn't really help or hurt for X86_64 but significantly helps
Sparc and you report gains for PPC so it looks good.


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2011-08-10 21:57 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-08-09  5:29 [patch v3 7/7] crc32: final-cleanup.diff Bob Pearson
2011-08-09  6:21 ` Bob Pearson
2011-08-09 17:39   ` Joakim Tjernlund
2011-08-09 23:05     ` Bob Pearson
2011-08-10 11:40       ` Joakim Tjernlund
2011-08-10 15:13         ` Bob Pearson
2011-08-10 15:48           ` Joakim Tjernlund
2011-08-10 21:57             ` Bob Pearson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox