public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [patch v3 6/7] crc32: add-slicing-by-8.diff
@ 2011-08-09  5:27 Bob Pearson
  2011-08-09 11:21 ` George Spelvin
  2011-08-09 17:21 ` Joakim Tjernlund
  0 siblings, 2 replies; 6+ messages in thread
From: Bob Pearson @ 2011-08-09  5:27 UTC (permalink / raw)
  To: linux-kernel, linux, joakim.tjernlund, fzago, rpearson, akpm

add slicing-by-8 algorithm to the existing
slicing-by-4 algorithm. This consists of:

	- extend largest BITS size from 32 to 64
	- extend table from tab[4][256] to tab[8][256]
	- change algorithm to align on 8 byte boundary
	  (it was noted that all that is really required
	  is that the inner loop must comsume 8 bytes.
	  As written it can start on even or odd 4 byte
	  boundary.)
	- Add code for inner loop.

Signed-off-by: Bob Pearson <rpearson@systemfabricworks.com>

---
 lib/crc32.c          |   58 ++++++++++++++++++++++++++++++++++++++++++---------
 lib/crc32defs.h      |   14 ++++++------
 lib/gen_crc32table.c |   40 +++++++++++++++++++++--------------
 3 files changed, 79 insertions(+), 33 deletions(-)

Index: infiniband/lib/crc32.c
===================================================================
--- infiniband.orig/lib/crc32.c
+++ infiniband/lib/crc32.c
@@ -45,6 +45,7 @@ MODULE_LICENSE("GPL");

 #if CRC_LE_BITS > 8 || CRC_BE_BITS > 8

+/* implements slicing-by-4 or slicing-by-8 algorithm */
 static inline u32
 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
 {
@@ -54,41 +55,78 @@ crc32_body(u32 crc, unsigned char const
 		tab[2][(crc >> 8) & 255] ^ \
 		tab[1][(crc >> 16) & 255] ^ \
 		tab[0][(crc >> 24) & 255]
+#  define DO_CRC8a (tab[7][(q) & 255] ^ \
+		tab[6][(q >> 8) & 255] ^ \
+		tab[5][(q >> 16) & 255] ^ \
+		tab[4][(q >> 24) & 255])
+#  define DO_CRC8b (tab[3][(q) & 255] ^ \
+		tab[2][(q >> 8) & 255] ^ \
+		tab[1][(q >> 16) & 255] ^ \
+		tab[0][(q >> 24) & 255])
 # else
 #  define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
 #  define DO_CRC4 crc = tab[0][(crc) & 255] ^ \
 		tab[1][(crc >> 8) & 255] ^ \
 		tab[2][(crc >> 16) & 255] ^ \
 		tab[3][(crc >> 24) & 255]
+#  define DO_CRC8a (tab[4][(q) & 255] ^ \
+		tab[5][(q >> 8) & 255] ^ \
+		tab[6][(q >> 16) & 255] ^ \
+		tab[8][(q >> 24) & 255])
+#  define DO_CRC8b (tab[0][(q) & 255] ^ \
+		tab[1][(q >> 8) & 255] ^ \
+		tab[2][(q >> 16) & 255] ^ \
+		tab[3][(q >> 24) & 255])
 # endif
 	const u32 *b;
-	size_t    rem_len;
+	size_t init_len;
+	size_t middle_len;
+	size_t rem_len;
+
+	/* break buf into init_len bytes before and
+	 * rem_len bytes after a middle section with
+	 * middle_len properly aligned words */
+# if CRC_LE_BITS == 32
+	init_len = min((-((uintptr_t)buf)) & 3, len);
+	middle_len = (len - init_len) >> 2;
+	rem_len = (len - init_len) & 3;
+# else
+	init_len = min((-((uintptr_t)buf)) & 7, len);
+	middle_len = (len - init_len) >> 3;
+	rem_len = (len - init_len) & 7;
+# endif

 	/* Align it */
-	if (unlikely((long)buf & 3 && len)) {
+	if (unlikely(init_len)) {
 		do {
 			DO_CRC(*buf++);
-		} while ((--len) && ((long)buf)&3);
+		} while (--init_len);
 	}
-	rem_len = len & 3;
-	/* load data 32 bits wide, xor data 32 bits wide. */
-	len = len >> 2;
 	b = (const u32 *)buf;
-	for (--b; len; --len) {
+	for (--b; middle_len; --middle_len) {
+# if CRC_LE_BITS == 32
 		crc ^= *++b; /* use pre increment for speed */
 		DO_CRC4;
+# else
+		u32 q;
+		q = crc ^ *++b;
+		crc = DO_CRC8a;
+		q = *++b;
+		crc ^= DO_CRC8b;
+# endif
 	}
-	len = rem_len;
 	/* And the last few bytes */
-	if (len) {
+	if (rem_len) {
 		u8 *p = (u8 *)(b + 1) - 1;
 		do {
 			DO_CRC(*++p); /* use pre increment for speed */
-		} while (--len);
+		} while (--rem_len);
 	}
 	return crc;
 #undef DO_CRC
 #undef DO_CRC4
+#undef DO_CRC8a
+#undef DO_CRC8b
 }
 #endif

Index: infiniband/lib/crc32defs.h
===================================================================
--- infiniband.orig/lib/crc32defs.h
+++ infiniband/lib/crc32defs.h
@@ -6,29 +6,29 @@
 #define CRCPOLY_LE 0xedb88320
 #define CRCPOLY_BE 0x04c11db7

-/* How many bits at a time to use.  Valid values are 1, 2, 4, 8, and 32. */
+/* How many bits at a time to use.  Valid values are 1, 2, 4, 8, 32 and 64. */
 /* For less performance-sensitive, use 4 or 8 */
 #ifndef CRC_LE_BITS
-# define CRC_LE_BITS 32
+# define CRC_LE_BITS 64
 #endif
 #ifndef CRC_BE_BITS
-# define CRC_BE_BITS 32
+# define CRC_BE_BITS 64
 #endif

 /*
  * Little-endian CRC computation.  Used with serial bit streams sent
  * lsbit-first.  Be sure to use cpu_to_le32() to append the computed CRC.
  */
-#if CRC_LE_BITS > 32 || CRC_LE_BITS < 1 || CRC_LE_BITS == 16 || \
+#if CRC_LE_BITS > 64 || CRC_LE_BITS < 1 || CRC_LE_BITS == 16 || \
 	CRC_LE_BITS & CRC_LE_BITS-1
-# error "CRC_LE_BITS must be one of {1, 2, 4, 8, 32}"
+# error "CRC_LE_BITS must be one of {1, 2, 4, 8, 64}"
 #endif

 /*
  * Big-endian CRC computation.  Used with serial bit streams sent
  * msbit-first.  Be sure to use cpu_to_be32() to append the computed CRC.
  */
-#if CRC_BE_BITS > 32 || CRC_BE_BITS < 1 || CRC_BE_BITS == 16 || \
+#if CRC_BE_BITS > 64 || CRC_BE_BITS < 1 || CRC_BE_BITS == 16 || \
 	CRC_BE_BITS & CRC_BE_BITS-1
-# error "CRC_BE_BITS must be one of {1, 2, 4, 8, 32}"
+# error "CRC_BE_BITS must be one of {1, 2, 4, 8, 64}"
 #endif
Index: infiniband/lib/gen_crc32table.c
===================================================================
--- infiniband.orig/lib/gen_crc32table.c
+++ infiniband/lib/gen_crc32table.c
@@ -4,20 +4,24 @@

 #define ENTRIES_PER_LINE 4

-#if CRC_LE_BITS <= 8
-#define LE_TABLE_SIZE (1 << CRC_LE_BITS)
+#if CRC_LE_BITS > 8
+# define LE_TABLE_ROWS (CRC_LE_BITS/8)
+# define LE_TABLE_SIZE 256
 #else
-#define LE_TABLE_SIZE 256
+# define LE_TABLE_ROWS 1
+# define LE_TABLE_SIZE (1 << CRC_LE_BITS)
 #endif

-#if CRC_BE_BITS <= 8
-#define BE_TABLE_SIZE (1 << CRC_BE_BITS)
+#if CRC_BE_BITS > 8
+# define BE_TABLE_ROWS (CRC_BE_BITS/8)
+# define BE_TABLE_SIZE 256
 #else
-#define BE_TABLE_SIZE 256
+# define BE_TABLE_ROWS 1
+# define BE_TABLE_SIZE (1 << CRC_BE_BITS)
 #endif

-static uint32_t crc32table_le[8][256];
-static uint32_t crc32table_be[8][256];
+static uint32_t crc32table_le[LE_TABLE_ROWS][256];
+static uint32_t crc32table_be[BE_TABLE_ROWS][256];

 /**
  * crc32init_le() - allocate and initialize LE table data
@@ -40,7 +44,7 @@ static void crc32init_le(void)
 	}
 	for (i = 0; i < LE_TABLE_SIZE; i++) {
 		crc = crc32table_le[0][i];
-		for (j = 1; j < 8; j++) {
+		for (j = 1; j < LE_TABLE_ROWS; j++) {
 			crc = crc32table_le[0][crc & 0xff] ^ (crc >> 8);
 			crc32table_le[j][i] = crc;
 		}
@@ -64,18 +68,18 @@ static void crc32init_be(void)
 	}
 	for (i = 0; i < BE_TABLE_SIZE; i++) {
 		crc = crc32table_be[0][i];
-		for (j = 1; j < 8; j++) {
+		for (j = 1; j < BE_TABLE_ROWS; j++) {
 			crc = crc32table_be[0][(crc >> 24) & 0xff] ^ (crc << 8);
 			crc32table_be[j][i] = crc;
 		}
 	}
 }

-static void output_table(uint32_t table[8][256], int len, char *trans)
+static void output_table(uint32_t (*table)[256], int rows, int len, char *trans)
 {
 	int i, j;

-	for (j = 0 ; j < 8; j++) {
+	for (j = 0 ; j < rows; j++) {
 		printf("{");
 		for (i = 0; i < len - 1; i++) {
 			if (i % ENTRIES_PER_LINE == 0)
@@ -92,15 +96,19 @@ int main(int argc, char** argv)

 	if (CRC_LE_BITS > 1) {
 		crc32init_le();
-		printf("static const u32 crc32table_le[8][256] = {");
-		output_table(crc32table_le, LE_TABLE_SIZE, "tole");
+		printf("static const u32 crc32table_le[%d][%d] = {",
+		       LE_TABLE_ROWS, LE_TABLE_SIZE);
+		output_table(crc32table_le, LE_TABLE_ROWS,
+			     LE_TABLE_SIZE, "tole");
 		printf("};\n");
 	}

 	if (CRC_BE_BITS > 1) {
 		crc32init_be();
-		printf("static const u32 crc32table_be[8][256] = {");
-		output_table(crc32table_be, BE_TABLE_SIZE, "tobe");
+		printf("static const u32 crc32table_be[%d][%d] = {",
+		       BE_TABLE_ROWS, BE_TABLE_SIZE);
+		output_table(crc32table_be, LE_TABLE_ROWS,
+			     BE_TABLE_SIZE, "tobe");
 		printf("};\n");
 	}


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [patch v3 6/7] crc32: add-slicing-by-8.diff
  2011-08-09  5:27 [patch v3 6/7] crc32: add-slicing-by-8.diff Bob Pearson
@ 2011-08-09 11:21 ` George Spelvin
  2011-08-09 15:28   ` Bob Pearson
  2011-08-09 17:21 ` Joakim Tjernlund
  1 sibling, 1 reply; 6+ messages in thread
From: George Spelvin @ 2011-08-09 11:21 UTC (permalink / raw)
  To: akpm, fzago, joakim.tjernlund, linux-kernel, linux, rpearson

While writing up some documentation of this algorithm, I came up with
a potential speedup.  Or, at least, realized why slicing by more than
4 is so much faster than slicing by 4 or less.

Note that the inner loop of the algorithm is as follows:

+#  define DO_CRC8a (tab[7][(q) & 255] ^ \
+		tab[6][(q >> 8) & 255] ^ \
+		tab[5][(q >> 16) & 255] ^ \
+		tab[4][(q >> 24) & 255])
+#  define DO_CRC8b (tab[3][(q) & 255] ^ \
+		tab[2][(q >> 8) & 255] ^ \
+		tab[1][(q >> 16) & 255] ^ \
+		tab[0][(q >> 24) & 255])

+	for (--b; middle_len; --middle_len) {
+		u32 q;
+		q = crc ^ *++b;
+		crc = DO_CRC8a;
+		q = *++b;
+		crc ^= DO_CRC8b;
 	}

Note the data dependencies: DO_CRC8a depends on the
previous crc, which depends on the previous DO_CRC8b.
But DO_CRC8b does not depend on anything except the
input data at *++b.

It would increase parallelism to schedule DO_CRC8b before DO_CRC8a,
to start those loads before the previous crc value is available.

Maybe the compiler and/pr processor can find this parallelism already,
but if not, it might be useful to try reordering it:

#  define DO_CRC8a(x) (tab[7][(x) & 255] ^ \
		tab[6][((x) >> 8) & 255] ^ \
		tab[5][((x) >> 16) & 255] ^ \
		tab[4][((x) >> 24) & 255])
#  define DO_CRC8b(x) (tab[3][(x) & 255] ^ \
		tab[2][((x) >> 8) & 255] ^ \
		tab[1][((x) >> 16) & 255] ^ \
		tab[0][((x) >> 24) & 255])

	for ( ; middle_len; --middle_len, b += 2) {
		u32 q = DO_CRC8b(b[1]);
		crc ^= b[0];
		crc = q ^ DO_CRC8a(crc);
	}

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [patch v3 6/7] crc32: add-slicing-by-8.diff
  2011-08-09 11:21 ` George Spelvin
@ 2011-08-09 15:28   ` Bob Pearson
  0 siblings, 0 replies; 6+ messages in thread
From: Bob Pearson @ 2011-08-09 15:28 UTC (permalink / raw)
  To: 'George Spelvin', akpm, fzago, joakim.tjernlund,
	linux-kernel



> -----Original Message-----
> From: George Spelvin [mailto:linux@horizon.com]
> Sent: Tuesday, August 09, 2011 6:21 AM
> To: akpm@linux-foundation.org; fzago@systemfabricworks.com;
> joakim.tjernlund@transmode.se; linux-kernel@vger.kernel.org;
> linux@horizon.com; rpearson@systemfabricworks.com
> Subject: Re: [patch v3 6/7] crc32: add-slicing-by-8.diff
> 
> While writing up some documentation of this algorithm, I came up with
> a potential speedup.  Or, at least, realized why slicing by more than
> 4 is so much faster than slicing by 4 or less.
> 
> Note that the inner loop of the algorithm is as follows:
> 
> +#  define DO_CRC8a (tab[7][(q) & 255] ^ \
> +		tab[6][(q >> 8) & 255] ^ \
> +		tab[5][(q >> 16) & 255] ^ \
> +		tab[4][(q >> 24) & 255])
> +#  define DO_CRC8b (tab[3][(q) & 255] ^ \
> +		tab[2][(q >> 8) & 255] ^ \
> +		tab[1][(q >> 16) & 255] ^ \
> +		tab[0][(q >> 24) & 255])
> 
> +	for (--b; middle_len; --middle_len) {
> +		u32 q;
> +		q = crc ^ *++b;
> +		crc = DO_CRC8a;
> +		q = *++b;
> +		crc ^= DO_CRC8b;
>  	}
> 
> Note the data dependencies: DO_CRC8a depends on the
> previous crc, which depends on the previous DO_CRC8b.
> But DO_CRC8b does not depend on anything except the
> input data at *++b.

I think you've got it. That is exactly why it works. The hardware is pretty
good at finding the parallelism since this code runs at almost 2X the speed
of the CRC4 loop.

> 
> It would increase parallelism to schedule DO_CRC8b before DO_CRC8a,
> to start those loads before the previous crc value is available.
> 
> Maybe the compiler and/pr processor can find this parallelism already,
> but if not, it might be useful to try reordering it:
> 
> #  define DO_CRC8a(x) (tab[7][(x) & 255] ^ \
> 		tab[6][((x) >> 8) & 255] ^ \
> 		tab[5][((x) >> 16) & 255] ^ \
> 		tab[4][((x) >> 24) & 255])
> #  define DO_CRC8b(x) (tab[3][(x) & 255] ^ \
> 		tab[2][((x) >> 8) & 255] ^ \
> 		tab[1][((x) >> 16) & 255] ^ \
> 		tab[0][((x) >> 24) & 255])
> 
> 	for ( ; middle_len; --middle_len, b += 2) {
> 		u32 q = DO_CRC8b(b[1]);
> 		crc ^= b[0];
> 		crc = q ^ DO_CRC8a(crc);
> 	}


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [patch v3 6/7] crc32: add-slicing-by-8.diff
  2011-08-09  5:27 [patch v3 6/7] crc32: add-slicing-by-8.diff Bob Pearson
  2011-08-09 11:21 ` George Spelvin
@ 2011-08-09 17:21 ` Joakim Tjernlund
  2011-08-09 20:52   ` Bob Pearson
  1 sibling, 1 reply; 6+ messages in thread
From: Joakim Tjernlund @ 2011-08-09 17:21 UTC (permalink / raw)
  To: Bob Pearson; +Cc: akpm, fzago, linux, linux-kernel


Bob Pearson <rpearson@systemfabricworks.com> wrote on 2011/08/09 07:27:59:
>
> add slicing-by-8 algorithm to the existing
> slicing-by-4 algorithm. This consists of:
>
>    - extend largest BITS size from 32 to 64
>    - extend table from tab[4][256] to tab[8][256]
>    - change algorithm to align on 8 byte boundary
>      (it was noted that all that is really required
>      is that the inner loop must comsume 8 bytes.
>      As written it can start on even or odd 4 byte
>      boundary.)
So why not do that in the code too?

>    - Add code for inner loop.
>
> Signed-off-by: Bob Pearson <rpearson@systemfabricworks.com>
>
> ---
>  lib/crc32.c          |   58 ++++++++++++++++++++++++++++++++++++++++++---------
>  lib/crc32defs.h      |   14 ++++++------
>  lib/gen_crc32table.c |   40 +++++++++++++++++++++--------------
>  3 files changed, 79 insertions(+), 33 deletions(-)
>
> Index: infiniband/lib/crc32.c
> ===================================================================
> --- infiniband.orig/lib/crc32.c
> +++ infiniband/lib/crc32.c
> @@ -45,6 +45,7 @@ MODULE_LICENSE("GPL");
>
>  #if CRC_LE_BITS > 8 || CRC_BE_BITS > 8
>
> +/* implements slicing-by-4 or slicing-by-8 algorithm */
>  static inline u32
>  crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
>  {
> @@ -54,41 +55,78 @@ crc32_body(u32 crc, unsigned char const
>        tab[2][(crc >> 8) & 255] ^ \
>        tab[1][(crc >> 16) & 255] ^ \
>        tab[0][(crc >> 24) & 255]
> +#  define DO_CRC8a (tab[7][(q) & 255] ^ \
> +      tab[6][(q >> 8) & 255] ^ \
> +      tab[5][(q >> 16) & 255] ^ \
> +      tab[4][(q >> 24) & 255])
> +#  define DO_CRC8b (tab[3][(q) & 255] ^ \
> +      tab[2][(q >> 8) & 255] ^ \
> +      tab[1][(q >> 16) & 255] ^ \
> +      tab[0][(q >> 24) & 255])
>  # else
>  #  define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
>  #  define DO_CRC4 crc = tab[0][(crc) & 255] ^ \
>        tab[1][(crc >> 8) & 255] ^ \
>        tab[2][(crc >> 16) & 255] ^ \
>        tab[3][(crc >> 24) & 255]
> +#  define DO_CRC8a (tab[4][(q) & 255] ^ \
> +      tab[5][(q >> 8) & 255] ^ \
> +      tab[6][(q >> 16) & 255] ^ \
> +      tab[8][(q >> 24) & 255])
> +#  define DO_CRC8b (tab[0][(q) & 255] ^ \
> +      tab[1][(q >> 8) & 255] ^ \
> +      tab[2][(q >> 16) & 255] ^ \
> +      tab[3][(q >> 24) & 255])
>  # endif
>     const u32 *b;
> -   size_t    rem_len;
> +   size_t init_len;
> +   size_t middle_len;
> +   size_t rem_len;
> +
> +   /* break buf into init_len bytes before and
> +    * rem_len bytes after a middle section with
> +    * middle_len properly aligned words */
> +# if CRC_LE_BITS == 32
> +   init_len = min((-((uintptr_t)buf)) & 3, len);
> +   middle_len = (len - init_len) >> 2;
> +   rem_len = (len - init_len) & 3;
> +# else
> +   init_len = min((-((uintptr_t)buf)) & 7, len);
> +   middle_len = (len - init_len) >> 3;
> +   rem_len = (len - init_len) & 7;
> +# endif
>
>     /* Align it */
> -   if (unlikely((long)buf & 3 && len)) {
> +   if (unlikely(init_len)) {
>        do {
>           DO_CRC(*buf++);
> -      } while ((--len) && ((long)buf)&3);
> +      } while (--init_len);
>     }

As discussed earlier, don't change initial loop. Best left alone unless
you have verified it generates better code.

> -   rem_len = len & 3;
> -   /* load data 32 bits wide, xor data 32 bits wide. */
> -   len = len >> 2;
>     b = (const u32 *)buf;
> -   for (--b; len; --len) {
> +   for (--b; middle_len; --middle_len) {
> +# if CRC_LE_BITS == 32
>        crc ^= *++b; /* use pre increment for speed */
>        DO_CRC4;
> +# else
> +      u32 q;
> +      q = crc ^ *++b;
> +      crc = DO_CRC8a;
> +      q = *++b;
> +      crc ^= DO_CRC8b;
> +# endif
>     }
> -   len = rem_len;
>     /* And the last few bytes */
> -   if (len) {
> +   if (rem_len) {
>        u8 *p = (u8 *)(b + 1) - 1;
>        do {
>           DO_CRC(*++p); /* use pre increment for speed */
> -      } while (--len);
> +      } while (--rem_len);
>     }
>     return crc;
>  #undef DO_CRC
>  #undef DO_CRC4
> +#undef DO_CRC8a
> +#undef DO_CRC8b
>  }
>  #endif
>
> Index: infiniband/lib/crc32defs.h
> ===================================================================
> --- infiniband.orig/lib/crc32defs.h
> +++ infiniband/lib/crc32defs.h
> @@ -6,29 +6,29 @@
>  #define CRCPOLY_LE 0xedb88320
>  #define CRCPOLY_BE 0x04c11db7
>
> -/* How many bits at a time to use.  Valid values are 1, 2, 4, 8, and 32. */
> +/* How many bits at a time to use.  Valid values are 1, 2, 4, 8, 32 and 64. */
>  /* For less performance-sensitive, use 4 or 8 */
>  #ifndef CRC_LE_BITS
> -# define CRC_LE_BITS 32
> +# define CRC_LE_BITS 64

64 bits as default for all archs? Not very likely, especially
as I already told you that 64 bits made my ppc32 drop to
half the size.

>  #endif
>  #ifndef CRC_BE_BITS
> -# define CRC_BE_BITS 32
> +# define CRC_BE_BITS 64
>  #endif
>
>  /*
>   * Little-endian CRC computation.  Used with serial bit streams sent
>   * lsbit-first.  Be sure to use cpu_to_le32() to append the computed CRC.
>   */
> -#if CRC_LE_BITS > 32 || CRC_LE_BITS < 1 || CRC_LE_BITS == 16 || \
> +#if CRC_LE_BITS > 64 || CRC_LE_BITS < 1 || CRC_LE_BITS == 16 || \
>     CRC_LE_BITS & CRC_LE_BITS-1
> -# error "CRC_LE_BITS must be one of {1, 2, 4, 8, 32}"
> +# error "CRC_LE_BITS must be one of {1, 2, 4, 8, 64}"

32 is missing

>  #endif
>
>  /*
>   * Big-endian CRC computation.  Used with serial bit streams sent
>   * msbit-first.  Be sure to use cpu_to_be32() to append the computed CRC.
>   */
> -#if CRC_BE_BITS > 32 || CRC_BE_BITS < 1 || CRC_BE_BITS == 16 || \
> +#if CRC_BE_BITS > 64 || CRC_BE_BITS < 1 || CRC_BE_BITS == 16 || \
>     CRC_BE_BITS & CRC_BE_BITS-1
> -# error "CRC_BE_BITS must be one of {1, 2, 4, 8, 32}"
> +# error "CRC_BE_BITS must be one of {1, 2, 4, 8, 64}"

Ditto.

>  #endif
> Index: infiniband/lib/gen_crc32table.c
> ===================================================================
> --- infiniband.orig/lib/gen_crc32table.c
> +++ infiniband/lib/gen_crc32table.c
> @@ -4,20 +4,24 @@
>
>  #define ENTRIES_PER_LINE 4
>
> -#if CRC_LE_BITS <= 8
> -#define LE_TABLE_SIZE (1 << CRC_LE_BITS)
> +#if CRC_LE_BITS > 8
> +# define LE_TABLE_ROWS (CRC_LE_BITS/8)
> +# define LE_TABLE_SIZE 256
>  #else
> -#define LE_TABLE_SIZE 256
> +# define LE_TABLE_ROWS 1
> +# define LE_TABLE_SIZE (1 << CRC_LE_BITS)
>  #endif
>
> -#if CRC_BE_BITS <= 8
> -#define BE_TABLE_SIZE (1 << CRC_BE_BITS)
> +#if CRC_BE_BITS > 8
> +# define BE_TABLE_ROWS (CRC_BE_BITS/8)
> +# define BE_TABLE_SIZE 256
>  #else
> -#define BE_TABLE_SIZE 256
> +# define BE_TABLE_ROWS 1
> +# define BE_TABLE_SIZE (1 << CRC_BE_BITS)
>  #endif
>
> -static uint32_t crc32table_le[8][256];
> -static uint32_t crc32table_be[8][256];
> +static uint32_t crc32table_le[LE_TABLE_ROWS][256];
> +static uint32_t crc32table_be[BE_TABLE_ROWS][256];

256 should be XX_TABLE_SIZE

>
>  /**
>   * crc32init_le() - allocate and initialize LE table data
> @@ -40,7 +44,7 @@ static void crc32init_le(void)
>     }
>     for (i = 0; i < LE_TABLE_SIZE; i++) {
>        crc = crc32table_le[0][i];
> -      for (j = 1; j < 8; j++) {
> +      for (j = 1; j < LE_TABLE_ROWS; j++) {
>           crc = crc32table_le[0][crc & 0xff] ^ (crc >> 8);
>           crc32table_le[j][i] = crc;
>        }
> @@ -64,18 +68,18 @@ static void crc32init_be(void)
>     }
>     for (i = 0; i < BE_TABLE_SIZE; i++) {
>        crc = crc32table_be[0][i];
> -      for (j = 1; j < 8; j++) {
> +      for (j = 1; j < BE_TABLE_ROWS; j++) {
>           crc = crc32table_be[0][(crc >> 24) & 0xff] ^ (crc << 8);
>           crc32table_be[j][i] = crc;
>        }
>     }
>  }
>
> -static void output_table(uint32_t table[8][256], int len, char *trans)
> +static void output_table(uint32_t (*table)[256], int rows, int len, char *trans)

Size is not always 256.

>  {
>     int i, j;
>
> -   for (j = 0 ; j < 8; j++) {
> +   for (j = 0 ; j < rows; j++) {
>        printf("{");
>        for (i = 0; i < len - 1; i++) {
>           if (i % ENTRIES_PER_LINE == 0)
> @@ -92,15 +96,19 @@ int main(int argc, char** argv)
>
>     if (CRC_LE_BITS > 1) {
>        crc32init_le();
> -      printf("static const u32 crc32table_le[8][256] = {");
> -      output_table(crc32table_le, LE_TABLE_SIZE, "tole");
> +      printf("static const u32 crc32table_le[%d][%d] = {",
> +             LE_TABLE_ROWS, LE_TABLE_SIZE);
> +      output_table(crc32table_le, LE_TABLE_ROWS,
> +              LE_TABLE_SIZE, "tole");
>        printf("};\n");
>     }
>
>     if (CRC_BE_BITS > 1) {
>        crc32init_be();
> -      printf("static const u32 crc32table_be[8][256] = {");
> -      output_table(crc32table_be, BE_TABLE_SIZE, "tobe");
> +      printf("static const u32 crc32table_be[%d][%d] = {",
> +             BE_TABLE_ROWS, BE_TABLE_SIZE);
> +      output_table(crc32table_be, LE_TABLE_ROWS,
> +              BE_TABLE_SIZE, "tobe");
>        printf("};\n");
>     }
>


^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [patch v3 6/7] crc32: add-slicing-by-8.diff
  2011-08-09 17:21 ` Joakim Tjernlund
@ 2011-08-09 20:52   ` Bob Pearson
  2011-08-10  9:32     ` Joakim Tjernlund
  0 siblings, 1 reply; 6+ messages in thread
From: Bob Pearson @ 2011-08-09 20:52 UTC (permalink / raw)
  To: 'Joakim Tjernlund'; +Cc: akpm, fzago, linux, linux-kernel

> >    - extend largest BITS size from 32 to 64
> >    - extend table from tab[4][256] to tab[8][256]
> >    - change algorithm to align on 8 byte boundary
> >      (it was noted that all that is really required
> >      is that the inner loop must comsume 8 bytes.
> >      As written it can start on even or odd 4 byte
> >      boundary.)
> So why not do that in the code too?
> 

I did the experiment with the random test set and it is a couple of % faster
so I made the change. I had thought that it wouldn't make a measurable
difference.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [patch v3 6/7] crc32: add-slicing-by-8.diff
  2011-08-09 20:52   ` Bob Pearson
@ 2011-08-10  9:32     ` Joakim Tjernlund
  0 siblings, 0 replies; 6+ messages in thread
From: Joakim Tjernlund @ 2011-08-10  9:32 UTC (permalink / raw)
  To: Bob Pearson; +Cc: akpm, fzago, linux, linux-kernel

"Bob Pearson" <rpearson@systemfabricworks.com> wrote on 2011/08/09 22:52:34:

> From: "Bob Pearson" <rpearson@systemfabricworks.com>
> To: "'Joakim Tjernlund'" <joakim.tjernlund@transmode.se>
> Cc: <akpm@linux-foundation.org>, <fzago@systemfabricworks.com>, <linux@horizon.com>, <linux-kernel@vger.kernel.org>
> Date: 2011/08/09 22:52
> Subject: RE: [patch v3 6/7] crc32: add-slicing-by-8.diff
>
> > >    - extend largest BITS size from 32 to 64
> > >    - extend table from tab[4][256] to tab[8][256]
> > >    - change algorithm to align on 8 byte boundary
> > >      (it was noted that all that is really required
> > >      is that the inner loop must comsume 8 bytes.
> > >      As written it can start on even or odd 4 byte
> > >      boundary.)
> > So why not do that in the code too?
> >
>
> I did the experiment with the random test set and it is a couple of % faster
> so I made the change. I had thought that it wouldn't make a measurable
> difference.

That could depend on how the test data table is aligned. What if you change
the alignment of the table?


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2011-08-10  9:33 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-08-09  5:27 [patch v3 6/7] crc32: add-slicing-by-8.diff Bob Pearson
2011-08-09 11:21 ` George Spelvin
2011-08-09 15:28   ` Bob Pearson
2011-08-09 17:21 ` Joakim Tjernlund
2011-08-09 20:52   ` Bob Pearson
2011-08-10  9:32     ` Joakim Tjernlund

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox