RE: [patch v3 7/7] crc32: final-cleanup.diff

All of lore.kernel.org
 help / color / mirror / Atom feed

From: "Bob Pearson" <rpearson@systemfabricworks.com>
To: <linux-kernel@vger.kernel.org>, <linux@horizon.com>,
	<joakim.tjernlund@transmode.se>, <fzago@systemfabricworks.com>,
	<akpm@linux-foundation.org>
Subject: RE: [patch v3 7/7] crc32: final-cleanup.diff
Date: Tue, 9 Aug 2011 01:21:38 -0500	[thread overview]
Message-ID: <00d401cc565c$98eeab60$cacc0220$@systemfabricworks.com> (raw)
In-Reply-To: <4E40C5C7.2050609@systemfabricworks.com>

Additional comments.

 
> Some final cleanup changes
> 
> 	- added a comment at the top of crc32.c
> 	- moved macros ahead of function prototype
> 	- replaced loops with for (i = 0; i < xxx; i++) which
> 	  requires fewer instructions on x86 since the
> 	  buffer lookups can use i as an index.
> 
> -/* implements slicing-by-4 or slicing-by-8 algorithm */
> -static inline u32
> -crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32
> (*tab)[256])

After careful measurements and looking at asm code I figured out that there
was no penalty for using 2D array. That allowed me to go back to the
original form.

> -{
>  # ifdef __LITTLE_ENDIAN
>  #  define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8)
> -#  define DO_CRC4 crc = tab[3][(crc) & 255] ^ \
> -		tab[2][(crc >> 8) & 255] ^ \
> -		tab[1][(crc >> 16) & 255] ^ \
> -		tab[0][(crc >> 24) & 255]
> -#  define DO_CRC8a (tab[7][(q) & 255] ^ \
> -		tab[6][(q >> 8) & 255] ^ \
> -		tab[5][(q >> 16) & 255] ^ \
> -		tab[4][(q >> 24) & 255])
> -#  define DO_CRC8b (tab[3][(q) & 255] ^ \
> +#  define DO_CRC4 (tab[3][(q) & 255] ^ \
>  		tab[2][(q >> 8) & 255] ^ \
>  		tab[1][(q >> 16) & 255] ^ \
>  		tab[0][(q >> 24) & 255])

By changing the syntax a little so that the assignment is done down below
the 4 byte and 8 byte algorithms can share DO_CRC4.

> +#  define DO_CRC8 (tab[7][(q) & 255] ^ \
> +		tab[6][(q >> 8) & 255] ^ \
> +		tab[5][(q >> 16) & 255] ^ \
> +		tab[4][(q >> 24) & 255])
>  # else
>  #  define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
> -#  define DO_CRC4 crc = tab[0][(crc) & 255] ^ \
> -		tab[1][(crc >> 8) & 255] ^ \
> -		tab[2][(crc >> 16) & 255] ^ \
> -		tab[3][(crc >> 24) & 255]
> -#  define DO_CRC8a (tab[4][(q) & 255] ^ \
> -		tab[5][(q >> 8) & 255] ^ \
> -		tab[6][(q >> 16) & 255] ^ \
> -		tab[8][(q >> 24) & 255])
> -#  define DO_CRC8b (tab[0][(q) & 255] ^ \
> +#  define DO_CRC4 (tab[0][(q) & 255] ^ \
>  		tab[1][(q >> 8) & 255] ^ \
>  		tab[2][(q >> 16) & 255] ^ \
>  		tab[3][(q >> 24) & 255])
> +#  define DO_CRC8 (tab[4][(q) & 255] ^ \
> +		tab[5][(q >> 8) & 255] ^ \
> +		tab[6][(q >> 16) & 255] ^ \
> +		tab[7][(q >> 24) & 255])
>  # endif
> +
> +/* implements slicing-by-4 or slicing-by-8 algorithm */
> +static inline u32 crc32_body(u32 crc, unsigned char const *buf,
> +			     size_t len, const u32 (*tab)[256])
> +{
>  	const u32 *b;
> +	u8 *p;
> +	u32 q;
>  	size_t init_len;
>  	size_t middle_len;
>  	size_t rem_len;
> +	size_t i;
> 
>  	/* break buf into init_len bytes before and
>  	 * rem_len bytes after a middle section with
> @@ -96,37 +96,34 @@ crc32_body(u32 crc, unsigned char const
>  	rem_len = (len - init_len) & 7;
>  # endif
> 
> -	/* Align it */
> -	if (unlikely(init_len)) {
> -		do {
> -			DO_CRC(*buf++);
> -		} while (--init_len);
> -	}
> -	b = (const u32 *)buf;
> -	for (--b; middle_len; --middle_len) {
> +	/* process unaligned initial bytes */
> +	for (i = 0; i < init_len; i++)
> +		DO_CRC(*buf++);
> +
> +	/* process aligned words */
> +	b = (const u32 *)(buf - 4);
> +
> +	for (i = 0; i < middle_len; i++) {
>  # if CRC_LE_BITS == 32
> -		crc ^= *++b; /* use pre increment for speed */
> -		DO_CRC4;
> +		/* slicing-by-4 algorithm */
> +		q = crc ^ *++b; /* use pre increment for speed */
> +		crc = DO_CRC4;
>  # else
> -		u32 q;
> +		/* slicing-by-8 algorithm */
>  		q = crc ^ *++b;
> -		crc = DO_CRC8a;
> +		crc = DO_CRC8;
>  		q = *++b;
> -		crc ^= DO_CRC8b;
> +		crc ^= DO_CRC4;
>  # endif

This really shows how close the two algorithms are.

>  	}
> -	/* And the last few bytes */
> -	if (rem_len) {
> -		u8 *p = (u8 *)(b + 1) - 1;
> -		do {
> -			DO_CRC(*++p); /* use pre increment for speed */
> -		} while (--rem_len);
> -	}
> +
> +	/* process unaligned remaining bytes */
> +	p = (u8 *)(b + 1) - 1;
> +
> +	for (i = 0; i < rem_len; i++)
> +		DO_CRC(*++p); /* use pre increment for speed */
> +
>  	return crc;
> -#undef DO_CRC
> -#undef DO_CRC4
> -#undef DO_CRC8a
> -#undef DO_CRC8b
>  }
>  #endif

next prev parent reply	other threads:[~2011-08-09  6:21 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-08-09  5:29 [patch v3 7/7] crc32: final-cleanup.diff Bob Pearson
2011-08-09  6:21 ` Bob Pearson [this message]
2011-08-09 17:39   ` Joakim Tjernlund
2011-08-09 23:05     ` Bob Pearson
2011-08-10 11:40       ` Joakim Tjernlund
2011-08-10 15:13         ` Bob Pearson
2011-08-10 15:48           ` Joakim Tjernlund
2011-08-10 21:57             ` Bob Pearson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='00d401cc565c$98eeab60$cacc0220$@systemfabricworks.com' \
    --to=rpearson@systemfabricworks.com \
    --cc=akpm@linux-foundation.org \
    --cc=fzago@systemfabricworks.com \
    --cc=joakim.tjernlund@transmode.se \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux@horizon.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.