[PATCH 0/3] crypto: x86/crc32c - jump table elimination and other cleanups

linux-crypto.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/3] crypto: x86/crc32c - jump table elimination and other cleanups
@ 2024-10-14  4:24 Eric Biggers
  2024-10-14  4:24 ` [PATCH 1/3] crypto: x86/crc32c - simplify code for handling fewer than 200 bytes Eric Biggers
                   ` (4 more replies)
  0 siblings, 5 replies; 10+ messages in thread
From: Eric Biggers @ 2024-10-14  4:24 UTC (permalink / raw)
  To: linux-crypto
  Cc: x86, linux-kernel, Ard Biesheuvel, Josh Poimboeuf, Peter Zijlstra

This series cleans up the x86_64 assembly implementation of CRC32C to
reduce code size, improve performance, and eliminate the use of the
outdated and problematic jump table idiom.

Eric Biggers (3):
  crypto: x86/crc32c - simplify code for handling fewer than 200 bytes
  crypto: x86/crc32c - access 32-bit arguments as 32-bit
  crypto: x86/crc32c - eliminate jump table and excessive unrolling

 arch/x86/crypto/crc32c-intel_glue.c       |   2 +-
 arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 354 ++++++++--------------
 2 files changed, 126 insertions(+), 230 deletions(-)


base-commit: cfea70e835b9180029257d8b772c9e99c3305a9a
-- 
2.47.0


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 1/3] crypto: x86/crc32c - simplify code for handling fewer than 200 bytes
  2024-10-14  4:24 [PATCH 0/3] crypto: x86/crc32c - jump table elimination and other cleanups Eric Biggers
@ 2024-10-14  4:24 ` Eric Biggers
  2024-10-14  4:24 ` [PATCH 2/3] crypto: x86/crc32c - access 32-bit arguments as 32-bit Eric Biggers
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 10+ messages in thread
From: Eric Biggers @ 2024-10-14  4:24 UTC (permalink / raw)
  To: linux-crypto
  Cc: x86, linux-kernel, Ard Biesheuvel, Josh Poimboeuf, Peter Zijlstra

From: Eric Biggers <ebiggers@google.com>

The assembly code in crc32c-pcl-intel-asm_64.S is invoked only for
lengths >= 512, due to the overhead of saving and restoring FPU state.
Therefore, it is unnecessary for this code to be excessively "optimized"
for lengths < 200.  Eliminate the excessive unrolling of this part of
the code and use a more straightforward qword-at-a-time loop.

Note: the part of the code in question is not entirely redundant, as it
is still used to process any remainder mod 24, as well as any remaining
data when fewer than 200 bytes remain after least one 3072-byte chunk.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 116 ++++++----------------
 1 file changed, 33 insertions(+), 83 deletions(-)

diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index bbcff1fb78cb2..466cea4943963 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -54,24 +54,14 @@
 
 .macro JMPTBL_ENTRY i
 .quad .Lcrc_\i
 .endm
 
-.macro JNC_LESS_THAN j
-	jnc .Lless_than_\j
-.endm
-
-# Define threshold where buffers are considered "small" and routed to more
-# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
-# SMALL_SIZE can be no larger than 255.
-
+# Define threshold below which buffers are considered "small" and routed to
+# regular CRC code that does not interleave the CRC instructions.
 #define SMALL_SIZE 200
 
-.if (SMALL_SIZE > 255)
-.error "SMALL_ SIZE must be < 256"
-.endif
-
 # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
 
 .text
 SYM_FUNC_START(crc_pcl)
 #define    bufp		rdi
@@ -98,29 +88,22 @@ SYM_FUNC_START(crc_pcl)
 	pushq   %rsi
 
 	## Move crc_init for Linux to a different
 	mov     crc_init_arg, crc_init
 
+	mov	%bufp, bufptmp		# rdi = *buf
+	cmp	$SMALL_SIZE, len
+	jb	.Lsmall
+
 	################################################################
 	## 1) ALIGN:
 	################################################################
-
-	mov     %bufp, bufptmp		# rdi = *buf
 	neg     %bufp
 	and     $7, %bufp		# calculate the unalignment amount of
 					# the address
 	je      .Lproc_block		# Skip if aligned
 
-	## If len is less than 8 and we're unaligned, we need to jump
-	## to special code to avoid reading beyond the end of the buffer
-	cmp     $8, len
-	jae     .Ldo_align
-	# less_than_8 expects length in upper 3 bits of len_dw
-	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
-	shl     $32-3+1, len_dw
-	jmp     .Lless_than_8_post_shl1
-
 .Ldo_align:
 	#### Calculate CRC of unaligned bytes of the buffer (if any)
 	movq    (bufptmp), tmp		# load a quadward from the buffer
 	add     %bufp, bufptmp		# align buffer pointer for quadword
 					# processing
@@ -142,13 +125,10 @@ SYM_FUNC_START(crc_pcl)
 
 	cmpq    $128*24, len
 	jae     .Lfull_block
 
 .Lcontinue_block:
-	cmpq    $SMALL_SIZE, len
-	jb      .Lsmall
-
 	## len < 128*24
 	movq    $2731, %rax		# 2731 = ceil(2^16 / 24)
 	mul     len_dw
 	shrq    $16, %rax
 
@@ -241,72 +221,42 @@ LABEL crc_ %i
 LABEL crc_ 0
 	ENDBR
 	mov     tmp, len
 	cmp     $128*24, tmp
 	jae     .Lfull_block
-	cmp     $24, tmp
+	cmp	$SMALL_SIZE, tmp
 	jae     .Lcontinue_block
 
-.Lless_than_24:
-	shl     $32-4, len_dw			# less_than_16 expects length
-						# in upper 4 bits of len_dw
-	jnc     .Lless_than_16
-	crc32q  (bufptmp), crc_init
-	crc32q  8(bufptmp), crc_init
-	jz      .Ldo_return
-	add     $16, bufptmp
-	# len is less than 8 if we got here
-	# less_than_8 expects length in upper 3 bits of len_dw
-	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
-	shl     $2, len_dw
-	jmp     .Lless_than_8_post_shl1
-
 	#######################################################################
-	## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
+	## 6) Process any remainder without interleaving:
 	#######################################################################
 .Lsmall:
-	shl $32-8, len_dw		# Prepare len_dw for less_than_256
-	j=256
-.rept 5					# j = {256, 128, 64, 32, 16}
-.altmacro
-LABEL less_than_ %j			# less_than_j: Length should be in
-					# upper lg(j) bits of len_dw
-	j=(j/2)
-	shl     $1, len_dw		# Get next MSB
-	JNC_LESS_THAN %j
-.noaltmacro
-	i=0
-.rept (j/8)
-	crc32q  i(bufptmp), crc_init	# Compute crc32 of 8-byte data
-	i=i+8
-.endr
-	jz      .Ldo_return		# Return if remaining length is zero
-	add     $j, bufptmp		# Advance buf
-.endr
-
-.Lless_than_8:				# Length should be stored in
-					# upper 3 bits of len_dw
-	shl     $1, len_dw
-.Lless_than_8_post_shl1:
-	jnc     .Lless_than_4
-	crc32l  (bufptmp), crc_init_dw	# CRC of 4 bytes
-	jz      .Ldo_return		# return if remaining data is zero
-	add     $4, bufptmp
-.Lless_than_4:				# Length should be stored in
-					# upper 2 bits of len_dw
-	shl     $1, len_dw
-	jnc     .Lless_than_2
-	crc32w  (bufptmp), crc_init_dw	# CRC of 2 bytes
-	jz      .Ldo_return		# return if remaining data is zero
-	add     $2, bufptmp
-.Lless_than_2:				# Length should be stored in the MSB
-					# of len_dw
-	shl     $1, len_dw
-	jnc     .Lless_than_1
-	crc32b  (bufptmp), crc_init_dw	# CRC of 1 byte
-.Lless_than_1:				# Length should be zero
-.Ldo_return:
+	test	len, len
+	jz	.Ldone
+	mov	len_dw, %eax
+	shr	$3, %eax
+	jz	.Ldo_dword
+.Ldo_qwords:
+	crc32q	(bufptmp), crc_init
+	add	$8, bufptmp
+	dec	%eax
+	jnz	.Ldo_qwords
+.Ldo_dword:
+	test	$4, len_dw
+	jz	.Ldo_word
+	crc32l	(bufptmp), crc_init_dw
+	add	$4, bufptmp
+.Ldo_word:
+	test	$2, len_dw
+	jz	.Ldo_byte
+	crc32w	(bufptmp), crc_init_dw
+	add	$2, bufptmp
+.Ldo_byte:
+	test	$1, len_dw
+	jz	.Ldone
+	crc32b	(bufptmp), crc_init_dw
+.Ldone:
 	movq    crc_init, %rax
 	popq    %rsi
 	popq    %rdi
 	popq    %rbx
         RET
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/3] crypto: x86/crc32c - access 32-bit arguments as 32-bit
  2024-10-14  4:24 [PATCH 0/3] crypto: x86/crc32c - jump table elimination and other cleanups Eric Biggers
  2024-10-14  4:24 ` [PATCH 1/3] crypto: x86/crc32c - simplify code for handling fewer than 200 bytes Eric Biggers
@ 2024-10-14  4:24 ` Eric Biggers
  2024-10-14  4:24 ` [PATCH 3/3] crypto: x86/crc32c - eliminate jump table and excessive unrolling Eric Biggers
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 10+ messages in thread
From: Eric Biggers @ 2024-10-14  4:24 UTC (permalink / raw)
  To: linux-crypto
  Cc: x86, linux-kernel, Ard Biesheuvel, Josh Poimboeuf, Peter Zijlstra

From: Eric Biggers <ebiggers@google.com>

Fix crc32c-pcl-intel-asm_64.S to access 32-bit arguments as 32-bit
values instead of 64-bit, since the upper bits of the corresponding
64-bit registers are not guaranteed to be zero.  Also update the type of
the length argument to be unsigned int rather than int, as the assembly
code treats it as unsigned.

Note: there haven't been any reports of this bug actually causing
incorrect behavior.  Neither gcc nor clang guarantee zero-extension to
64 bits, but zero-extension is likely to happen in practice because most
instructions that operate on 32-bit registers zero-extend to 64 bits.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/crc32c-intel_glue.c       |  2 +-
 arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 57 +++++++++++------------
 2 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index feccb5254c7e5..52c5d47ef5a14 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -39,11 +39,11 @@
  * size is >= 512 to account
  * for fpu state save/restore overhead.
  */
 #define CRC32C_PCL_BREAKEVEN	512
 
-asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
+asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len,
 				unsigned int crc_init);
 #endif /* CONFIG_X86_64 */
 
 static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
 {
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 466cea4943963..bbf860e90951d 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -58,11 +58,11 @@
 
 # Define threshold below which buffers are considered "small" and routed to
 # regular CRC code that does not interleave the CRC instructions.
 #define SMALL_SIZE 200
 
-# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
+# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
 
 .text
 SYM_FUNC_START(crc_pcl)
 #define    bufp		rdi
 #define    bufp_dw	%edi
@@ -70,18 +70,15 @@ SYM_FUNC_START(crc_pcl)
 #define    bufp_b	%dil
 #define    bufptmp	%rcx
 #define    block_0	%rcx
 #define    block_1	%rdx
 #define    block_2	%r11
-#define    len		%rsi
-#define    len_dw	%esi
-#define    len_w	%si
-#define    len_b	%sil
-#define    crc_init_arg %rdx
+#define    len		%esi
+#define    crc_init_arg %edx
 #define    tmp		%rbx
-#define    crc_init	%r8
-#define    crc_init_dw	%r8d
+#define    crc_init	%r8d
+#define    crc_init_q	%r8
 #define    crc1		%r9
 #define    crc2		%r10
 
 	pushq   %rbx
 	pushq   %rdi
@@ -105,13 +102,13 @@ SYM_FUNC_START(crc_pcl)
 .Ldo_align:
 	#### Calculate CRC of unaligned bytes of the buffer (if any)
 	movq    (bufptmp), tmp		# load a quadward from the buffer
 	add     %bufp, bufptmp		# align buffer pointer for quadword
 					# processing
-	sub     %bufp, len		# update buffer length
+	sub	bufp_dw, len		# update buffer length
 .Lalign_loop:
-	crc32b  %bl, crc_init_dw 	# compute crc32 of 1-byte
+	crc32b	%bl, crc_init		# compute crc32 of 1-byte
 	shr     $8, tmp			# get next byte
 	dec     %bufp
 	jne     .Lalign_loop
 
 .Lproc_block:
@@ -119,19 +116,18 @@ SYM_FUNC_START(crc_pcl)
 	################################################################
 	## 2) PROCESS  BLOCKS:
 	################################################################
 
 	## compute num of bytes to be processed
-	movq    len, tmp		# save num bytes in tmp
 
-	cmpq    $128*24, len
+	cmp	$128*24, len
 	jae     .Lfull_block
 
 .Lcontinue_block:
 	## len < 128*24
 	movq    $2731, %rax		# 2731 = ceil(2^16 / 24)
-	mul     len_dw
+	mul	len
 	shrq    $16, %rax
 
 	## eax contains floor(bytes / 24) = num 24-byte chunks to do
 
 	## process rax 24-byte chunks (128 >= rax >= 0)
@@ -174,21 +170,21 @@ SYM_FUNC_START(crc_pcl)
 .rept 128-1
 .altmacro
 LABEL crc_ %i
 .noaltmacro
 	ENDBR
-	crc32q   -i*8(block_0), crc_init
+	crc32q   -i*8(block_0), crc_init_q
 	crc32q   -i*8(block_1), crc1
 	crc32q   -i*8(block_2), crc2
 	i=(i-1)
 .endr
 
 .altmacro
 LABEL crc_ %i
 .noaltmacro
 	ENDBR
-	crc32q   -i*8(block_0), crc_init
+	crc32q   -i*8(block_0), crc_init_q
 	crc32q   -i*8(block_1), crc1
 # SKIP  crc32  -i*8(block_2), crc2 ; Don't do this one yet
 
 	mov     block_2, block_0
 
@@ -198,66 +194,65 @@ LABEL crc_ %i
 
 	lea	(K_table-8)(%rip), %bufp		# first entry is for idx 1
 	shlq    $3, %rax			# rax *= 8
 	pmovzxdq (%bufp,%rax), %xmm0		# 2 consts: K1:K2
 	leal	(%eax,%eax,2), %eax		# rax *= 3 (total *24)
-	subq    %rax, tmp			# tmp -= rax*24
+	sub	%eax, len			# len -= rax*24
 
-	movq    crc_init, %xmm1			# CRC for block 1
+	movq	crc_init_q, %xmm1		# CRC for block 1
 	pclmulqdq $0x00, %xmm0, %xmm1		# Multiply by K2
 
 	movq    crc1, %xmm2			# CRC for block 2
 	pclmulqdq $0x10, %xmm0, %xmm2		# Multiply by K1
 
 	pxor    %xmm2,%xmm1
 	movq    %xmm1, %rax
 	xor     -i*8(block_2), %rax
-	mov     crc2, crc_init
-	crc32   %rax, crc_init
+	mov	crc2, crc_init_q
+	crc32	%rax, crc_init_q
 
 	################################################################
 	## 5) Check for end:
 	################################################################
 
 LABEL crc_ 0
 	ENDBR
-	mov     tmp, len
-	cmp     $128*24, tmp
+	cmp	$128*24, len
 	jae     .Lfull_block
-	cmp	$SMALL_SIZE, tmp
+	cmp	$SMALL_SIZE, len
 	jae     .Lcontinue_block
 
 	#######################################################################
 	## 6) Process any remainder without interleaving:
 	#######################################################################
 .Lsmall:
 	test	len, len
 	jz	.Ldone
-	mov	len_dw, %eax
+	mov	len, %eax
 	shr	$3, %eax
 	jz	.Ldo_dword
 .Ldo_qwords:
-	crc32q	(bufptmp), crc_init
+	crc32q	(bufptmp), crc_init_q
 	add	$8, bufptmp
 	dec	%eax
 	jnz	.Ldo_qwords
 .Ldo_dword:
-	test	$4, len_dw
+	test	$4, len
 	jz	.Ldo_word
-	crc32l	(bufptmp), crc_init_dw
+	crc32l	(bufptmp), crc_init
 	add	$4, bufptmp
 .Ldo_word:
-	test	$2, len_dw
+	test	$2, len
 	jz	.Ldo_byte
-	crc32w	(bufptmp), crc_init_dw
+	crc32w	(bufptmp), crc_init
 	add	$2, bufptmp
 .Ldo_byte:
-	test	$1, len_dw
+	test	$1, len
 	jz	.Ldone
-	crc32b	(bufptmp), crc_init_dw
+	crc32b	(bufptmp), crc_init
 .Ldone:
-	movq    crc_init, %rax
+	mov	crc_init, %eax
 	popq    %rsi
 	popq    %rdi
 	popq    %rbx
         RET
 SYM_FUNC_END(crc_pcl)
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/3] crypto: x86/crc32c - eliminate jump table and excessive unrolling
  2024-10-14  4:24 [PATCH 0/3] crypto: x86/crc32c - jump table elimination and other cleanups Eric Biggers
  2024-10-14  4:24 ` [PATCH 1/3] crypto: x86/crc32c - simplify code for handling fewer than 200 bytes Eric Biggers
  2024-10-14  4:24 ` [PATCH 2/3] crypto: x86/crc32c - access 32-bit arguments as 32-bit Eric Biggers
@ 2024-10-14  4:24 ` Eric Biggers
  2024-10-14 16:30   ` David Laight
  2024-10-15 10:55 ` [PATCH 0/3] crypto: x86/crc32c - jump table elimination and other cleanups Ard Biesheuvel
  2024-10-26  6:53 ` Herbert Xu
  4 siblings, 1 reply; 10+ messages in thread
From: Eric Biggers @ 2024-10-14  4:24 UTC (permalink / raw)
  To: linux-crypto
  Cc: x86, linux-kernel, Ard Biesheuvel, Josh Poimboeuf, Peter Zijlstra

From: Eric Biggers <ebiggers@google.com>

crc32c-pcl-intel-asm_64.S has a loop with 1 to 127 iterations fully
unrolled and uses a jump table to jump into the correct location.  This
optimization is misguided, as it bloats the binary code size and
introduces an indirect call.  x86_64 CPUs can predict loops well, so it
is fine to just use a loop instead.  Loop bookkeeping instructions can
compete with the crc instructions for the ALUs, but this is easily
mitigated by unrolling the loop by a smaller amount, such as 4 times.

Therefore, re-roll the loop and make related tweaks to the code.

This reduces the binary code size of crc_pclmul() from 4546 bytes to 418
bytes, a 91% reduction.  In general it also makes the code faster, with
some large improvements seen when retpoline is enabled.

More detailed performance results are shown below.  They are given as
percent improvement in throughput (negative means regressed) for CPU
microarchitecture vs. input length in bytes.  E.g. an improvement from
40 GB/s to 50 GB/s would be listed as 25%.

Table 1: Results with retpoline enabled (the default):

                       |   512 |   833 |  1024 |  2000 |  3173 |  4096 |
  ---------------------+-------+-------+-------+------ +-------+-------+
  Intel Haswell        | 35.0% | 20.7% | 17.8% |  9.7% | -0.2% |  4.4% |
  Intel Emerald Rapids | 66.8% | 45.2% | 36.3% | 19.3% |  0.0% |  5.4% |
  AMD Zen 2            | 29.5% | 17.2% | 13.5% |  8.6% | -0.5% |  2.8% |

Table 2: Results with retpoline disabled:

                       |   512 |   833 |  1024 |  2000 |  3173 |  4096 |
  ---------------------+-------+-------+-------+------ +-------+-------+
  Intel Haswell        |  3.3% |  4.8% |  4.5% |  0.9% | -2.9% |  0.3% |
  Intel Emerald Rapids |  7.5% |  6.4% |  5.2% |  2.3% | -0.0% |  0.6% |
  AMD Zen 2            | 11.8% |  1.4% |  0.2% |  1.3% | -0.9% | -0.2% |

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 233 +++++++++-------------
 1 file changed, 92 insertions(+), 141 deletions(-)

diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index bbf860e90951d..752812bc4991d 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -5,10 +5,11 @@
  * downloaded from:
  * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
  * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
  *
  * Copyright (C) 2012 Intel Corporation.
+ * Copyright 2024 Google LLC
  *
  * Authors:
  *	Wajdi Feghali <wajdi.k.feghali@intel.com>
  *	James Guilford <james.guilford@intel.com>
  *	David Cote <david.m.cote@intel.com>
@@ -42,186 +43,153 @@
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 
 #include <linux/linkage.h>
-#include <asm/nospec-branch.h>
 
 ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
 
-.macro LABEL prefix n
-.L\prefix\n\():
-.endm
-
-.macro JMPTBL_ENTRY i
-.quad .Lcrc_\i
-.endm
-
 # Define threshold below which buffers are considered "small" and routed to
 # regular CRC code that does not interleave the CRC instructions.
 #define SMALL_SIZE 200
 
 # unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
 
 .text
 SYM_FUNC_START(crc_pcl)
-#define    bufp		rdi
-#define    bufp_dw	%edi
-#define    bufp_w	%di
-#define    bufp_b	%dil
-#define    bufptmp	%rcx
-#define    block_0	%rcx
-#define    block_1	%rdx
-#define    block_2	%r11
-#define    len		%esi
-#define    crc_init_arg %edx
-#define    tmp		%rbx
-#define    crc_init	%r8d
-#define    crc_init_q	%r8
-#define    crc1		%r9
-#define    crc2		%r10
-
-	pushq   %rbx
-	pushq   %rdi
-	pushq   %rsi
+#define    bufp		  %rdi
+#define    bufp_d	  %edi
+#define    len		  %esi
+#define    crc_init	  %edx
+#define    crc_init_q	  %rdx
+#define    n_misaligned	  %ecx /* overlaps chunk_bytes! */
+#define    n_misaligned_q %rcx
+#define    chunk_bytes	  %ecx /* overlaps n_misaligned! */
+#define    chunk_bytes_q  %rcx
+#define    crc1		  %r8
+#define    crc2		  %r9
 
-	## Move crc_init for Linux to a different
-	mov     crc_init_arg, crc_init
-
-	mov	%bufp, bufptmp		# rdi = *buf
 	cmp	$SMALL_SIZE, len
 	jb	.Lsmall
 
 	################################################################
 	## 1) ALIGN:
 	################################################################
-	neg     %bufp
-	and     $7, %bufp		# calculate the unalignment amount of
+	mov	bufp_d, n_misaligned
+	neg	n_misaligned
+	and	$7, n_misaligned	# calculate the misalignment amount of
 					# the address
-	je      .Lproc_block		# Skip if aligned
+	je	.Laligned		# Skip if aligned
 
+	# Process 1 <= n_misaligned <= 7 bytes individually in order to align
+	# the remaining data to an 8-byte boundary.
 .Ldo_align:
-	#### Calculate CRC of unaligned bytes of the buffer (if any)
-	movq    (bufptmp), tmp		# load a quadward from the buffer
-	add     %bufp, bufptmp		# align buffer pointer for quadword
-					# processing
-	sub	bufp_dw, len		# update buffer length
+	movq	(bufp), %rax
+	add	n_misaligned_q, bufp
+	sub	n_misaligned, len
 .Lalign_loop:
-	crc32b	%bl, crc_init		# compute crc32 of 1-byte
-	shr     $8, tmp			# get next byte
-	dec     %bufp
+	crc32b	%al, crc_init		# compute crc32 of 1-byte
+	shr	$8, %rax		# get next byte
+	dec	n_misaligned
 	jne     .Lalign_loop
-
-.Lproc_block:
+.Laligned:
 
 	################################################################
-	## 2) PROCESS  BLOCKS:
+	## 2) PROCESS BLOCK:
 	################################################################
 
-	## compute num of bytes to be processed
-
 	cmp	$128*24, len
 	jae     .Lfull_block
 
-.Lcontinue_block:
-	## len < 128*24
-	movq    $2731, %rax		# 2731 = ceil(2^16 / 24)
-	mul	len
-	shrq    $16, %rax
-
-	## eax contains floor(bytes / 24) = num 24-byte chunks to do
-
-	## process rax 24-byte chunks (128 >= rax >= 0)
-
-	## compute end address of each block
-	## block 0 (base addr + RAX * 8)
-	## block 1 (base addr + RAX * 16)
-	## block 2 (base addr + RAX * 24)
-	lea     (bufptmp, %rax, 8), block_0
-	lea     (block_0, %rax, 8), block_1
-	lea     (block_1, %rax, 8), block_2
-
-	xor     crc1, crc1
-	xor     crc2, crc2
-
-	## branch into array
-	leaq	jump_table(%rip), %bufp
-	mov	(%bufp,%rax,8), %bufp
-	JMP_NOSPEC bufp
+.Lpartial_block:
+	# Compute floor(len / 24) to get num qwords to process from each lane.
+	imul	$2731, len, %eax	# 2731 = ceil(2^16 / 24)
+	shr	$16, %eax
+	jmp	.Lcrc_3lanes
 
-	################################################################
-	## 2a) PROCESS FULL BLOCKS:
-	################################################################
 .Lfull_block:
-	movl    $128,%eax
-	lea     128*8*2(block_0), block_1
-	lea     128*8*3(block_0), block_2
-	add     $128*8*1, block_0
-
-	xor     crc1,crc1
-	xor     crc2,crc2
-
-	# Fall through into top of crc array (crc_128)
+	# Processing 128 qwords from each lane.
+	mov	$128, %eax
 
 	################################################################
-	## 3) CRC Array:
+	## 3) CRC each of three lanes:
 	################################################################
 
-	i=128
-.rept 128-1
-.altmacro
-LABEL crc_ %i
-.noaltmacro
-	ENDBR
-	crc32q   -i*8(block_0), crc_init_q
-	crc32q   -i*8(block_1), crc1
-	crc32q   -i*8(block_2), crc2
-	i=(i-1)
-.endr
-
-.altmacro
-LABEL crc_ %i
-.noaltmacro
-	ENDBR
-	crc32q   -i*8(block_0), crc_init_q
-	crc32q   -i*8(block_1), crc1
-# SKIP  crc32  -i*8(block_2), crc2 ; Don't do this one yet
+.Lcrc_3lanes:
+	xor	crc1,crc1
+	xor     crc2,crc2
+	mov	%eax, chunk_bytes
+	shl	$3, chunk_bytes		# num bytes to process from each lane
+	sub	$5, %eax		# 4 for 4x_loop, 1 for special last iter
+	jl	.Lcrc_3lanes_4x_done
+
+	# Unroll the loop by a factor of 4 to reduce the overhead of the loop
+	# bookkeeping instructions, which can compete with crc32q for the ALUs.
+.Lcrc_3lanes_4x_loop:
+	crc32q	(bufp), crc_init_q
+	crc32q	(bufp,chunk_bytes_q), crc1
+	crc32q	(bufp,chunk_bytes_q,2), crc2
+	crc32q	8(bufp), crc_init_q
+	crc32q	8(bufp,chunk_bytes_q), crc1
+	crc32q	8(bufp,chunk_bytes_q,2), crc2
+	crc32q	16(bufp), crc_init_q
+	crc32q	16(bufp,chunk_bytes_q), crc1
+	crc32q	16(bufp,chunk_bytes_q,2), crc2
+	crc32q	24(bufp), crc_init_q
+	crc32q	24(bufp,chunk_bytes_q), crc1
+	crc32q	24(bufp,chunk_bytes_q,2), crc2
+	add	$32, bufp
+	sub	$4, %eax
+	jge	.Lcrc_3lanes_4x_loop
+
+.Lcrc_3lanes_4x_done:
+	add	$4, %eax
+	jz	.Lcrc_3lanes_last_qword
+
+.Lcrc_3lanes_1x_loop:
+	crc32q	(bufp), crc_init_q
+	crc32q	(bufp,chunk_bytes_q), crc1
+	crc32q	(bufp,chunk_bytes_q,2), crc2
+	add	$8, bufp
+	dec	%eax
+	jnz	.Lcrc_3lanes_1x_loop
 
-	mov     block_2, block_0
+.Lcrc_3lanes_last_qword:
+	crc32q	(bufp), crc_init_q
+	crc32q	(bufp,chunk_bytes_q), crc1
+# SKIP  crc32q	(bufp,chunk_bytes_q,2), crc2	; Don't do this one yet
 
 	################################################################
 	## 4) Combine three results:
 	################################################################
 
-	lea	(K_table-8)(%rip), %bufp		# first entry is for idx 1
-	shlq    $3, %rax			# rax *= 8
-	pmovzxdq (%bufp,%rax), %xmm0		# 2 consts: K1:K2
-	leal	(%eax,%eax,2), %eax		# rax *= 3 (total *24)
-	sub	%eax, len			# len -= rax*24
+	lea	(K_table-8)(%rip), %rax		# first entry is for idx 1
+	pmovzxdq (%rax,chunk_bytes_q), %xmm0	# 2 consts: K1:K2
+	lea	(chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
+	sub	%eax, len			# len -= chunk_bytes * 3
 
 	movq	crc_init_q, %xmm1		# CRC for block 1
 	pclmulqdq $0x00, %xmm0, %xmm1		# Multiply by K2
 
 	movq    crc1, %xmm2			# CRC for block 2
 	pclmulqdq $0x10, %xmm0, %xmm2		# Multiply by K1
 
 	pxor    %xmm2,%xmm1
 	movq    %xmm1, %rax
-	xor     -i*8(block_2), %rax
+	xor	(bufp,chunk_bytes_q,2), %rax
 	mov	crc2, crc_init_q
 	crc32	%rax, crc_init_q
+	lea	8(bufp,chunk_bytes_q,2), bufp
 
 	################################################################
-	## 5) Check for end:
+	## 5) If more blocks remain, goto (2):
 	################################################################
 
-LABEL crc_ 0
-	ENDBR
 	cmp	$128*24, len
-	jae     .Lfull_block
+	jae	.Lfull_block
 	cmp	$SMALL_SIZE, len
-	jae     .Lcontinue_block
+	jae	.Lpartial_block
 
 	#######################################################################
 	## 6) Process any remainder without interleaving:
 	#######################################################################
 .Lsmall:
@@ -229,51 +197,34 @@ LABEL crc_ 0
 	jz	.Ldone
 	mov	len, %eax
 	shr	$3, %eax
 	jz	.Ldo_dword
 .Ldo_qwords:
-	crc32q	(bufptmp), crc_init_q
-	add	$8, bufptmp
+	crc32q	(bufp), crc_init_q
+	add	$8, bufp
 	dec	%eax
 	jnz	.Ldo_qwords
 .Ldo_dword:
 	test	$4, len
 	jz	.Ldo_word
-	crc32l	(bufptmp), crc_init
-	add	$4, bufptmp
+	crc32l	(bufp), crc_init
+	add	$4, bufp
 .Ldo_word:
 	test	$2, len
 	jz	.Ldo_byte
-	crc32w	(bufptmp), crc_init
-	add	$2, bufptmp
+	crc32w	(bufp), crc_init
+	add	$2, bufp
 .Ldo_byte:
 	test	$1, len
 	jz	.Ldone
-	crc32b	(bufptmp), crc_init
+	crc32b	(bufp), crc_init
 .Ldone:
 	mov	crc_init, %eax
-	popq    %rsi
-	popq    %rdi
-	popq    %rbx
         RET
 SYM_FUNC_END(crc_pcl)
 
 .section	.rodata, "a", @progbits
-        ################################################################
-        ## jump table        Table is 129 entries x 2 bytes each
-        ################################################################
-.align 4
-jump_table:
-	i=0
-.rept 129
-.altmacro
-JMPTBL_ENTRY %i
-.noaltmacro
-	i=i+1
-.endr
-
-
 	################################################################
 	## PCLMULQDQ tables
 	## Table is 128 entries x 2 words (8 bytes) each
 	################################################################
 .align 8
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* RE: [PATCH 3/3] crypto: x86/crc32c - eliminate jump table and excessive unrolling
  2024-10-14  4:24 ` [PATCH 3/3] crypto: x86/crc32c - eliminate jump table and excessive unrolling Eric Biggers
@ 2024-10-14 16:30   ` David Laight
  2024-10-14 19:01     ` Eric Biggers
  0 siblings, 1 reply; 10+ messages in thread
From: David Laight @ 2024-10-14 16:30 UTC (permalink / raw)
  To: 'Eric Biggers', linux-crypto@vger.kernel.org
  Cc: x86@kernel.org, linux-kernel@vger.kernel.org, Ard Biesheuvel,
	Josh Poimboeuf, Peter Zijlstra

From: Eric Biggers
> Sent: 14 October 2024 05:25
> 
> crc32c-pcl-intel-asm_64.S has a loop with 1 to 127 iterations fully
> unrolled and uses a jump table to jump into the correct location.  This
> optimization is misguided, as it bloats the binary code size and
> introduces an indirect call.  x86_64 CPUs can predict loops well, so it
> is fine to just use a loop instead.  Loop bookkeeping instructions can
> compete with the crc instructions for the ALUs, but this is easily
> mitigated by unrolling the loop by a smaller amount, such as 4 times.

Do you need to unroll it at all?

...
> +	# Unroll the loop by a factor of 4 to reduce the overhead of the loop
> +	# bookkeeping instructions, which can compete with crc32q for the ALUs.
> +.Lcrc_3lanes_4x_loop:
> +	crc32q	(bufp), crc_init_q
> +	crc32q	(bufp,chunk_bytes_q), crc1
> +	crc32q	(bufp,chunk_bytes_q,2), crc2
> +	crc32q	8(bufp), crc_init_q
> +	crc32q	8(bufp,chunk_bytes_q), crc1
> +	crc32q	8(bufp,chunk_bytes_q,2), crc2
> +	crc32q	16(bufp), crc_init_q
> +	crc32q	16(bufp,chunk_bytes_q), crc1
> +	crc32q	16(bufp,chunk_bytes_q,2), crc2
> +	crc32q	24(bufp), crc_init_q
> +	crc32q	24(bufp,chunk_bytes_q), crc1
> +	crc32q	24(bufp,chunk_bytes_q,2), crc2
> +	add	$32, bufp
> +	sub	$4, %eax
> +	jge	.Lcrc_3lanes_4x_loop

If you are really lucky you'll get two memory reads/clock.
So you won't ever to do than two crc32/clock.
Looking at Agner's instruction latency tables I don't think
any cpu can do more that one per clock, or pipeline them.
I think that means you don't even need two (never mind 3)
buffers.

Most modern x86 can do 4 or 5 (or even more) ALU operations
per clock - depending on the combination of instructions.

Replace the loop termination with a comparison of 'bufp'
against a pre-calculated limit and you get two instructions
(that might get merged into one u-op) for the loop overhead.
They'll run in parallel with the crc32q instructions.

I've never managed to get a 1-clock loop, but two is easy.
You might find that just:
10:
	crc32q	(bufp), crc
	crc32q	8(bufp), crc
	add		$16, bufp
	cmp		bufp, buf_lim
	jne		10b
will run at 8 bytes/clock on modern intel cpu.
You can write that in C with a simple asm function for the crc32
instruction itself.

You might need the more complex to setup loop:
	offset = -length;
	bufend = buf + length;
10:
	crc32q	(offset, bufend), crc
	crc32q	8(offset, bufend), crc
	add		&16, offset
	jne		10b
which uses negative offsets from the end of the buffer.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3] crypto: x86/crc32c - eliminate jump table and excessive unrolling
  2024-10-14 16:30   ` David Laight
@ 2024-10-14 19:01     ` Eric Biggers
  2024-10-14 22:32       ` David Laight
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Biggers @ 2024-10-14 19:01 UTC (permalink / raw)
  To: David Laight
  Cc: linux-crypto@vger.kernel.org, x86@kernel.org,
	linux-kernel@vger.kernel.org, Ard Biesheuvel, Josh Poimboeuf,
	Peter Zijlstra

On Mon, Oct 14, 2024 at 04:30:05PM +0000, David Laight wrote:
> From: Eric Biggers
> > Sent: 14 October 2024 05:25
> > 
> > crc32c-pcl-intel-asm_64.S has a loop with 1 to 127 iterations fully
> > unrolled and uses a jump table to jump into the correct location.  This
> > optimization is misguided, as it bloats the binary code size and
> > introduces an indirect call.  x86_64 CPUs can predict loops well, so it
> > is fine to just use a loop instead.  Loop bookkeeping instructions can
> > compete with the crc instructions for the ALUs, but this is easily
> > mitigated by unrolling the loop by a smaller amount, such as 4 times.
> 
> Do you need to unroll it at all?

It looks like on most CPUs, no.  On Haswell, Emerald Rapids, Zen 2 it does not
make a significant difference.  However, it helps on Zen 5.

> > +	# Unroll the loop by a factor of 4 to reduce the overhead of the loop
> > +	# bookkeeping instructions, which can compete with crc32q for the ALUs.
> > +.Lcrc_3lanes_4x_loop:
> > +	crc32q	(bufp), crc_init_q
> > +	crc32q	(bufp,chunk_bytes_q), crc1
> > +	crc32q	(bufp,chunk_bytes_q,2), crc2
> > +	crc32q	8(bufp), crc_init_q
> > +	crc32q	8(bufp,chunk_bytes_q), crc1
> > +	crc32q	8(bufp,chunk_bytes_q,2), crc2
> > +	crc32q	16(bufp), crc_init_q
> > +	crc32q	16(bufp,chunk_bytes_q), crc1
> > +	crc32q	16(bufp,chunk_bytes_q,2), crc2
> > +	crc32q	24(bufp), crc_init_q
> > +	crc32q	24(bufp,chunk_bytes_q), crc1
> > +	crc32q	24(bufp,chunk_bytes_q,2), crc2
> > +	add	$32, bufp
> > +	sub	$4, %eax
> > +	jge	.Lcrc_3lanes_4x_loop
> 
> If you are really lucky you'll get two memory reads/clock.
> So you won't ever to do than two crc32/clock.
> Looking at Agner's instruction latency tables I don't think
> any cpu can do more that one per clock, or pipeline them.
> I think that means you don't even need two (never mind 3)
> buffers.

On most Intel and AMD CPUs (I tested Haswell for old Intel, Emerald Rapids for
new Intel, and Zen 2 for slightly-old AMD), crc32q has 3 cycle latency and 1 per
cycle throughput.  So you do need at least 3 streams.

AMD Zen 5 has much higher crc32q throughput and seems to want up to 7 streams.
This is not implemented yet.

> Most modern x86 can do 4 or 5 (or even more) ALU operations
> per clock - depending on the combination of instructions.
> 
> Replace the loop termination with a comparison of 'bufp'
> against a pre-calculated limit and you get two instructions
> (that might get merged into one u-op) for the loop overhead.
> They'll run in parallel with the crc32q instructions.

That's actually still three instructions: add, cmp, and jne.

I tried it on both Intel and AMD, and it did not help.

> I've never managed to get a 1-clock loop, but two is easy.
> You might find that just:
> 10:
> 	crc32q	(bufp), crc
> 	crc32q	8(bufp), crc
> 	add		$16, bufp
> 	cmp		bufp, buf_lim
> 	jne		10b
> will run at 8 bytes/clock on modern intel cpu.

No, the latency of crc32q is still three cycles.  You need three streams.

> You can write that in C with a simple asm function for the crc32
> instruction itself.

Well, the single-stream CRC32C implementation already does that; see
arch/x86/crypto/crc32c-intel_glue.c.  Things are not as simple for this
multi-stream implementation, which uses pclmulqdq to combine the CRCs.

- Eric

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH 3/3] crypto: x86/crc32c - eliminate jump table and excessive unrolling
  2024-10-14 19:01     ` Eric Biggers
@ 2024-10-14 22:32       ` David Laight
  2024-10-14 23:59         ` Eric Biggers
  0 siblings, 1 reply; 10+ messages in thread
From: David Laight @ 2024-10-14 22:32 UTC (permalink / raw)
  To: 'Eric Biggers'
  Cc: linux-crypto@vger.kernel.org, x86@kernel.org,
	linux-kernel@vger.kernel.org, Ard Biesheuvel, Josh Poimboeuf,
	Peter Zijlstra

...
> > Do you need to unroll it at all?

> It looks like on most CPUs, no.  On Haswell, Emerald Rapids, Zen 2 it does not
> make a significant difference.  However, it helps on Zen 5.

I wonder if one of the loop instructions is using the ALU
unit you really want to be processing a crc32?
If the cpu has fused arithmetic+jump u-ops then trying to get the
decoder to use one of those may help.

Is Zen 5 actually slower than the other systems?
I've managed to get clock cycle counts using the performance counters
that more of less match the predicted values.
You can't use 'rdtsc' because the cpu frequence isn't stable.

...
> > If you are really lucky you'll get two memory reads/clock.
> > So you won't ever to do than two crc32/clock.
> > Looking at Agner's instruction latency tables I don't think
> > any cpu can do more that one per clock, or pipeline them.
> > I think that means you don't even need two (never mind 3)
> > buffers.
> 
> On most Intel and AMD CPUs (I tested Haswell for old Intel, Emerald Rapids for
> new Intel, and Zen 2 for slightly-old AMD), crc32q has 3 cycle latency and 1 per
> cycle throughput.  So you do need at least 3 streams.

Bah, I missed the latency column :-)

> AMD Zen 5 has much higher crc32q throughput and seems to want up to 7 streams.
> This is not implemented yet.

The copy of the tables I have is old - doesn't contain Zen-5.
Does that mean that 2 (or more) of its alu 'units' can do crc32
so you can do more than 1/clock (along with the memory reads).

One thought is how much of it is actually worth while!
If the data isn't already in the L1 data cache then the cache
loads almost certainly dominate - especially if you have to
do out to 'real memory'.
You can benchmark the loops by repeatedly accessing the same
data - but that isn't what will actually happen.

> > Most modern x86 can do 4 or 5 (or even more) ALU operations
> > per clock - depending on the combination of instructions.
> >
> > Replace the loop termination with a comparison of 'bufp'
> > against a pre-calculated limit and you get two instructions
> > (that might get merged into one u-op) for the loop overhead.
> > They'll run in parallel with the crc32q instructions.
> 
> That's actually still three instructions: add, cmp, and jne.

I was really thinking of the loop I quoted later.
The one that uses negative offsets from the end of the buffer.
That has an 'add' and a 'jnz' - which might even fuse into a
single u-op.
Maybe even constrained to p6 - so won't go near p1.
(I don't have a recent AMD cpu)

It may not actually matter.
The add/subtract/cmp are only dependant on themselves.
Similarly the jne is only dependant on the result of the sub/cmp.
In principle they can all run in the same clock (for different
loop cycles) since the rest of the loop only needs one of the
ALU blocks (on Intel only P1 can do crc).
But I failed to get a 1 clock loop (using ADC - which doesn't
have a latency issue).
It might be impossible because a predicted-taken conditional jmp
has a latency of 2.

> I tried it on both Intel and AMD, and it did not help.
> 
> > I've never managed to get a 1-clock loop, but two is easy.
> > You might find that just:
> > 10:
> > 	crc32q	(bufp), crc
> > 	crc32q	8(bufp), crc
> > 	add		$16, bufp
> > 	cmp		bufp, buf_lim
> > 	jne		10b
> > will run at 8 bytes/clock on modern intel cpu.
> 
> No, the latency of crc32q is still three cycles.  You need three streams.

If you need three streams to get one crc32/clock then, in theory,
you can get two more simple ALU ops, at least one memory read and
a jump in every clock - even on Sandy bridge.
So they are unlikely to dominate the loop whatever you do.

If the loop is too long you can get a stall (probably) because a register
has to be read back from the real register file and not just forwarded
from a previous use/alu result.
I've gained a clock back by adding an extra instruction in the middle
of a loop!
But the not-unrolled (multi-stream) loop isn't long enough for that
to be an issue.

Enough rambling.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3] crypto: x86/crc32c - eliminate jump table and excessive unrolling
  2024-10-14 22:32       ` David Laight
@ 2024-10-14 23:59         ` Eric Biggers
  0 siblings, 0 replies; 10+ messages in thread
From: Eric Biggers @ 2024-10-14 23:59 UTC (permalink / raw)
  To: David Laight
  Cc: linux-crypto@vger.kernel.org, x86@kernel.org,
	linux-kernel@vger.kernel.org, Ard Biesheuvel, Josh Poimboeuf,
	Peter Zijlstra

On Mon, Oct 14, 2024 at 10:32:48PM +0000, David Laight wrote:
> ...
> > > Do you need to unroll it at all?
> 
> > It looks like on most CPUs, no.  On Haswell, Emerald Rapids, Zen 2 it does not
> > make a significant difference.  However, it helps on Zen 5.
> 
> I wonder if one of the loop instructions is using the ALU
> unit you really want to be processing a crc32?
> If the cpu has fused arithmetic+jump u-ops then trying to get the
> decoder to use one of those may help.
> 
> Is Zen 5 actually slower than the other systems?
> I've managed to get clock cycle counts using the performance counters
> that more of less match the predicted values.
> You can't use 'rdtsc' because the cpu frequence isn't stable.

No, Zen 5 is faster than the other CPUs.  I looked more into what was happening,
and it turns out it's actually executing more than 3 crc32q in parallel on
average, by overlapping the execution of different calls to crc_pcl().  If I
chain the CRC values, that goes away and the 4x unrolling no longer helps.

Of course, whether users are chaining the CRC values or not is up to the user.
A user might be checksumming lots of small messages, or they might be
checksumming a large message in smaller pieces.

I do think the 4x unrolling is probably worth keeping around to reduce
dependency on microarchitectural details for future-proofing.  It's quite modest
compared to the 128x unrolling that was used before...

> ...
> > > If you are really lucky you'll get two memory reads/clock.
> > > So you won't ever to do than two crc32/clock.
> > > Looking at Agner's instruction latency tables I don't think
> > > any cpu can do more that one per clock, or pipeline them.
> > > I think that means you don't even need two (never mind 3)
> > > buffers.
> > 
> > On most Intel and AMD CPUs (I tested Haswell for old Intel, Emerald Rapids for
> > new Intel, and Zen 2 for slightly-old AMD), crc32q has 3 cycle latency and 1 per
> > cycle throughput.  So you do need at least 3 streams.
> 
> Bah, I missed the latency column :-)
> 
> > AMD Zen 5 has much higher crc32q throughput and seems to want up to 7 streams.
> > This is not implemented yet.
> 
> The copy of the tables I have is old - doesn't contain Zen-5.
> Does that mean that 2 (or more) of its alu 'units' can do crc32
> so you can do more than 1/clock (along with the memory reads).

That's correct.  It seems that 3 ALUs on Zen 5 can do crc32.

> One thought is how much of it is actually worth while!
> If the data isn't already in the L1 data cache then the cache
> loads almost certainly dominate - especially if you have to
> do out to 'real memory'.
> You can benchmark the loops by repeatedly accessing the same
> data - but that isn't what will actually happen.
> 

Well, data is rarely checksummed on its own but rather immediately before using
it or right after generating it.  In those cases it needs to be pulled into L1
cache, or has already been pulled into L1 cache, anyway.

> > > Most modern x86 can do 4 or 5 (or even more) ALU operations
> > > per clock - depending on the combination of instructions.
> > >
> > > Replace the loop termination with a comparison of 'bufp'
> > > against a pre-calculated limit and you get two instructions
> > > (that might get merged into one u-op) for the loop overhead.
> > > They'll run in parallel with the crc32q instructions.
> > 
> > That's actually still three instructions: add, cmp, and jne.
> 
> I was really thinking of the loop I quoted later.
> The one that uses negative offsets from the end of the buffer.
> That has an 'add' and a 'jnz' - which might even fuse into a
> single u-op.
> Maybe even constrained to p6 - so won't go near p1.
> (I don't have a recent AMD cpu)
>
> It may not actually matter.
> The add/subtract/cmp are only dependant on themselves.
> Similarly the jne is only dependant on the result of the sub/cmp.
> In principle they can all run in the same clock (for different
> loop cycles) since the rest of the loop only needs one of the
> ALU blocks (on Intel only P1 can do crc).
> But I failed to get a 1 clock loop (using ADC - which doesn't
> have a latency issue).
> It might be impossible because a predicted-taken conditional jmp
> has a latency of 2.

Yes, it's an interesting idea.  There would need to be a separate bufend pointer
for each chunk set up.

- Eric

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 0/3] crypto: x86/crc32c - jump table elimination and other cleanups
  2024-10-14  4:24 [PATCH 0/3] crypto: x86/crc32c - jump table elimination and other cleanups Eric Biggers
                   ` (2 preceding siblings ...)
  2024-10-14  4:24 ` [PATCH 3/3] crypto: x86/crc32c - eliminate jump table and excessive unrolling Eric Biggers
@ 2024-10-15 10:55 ` Ard Biesheuvel
  2024-10-26  6:53 ` Herbert Xu
  4 siblings, 0 replies; 10+ messages in thread
From: Ard Biesheuvel @ 2024-10-15 10:55 UTC (permalink / raw)
  To: Eric Biggers
  Cc: linux-crypto, x86, linux-kernel, Josh Poimboeuf, Peter Zijlstra

On Mon, 14 Oct 2024 at 06:25, Eric Biggers <ebiggers@kernel.org> wrote:
>
> This series cleans up the x86_64 assembly implementation of CRC32C to
> reduce code size, improve performance, and eliminate the use of the
> outdated and problematic jump table idiom.
>
> Eric Biggers (3):
>   crypto: x86/crc32c - simplify code for handling fewer than 200 bytes
>   crypto: x86/crc32c - access 32-bit arguments as 32-bit
>   crypto: x86/crc32c - eliminate jump table and excessive unrolling
>

Nice cleanup

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 0/3] crypto: x86/crc32c - jump table elimination and other cleanups
  2024-10-14  4:24 [PATCH 0/3] crypto: x86/crc32c - jump table elimination and other cleanups Eric Biggers
                   ` (3 preceding siblings ...)
  2024-10-15 10:55 ` [PATCH 0/3] crypto: x86/crc32c - jump table elimination and other cleanups Ard Biesheuvel
@ 2024-10-26  6:53 ` Herbert Xu
  4 siblings, 0 replies; 10+ messages in thread
From: Herbert Xu @ 2024-10-26  6:53 UTC (permalink / raw)
  To: Eric Biggers; +Cc: linux-crypto, x86, linux-kernel, ardb, jpoimboe, peterz

Eric Biggers <ebiggers@kernel.org> wrote:
> This series cleans up the x86_64 assembly implementation of CRC32C to
> reduce code size, improve performance, and eliminate the use of the
> outdated and problematic jump table idiom.
> 
> Eric Biggers (3):
>  crypto: x86/crc32c - simplify code for handling fewer than 200 bytes
>  crypto: x86/crc32c - access 32-bit arguments as 32-bit
>  crypto: x86/crc32c - eliminate jump table and excessive unrolling
> 
> arch/x86/crypto/crc32c-intel_glue.c       |   2 +-
> arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 354 ++++++++--------------
> 2 files changed, 126 insertions(+), 230 deletions(-)
> 
> 
> base-commit: cfea70e835b9180029257d8b772c9e99c3305a9a

All applied.  Thanks.
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-10-26  6:53 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-14  4:24 [PATCH 0/3] crypto: x86/crc32c - jump table elimination and other cleanups Eric Biggers
2024-10-14  4:24 ` [PATCH 1/3] crypto: x86/crc32c - simplify code for handling fewer than 200 bytes Eric Biggers
2024-10-14  4:24 ` [PATCH 2/3] crypto: x86/crc32c - access 32-bit arguments as 32-bit Eric Biggers
2024-10-14  4:24 ` [PATCH 3/3] crypto: x86/crc32c - eliminate jump table and excessive unrolling Eric Biggers
2024-10-14 16:30   ` David Laight
2024-10-14 19:01     ` Eric Biggers
2024-10-14 22:32       ` David Laight
2024-10-14 23:59         ` Eric Biggers
2024-10-15 10:55 ` [PATCH 0/3] crypto: x86/crc32c - jump table elimination and other cleanups Ard Biesheuvel
2024-10-26  6:53 ` Herbert Xu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).