linux-crypto.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: x86@kernel.org, linux-kernel@vger.kernel.org,
	Ard Biesheuvel <ardb@kernel.org>,
	Josh Poimboeuf <jpoimboe@kernel.org>,
	Peter Zijlstra <peterz@infradead.org>
Subject: [PATCH 1/3] crypto: x86/crc32c - simplify code for handling fewer than 200 bytes
Date: Sun, 13 Oct 2024 21:24:45 -0700	[thread overview]
Message-ID: <20241014042447.50197-2-ebiggers@kernel.org> (raw)
In-Reply-To: <20241014042447.50197-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

The assembly code in crc32c-pcl-intel-asm_64.S is invoked only for
lengths >= 512, due to the overhead of saving and restoring FPU state.
Therefore, it is unnecessary for this code to be excessively "optimized"
for lengths < 200.  Eliminate the excessive unrolling of this part of
the code and use a more straightforward qword-at-a-time loop.

Note: the part of the code in question is not entirely redundant, as it
is still used to process any remainder mod 24, as well as any remaining
data when fewer than 200 bytes remain after least one 3072-byte chunk.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 116 ++++++----------------
 1 file changed, 33 insertions(+), 83 deletions(-)

diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index bbcff1fb78cb2..466cea4943963 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -54,24 +54,14 @@
 
 .macro JMPTBL_ENTRY i
 .quad .Lcrc_\i
 .endm
 
-.macro JNC_LESS_THAN j
-	jnc .Lless_than_\j
-.endm
-
-# Define threshold where buffers are considered "small" and routed to more
-# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
-# SMALL_SIZE can be no larger than 255.
-
+# Define threshold below which buffers are considered "small" and routed to
+# regular CRC code that does not interleave the CRC instructions.
 #define SMALL_SIZE 200
 
-.if (SMALL_SIZE > 255)
-.error "SMALL_ SIZE must be < 256"
-.endif
-
 # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
 
 .text
 SYM_FUNC_START(crc_pcl)
 #define    bufp		rdi
@@ -98,29 +88,22 @@ SYM_FUNC_START(crc_pcl)
 	pushq   %rsi
 
 	## Move crc_init for Linux to a different
 	mov     crc_init_arg, crc_init
 
+	mov	%bufp, bufptmp		# rdi = *buf
+	cmp	$SMALL_SIZE, len
+	jb	.Lsmall
+
 	################################################################
 	## 1) ALIGN:
 	################################################################
-
-	mov     %bufp, bufptmp		# rdi = *buf
 	neg     %bufp
 	and     $7, %bufp		# calculate the unalignment amount of
 					# the address
 	je      .Lproc_block		# Skip if aligned
 
-	## If len is less than 8 and we're unaligned, we need to jump
-	## to special code to avoid reading beyond the end of the buffer
-	cmp     $8, len
-	jae     .Ldo_align
-	# less_than_8 expects length in upper 3 bits of len_dw
-	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
-	shl     $32-3+1, len_dw
-	jmp     .Lless_than_8_post_shl1
-
 .Ldo_align:
 	#### Calculate CRC of unaligned bytes of the buffer (if any)
 	movq    (bufptmp), tmp		# load a quadward from the buffer
 	add     %bufp, bufptmp		# align buffer pointer for quadword
 					# processing
@@ -142,13 +125,10 @@ SYM_FUNC_START(crc_pcl)
 
 	cmpq    $128*24, len
 	jae     .Lfull_block
 
 .Lcontinue_block:
-	cmpq    $SMALL_SIZE, len
-	jb      .Lsmall
-
 	## len < 128*24
 	movq    $2731, %rax		# 2731 = ceil(2^16 / 24)
 	mul     len_dw
 	shrq    $16, %rax
 
@@ -241,72 +221,42 @@ LABEL crc_ %i
 LABEL crc_ 0
 	ENDBR
 	mov     tmp, len
 	cmp     $128*24, tmp
 	jae     .Lfull_block
-	cmp     $24, tmp
+	cmp	$SMALL_SIZE, tmp
 	jae     .Lcontinue_block
 
-.Lless_than_24:
-	shl     $32-4, len_dw			# less_than_16 expects length
-						# in upper 4 bits of len_dw
-	jnc     .Lless_than_16
-	crc32q  (bufptmp), crc_init
-	crc32q  8(bufptmp), crc_init
-	jz      .Ldo_return
-	add     $16, bufptmp
-	# len is less than 8 if we got here
-	# less_than_8 expects length in upper 3 bits of len_dw
-	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
-	shl     $2, len_dw
-	jmp     .Lless_than_8_post_shl1
-
 	#######################################################################
-	## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
+	## 6) Process any remainder without interleaving:
 	#######################################################################
 .Lsmall:
-	shl $32-8, len_dw		# Prepare len_dw for less_than_256
-	j=256
-.rept 5					# j = {256, 128, 64, 32, 16}
-.altmacro
-LABEL less_than_ %j			# less_than_j: Length should be in
-					# upper lg(j) bits of len_dw
-	j=(j/2)
-	shl     $1, len_dw		# Get next MSB
-	JNC_LESS_THAN %j
-.noaltmacro
-	i=0
-.rept (j/8)
-	crc32q  i(bufptmp), crc_init	# Compute crc32 of 8-byte data
-	i=i+8
-.endr
-	jz      .Ldo_return		# Return if remaining length is zero
-	add     $j, bufptmp		# Advance buf
-.endr
-
-.Lless_than_8:				# Length should be stored in
-					# upper 3 bits of len_dw
-	shl     $1, len_dw
-.Lless_than_8_post_shl1:
-	jnc     .Lless_than_4
-	crc32l  (bufptmp), crc_init_dw	# CRC of 4 bytes
-	jz      .Ldo_return		# return if remaining data is zero
-	add     $4, bufptmp
-.Lless_than_4:				# Length should be stored in
-					# upper 2 bits of len_dw
-	shl     $1, len_dw
-	jnc     .Lless_than_2
-	crc32w  (bufptmp), crc_init_dw	# CRC of 2 bytes
-	jz      .Ldo_return		# return if remaining data is zero
-	add     $2, bufptmp
-.Lless_than_2:				# Length should be stored in the MSB
-					# of len_dw
-	shl     $1, len_dw
-	jnc     .Lless_than_1
-	crc32b  (bufptmp), crc_init_dw	# CRC of 1 byte
-.Lless_than_1:				# Length should be zero
-.Ldo_return:
+	test	len, len
+	jz	.Ldone
+	mov	len_dw, %eax
+	shr	$3, %eax
+	jz	.Ldo_dword
+.Ldo_qwords:
+	crc32q	(bufptmp), crc_init
+	add	$8, bufptmp
+	dec	%eax
+	jnz	.Ldo_qwords
+.Ldo_dword:
+	test	$4, len_dw
+	jz	.Ldo_word
+	crc32l	(bufptmp), crc_init_dw
+	add	$4, bufptmp
+.Ldo_word:
+	test	$2, len_dw
+	jz	.Ldo_byte
+	crc32w	(bufptmp), crc_init_dw
+	add	$2, bufptmp
+.Ldo_byte:
+	test	$1, len_dw
+	jz	.Ldone
+	crc32b	(bufptmp), crc_init_dw
+.Ldone:
 	movq    crc_init, %rax
 	popq    %rsi
 	popq    %rdi
 	popq    %rbx
         RET
-- 
2.47.0


  reply	other threads:[~2024-10-14  4:25 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-10-14  4:24 [PATCH 0/3] crypto: x86/crc32c - jump table elimination and other cleanups Eric Biggers
2024-10-14  4:24 ` Eric Biggers [this message]
2024-10-14  4:24 ` [PATCH 2/3] crypto: x86/crc32c - access 32-bit arguments as 32-bit Eric Biggers
2024-10-14  4:24 ` [PATCH 3/3] crypto: x86/crc32c - eliminate jump table and excessive unrolling Eric Biggers
2024-10-14 16:30   ` David Laight
2024-10-14 19:01     ` Eric Biggers
2024-10-14 22:32       ` David Laight
2024-10-14 23:59         ` Eric Biggers
2024-10-15 10:55 ` [PATCH 0/3] crypto: x86/crc32c - jump table elimination and other cleanups Ard Biesheuvel
2024-10-26  6:53 ` Herbert Xu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20241014042447.50197-2-ebiggers@kernel.org \
    --to=ebiggers@kernel.org \
    --cc=ardb@kernel.org \
    --cc=jpoimboe@kernel.org \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=peterz@infradead.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).