From: Eric Biggers <ebiggers@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: linux-arch@vger.kernel.org, linux-arm-kernel@lists.infradead.org,
linux-crypto@vger.kernel.org, linux-ext4@vger.kernel.org,
linux-f2fs-devel@lists.sourceforge.net,
linux-mips@vger.kernel.org, linux-riscv@lists.infradead.org,
linux-s390@vger.kernel.org, linux-scsi@vger.kernel.org,
linuxppc-dev@lists.ozlabs.org, loongarch@lists.linux.dev,
sparclinux@vger.kernel.org, x86@kernel.org,
Ard Biesheuvel <ardb@kernel.org>
Subject: [PATCH v3 11/18] x86/crc32: update prototype for crc_pcl()
Date: Sun, 3 Nov 2024 14:31:47 -0800 [thread overview]
Message-ID: <20241103223154.136127-12-ebiggers@kernel.org> (raw)
In-Reply-To: <20241103223154.136127-1-ebiggers@kernel.org>
From: Eric Biggers <ebiggers@google.com>
- Change the len parameter from unsigned int to size_t, so that the
library function which takes a size_t can safely use this code.
- Rename to crc32c_x86_3way() which is much clearer.
- Move the crc parameter to the front, as this is the usual convention.
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/crc32c-intel_glue.c | 7 ++-
arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 63 ++++++++++++-----------
2 files changed, 35 insertions(+), 35 deletions(-)
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 52c5d47ef5a1..603d159de400 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -39,12 +39,11 @@
* size is >= 512 to account
* for fpu state save/restore overhead.
*/
#define CRC32C_PCL_BREAKEVEN 512
-asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len,
- unsigned int crc_init);
+asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
#endif /* CONFIG_X86_64 */
static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
{
while (length--) {
@@ -157,11 +156,11 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
* use faster PCL version if datasize is large enough to
* overcome kernel fpu state save/restore overhead
*/
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
kernel_fpu_begin();
- *crcp = crc_pcl(data, len, *crcp);
+ *crcp = crc32c_x86_3way(*crcp, data, len);
kernel_fpu_end();
} else
*crcp = crc32c_intel_le_hw(*crcp, data, len);
return 0;
}
@@ -169,11 +168,11 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
kernel_fpu_begin();
- *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
+ *(__le32 *)out = ~cpu_to_le32(crc32c_x86_3way(*crcp, data, len));
kernel_fpu_end();
} else
*(__le32 *)out =
~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
return 0;
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 752812bc4991..9b8770503bbc 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -50,19 +50,20 @@
# Define threshold below which buffers are considered "small" and routed to
# regular CRC code that does not interleave the CRC instructions.
#define SMALL_SIZE 200
-# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
+# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
.text
-SYM_FUNC_START(crc_pcl)
-#define bufp %rdi
-#define bufp_d %edi
-#define len %esi
-#define crc_init %edx
-#define crc_init_q %rdx
+SYM_FUNC_START(crc32c_x86_3way)
+#define crc0 %edi
+#define crc0_q %rdi
+#define bufp %rsi
+#define bufp_d %esi
+#define len %rdx
+#define len_dw %edx
#define n_misaligned %ecx /* overlaps chunk_bytes! */
#define n_misaligned_q %rcx
#define chunk_bytes %ecx /* overlaps n_misaligned! */
#define chunk_bytes_q %rcx
#define crc1 %r8
@@ -83,13 +84,13 @@ SYM_FUNC_START(crc_pcl)
# Process 1 <= n_misaligned <= 7 bytes individually in order to align
# the remaining data to an 8-byte boundary.
.Ldo_align:
movq (bufp), %rax
add n_misaligned_q, bufp
- sub n_misaligned, len
+ sub n_misaligned_q, len
.Lalign_loop:
- crc32b %al, crc_init # compute crc32 of 1-byte
+ crc32b %al, crc0 # compute crc32 of 1-byte
shr $8, %rax # get next byte
dec n_misaligned
jne .Lalign_loop
.Laligned:
@@ -100,11 +101,11 @@ SYM_FUNC_START(crc_pcl)
cmp $128*24, len
jae .Lfull_block
.Lpartial_block:
# Compute floor(len / 24) to get num qwords to process from each lane.
- imul $2731, len, %eax # 2731 = ceil(2^16 / 24)
+ imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24)
shr $16, %eax
jmp .Lcrc_3lanes
.Lfull_block:
# Processing 128 qwords from each lane.
@@ -123,20 +124,20 @@ SYM_FUNC_START(crc_pcl)
jl .Lcrc_3lanes_4x_done
# Unroll the loop by a factor of 4 to reduce the overhead of the loop
# bookkeeping instructions, which can compete with crc32q for the ALUs.
.Lcrc_3lanes_4x_loop:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
- crc32q 8(bufp), crc_init_q
+ crc32q 8(bufp), crc0_q
crc32q 8(bufp,chunk_bytes_q), crc1
crc32q 8(bufp,chunk_bytes_q,2), crc2
- crc32q 16(bufp), crc_init_q
+ crc32q 16(bufp), crc0_q
crc32q 16(bufp,chunk_bytes_q), crc1
crc32q 16(bufp,chunk_bytes_q,2), crc2
- crc32q 24(bufp), crc_init_q
+ crc32q 24(bufp), crc0_q
crc32q 24(bufp,chunk_bytes_q), crc1
crc32q 24(bufp,chunk_bytes_q,2), crc2
add $32, bufp
sub $4, %eax
jge .Lcrc_3lanes_4x_loop
@@ -144,42 +145,42 @@ SYM_FUNC_START(crc_pcl)
.Lcrc_3lanes_4x_done:
add $4, %eax
jz .Lcrc_3lanes_last_qword
.Lcrc_3lanes_1x_loop:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
add $8, bufp
dec %eax
jnz .Lcrc_3lanes_1x_loop
.Lcrc_3lanes_last_qword:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
################################################################
## 4) Combine three results:
################################################################
lea (K_table-8)(%rip), %rax # first entry is for idx 1
pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
- sub %eax, len # len -= chunk_bytes * 3
+ sub %rax, len # len -= chunk_bytes * 3
- movq crc_init_q, %xmm1 # CRC for block 1
+ movq crc0_q, %xmm1 # CRC for block 1
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
movq crc1, %xmm2 # CRC for block 2
pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
pxor %xmm2,%xmm1
movq %xmm1, %rax
xor (bufp,chunk_bytes_q,2), %rax
- mov crc2, crc_init_q
- crc32 %rax, crc_init_q
+ mov crc2, crc0_q
+ crc32 %rax, crc0_q
lea 8(bufp,chunk_bytes_q,2), bufp
################################################################
## 5) If more blocks remain, goto (2):
################################################################
@@ -191,38 +192,38 @@ SYM_FUNC_START(crc_pcl)
#######################################################################
## 6) Process any remainder without interleaving:
#######################################################################
.Lsmall:
- test len, len
+ test len_dw, len_dw
jz .Ldone
- mov len, %eax
+ mov len_dw, %eax
shr $3, %eax
jz .Ldo_dword
.Ldo_qwords:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
add $8, bufp
dec %eax
jnz .Ldo_qwords
.Ldo_dword:
- test $4, len
+ test $4, len_dw
jz .Ldo_word
- crc32l (bufp), crc_init
+ crc32l (bufp), crc0
add $4, bufp
.Ldo_word:
- test $2, len
+ test $2, len_dw
jz .Ldo_byte
- crc32w (bufp), crc_init
+ crc32w (bufp), crc0
add $2, bufp
.Ldo_byte:
- test $1, len
+ test $1, len_dw
jz .Ldone
- crc32b (bufp), crc_init
+ crc32b (bufp), crc0
.Ldone:
- mov crc_init, %eax
+ mov crc0, %eax
RET
-SYM_FUNC_END(crc_pcl)
+SYM_FUNC_END(crc32c_x86_3way)
.section .rodata, "a", @progbits
################################################################
## PCLMULQDQ tables
## Table is 128 entries x 2 words (8 bytes) each
--
2.47.0
next prev parent reply other threads:[~2024-11-03 22:32 UTC|newest]
Thread overview: 31+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-11-03 22:31 [PATCH v3 00/18] Wire up CRC32 library functions to arch-optimized code Eric Biggers
2024-11-03 22:31 ` [PATCH v3 01/18] lib/crc32: drop leading underscores from __crc32c_le_base Eric Biggers
2024-11-03 22:31 ` [PATCH v3 02/18] lib/crc32: improve support for arch-specific overrides Eric Biggers
2024-11-03 22:31 ` [PATCH v3 03/18] lib/crc32: expose whether the lib is really optimized at runtime Eric Biggers
2024-11-04 10:55 ` Ard Biesheuvel
2024-11-03 22:31 ` [PATCH v3 04/18] crypto: crc32 - don't unnecessarily register arch algorithms Eric Biggers
2024-11-04 10:56 ` Ard Biesheuvel
2024-11-03 22:31 ` [PATCH v3 05/18] arm/crc32: expose CRC32 functions through lib Eric Biggers
2024-11-04 18:10 ` Eric Biggers
2024-11-03 22:31 ` [PATCH v3 06/18] loongarch/crc32: " Eric Biggers
2024-11-03 22:31 ` [PATCH v3 07/18] mips/crc32: " Eric Biggers
2024-11-03 22:31 ` [PATCH v3 08/18] powerpc/crc32: " Eric Biggers
2024-11-05 9:22 ` Michael Ellerman
2024-11-03 22:31 ` [PATCH v3 09/18] s390/crc32: " Eric Biggers
2024-11-03 22:31 ` [PATCH v3 10/18] sparc/crc32: " Eric Biggers
2024-11-03 22:31 ` Eric Biggers [this message]
2024-11-03 22:31 ` [PATCH v3 12/18] x86/crc32: update prototype for crc32_pclmul_le_16() Eric Biggers
2024-11-03 22:31 ` [PATCH v3 13/18] x86/crc32: expose CRC32 functions through lib Eric Biggers
2024-11-03 22:31 ` [PATCH v3 14/18] lib/crc32: make crc32c() go directly to lib Eric Biggers
2024-11-03 22:31 ` [PATCH v3 15/18] ext4: switch to using the crc32c library Eric Biggers
2024-11-04 15:59 ` Darrick J. Wong
2024-11-04 17:48 ` Eric Biggers
2024-11-04 16:17 ` Theodore Ts'o
2024-11-03 22:31 ` [PATCH v3 16/18] jbd2: " Eric Biggers
2024-11-04 16:01 ` Darrick J. Wong
2024-11-04 18:02 ` Eric Biggers
2024-11-04 16:17 ` Theodore Ts'o
2024-11-03 22:31 ` [PATCH v3 17/18] f2fs: switch to using the crc32 library Eric Biggers
2024-11-05 1:34 ` [f2fs-dev] " Chao Yu
2024-11-03 22:31 ` [PATCH v3 18/18] scsi: target: iscsi: switch to using the crc32c library Eric Biggers
2024-11-05 1:33 ` Martin K. Petersen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241103223154.136127-12-ebiggers@kernel.org \
--to=ebiggers@kernel.org \
--cc=ardb@kernel.org \
--cc=linux-arch@vger.kernel.org \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-crypto@vger.kernel.org \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-f2fs-devel@lists.sourceforge.net \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mips@vger.kernel.org \
--cc=linux-riscv@lists.infradead.org \
--cc=linux-s390@vger.kernel.org \
--cc=linux-scsi@vger.kernel.org \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=loongarch@lists.linux.dev \
--cc=sparclinux@vger.kernel.org \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).