From: Eric Biggers <ebiggers@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: linux-arch@vger.kernel.org, linux-arm-kernel@lists.infradead.org,
linux-crypto@vger.kernel.org, linux-ext4@vger.kernel.org,
linux-f2fs-devel@lists.sourceforge.net,
linux-mips@vger.kernel.org, linux-riscv@lists.infradead.org,
linux-s390@vger.kernel.org, linux-scsi@vger.kernel.org,
linuxppc-dev@lists.ozlabs.org, loongarch@lists.linux.dev,
sparclinux@vger.kernel.org, x86@kernel.org,
Ard Biesheuvel <ardb@kernel.org>
Subject: [PATCH v3 11/18] x86/crc32: update prototype for crc_pcl()
Date: Sun, 3 Nov 2024 14:31:47 -0800 [thread overview]
Message-ID: <20241103223154.136127-12-ebiggers@kernel.org> (raw)
In-Reply-To: <20241103223154.136127-1-ebiggers@kernel.org>
From: Eric Biggers <ebiggers@google.com>
- Change the len parameter from unsigned int to size_t, so that the
library function which takes a size_t can safely use this code.
- Rename to crc32c_x86_3way() which is much clearer.
- Move the crc parameter to the front, as this is the usual convention.
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/crc32c-intel_glue.c | 7 ++-
arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 63 ++++++++++++-----------
2 files changed, 35 insertions(+), 35 deletions(-)
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 52c5d47ef5a1..603d159de400 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -39,12 +39,11 @@
* size is >= 512 to account
* for fpu state save/restore overhead.
*/
#define CRC32C_PCL_BREAKEVEN 512
-asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len,
- unsigned int crc_init);
+asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
#endif /* CONFIG_X86_64 */
static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
{
while (length--) {
@@ -157,11 +156,11 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
* use faster PCL version if datasize is large enough to
* overcome kernel fpu state save/restore overhead
*/
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
kernel_fpu_begin();
- *crcp = crc_pcl(data, len, *crcp);
+ *crcp = crc32c_x86_3way(*crcp, data, len);
kernel_fpu_end();
} else
*crcp = crc32c_intel_le_hw(*crcp, data, len);
return 0;
}
@@ -169,11 +168,11 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
kernel_fpu_begin();
- *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
+ *(__le32 *)out = ~cpu_to_le32(crc32c_x86_3way(*crcp, data, len));
kernel_fpu_end();
} else
*(__le32 *)out =
~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
return 0;
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 752812bc4991..9b8770503bbc 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -50,19 +50,20 @@
# Define threshold below which buffers are considered "small" and routed to
# regular CRC code that does not interleave the CRC instructions.
#define SMALL_SIZE 200
-# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
+# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
.text
-SYM_FUNC_START(crc_pcl)
-#define bufp %rdi
-#define bufp_d %edi
-#define len %esi
-#define crc_init %edx
-#define crc_init_q %rdx
+SYM_FUNC_START(crc32c_x86_3way)
+#define crc0 %edi
+#define crc0_q %rdi
+#define bufp %rsi
+#define bufp_d %esi
+#define len %rdx
+#define len_dw %edx
#define n_misaligned %ecx /* overlaps chunk_bytes! */
#define n_misaligned_q %rcx
#define chunk_bytes %ecx /* overlaps n_misaligned! */
#define chunk_bytes_q %rcx
#define crc1 %r8
@@ -83,13 +84,13 @@ SYM_FUNC_START(crc_pcl)
# Process 1 <= n_misaligned <= 7 bytes individually in order to align
# the remaining data to an 8-byte boundary.
.Ldo_align:
movq (bufp), %rax
add n_misaligned_q, bufp
- sub n_misaligned, len
+ sub n_misaligned_q, len
.Lalign_loop:
- crc32b %al, crc_init # compute crc32 of 1-byte
+ crc32b %al, crc0 # compute crc32 of 1-byte
shr $8, %rax # get next byte
dec n_misaligned
jne .Lalign_loop
.Laligned:
@@ -100,11 +101,11 @@ SYM_FUNC_START(crc_pcl)
cmp $128*24, len
jae .Lfull_block
.Lpartial_block:
# Compute floor(len / 24) to get num qwords to process from each lane.
- imul $2731, len, %eax # 2731 = ceil(2^16 / 24)
+ imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24)
shr $16, %eax
jmp .Lcrc_3lanes
.Lfull_block:
# Processing 128 qwords from each lane.
@@ -123,20 +124,20 @@ SYM_FUNC_START(crc_pcl)
jl .Lcrc_3lanes_4x_done
# Unroll the loop by a factor of 4 to reduce the overhead of the loop
# bookkeeping instructions, which can compete with crc32q for the ALUs.
.Lcrc_3lanes_4x_loop:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
- crc32q 8(bufp), crc_init_q
+ crc32q 8(bufp), crc0_q
crc32q 8(bufp,chunk_bytes_q), crc1
crc32q 8(bufp,chunk_bytes_q,2), crc2
- crc32q 16(bufp), crc_init_q
+ crc32q 16(bufp), crc0_q
crc32q 16(bufp,chunk_bytes_q), crc1
crc32q 16(bufp,chunk_bytes_q,2), crc2
- crc32q 24(bufp), crc_init_q
+ crc32q 24(bufp), crc0_q
crc32q 24(bufp,chunk_bytes_q), crc1
crc32q 24(bufp,chunk_bytes_q,2), crc2
add $32, bufp
sub $4, %eax
jge .Lcrc_3lanes_4x_loop
@@ -144,42 +145,42 @@ SYM_FUNC_START(crc_pcl)
.Lcrc_3lanes_4x_done:
add $4, %eax
jz .Lcrc_3lanes_last_qword
.Lcrc_3lanes_1x_loop:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
add $8, bufp
dec %eax
jnz .Lcrc_3lanes_1x_loop
.Lcrc_3lanes_last_qword:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
################################################################
## 4) Combine three results:
################################################################
lea (K_table-8)(%rip), %rax # first entry is for idx 1
pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
- sub %eax, len # len -= chunk_bytes * 3
+ sub %rax, len # len -= chunk_bytes * 3
- movq crc_init_q, %xmm1 # CRC for block 1
+ movq crc0_q, %xmm1 # CRC for block 1
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
movq crc1, %xmm2 # CRC for block 2
pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
pxor %xmm2,%xmm1
movq %xmm1, %rax
xor (bufp,chunk_bytes_q,2), %rax
- mov crc2, crc_init_q
- crc32 %rax, crc_init_q
+ mov crc2, crc0_q
+ crc32 %rax, crc0_q
lea 8(bufp,chunk_bytes_q,2), bufp
################################################################
## 5) If more blocks remain, goto (2):
################################################################
@@ -191,38 +192,38 @@ SYM_FUNC_START(crc_pcl)
#######################################################################
## 6) Process any remainder without interleaving:
#######################################################################
.Lsmall:
- test len, len
+ test len_dw, len_dw
jz .Ldone
- mov len, %eax
+ mov len_dw, %eax
shr $3, %eax
jz .Ldo_dword
.Ldo_qwords:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
add $8, bufp
dec %eax
jnz .Ldo_qwords
.Ldo_dword:
- test $4, len
+ test $4, len_dw
jz .Ldo_word
- crc32l (bufp), crc_init
+ crc32l (bufp), crc0
add $4, bufp
.Ldo_word:
- test $2, len
+ test $2, len_dw
jz .Ldo_byte
- crc32w (bufp), crc_init
+ crc32w (bufp), crc0
add $2, bufp
.Ldo_byte:
- test $1, len
+ test $1, len_dw
jz .Ldone
- crc32b (bufp), crc_init
+ crc32b (bufp), crc0
.Ldone:
- mov crc_init, %eax
+ mov crc0, %eax
RET
-SYM_FUNC_END(crc_pcl)
+SYM_FUNC_END(crc32c_x86_3way)
.section .rodata, "a", @progbits
################################################################
## PCLMULQDQ tables
## Table is 128 entries x 2 words (8 bytes) each
--
2.47.0
WARNING: multiple messages have this Message-ID (diff)
From: Eric Biggers via Linux-f2fs-devel <linux-f2fs-devel@lists.sourceforge.net>
To: linux-kernel@vger.kernel.org
Cc: linux-arch@vger.kernel.org, linux-s390@vger.kernel.org,
linux-scsi@vger.kernel.org, x86@kernel.org,
linux-mips@vger.kernel.org,
linux-f2fs-devel@lists.sourceforge.net,
linux-crypto@vger.kernel.org, loongarch@lists.linux.dev,
sparclinux@vger.kernel.org, linux-riscv@lists.infradead.org,
linux-ext4@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
Ard Biesheuvel <ardb@kernel.org>,
linux-arm-kernel@lists.infradead.org
Subject: [f2fs-dev] [PATCH v3 11/18] x86/crc32: update prototype for crc_pcl()
Date: Sun, 3 Nov 2024 14:31:47 -0800 [thread overview]
Message-ID: <20241103223154.136127-12-ebiggers@kernel.org> (raw)
In-Reply-To: <20241103223154.136127-1-ebiggers@kernel.org>
From: Eric Biggers <ebiggers@google.com>
- Change the len parameter from unsigned int to size_t, so that the
library function which takes a size_t can safely use this code.
- Rename to crc32c_x86_3way() which is much clearer.
- Move the crc parameter to the front, as this is the usual convention.
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/crc32c-intel_glue.c | 7 ++-
arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 63 ++++++++++++-----------
2 files changed, 35 insertions(+), 35 deletions(-)
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 52c5d47ef5a1..603d159de400 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -39,12 +39,11 @@
* size is >= 512 to account
* for fpu state save/restore overhead.
*/
#define CRC32C_PCL_BREAKEVEN 512
-asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len,
- unsigned int crc_init);
+asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
#endif /* CONFIG_X86_64 */
static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
{
while (length--) {
@@ -157,11 +156,11 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
* use faster PCL version if datasize is large enough to
* overcome kernel fpu state save/restore overhead
*/
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
kernel_fpu_begin();
- *crcp = crc_pcl(data, len, *crcp);
+ *crcp = crc32c_x86_3way(*crcp, data, len);
kernel_fpu_end();
} else
*crcp = crc32c_intel_le_hw(*crcp, data, len);
return 0;
}
@@ -169,11 +168,11 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
kernel_fpu_begin();
- *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
+ *(__le32 *)out = ~cpu_to_le32(crc32c_x86_3way(*crcp, data, len));
kernel_fpu_end();
} else
*(__le32 *)out =
~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
return 0;
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 752812bc4991..9b8770503bbc 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -50,19 +50,20 @@
# Define threshold below which buffers are considered "small" and routed to
# regular CRC code that does not interleave the CRC instructions.
#define SMALL_SIZE 200
-# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
+# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
.text
-SYM_FUNC_START(crc_pcl)
-#define bufp %rdi
-#define bufp_d %edi
-#define len %esi
-#define crc_init %edx
-#define crc_init_q %rdx
+SYM_FUNC_START(crc32c_x86_3way)
+#define crc0 %edi
+#define crc0_q %rdi
+#define bufp %rsi
+#define bufp_d %esi
+#define len %rdx
+#define len_dw %edx
#define n_misaligned %ecx /* overlaps chunk_bytes! */
#define n_misaligned_q %rcx
#define chunk_bytes %ecx /* overlaps n_misaligned! */
#define chunk_bytes_q %rcx
#define crc1 %r8
@@ -83,13 +84,13 @@ SYM_FUNC_START(crc_pcl)
# Process 1 <= n_misaligned <= 7 bytes individually in order to align
# the remaining data to an 8-byte boundary.
.Ldo_align:
movq (bufp), %rax
add n_misaligned_q, bufp
- sub n_misaligned, len
+ sub n_misaligned_q, len
.Lalign_loop:
- crc32b %al, crc_init # compute crc32 of 1-byte
+ crc32b %al, crc0 # compute crc32 of 1-byte
shr $8, %rax # get next byte
dec n_misaligned
jne .Lalign_loop
.Laligned:
@@ -100,11 +101,11 @@ SYM_FUNC_START(crc_pcl)
cmp $128*24, len
jae .Lfull_block
.Lpartial_block:
# Compute floor(len / 24) to get num qwords to process from each lane.
- imul $2731, len, %eax # 2731 = ceil(2^16 / 24)
+ imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24)
shr $16, %eax
jmp .Lcrc_3lanes
.Lfull_block:
# Processing 128 qwords from each lane.
@@ -123,20 +124,20 @@ SYM_FUNC_START(crc_pcl)
jl .Lcrc_3lanes_4x_done
# Unroll the loop by a factor of 4 to reduce the overhead of the loop
# bookkeeping instructions, which can compete with crc32q for the ALUs.
.Lcrc_3lanes_4x_loop:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
- crc32q 8(bufp), crc_init_q
+ crc32q 8(bufp), crc0_q
crc32q 8(bufp,chunk_bytes_q), crc1
crc32q 8(bufp,chunk_bytes_q,2), crc2
- crc32q 16(bufp), crc_init_q
+ crc32q 16(bufp), crc0_q
crc32q 16(bufp,chunk_bytes_q), crc1
crc32q 16(bufp,chunk_bytes_q,2), crc2
- crc32q 24(bufp), crc_init_q
+ crc32q 24(bufp), crc0_q
crc32q 24(bufp,chunk_bytes_q), crc1
crc32q 24(bufp,chunk_bytes_q,2), crc2
add $32, bufp
sub $4, %eax
jge .Lcrc_3lanes_4x_loop
@@ -144,42 +145,42 @@ SYM_FUNC_START(crc_pcl)
.Lcrc_3lanes_4x_done:
add $4, %eax
jz .Lcrc_3lanes_last_qword
.Lcrc_3lanes_1x_loop:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
add $8, bufp
dec %eax
jnz .Lcrc_3lanes_1x_loop
.Lcrc_3lanes_last_qword:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
################################################################
## 4) Combine three results:
################################################################
lea (K_table-8)(%rip), %rax # first entry is for idx 1
pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
- sub %eax, len # len -= chunk_bytes * 3
+ sub %rax, len # len -= chunk_bytes * 3
- movq crc_init_q, %xmm1 # CRC for block 1
+ movq crc0_q, %xmm1 # CRC for block 1
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
movq crc1, %xmm2 # CRC for block 2
pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
pxor %xmm2,%xmm1
movq %xmm1, %rax
xor (bufp,chunk_bytes_q,2), %rax
- mov crc2, crc_init_q
- crc32 %rax, crc_init_q
+ mov crc2, crc0_q
+ crc32 %rax, crc0_q
lea 8(bufp,chunk_bytes_q,2), bufp
################################################################
## 5) If more blocks remain, goto (2):
################################################################
@@ -191,38 +192,38 @@ SYM_FUNC_START(crc_pcl)
#######################################################################
## 6) Process any remainder without interleaving:
#######################################################################
.Lsmall:
- test len, len
+ test len_dw, len_dw
jz .Ldone
- mov len, %eax
+ mov len_dw, %eax
shr $3, %eax
jz .Ldo_dword
.Ldo_qwords:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
add $8, bufp
dec %eax
jnz .Ldo_qwords
.Ldo_dword:
- test $4, len
+ test $4, len_dw
jz .Ldo_word
- crc32l (bufp), crc_init
+ crc32l (bufp), crc0
add $4, bufp
.Ldo_word:
- test $2, len
+ test $2, len_dw
jz .Ldo_byte
- crc32w (bufp), crc_init
+ crc32w (bufp), crc0
add $2, bufp
.Ldo_byte:
- test $1, len
+ test $1, len_dw
jz .Ldone
- crc32b (bufp), crc_init
+ crc32b (bufp), crc0
.Ldone:
- mov crc_init, %eax
+ mov crc0, %eax
RET
-SYM_FUNC_END(crc_pcl)
+SYM_FUNC_END(crc32c_x86_3way)
.section .rodata, "a", @progbits
################################################################
## PCLMULQDQ tables
## Table is 128 entries x 2 words (8 bytes) each
--
2.47.0
_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
WARNING: multiple messages have this Message-ID (diff)
From: Eric Biggers <ebiggers@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: linux-arch@vger.kernel.org, linux-arm-kernel@lists.infradead.org,
linux-crypto@vger.kernel.org, linux-ext4@vger.kernel.org,
linux-f2fs-devel@lists.sourceforge.net,
linux-mips@vger.kernel.org, linux-riscv@lists.infradead.org,
linux-s390@vger.kernel.org, linux-scsi@vger.kernel.org,
linuxppc-dev@lists.ozlabs.org, loongarch@lists.linux.dev,
sparclinux@vger.kernel.org, x86@kernel.org,
Ard Biesheuvel <ardb@kernel.org>
Subject: [PATCH v3 11/18] x86/crc32: update prototype for crc_pcl()
Date: Sun, 3 Nov 2024 14:31:47 -0800 [thread overview]
Message-ID: <20241103223154.136127-12-ebiggers@kernel.org> (raw)
In-Reply-To: <20241103223154.136127-1-ebiggers@kernel.org>
From: Eric Biggers <ebiggers@google.com>
- Change the len parameter from unsigned int to size_t, so that the
library function which takes a size_t can safely use this code.
- Rename to crc32c_x86_3way() which is much clearer.
- Move the crc parameter to the front, as this is the usual convention.
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/crc32c-intel_glue.c | 7 ++-
arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 63 ++++++++++++-----------
2 files changed, 35 insertions(+), 35 deletions(-)
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 52c5d47ef5a1..603d159de400 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -39,12 +39,11 @@
* size is >= 512 to account
* for fpu state save/restore overhead.
*/
#define CRC32C_PCL_BREAKEVEN 512
-asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len,
- unsigned int crc_init);
+asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
#endif /* CONFIG_X86_64 */
static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
{
while (length--) {
@@ -157,11 +156,11 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
* use faster PCL version if datasize is large enough to
* overcome kernel fpu state save/restore overhead
*/
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
kernel_fpu_begin();
- *crcp = crc_pcl(data, len, *crcp);
+ *crcp = crc32c_x86_3way(*crcp, data, len);
kernel_fpu_end();
} else
*crcp = crc32c_intel_le_hw(*crcp, data, len);
return 0;
}
@@ -169,11 +168,11 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
kernel_fpu_begin();
- *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
+ *(__le32 *)out = ~cpu_to_le32(crc32c_x86_3way(*crcp, data, len));
kernel_fpu_end();
} else
*(__le32 *)out =
~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
return 0;
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 752812bc4991..9b8770503bbc 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -50,19 +50,20 @@
# Define threshold below which buffers are considered "small" and routed to
# regular CRC code that does not interleave the CRC instructions.
#define SMALL_SIZE 200
-# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
+# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
.text
-SYM_FUNC_START(crc_pcl)
-#define bufp %rdi
-#define bufp_d %edi
-#define len %esi
-#define crc_init %edx
-#define crc_init_q %rdx
+SYM_FUNC_START(crc32c_x86_3way)
+#define crc0 %edi
+#define crc0_q %rdi
+#define bufp %rsi
+#define bufp_d %esi
+#define len %rdx
+#define len_dw %edx
#define n_misaligned %ecx /* overlaps chunk_bytes! */
#define n_misaligned_q %rcx
#define chunk_bytes %ecx /* overlaps n_misaligned! */
#define chunk_bytes_q %rcx
#define crc1 %r8
@@ -83,13 +84,13 @@ SYM_FUNC_START(crc_pcl)
# Process 1 <= n_misaligned <= 7 bytes individually in order to align
# the remaining data to an 8-byte boundary.
.Ldo_align:
movq (bufp), %rax
add n_misaligned_q, bufp
- sub n_misaligned, len
+ sub n_misaligned_q, len
.Lalign_loop:
- crc32b %al, crc_init # compute crc32 of 1-byte
+ crc32b %al, crc0 # compute crc32 of 1-byte
shr $8, %rax # get next byte
dec n_misaligned
jne .Lalign_loop
.Laligned:
@@ -100,11 +101,11 @@ SYM_FUNC_START(crc_pcl)
cmp $128*24, len
jae .Lfull_block
.Lpartial_block:
# Compute floor(len / 24) to get num qwords to process from each lane.
- imul $2731, len, %eax # 2731 = ceil(2^16 / 24)
+ imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24)
shr $16, %eax
jmp .Lcrc_3lanes
.Lfull_block:
# Processing 128 qwords from each lane.
@@ -123,20 +124,20 @@ SYM_FUNC_START(crc_pcl)
jl .Lcrc_3lanes_4x_done
# Unroll the loop by a factor of 4 to reduce the overhead of the loop
# bookkeeping instructions, which can compete with crc32q for the ALUs.
.Lcrc_3lanes_4x_loop:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
- crc32q 8(bufp), crc_init_q
+ crc32q 8(bufp), crc0_q
crc32q 8(bufp,chunk_bytes_q), crc1
crc32q 8(bufp,chunk_bytes_q,2), crc2
- crc32q 16(bufp), crc_init_q
+ crc32q 16(bufp), crc0_q
crc32q 16(bufp,chunk_bytes_q), crc1
crc32q 16(bufp,chunk_bytes_q,2), crc2
- crc32q 24(bufp), crc_init_q
+ crc32q 24(bufp), crc0_q
crc32q 24(bufp,chunk_bytes_q), crc1
crc32q 24(bufp,chunk_bytes_q,2), crc2
add $32, bufp
sub $4, %eax
jge .Lcrc_3lanes_4x_loop
@@ -144,42 +145,42 @@ SYM_FUNC_START(crc_pcl)
.Lcrc_3lanes_4x_done:
add $4, %eax
jz .Lcrc_3lanes_last_qword
.Lcrc_3lanes_1x_loop:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
add $8, bufp
dec %eax
jnz .Lcrc_3lanes_1x_loop
.Lcrc_3lanes_last_qword:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
################################################################
## 4) Combine three results:
################################################################
lea (K_table-8)(%rip), %rax # first entry is for idx 1
pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
- sub %eax, len # len -= chunk_bytes * 3
+ sub %rax, len # len -= chunk_bytes * 3
- movq crc_init_q, %xmm1 # CRC for block 1
+ movq crc0_q, %xmm1 # CRC for block 1
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
movq crc1, %xmm2 # CRC for block 2
pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
pxor %xmm2,%xmm1
movq %xmm1, %rax
xor (bufp,chunk_bytes_q,2), %rax
- mov crc2, crc_init_q
- crc32 %rax, crc_init_q
+ mov crc2, crc0_q
+ crc32 %rax, crc0_q
lea 8(bufp,chunk_bytes_q,2), bufp
################################################################
## 5) If more blocks remain, goto (2):
################################################################
@@ -191,38 +192,38 @@ SYM_FUNC_START(crc_pcl)
#######################################################################
## 6) Process any remainder without interleaving:
#######################################################################
.Lsmall:
- test len, len
+ test len_dw, len_dw
jz .Ldone
- mov len, %eax
+ mov len_dw, %eax
shr $3, %eax
jz .Ldo_dword
.Ldo_qwords:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
add $8, bufp
dec %eax
jnz .Ldo_qwords
.Ldo_dword:
- test $4, len
+ test $4, len_dw
jz .Ldo_word
- crc32l (bufp), crc_init
+ crc32l (bufp), crc0
add $4, bufp
.Ldo_word:
- test $2, len
+ test $2, len_dw
jz .Ldo_byte
- crc32w (bufp), crc_init
+ crc32w (bufp), crc0
add $2, bufp
.Ldo_byte:
- test $1, len
+ test $1, len_dw
jz .Ldone
- crc32b (bufp), crc_init
+ crc32b (bufp), crc0
.Ldone:
- mov crc_init, %eax
+ mov crc0, %eax
RET
-SYM_FUNC_END(crc_pcl)
+SYM_FUNC_END(crc32c_x86_3way)
.section .rodata, "a", @progbits
################################################################
## PCLMULQDQ tables
## Table is 128 entries x 2 words (8 bytes) each
--
2.47.0
_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv
next prev parent reply other threads:[~2024-11-03 22:32 UTC|newest]
Thread overview: 93+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-11-03 22:31 [PATCH v3 00/18] Wire up CRC32 library functions to arch-optimized code Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-03 22:31 ` [PATCH v3 01/18] lib/crc32: drop leading underscores from __crc32c_le_base Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-03 22:31 ` [PATCH v3 02/18] lib/crc32: improve support for arch-specific overrides Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-03 22:31 ` [PATCH v3 03/18] lib/crc32: expose whether the lib is really optimized at runtime Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-04 10:55 ` Ard Biesheuvel
2024-11-04 10:55 ` Ard Biesheuvel
2024-11-04 10:55 ` [f2fs-dev] " Ard Biesheuvel via Linux-f2fs-devel
2024-11-03 22:31 ` [PATCH v3 04/18] crypto: crc32 - don't unnecessarily register arch algorithms Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-04 10:56 ` Ard Biesheuvel
2024-11-04 10:56 ` Ard Biesheuvel
2024-11-04 10:56 ` [f2fs-dev] " Ard Biesheuvel via Linux-f2fs-devel
2024-11-03 22:31 ` [PATCH v3 05/18] arm/crc32: expose CRC32 functions through lib Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-04 18:10 ` Eric Biggers
2024-11-04 18:10 ` Eric Biggers
2024-11-04 18:10 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-03 22:31 ` [PATCH v3 06/18] loongarch/crc32: " Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-03 22:31 ` [PATCH v3 07/18] mips/crc32: " Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-03 22:31 ` [PATCH v3 08/18] powerpc/crc32: " Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-05 9:22 ` Michael Ellerman
2024-11-05 9:22 ` Michael Ellerman
2024-11-05 9:22 ` [f2fs-dev] " Michael Ellerman
2024-11-03 22:31 ` [PATCH v3 09/18] s390/crc32: " Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-03 22:31 ` [PATCH v3 10/18] sparc/crc32: " Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-03 22:31 ` Eric Biggers [this message]
2024-11-03 22:31 ` [PATCH v3 11/18] x86/crc32: update prototype for crc_pcl() Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-03 22:31 ` [PATCH v3 12/18] x86/crc32: update prototype for crc32_pclmul_le_16() Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-03 22:31 ` [PATCH v3 13/18] x86/crc32: expose CRC32 functions through lib Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-03 22:31 ` [PATCH v3 14/18] lib/crc32: make crc32c() go directly to lib Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-03 22:31 ` [PATCH v3 15/18] ext4: switch to using the crc32c library Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-04 15:59 ` Darrick J. Wong
2024-11-04 15:59 ` Darrick J. Wong
2024-11-04 15:59 ` [f2fs-dev] " Darrick J. Wong via Linux-f2fs-devel
2024-11-04 17:48 ` Eric Biggers
2024-11-04 17:48 ` Eric Biggers
2024-11-04 17:48 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-04 16:17 ` Theodore Ts'o
2024-11-04 16:17 ` Theodore Ts'o
2024-11-04 16:17 ` [f2fs-dev] " Theodore Ts'o
2024-11-03 22:31 ` [PATCH v3 16/18] jbd2: " Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-04 16:01 ` Darrick J. Wong
2024-11-04 16:01 ` Darrick J. Wong
2024-11-04 16:01 ` [f2fs-dev] " Darrick J. Wong via Linux-f2fs-devel
2024-11-04 18:02 ` Eric Biggers
2024-11-04 18:02 ` Eric Biggers
2024-11-04 18:02 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-04 16:17 ` Theodore Ts'o
2024-11-04 16:17 ` Theodore Ts'o
2024-11-04 16:17 ` [f2fs-dev] " Theodore Ts'o
2024-11-03 22:31 ` [PATCH v3 17/18] f2fs: switch to using the crc32 library Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-05 1:34 ` Chao Yu
2024-11-05 1:34 ` Chao Yu
2024-11-05 1:34 ` Chao Yu via Linux-f2fs-devel
2024-11-03 22:31 ` [PATCH v3 18/18] scsi: target: iscsi: switch to using the crc32c library Eric Biggers
2024-11-03 22:31 ` Eric Biggers
2024-11-03 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-05 1:33 ` Martin K. Petersen
2024-11-05 1:33 ` Martin K. Petersen
2024-11-05 1:33 ` [f2fs-dev] " Martin K. Petersen via Linux-f2fs-devel
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241103223154.136127-12-ebiggers@kernel.org \
--to=ebiggers@kernel.org \
--cc=ardb@kernel.org \
--cc=linux-arch@vger.kernel.org \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-crypto@vger.kernel.org \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-f2fs-devel@lists.sourceforge.net \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mips@vger.kernel.org \
--cc=linux-riscv@lists.infradead.org \
--cc=linux-s390@vger.kernel.org \
--cc=linux-scsi@vger.kernel.org \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=loongarch@lists.linux.dev \
--cc=sparclinux@vger.kernel.org \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.