From: Eric Biggers <ebiggers@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: linux-arch@vger.kernel.org, linux-arm-kernel@lists.infradead.org,
linux-crypto@vger.kernel.org, linux-ext4@vger.kernel.org,
linux-f2fs-devel@lists.sourceforge.net,
linux-mips@vger.kernel.org, linux-riscv@lists.infradead.org,
linux-s390@vger.kernel.org, linux-scsi@vger.kernel.org,
linuxppc-dev@lists.ozlabs.org, loongarch@lists.linux.dev,
sparclinux@vger.kernel.org, x86@kernel.org,
Ard Biesheuvel <ardb@kernel.org>
Subject: [PATCH v2 11/18] x86/crc32: update prototype for crc_pcl()
Date: Fri, 25 Oct 2024 12:14:47 -0700 [thread overview]
Message-ID: <20241025191454.72616-12-ebiggers@kernel.org> (raw)
In-Reply-To: <20241025191454.72616-1-ebiggers@kernel.org>
From: Eric Biggers <ebiggers@google.com>
- Change the len parameter from unsigned int to size_t, so that the
library function which takes a size_t can safely use this code.
- Rename to crc32c_x86_3way() which is much clearer.
- Move the crc parameter to the front, as this is the usual convention.
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/crc32c-intel_glue.c | 7 ++-
arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 63 ++++++++++++-----------
2 files changed, 35 insertions(+), 35 deletions(-)
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 52c5d47ef5a1..603d159de400 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -39,12 +39,11 @@
* size is >= 512 to account
* for fpu state save/restore overhead.
*/
#define CRC32C_PCL_BREAKEVEN 512
-asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len,
- unsigned int crc_init);
+asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
#endif /* CONFIG_X86_64 */
static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
{
while (length--) {
@@ -157,11 +156,11 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
* use faster PCL version if datasize is large enough to
* overcome kernel fpu state save/restore overhead
*/
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
kernel_fpu_begin();
- *crcp = crc_pcl(data, len, *crcp);
+ *crcp = crc32c_x86_3way(*crcp, data, len);
kernel_fpu_end();
} else
*crcp = crc32c_intel_le_hw(*crcp, data, len);
return 0;
}
@@ -169,11 +168,11 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
kernel_fpu_begin();
- *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
+ *(__le32 *)out = ~cpu_to_le32(crc32c_x86_3way(*crcp, data, len));
kernel_fpu_end();
} else
*(__le32 *)out =
~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
return 0;
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 752812bc4991..9b8770503bbc 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -50,19 +50,20 @@
# Define threshold below which buffers are considered "small" and routed to
# regular CRC code that does not interleave the CRC instructions.
#define SMALL_SIZE 200
-# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
+# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
.text
-SYM_FUNC_START(crc_pcl)
-#define bufp %rdi
-#define bufp_d %edi
-#define len %esi
-#define crc_init %edx
-#define crc_init_q %rdx
+SYM_FUNC_START(crc32c_x86_3way)
+#define crc0 %edi
+#define crc0_q %rdi
+#define bufp %rsi
+#define bufp_d %esi
+#define len %rdx
+#define len_dw %edx
#define n_misaligned %ecx /* overlaps chunk_bytes! */
#define n_misaligned_q %rcx
#define chunk_bytes %ecx /* overlaps n_misaligned! */
#define chunk_bytes_q %rcx
#define crc1 %r8
@@ -83,13 +84,13 @@ SYM_FUNC_START(crc_pcl)
# Process 1 <= n_misaligned <= 7 bytes individually in order to align
# the remaining data to an 8-byte boundary.
.Ldo_align:
movq (bufp), %rax
add n_misaligned_q, bufp
- sub n_misaligned, len
+ sub n_misaligned_q, len
.Lalign_loop:
- crc32b %al, crc_init # compute crc32 of 1-byte
+ crc32b %al, crc0 # compute crc32 of 1-byte
shr $8, %rax # get next byte
dec n_misaligned
jne .Lalign_loop
.Laligned:
@@ -100,11 +101,11 @@ SYM_FUNC_START(crc_pcl)
cmp $128*24, len
jae .Lfull_block
.Lpartial_block:
# Compute floor(len / 24) to get num qwords to process from each lane.
- imul $2731, len, %eax # 2731 = ceil(2^16 / 24)
+ imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24)
shr $16, %eax
jmp .Lcrc_3lanes
.Lfull_block:
# Processing 128 qwords from each lane.
@@ -123,20 +124,20 @@ SYM_FUNC_START(crc_pcl)
jl .Lcrc_3lanes_4x_done
# Unroll the loop by a factor of 4 to reduce the overhead of the loop
# bookkeeping instructions, which can compete with crc32q for the ALUs.
.Lcrc_3lanes_4x_loop:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
- crc32q 8(bufp), crc_init_q
+ crc32q 8(bufp), crc0_q
crc32q 8(bufp,chunk_bytes_q), crc1
crc32q 8(bufp,chunk_bytes_q,2), crc2
- crc32q 16(bufp), crc_init_q
+ crc32q 16(bufp), crc0_q
crc32q 16(bufp,chunk_bytes_q), crc1
crc32q 16(bufp,chunk_bytes_q,2), crc2
- crc32q 24(bufp), crc_init_q
+ crc32q 24(bufp), crc0_q
crc32q 24(bufp,chunk_bytes_q), crc1
crc32q 24(bufp,chunk_bytes_q,2), crc2
add $32, bufp
sub $4, %eax
jge .Lcrc_3lanes_4x_loop
@@ -144,42 +145,42 @@ SYM_FUNC_START(crc_pcl)
.Lcrc_3lanes_4x_done:
add $4, %eax
jz .Lcrc_3lanes_last_qword
.Lcrc_3lanes_1x_loop:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
add $8, bufp
dec %eax
jnz .Lcrc_3lanes_1x_loop
.Lcrc_3lanes_last_qword:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
################################################################
## 4) Combine three results:
################################################################
lea (K_table-8)(%rip), %rax # first entry is for idx 1
pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
- sub %eax, len # len -= chunk_bytes * 3
+ sub %rax, len # len -= chunk_bytes * 3
- movq crc_init_q, %xmm1 # CRC for block 1
+ movq crc0_q, %xmm1 # CRC for block 1
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
movq crc1, %xmm2 # CRC for block 2
pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
pxor %xmm2,%xmm1
movq %xmm1, %rax
xor (bufp,chunk_bytes_q,2), %rax
- mov crc2, crc_init_q
- crc32 %rax, crc_init_q
+ mov crc2, crc0_q
+ crc32 %rax, crc0_q
lea 8(bufp,chunk_bytes_q,2), bufp
################################################################
## 5) If more blocks remain, goto (2):
################################################################
@@ -191,38 +192,38 @@ SYM_FUNC_START(crc_pcl)
#######################################################################
## 6) Process any remainder without interleaving:
#######################################################################
.Lsmall:
- test len, len
+ test len_dw, len_dw
jz .Ldone
- mov len, %eax
+ mov len_dw, %eax
shr $3, %eax
jz .Ldo_dword
.Ldo_qwords:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
add $8, bufp
dec %eax
jnz .Ldo_qwords
.Ldo_dword:
- test $4, len
+ test $4, len_dw
jz .Ldo_word
- crc32l (bufp), crc_init
+ crc32l (bufp), crc0
add $4, bufp
.Ldo_word:
- test $2, len
+ test $2, len_dw
jz .Ldo_byte
- crc32w (bufp), crc_init
+ crc32w (bufp), crc0
add $2, bufp
.Ldo_byte:
- test $1, len
+ test $1, len_dw
jz .Ldone
- crc32b (bufp), crc_init
+ crc32b (bufp), crc0
.Ldone:
- mov crc_init, %eax
+ mov crc0, %eax
RET
-SYM_FUNC_END(crc_pcl)
+SYM_FUNC_END(crc32c_x86_3way)
.section .rodata, "a", @progbits
################################################################
## PCLMULQDQ tables
## Table is 128 entries x 2 words (8 bytes) each
--
2.47.0
WARNING: multiple messages have this Message-ID (diff)
From: Eric Biggers via Linux-f2fs-devel <linux-f2fs-devel@lists.sourceforge.net>
To: linux-kernel@vger.kernel.org
Cc: linux-arch@vger.kernel.org, linux-s390@vger.kernel.org,
linux-scsi@vger.kernel.org, x86@kernel.org,
linux-mips@vger.kernel.org,
linux-f2fs-devel@lists.sourceforge.net,
linux-crypto@vger.kernel.org, loongarch@lists.linux.dev,
sparclinux@vger.kernel.org, linux-riscv@lists.infradead.org,
linux-ext4@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
Ard Biesheuvel <ardb@kernel.org>,
linux-arm-kernel@lists.infradead.org
Subject: [f2fs-dev] [PATCH v2 11/18] x86/crc32: update prototype for crc_pcl()
Date: Fri, 25 Oct 2024 12:14:47 -0700 [thread overview]
Message-ID: <20241025191454.72616-12-ebiggers@kernel.org> (raw)
In-Reply-To: <20241025191454.72616-1-ebiggers@kernel.org>
From: Eric Biggers <ebiggers@google.com>
- Change the len parameter from unsigned int to size_t, so that the
library function which takes a size_t can safely use this code.
- Rename to crc32c_x86_3way() which is much clearer.
- Move the crc parameter to the front, as this is the usual convention.
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/crc32c-intel_glue.c | 7 ++-
arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 63 ++++++++++++-----------
2 files changed, 35 insertions(+), 35 deletions(-)
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 52c5d47ef5a1..603d159de400 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -39,12 +39,11 @@
* size is >= 512 to account
* for fpu state save/restore overhead.
*/
#define CRC32C_PCL_BREAKEVEN 512
-asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len,
- unsigned int crc_init);
+asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
#endif /* CONFIG_X86_64 */
static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
{
while (length--) {
@@ -157,11 +156,11 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
* use faster PCL version if datasize is large enough to
* overcome kernel fpu state save/restore overhead
*/
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
kernel_fpu_begin();
- *crcp = crc_pcl(data, len, *crcp);
+ *crcp = crc32c_x86_3way(*crcp, data, len);
kernel_fpu_end();
} else
*crcp = crc32c_intel_le_hw(*crcp, data, len);
return 0;
}
@@ -169,11 +168,11 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
kernel_fpu_begin();
- *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
+ *(__le32 *)out = ~cpu_to_le32(crc32c_x86_3way(*crcp, data, len));
kernel_fpu_end();
} else
*(__le32 *)out =
~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
return 0;
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 752812bc4991..9b8770503bbc 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -50,19 +50,20 @@
# Define threshold below which buffers are considered "small" and routed to
# regular CRC code that does not interleave the CRC instructions.
#define SMALL_SIZE 200
-# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
+# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
.text
-SYM_FUNC_START(crc_pcl)
-#define bufp %rdi
-#define bufp_d %edi
-#define len %esi
-#define crc_init %edx
-#define crc_init_q %rdx
+SYM_FUNC_START(crc32c_x86_3way)
+#define crc0 %edi
+#define crc0_q %rdi
+#define bufp %rsi
+#define bufp_d %esi
+#define len %rdx
+#define len_dw %edx
#define n_misaligned %ecx /* overlaps chunk_bytes! */
#define n_misaligned_q %rcx
#define chunk_bytes %ecx /* overlaps n_misaligned! */
#define chunk_bytes_q %rcx
#define crc1 %r8
@@ -83,13 +84,13 @@ SYM_FUNC_START(crc_pcl)
# Process 1 <= n_misaligned <= 7 bytes individually in order to align
# the remaining data to an 8-byte boundary.
.Ldo_align:
movq (bufp), %rax
add n_misaligned_q, bufp
- sub n_misaligned, len
+ sub n_misaligned_q, len
.Lalign_loop:
- crc32b %al, crc_init # compute crc32 of 1-byte
+ crc32b %al, crc0 # compute crc32 of 1-byte
shr $8, %rax # get next byte
dec n_misaligned
jne .Lalign_loop
.Laligned:
@@ -100,11 +101,11 @@ SYM_FUNC_START(crc_pcl)
cmp $128*24, len
jae .Lfull_block
.Lpartial_block:
# Compute floor(len / 24) to get num qwords to process from each lane.
- imul $2731, len, %eax # 2731 = ceil(2^16 / 24)
+ imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24)
shr $16, %eax
jmp .Lcrc_3lanes
.Lfull_block:
# Processing 128 qwords from each lane.
@@ -123,20 +124,20 @@ SYM_FUNC_START(crc_pcl)
jl .Lcrc_3lanes_4x_done
# Unroll the loop by a factor of 4 to reduce the overhead of the loop
# bookkeeping instructions, which can compete with crc32q for the ALUs.
.Lcrc_3lanes_4x_loop:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
- crc32q 8(bufp), crc_init_q
+ crc32q 8(bufp), crc0_q
crc32q 8(bufp,chunk_bytes_q), crc1
crc32q 8(bufp,chunk_bytes_q,2), crc2
- crc32q 16(bufp), crc_init_q
+ crc32q 16(bufp), crc0_q
crc32q 16(bufp,chunk_bytes_q), crc1
crc32q 16(bufp,chunk_bytes_q,2), crc2
- crc32q 24(bufp), crc_init_q
+ crc32q 24(bufp), crc0_q
crc32q 24(bufp,chunk_bytes_q), crc1
crc32q 24(bufp,chunk_bytes_q,2), crc2
add $32, bufp
sub $4, %eax
jge .Lcrc_3lanes_4x_loop
@@ -144,42 +145,42 @@ SYM_FUNC_START(crc_pcl)
.Lcrc_3lanes_4x_done:
add $4, %eax
jz .Lcrc_3lanes_last_qword
.Lcrc_3lanes_1x_loop:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
add $8, bufp
dec %eax
jnz .Lcrc_3lanes_1x_loop
.Lcrc_3lanes_last_qword:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
################################################################
## 4) Combine three results:
################################################################
lea (K_table-8)(%rip), %rax # first entry is for idx 1
pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
- sub %eax, len # len -= chunk_bytes * 3
+ sub %rax, len # len -= chunk_bytes * 3
- movq crc_init_q, %xmm1 # CRC for block 1
+ movq crc0_q, %xmm1 # CRC for block 1
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
movq crc1, %xmm2 # CRC for block 2
pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
pxor %xmm2,%xmm1
movq %xmm1, %rax
xor (bufp,chunk_bytes_q,2), %rax
- mov crc2, crc_init_q
- crc32 %rax, crc_init_q
+ mov crc2, crc0_q
+ crc32 %rax, crc0_q
lea 8(bufp,chunk_bytes_q,2), bufp
################################################################
## 5) If more blocks remain, goto (2):
################################################################
@@ -191,38 +192,38 @@ SYM_FUNC_START(crc_pcl)
#######################################################################
## 6) Process any remainder without interleaving:
#######################################################################
.Lsmall:
- test len, len
+ test len_dw, len_dw
jz .Ldone
- mov len, %eax
+ mov len_dw, %eax
shr $3, %eax
jz .Ldo_dword
.Ldo_qwords:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
add $8, bufp
dec %eax
jnz .Ldo_qwords
.Ldo_dword:
- test $4, len
+ test $4, len_dw
jz .Ldo_word
- crc32l (bufp), crc_init
+ crc32l (bufp), crc0
add $4, bufp
.Ldo_word:
- test $2, len
+ test $2, len_dw
jz .Ldo_byte
- crc32w (bufp), crc_init
+ crc32w (bufp), crc0
add $2, bufp
.Ldo_byte:
- test $1, len
+ test $1, len_dw
jz .Ldone
- crc32b (bufp), crc_init
+ crc32b (bufp), crc0
.Ldone:
- mov crc_init, %eax
+ mov crc0, %eax
RET
-SYM_FUNC_END(crc_pcl)
+SYM_FUNC_END(crc32c_x86_3way)
.section .rodata, "a", @progbits
################################################################
## PCLMULQDQ tables
## Table is 128 entries x 2 words (8 bytes) each
--
2.47.0
_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
WARNING: multiple messages have this Message-ID (diff)
From: Eric Biggers <ebiggers@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: linux-arch@vger.kernel.org, linux-arm-kernel@lists.infradead.org,
linux-crypto@vger.kernel.org, linux-ext4@vger.kernel.org,
linux-f2fs-devel@lists.sourceforge.net,
linux-mips@vger.kernel.org, linux-riscv@lists.infradead.org,
linux-s390@vger.kernel.org, linux-scsi@vger.kernel.org,
linuxppc-dev@lists.ozlabs.org, loongarch@lists.linux.dev,
sparclinux@vger.kernel.org, x86@kernel.org,
Ard Biesheuvel <ardb@kernel.org>
Subject: [PATCH v2 11/18] x86/crc32: update prototype for crc_pcl()
Date: Fri, 25 Oct 2024 12:14:47 -0700 [thread overview]
Message-ID: <20241025191454.72616-12-ebiggers@kernel.org> (raw)
In-Reply-To: <20241025191454.72616-1-ebiggers@kernel.org>
From: Eric Biggers <ebiggers@google.com>
- Change the len parameter from unsigned int to size_t, so that the
library function which takes a size_t can safely use this code.
- Rename to crc32c_x86_3way() which is much clearer.
- Move the crc parameter to the front, as this is the usual convention.
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/crc32c-intel_glue.c | 7 ++-
arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 63 ++++++++++++-----------
2 files changed, 35 insertions(+), 35 deletions(-)
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 52c5d47ef5a1..603d159de400 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -39,12 +39,11 @@
* size is >= 512 to account
* for fpu state save/restore overhead.
*/
#define CRC32C_PCL_BREAKEVEN 512
-asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len,
- unsigned int crc_init);
+asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
#endif /* CONFIG_X86_64 */
static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
{
while (length--) {
@@ -157,11 +156,11 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
* use faster PCL version if datasize is large enough to
* overcome kernel fpu state save/restore overhead
*/
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
kernel_fpu_begin();
- *crcp = crc_pcl(data, len, *crcp);
+ *crcp = crc32c_x86_3way(*crcp, data, len);
kernel_fpu_end();
} else
*crcp = crc32c_intel_le_hw(*crcp, data, len);
return 0;
}
@@ -169,11 +168,11 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
kernel_fpu_begin();
- *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
+ *(__le32 *)out = ~cpu_to_le32(crc32c_x86_3way(*crcp, data, len));
kernel_fpu_end();
} else
*(__le32 *)out =
~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
return 0;
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 752812bc4991..9b8770503bbc 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -50,19 +50,20 @@
# Define threshold below which buffers are considered "small" and routed to
# regular CRC code that does not interleave the CRC instructions.
#define SMALL_SIZE 200
-# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
+# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
.text
-SYM_FUNC_START(crc_pcl)
-#define bufp %rdi
-#define bufp_d %edi
-#define len %esi
-#define crc_init %edx
-#define crc_init_q %rdx
+SYM_FUNC_START(crc32c_x86_3way)
+#define crc0 %edi
+#define crc0_q %rdi
+#define bufp %rsi
+#define bufp_d %esi
+#define len %rdx
+#define len_dw %edx
#define n_misaligned %ecx /* overlaps chunk_bytes! */
#define n_misaligned_q %rcx
#define chunk_bytes %ecx /* overlaps n_misaligned! */
#define chunk_bytes_q %rcx
#define crc1 %r8
@@ -83,13 +84,13 @@ SYM_FUNC_START(crc_pcl)
# Process 1 <= n_misaligned <= 7 bytes individually in order to align
# the remaining data to an 8-byte boundary.
.Ldo_align:
movq (bufp), %rax
add n_misaligned_q, bufp
- sub n_misaligned, len
+ sub n_misaligned_q, len
.Lalign_loop:
- crc32b %al, crc_init # compute crc32 of 1-byte
+ crc32b %al, crc0 # compute crc32 of 1-byte
shr $8, %rax # get next byte
dec n_misaligned
jne .Lalign_loop
.Laligned:
@@ -100,11 +101,11 @@ SYM_FUNC_START(crc_pcl)
cmp $128*24, len
jae .Lfull_block
.Lpartial_block:
# Compute floor(len / 24) to get num qwords to process from each lane.
- imul $2731, len, %eax # 2731 = ceil(2^16 / 24)
+ imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24)
shr $16, %eax
jmp .Lcrc_3lanes
.Lfull_block:
# Processing 128 qwords from each lane.
@@ -123,20 +124,20 @@ SYM_FUNC_START(crc_pcl)
jl .Lcrc_3lanes_4x_done
# Unroll the loop by a factor of 4 to reduce the overhead of the loop
# bookkeeping instructions, which can compete with crc32q for the ALUs.
.Lcrc_3lanes_4x_loop:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
- crc32q 8(bufp), crc_init_q
+ crc32q 8(bufp), crc0_q
crc32q 8(bufp,chunk_bytes_q), crc1
crc32q 8(bufp,chunk_bytes_q,2), crc2
- crc32q 16(bufp), crc_init_q
+ crc32q 16(bufp), crc0_q
crc32q 16(bufp,chunk_bytes_q), crc1
crc32q 16(bufp,chunk_bytes_q,2), crc2
- crc32q 24(bufp), crc_init_q
+ crc32q 24(bufp), crc0_q
crc32q 24(bufp,chunk_bytes_q), crc1
crc32q 24(bufp,chunk_bytes_q,2), crc2
add $32, bufp
sub $4, %eax
jge .Lcrc_3lanes_4x_loop
@@ -144,42 +145,42 @@ SYM_FUNC_START(crc_pcl)
.Lcrc_3lanes_4x_done:
add $4, %eax
jz .Lcrc_3lanes_last_qword
.Lcrc_3lanes_1x_loop:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
add $8, bufp
dec %eax
jnz .Lcrc_3lanes_1x_loop
.Lcrc_3lanes_last_qword:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
################################################################
## 4) Combine three results:
################################################################
lea (K_table-8)(%rip), %rax # first entry is for idx 1
pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
- sub %eax, len # len -= chunk_bytes * 3
+ sub %rax, len # len -= chunk_bytes * 3
- movq crc_init_q, %xmm1 # CRC for block 1
+ movq crc0_q, %xmm1 # CRC for block 1
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
movq crc1, %xmm2 # CRC for block 2
pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
pxor %xmm2,%xmm1
movq %xmm1, %rax
xor (bufp,chunk_bytes_q,2), %rax
- mov crc2, crc_init_q
- crc32 %rax, crc_init_q
+ mov crc2, crc0_q
+ crc32 %rax, crc0_q
lea 8(bufp,chunk_bytes_q,2), bufp
################################################################
## 5) If more blocks remain, goto (2):
################################################################
@@ -191,38 +192,38 @@ SYM_FUNC_START(crc_pcl)
#######################################################################
## 6) Process any remainder without interleaving:
#######################################################################
.Lsmall:
- test len, len
+ test len_dw, len_dw
jz .Ldone
- mov len, %eax
+ mov len_dw, %eax
shr $3, %eax
jz .Ldo_dword
.Ldo_qwords:
- crc32q (bufp), crc_init_q
+ crc32q (bufp), crc0_q
add $8, bufp
dec %eax
jnz .Ldo_qwords
.Ldo_dword:
- test $4, len
+ test $4, len_dw
jz .Ldo_word
- crc32l (bufp), crc_init
+ crc32l (bufp), crc0
add $4, bufp
.Ldo_word:
- test $2, len
+ test $2, len_dw
jz .Ldo_byte
- crc32w (bufp), crc_init
+ crc32w (bufp), crc0
add $2, bufp
.Ldo_byte:
- test $1, len
+ test $1, len_dw
jz .Ldone
- crc32b (bufp), crc_init
+ crc32b (bufp), crc0
.Ldone:
- mov crc_init, %eax
+ mov crc0, %eax
RET
-SYM_FUNC_END(crc_pcl)
+SYM_FUNC_END(crc32c_x86_3way)
.section .rodata, "a", @progbits
################################################################
## PCLMULQDQ tables
## Table is 128 entries x 2 words (8 bytes) each
--
2.47.0
_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv
next prev parent reply other threads:[~2024-10-25 19:15 UTC|newest]
Thread overview: 122+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-10-25 19:14 [PATCH v2 00/18] Wire up CRC32 library functions to arch-optimized code Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 19:14 ` [PATCH v2 01/18] lib/crc32: drop leading underscores from __crc32c_le_base Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 19:14 ` [PATCH v2 02/18] lib/crc32: improve support for arch-specific overrides Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 19:14 ` [PATCH v2 03/18] lib/crc32: expose whether the lib is really optimized at runtime Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 20:32 ` Ard Biesheuvel
2024-10-25 20:32 ` Ard Biesheuvel
2024-10-25 20:32 ` [f2fs-dev] " Ard Biesheuvel via Linux-f2fs-devel
2024-10-25 21:32 ` Eric Biggers
2024-10-25 21:32 ` Eric Biggers
2024-10-25 21:32 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 21:37 ` Ard Biesheuvel
2024-10-25 21:37 ` Ard Biesheuvel
2024-10-25 21:37 ` [f2fs-dev] " Ard Biesheuvel via Linux-f2fs-devel
2024-10-25 22:31 ` Eric Biggers
2024-10-25 22:31 ` Eric Biggers
2024-10-25 22:31 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 19:14 ` [PATCH v2 04/18] crypto: crc32 - don't unnecessarily register arch algorithms Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 20:47 ` Ard Biesheuvel
2024-10-25 20:47 ` Ard Biesheuvel
2024-10-25 20:47 ` [f2fs-dev] " Ard Biesheuvel via Linux-f2fs-devel
2024-10-25 22:02 ` Eric Biggers
2024-10-25 22:02 ` Eric Biggers
2024-10-25 22:02 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-26 4:09 ` Eric Biggers
2024-10-26 4:09 ` Eric Biggers
2024-10-26 4:09 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-27 8:14 ` Ard Biesheuvel
2024-10-27 8:14 ` Ard Biesheuvel
2024-10-27 8:14 ` [f2fs-dev] " Ard Biesheuvel via Linux-f2fs-devel
2024-11-02 9:45 ` Herbert Xu
2024-11-02 9:45 ` Herbert Xu
2024-11-02 9:45 ` [f2fs-dev] " Herbert Xu via Linux-f2fs-devel
2024-11-02 9:58 ` Ard Biesheuvel
2024-11-02 9:58 ` Ard Biesheuvel
2024-11-02 9:58 ` [f2fs-dev] " Ard Biesheuvel via Linux-f2fs-devel
2024-11-02 10:19 ` Herbert Xu
2024-11-02 10:19 ` Herbert Xu
2024-11-02 10:19 ` [f2fs-dev] " Herbert Xu via Linux-f2fs-devel
2024-11-02 10:46 ` Ard Biesheuvel
2024-11-02 10:46 ` Ard Biesheuvel
2024-11-02 10:46 ` [f2fs-dev] " Ard Biesheuvel via Linux-f2fs-devel
2024-11-02 11:05 ` Ard Biesheuvel
2024-11-02 11:05 ` Ard Biesheuvel
2024-11-02 11:05 ` [f2fs-dev] " Ard Biesheuvel via Linux-f2fs-devel
2024-11-02 11:08 ` Herbert Xu
2024-11-02 11:08 ` Herbert Xu
2024-11-02 11:08 ` [f2fs-dev] " Herbert Xu via Linux-f2fs-devel
2024-11-02 16:36 ` Eric Biggers
2024-11-02 16:36 ` Eric Biggers
2024-11-02 16:36 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-02 16:46 ` Ard Biesheuvel
2024-11-02 16:46 ` Ard Biesheuvel
2024-11-02 16:46 ` [f2fs-dev] " Ard Biesheuvel via Linux-f2fs-devel
2024-11-02 17:21 ` Milan Broz
2024-11-02 17:21 ` Milan Broz
2024-11-02 17:21 ` [f2fs-dev] " Milan Broz
2024-10-25 19:14 ` [PATCH v2 05/18] arm/crc32: expose CRC32 functions through lib Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 19:14 ` [PATCH v2 06/18] loongarch/crc32: " Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-03 13:36 ` WangYuli
2024-11-03 13:36 ` WangYuli
2024-11-03 13:36 ` [f2fs-dev] " WangYuli
2024-11-03 13:57 ` Eric Biggers
2024-11-03 13:57 ` Eric Biggers
2024-11-03 13:57 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-04 2:34 ` WangYuli
2024-11-04 2:34 ` WangYuli
2024-10-25 19:14 ` [PATCH v2 07/18] mips/crc32: " Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 19:14 ` [PATCH v2 08/18] powerpc/crc32: " Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 19:14 ` [PATCH v2 09/18] s390/crc32: " Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 19:14 ` [PATCH v2 10/18] sparc/crc32: " Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 19:14 ` Eric Biggers [this message]
2024-10-25 19:14 ` [PATCH v2 11/18] x86/crc32: update prototype for crc_pcl() Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 19:14 ` [PATCH v2 12/18] x86/crc32: update prototype for crc32_pclmul_le_16() Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 19:14 ` [PATCH v2 13/18] x86/crc32: expose CRC32 functions through lib Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 19:14 ` [PATCH v2 14/18] lib/crc32: make crc32c() go directly to lib Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 19:14 ` [PATCH v2 15/18] ext4: switch to using the crc32c library Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-11-02 22:26 ` Theodore Ts'o
2024-11-02 22:26 ` Theodore Ts'o
2024-11-02 22:26 ` [f2fs-dev] " Theodore Ts'o
2024-10-25 19:14 ` [PATCH v2 16/18] jbd2: " Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 19:14 ` [PATCH v2 17/18] f2fs: switch to using the crc32 library Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 19:14 ` [PATCH v2 18/18] scsi: target: iscsi: switch to using the crc32c library Eric Biggers
2024-10-25 19:14 ` Eric Biggers
2024-10-25 19:14 ` [f2fs-dev] " Eric Biggers via Linux-f2fs-devel
2024-10-25 22:14 ` Ard Biesheuvel
2024-10-25 22:14 ` Ard Biesheuvel
2024-10-25 22:14 ` [f2fs-dev] " Ard Biesheuvel via Linux-f2fs-devel
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241025191454.72616-12-ebiggers@kernel.org \
--to=ebiggers@kernel.org \
--cc=ardb@kernel.org \
--cc=linux-arch@vger.kernel.org \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-crypto@vger.kernel.org \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-f2fs-devel@lists.sourceforge.net \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mips@vger.kernel.org \
--cc=linux-riscv@lists.infradead.org \
--cc=linux-s390@vger.kernel.org \
--cc=linux-scsi@vger.kernel.org \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=loongarch@lists.linux.dev \
--cc=sparclinux@vger.kernel.org \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.