linux-crypto.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Keith Busch <kbusch@kernel.org>
To: linux-nvme@lists.infradead.org, linux-block@vger.kernel.org,
	linux-crypto@vger.kernel.org, x86@kernel.org,
	linux-kernel@vger.kernel.org
Cc: axboe@kernel.dk, hch@lst.de, martin.petersen@oracle.com,
	colyli@suse.de, Keith Busch <kbusch@kernel.org>
Subject: [PATCHv3 10/10] x86/crypto: add pclmul acceleration for crc64
Date: Tue, 22 Feb 2022 08:31:44 -0800	[thread overview]
Message-ID: <20220222163144.1782447-11-kbusch@kernel.org> (raw)
In-Reply-To: <20220222163144.1782447-1-kbusch@kernel.org>

The crc64 table lookup method is inefficient, using a significant number
of CPU cycles in the block stack per IO. If available on x86, use a
PCLMULQDQ implementation to accelerate the calculation.

The assembly from this patch was mostly generated by gcc from a C
program using library functions provided by x86 intrinsics, and measures
~20x faster than the table lookup.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 arch/x86/crypto/Makefile                  |   3 +
 arch/x86/crypto/crc64-rocksoft-pcl-asm.S  | 215 ++++++++++++++++++++++
 arch/x86/crypto/crc64-rocksoft-pcl_glue.c | 117 ++++++++++++
 crypto/Kconfig                            |  11 ++
 4 files changed, 346 insertions(+)
 create mode 100644 arch/x86/crypto/crc64-rocksoft-pcl-asm.S
 create mode 100644 arch/x86/crypto/crc64-rocksoft-pcl_glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index c3af959648e6..036520c59f0e 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -79,6 +79,9 @@ crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
 crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
 
+obj-$(CONFIG_CRYPTO_CRC64_ROCKSOFT_PCLMUL) += crc64-rocksoft-pclmul.o
+crc64-rocksoft-pclmul-y := crc64-rocksoft-pcl-asm.o crc64-rocksoft-pcl_glue.o
+
 obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
 poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
 targets += poly1305-x86_64-cryptogams.S
diff --git a/arch/x86/crypto/crc64-rocksoft-pcl-asm.S b/arch/x86/crypto/crc64-rocksoft-pcl-asm.S
new file mode 100644
index 000000000000..e3b633a776a9
--- /dev/null
+++ b/arch/x86/crypto/crc64-rocksoft-pcl-asm.S
@@ -0,0 +1,215 @@
+########################################################################
+# Implement fast Rocksoft CRC-64 computation with SSE and PCLMULQDQ instructions
+#
+
+#include <linux/linkage.h>
+
+SYM_FUNC_START(crc_rocksoft_pcl)
+	leaq	(%rsi,%rdx), %rcx
+	movq	%rsi, %r10
+	andl	$15, %esi
+	movq	%rdi, %xmm3
+	leaq	15(%rcx), %rax
+	andq	$-16, %r10
+	pxor	%xmm1, %xmm1
+	andq	$-16, %rax
+	movdqa	%xmm1, %xmm5
+	movq	%rax, %r8
+	subq	%r10, %rax
+	subq	%rcx, %r8
+	movl	$16, %ecx
+	movq	%rax, %r11
+	movq	%rcx, %r9
+	sarq	$4, %r11
+	subq	%rsi, %r9
+	movdqu	shuffleMasks(%r9), %xmm4
+	movdqa	%xmm4, %xmm0
+	pblendvb	%xmm0, (%r10), %xmm5
+	cmpq	$16, %rax
+	je	.L12
+	movdqa	16(%r10), %xmm2
+	cmpq	$2, %r11
+	je	.L13
+	pcmpeqd	%xmm1, %xmm1
+	leaq	-16(%rsi,%rdx), %rdi
+	leaq	16(%r10), %r9
+	pxor	%xmm1, %xmm4
+	movdqa	%xmm3, %xmm1
+	pshufb	%xmm0, %xmm3
+	pshufb	%xmm4, %xmm1
+	movdqa	%xmm3, %xmm0
+	movdqa	.LC0(%rip), %xmm3
+	pxor	%xmm5, %xmm1
+	movdqa	%xmm1, %xmm4
+	pclmulqdq	$0, %xmm3, %xmm1
+	pclmulqdq	$17, %xmm3, %xmm4
+	pxor	%xmm4, %xmm1
+	pxor	%xmm1, %xmm0
+	cmpq	$31, %rdi
+	jbe	.L6
+	leaq	-32(%rdi), %rax
+	movq	%rax, %rsi
+	andq	$-16, %rax
+	leaq	32(%r10,%rax), %rcx
+	shrq	$4, %rsi
+	movq	%r9, %rax
+	.p2align 4,,10
+	.p2align 3
+.L7:
+	pxor	%xmm2, %xmm0
+	movq	%rax, %rdx
+	addq	$16, %rax
+	movdqa	%xmm0, %xmm1
+	pclmulqdq	$0, %xmm3, %xmm0
+	movdqa	16(%rdx), %xmm2
+	pclmulqdq	$17, %xmm3, %xmm1
+	pxor	%xmm1, %xmm0
+	cmpq	%rcx, %rax
+	jne	.L7
+	movq	%rsi, %rax
+	addq	$1, %rsi
+	negq	%rax
+	salq	$4, %rsi
+	salq	$4, %rax
+	addq	%rsi, %r9
+	leaq	-16(%rdi,%rax), %rdi
+.L6:
+	pxor	%xmm2, %xmm0
+	cmpq	$16, %rdi
+	je	.L9
+	movl	$16, %eax
+	pcmpeqd	%xmm2, %xmm2
+	movdqa	%xmm0, %xmm7
+	subq	%r8, %rax
+	movdqu	shuffleMasks(%rax), %xmm4
+	pxor	%xmm4, %xmm2
+	pshufb	%xmm4, %xmm0
+	movdqa	16(%r9), %xmm4
+	pshufb	%xmm2, %xmm7
+	pshufb	%xmm2, %xmm4
+	movdqa	%xmm7, %xmm1
+	movdqa	%xmm4, %xmm2
+	movdqa	%xmm7, %xmm4
+	pclmulqdq	$0, %xmm3, %xmm1
+	pclmulqdq	$17, %xmm3, %xmm4
+	por	%xmm2, %xmm0
+	pxor	%xmm4, %xmm1
+	pxor	%xmm1, %xmm0
+.L9:
+	movdqa	%xmm0, %xmm2
+	pclmulqdq	$16, %xmm3, %xmm0
+	psrldq	$8, %xmm2
+	pxor	%xmm2, %xmm0
+.L3:
+	movdqa	.LC1(%rip), %xmm2
+	movdqa	%xmm0, %xmm1
+	pclmulqdq	$0, %xmm2, %xmm1
+	movdqa	%xmm1, %xmm3
+	pclmulqdq	$16, %xmm2, %xmm1
+	pslldq	$8, %xmm3
+	pxor	%xmm3, %xmm1
+	pxor	%xmm1, %xmm0
+	pextrd	$3, %xmm0, %eax
+	salq	$32, %rax
+	movq	%rax, %rdx
+	pextrd	$2, %xmm0, %eax
+	orq	%rdx, %rax
+	notq	%rax
+	ret
+	.p2align 4,,10
+	.p2align 3
+.L13:
+	subq	%r8, %rcx
+	pcmpeqd	%xmm1, %xmm1
+	movdqu	shuffleMasks(%rcx), %xmm7
+	movdqa	%xmm7, %xmm6
+	pxor	%xmm1, %xmm6
+	cmpq	$7, %rdx
+	ja	.L5
+	movdqa	%xmm1, %xmm4
+	pshufb	%xmm7, %xmm5
+	movdqa	%xmm3, %xmm1
+	movdqu	shuffleMasks(%rdx), %xmm8
+	pshufb	%xmm6, %xmm2
+	pxor	%xmm8, %xmm4
+	pxor	%xmm5, %xmm2
+	pshufb	%xmm8, %xmm3
+	pshufb	%xmm4, %xmm1
+	movdqa	%xmm3, %xmm0
+	pxor	%xmm1, %xmm2
+	pslldq	$8, %xmm0
+	movdqa	%xmm2, %xmm3
+	pclmulqdq	$16, .LC0(%rip), %xmm2
+	psrldq	$8, %xmm3
+	pxor	%xmm3, %xmm0
+	pxor	%xmm2, %xmm0
+	jmp	.L3
+	.p2align 4,,10
+	.p2align 3
+.L12:
+	movdqu	shuffleMasks(%rdx), %xmm2
+	subq	%r8, %rcx
+	movdqa	%xmm3, %xmm6
+	pcmpeqd	%xmm4, %xmm4
+	movdqa	%xmm2, %xmm0
+	pshufb	%xmm2, %xmm3
+	movdqu	shuffleMasks(%rcx), %xmm2
+	pxor	%xmm4, %xmm0
+	pslldq	$8, %xmm3
+	pxor	%xmm4, %xmm2
+	pshufb	%xmm0, %xmm6
+	pshufb	%xmm2, %xmm5
+	movdqa	%xmm5, %xmm1
+	pxor	%xmm6, %xmm1
+	movdqa	%xmm1, %xmm0
+	pclmulqdq	$16, .LC0(%rip), %xmm1
+	psrldq	$8, %xmm0
+	pxor	%xmm3, %xmm0
+	pxor	%xmm1, %xmm0
+	jmp	.L3
+	.p2align 4,,10
+	.p2align 3
+.L5:
+	pxor	%xmm1, %xmm4
+	movdqa	%xmm3, %xmm1
+	pshufb	%xmm0, %xmm3
+	pshufb	%xmm4, %xmm1
+	pxor	%xmm3, %xmm2
+	movdqa	.LC0(%rip), %xmm3
+	pxor	%xmm5, %xmm1
+	pshufb	%xmm6, %xmm2
+	movdqa	%xmm1, %xmm5
+	pshufb	%xmm7, %xmm1
+	pshufb	%xmm6, %xmm5
+	pxor	%xmm2, %xmm1
+	movdqa	%xmm5, %xmm4
+	movdqa	%xmm5, %xmm0
+	pclmulqdq	$17, %xmm3, %xmm0
+	pclmulqdq	$0, %xmm3, %xmm4
+	pxor	%xmm0, %xmm4
+	pxor	%xmm4, %xmm1
+	movdqa	%xmm1, %xmm0
+	pclmulqdq	$16, %xmm3, %xmm1
+	psrldq	$8, %xmm0
+	pxor	%xmm1, %xmm0
+	jmp	.L3
+SYM_FUNC_END(crc_rocksoft_pcl)
+
+.section	.rodata
+.align 32
+.type	shuffleMasks, @object
+.size	shuffleMasks, 32
+shuffleMasks:
+	.string	""
+	.ascii	"\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\217\216\215"
+	.ascii	"\214\213\212\211\210\207\206\205\204\203\202\201\200"
+
+.section	.rodata.cst16,"aM",@progbits,16
+.align 16
+.LC0:
+	.quad	-1523270018343381984
+	.quad	2443614144669557164
+	.align 16
+.LC1:
+	.quad	2876949357237608311
+	.quad	3808117099328934763
diff --git a/arch/x86/crypto/crc64-rocksoft-pcl_glue.c b/arch/x86/crypto/crc64-rocksoft-pcl_glue.c
new file mode 100644
index 000000000000..996780aa3d93
--- /dev/null
+++ b/arch/x86/crypto/crc64-rocksoft-pcl_glue.c
@@ -0,0 +1,117 @@
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc64.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <asm/cpufeatures.h>
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
+
+asmlinkage u64 crc_rocksoft_pcl(u64 init_crc, const u8 *buf, size_t len);
+
+struct chksum_desc_ctx {
+	u64 crc;
+};
+
+static int chksum_init(struct shash_desc *desc)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	ctx->crc = 0;
+
+	return 0;
+}
+
+static int chksum_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	if (length >= 16 && crypto_simd_usable()) {
+		kernel_fpu_begin();
+		ctx->crc = crc_rocksoft_pcl(ctx->crc, data, length);
+		kernel_fpu_end();
+	} else
+		ctx->crc = crc64_rocksoft_generic(ctx->crc, data, length);
+	return 0;
+}
+
+static int chksum_final(struct shash_desc *desc, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	*(u64 *)out = ctx->crc;
+	return 0;
+}
+
+static int __chksum_finup(u64 crc, const u8 *data, unsigned int len, u8 *out)
+{
+	if (len >= 16 && crypto_simd_usable()) {
+		kernel_fpu_begin();
+		*(u64 *)out = crc_rocksoft_pcl(crc, data, len);
+		kernel_fpu_end();
+	} else
+		*(u64 *)out = crc64_rocksoft_generic(crc, data, len);
+	return 0;
+}
+
+static int chksum_finup(struct shash_desc *desc, const u8 *data,
+			unsigned int len, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	return __chksum_finup(ctx->crc, data, len, out);
+}
+
+static int chksum_digest(struct shash_desc *desc, const u8 *data,
+			 unsigned int length, u8 *out)
+{
+	return __chksum_finup(0, data, length, out);
+}
+
+static struct shash_alg alg = {
+	.digestsize	= 	8,
+	.init		=	chksum_init,
+	.update		=	chksum_update,
+	.final		=	chksum_final,
+	.finup		=	chksum_finup,
+	.digest		=	chksum_digest,
+	.descsize	=	sizeof(struct chksum_desc_ctx),
+	.base		=	{
+		.cra_name		=	CRC64_ROCKSOFT_STRING,
+		.cra_driver_name	=	"crc64-rocksoft-pclmul",
+		.cra_priority		=	200,
+		.cra_blocksize		=	1,
+		.cra_module		=	THIS_MODULE,
+	}
+};
+
+static const struct x86_cpu_id crc64_rocksoft_cpu_id[] = {
+	X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, crc64_rocksoft_cpu_id);
+
+static int __init crc64_rocksoft_x86_mod_init(void)
+{
+	if (!x86_match_cpu(crc64_rocksoft_cpu_id))
+		return -ENODEV;
+
+	return crypto_register_shash(&alg);
+}
+
+static void __exit crc64_rocksoft_x86_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(crc64_rocksoft_x86_mod_init);
+module_exit(crc64_rocksoft_x86_mod_fini);
+
+MODULE_AUTHOR("Keith Busch <kbusch@kernel.org>");
+MODULE_DESCRIPTION("Rocksoft CRC64 calculation accelerated with PCLMULQDQ.");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("crc64-rocksoft-pclmul");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index e343147b9f8f..d8861138f117 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -744,6 +744,17 @@ config CRYPTO_CRC64_ROCKSOFT
 	  transform. This allows for faster crc64 transforms to be used
 	  if they are available.
 
+config CRYPTO_CRC64_ROCKSOFT_PCLMUL
+	tristate "Rocksoft model CRC64 PCLMULQDQ hardware acceleration"
+	depends on X86 && 64BIT && CRC64
+	select CRYPTO_HASH
+	help
+	  For x86_64 processors with SSE4.2 and PCLMULQDQ supported,
+	  CRC64 PCLMULQDQ computation can be hardware accelerated PCLMULQDQ
+	  instruction. This option will create 'crc64-rocksoft-pclmul'
+	  module, which is faster when computing crc64 checksum compared
+	  with the generic table implementation.
+
 config CRYPTO_VPMSUM_TESTER
 	tristate "Powerpc64 vpmsum hardware acceleration tester"
 	depends on CRYPTO_CRCT10DIF_VPMSUM && CRYPTO_CRC32C_VPMSUM
-- 
2.25.4


  parent reply	other threads:[~2022-02-22 16:32 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-02-22 16:31 [PATCHv3 00/10] 64-bit data integrity field support Keith Busch
2022-02-22 16:31 ` [PATCHv3 01/10] block: support pi with extended metadata Keith Busch
2022-02-25 16:01   ` Christoph Hellwig
2022-02-22 16:31 ` [PATCHv3 02/10] nvme: allow integrity on extended metadata formats Keith Busch
2022-02-25 16:02   ` Christoph Hellwig
2022-02-22 16:31 ` [PATCHv3 03/10] asm-generic: introduce be48 unaligned accessors Keith Busch
2022-02-22 16:52   ` Chaitanya Kulkarni
2022-02-25 16:03   ` Christoph Hellwig
2022-02-25 17:53     ` Joe Perches
2022-02-25 17:59       ` Keith Busch
2022-02-22 16:31 ` [PATCHv3 04/10] linux/kernel: introduce lower_48_bits macro Keith Busch
2022-02-22 16:45   ` Joe Perches
2022-02-22 16:50     ` Christoph Hellwig
2022-02-22 16:56       ` Keith Busch
2022-02-22 18:43         ` Joe Perches
2022-02-22 20:09           ` David Laight
2022-02-22 20:31             ` Joe Perches
2022-02-22 21:12               ` Keith Busch
2022-02-22 21:17                 ` Joe Perches
2022-02-22 16:58       ` Joe Perches
2022-02-22 17:09       ` David Laight
2022-02-22 17:14       ` Chaitanya Kulkarni
2022-02-22 16:48   ` Chaitanya Kulkarni
2022-02-22 16:31 ` [PATCHv3 05/10] lib: add rocksoft model crc64 Keith Busch
2022-02-25 16:04   ` Christoph Hellwig
2022-02-22 16:31 ` [PATCHv3 06/10] crypto: add rocksoft 64b crc framework Keith Busch
2022-02-22 19:50   ` Eric Biggers
2022-02-22 19:54     ` Eric Biggers
2022-02-22 20:09     ` Keith Busch
2022-02-25 16:11       ` Christoph Hellwig
2022-02-22 19:56   ` Eric Biggers
2022-02-22 16:31 ` [PATCHv3 07/10] lib: add crc64 tests Keith Busch
2022-02-22 16:50   ` Chaitanya Kulkarni
2022-02-25 16:05   ` Christoph Hellwig
2022-02-25 16:12     ` Keith Busch
2022-02-25 16:19       ` Christoph Hellwig
2022-02-22 16:31 ` [PATCHv3 08/10] block: add pi for nvme enhanced integrity Keith Busch
2022-02-25 16:14   ` Christoph Hellwig
2022-03-02  3:15     ` Martin K. Petersen
2022-02-22 16:31 ` [PATCHv3 09/10] nvme: add support for enhanced metadata Keith Busch
2022-02-25 16:17   ` Christoph Hellwig
2022-03-02  3:18   ` Martin K. Petersen
2022-02-22 16:31 ` Keith Busch [this message]
2022-02-22 17:02   ` [PATCHv3 10/10] x86/crypto: add pclmul acceleration for crc64 David Laight
2022-02-22 17:14     ` Keith Busch
2022-02-22 20:06       ` Eric Biggers
2022-02-22 20:51         ` Keith Busch

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220222163144.1782447-11-kbusch@kernel.org \
    --to=kbusch@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=colyli@suse.de \
    --cc=hch@lst.de \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=martin.petersen@oracle.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).