All of lore.kernel.org
 help / color / mirror / Atom feed
From: Eric Biggers <ebiggers@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: linux-crypto@vger.kernel.org, Ard Biesheuvel <ardb@kernel.org>,
	Zhihang Shao <zhihang.shao.iscas@gmail.com>,
	linux-riscv@lists.infradead.org
Subject: [PATCH v4] riscv/crc-t10dif: Optimize crct10dif with zbc extension
Date: Wed, 12 Feb 2025 12:07:23 -0800	[thread overview]
Message-ID: <20250212200723.135894-1-ebiggers@kernel.org> (raw)

From: Zhihang Shao <zhihang.shao.iscas@gmail.com>

The current CRC-T10DIF algorithm on RISC-V platform is based on
table-lookup optimization.  Given the previous work on optimizing crc32
calculations with zbc extension, it is believed that this will be
equally effective for accelerating crc-t10dif.

Therefore this patch adds an implementation of crc-t10dif using zbc
extension. It detects whether the current runtime environment supports
zbc feature and, if so, uses it to accelerate crc-t10dif calculations.

This patch is updated due to the patchset of updating kernel's
CRC-T10DIF library in 6.14, which is finished by Eric Biggers.  Also, I
used crc_kunit.c to test the performance of crc-t10dif optimized by crc
extension.

Signed-off-by: Zhihang Shao <zhihang.shao.iscas@gmail.com>
[EB: fixed 32-bit build, added comments that explain the algorithm used,
     and various other cleanups]
Signed-off-by: Eric Biggers <ebiggers@google.com>
---

This patch applies to 
https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux.git/log/?h=crc-next

 arch/riscv/Kconfig                |   1 +
 arch/riscv/lib/Makefile           |   1 +
 arch/riscv/lib/crc-t10dif-riscv.c | 131 ++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+)
 create mode 100644 arch/riscv/lib/crc-t10dif-riscv.c

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 7612c52e9b1e3..db1cf9666dfdd 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -23,10 +23,11 @@ config RISCV
 	select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
 	select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
 	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
 	select ARCH_HAS_BINFMT_FLAT
 	select ARCH_HAS_CRC32 if RISCV_ISA_ZBC
+	select ARCH_HAS_CRC_T10DIF if RISCV_ISA_ZBC
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DEBUG_VIRTUAL if MMU
 	select ARCH_HAS_DEBUG_VM_PGTABLE
 	select ARCH_HAS_DEBUG_WX
 	select ARCH_HAS_FAST_MULTIPLIER
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 79368a895feed..d1d1f3d880e32 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -14,8 +14,9 @@ lib-$(CONFIG_RISCV_ISA_V)	+= uaccess_vector.o
 endif
 lib-$(CONFIG_MMU)	+= uaccess.o
 lib-$(CONFIG_64BIT)	+= tishift.o
 lib-$(CONFIG_RISCV_ISA_ZICBOZ)	+= clear_page.o
 obj-$(CONFIG_CRC32_ARCH)	+= crc32-riscv.o
+obj-$(CONFIG_CRC_T10DIF_ARCH)	+= crc-t10dif-riscv.o
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 lib-$(CONFIG_RISCV_ISA_V)	+= xor.o
 lib-$(CONFIG_RISCV_ISA_V)	+= riscv_v_helpers.o
diff --git a/arch/riscv/lib/crc-t10dif-riscv.c b/arch/riscv/lib/crc-t10dif-riscv.c
new file mode 100644
index 0000000000000..2e9c3dcba8a0e
--- /dev/null
+++ b/arch/riscv/lib/crc-t10dif-riscv.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC-T10DIF implementation with RISC-V Zbc extension.
+ *
+ * Copyright (C) 2024 Institute of Software, CAS.
+ */
+
+#include <asm/alternative-macros.h>
+#include <asm/byteorder.h>
+#include <asm/hwcap.h>
+
+#include <linux/crc-t10dif.h>
+#include <linux/module.h>
+
+/*
+ * CRC-T10DIF is a 16-bit CRC that uses most-significant-bit-first bit order,
+ * i.e. bit i contains the coefficient of x^i (not reflected).
+ */
+
+#define CRCT10DIF_POLY		0x18bb7 /* The generator polynomial G */
+
+#if __riscv_xlen == 64
+#define CRCT10DIF_QUOTIENT_POLY	0xf65a57f81d33a48a /* floor(x^80 / G) - x^64 */
+#define load_be_long(x)		be64_to_cpup(x)
+#elif __riscv_xlen == 32
+#define CRCT10DIF_QUOTIENT_POLY	0xf65a57f8	   /* floor(x^48 / G) - x^32 */
+#define load_be_long(x)		be32_to_cpup(x)
+#else
+#error "Unsupported __riscv_xlen"
+#endif
+
+/*
+ * Multiply the XLEN-bit message polynomial @m by x^16 and reduce it modulo the
+ * generator polynomial G.  This gives the CRC of the message polynomial @m.
+ */
+static inline u16 crct10dif_zbc(unsigned long m)
+{
+	u16 crc;
+
+	asm volatile(".option push\n"
+		     ".option arch,+zbc\n"
+		     /*
+		      * First step of Barrett reduction with integrated
+		      * multiplication by x^16:
+		      *
+		      *    %0 := floor((m * floor(x^(XLEN+16) / G)) / x^XLEN)
+		      *
+		      * The resulting value is equal to floor((m * x^16) / G).
+		      *
+		      * The constant floor(x^(XLEN+16) / G) has degree x^XLEN,
+		      * i.e. it has XLEN+1 bits.  The clmulh instruction
+		      * multiplies m by the x^0 through x^(XLEN-1) terms of this
+		      * constant and does the floored division by x^XLEN.  The
+		      * xor instruction handles the x^XLEN term of the constant
+		      * by adding an additional (m * x^XLEN) / x^XLEN = m.
+		      */
+		     "clmulh %0, %1, %2\n"
+		     "xor    %0, %0, %1\n"
+		     /*
+		      * Second step of Barrett reduction:
+		      *
+		      *    crc := (m * x^16) + (G * floor((m * x^16) / G))
+		      *
+		      * This reduces (m * x^16) modulo G by adding the
+		      * appropriate multiple of G to it.  The result uses only
+		      * the x^0 through x^15 terms.  HOWEVER, since the
+		      * unreduced value (m * x^16) is zero in those terms in the
+		      * first place, it is more efficient to do the equivalent:
+		      *
+		      *    crc := (G * floor((m * x^16) / G)) mod x^16
+		      */
+		     "clmul  %0, %0, %3\n"
+		     ".option pop\n"
+		     : "=&r" (crc)
+		     : "r" (m),
+		     "r" (CRCT10DIF_QUOTIENT_POLY),
+		     "r" (CRCT10DIF_POLY));
+	return crc;
+}
+
+static inline u16 crct10dif_unaligned(u16 crc, const u8 *p, size_t len)
+{
+	unsigned long m;
+	size_t i;
+
+	if (len == 1)
+		return crct10dif_zbc(p[0] ^ (crc >> 8)) ^ (crc << 8);
+
+	/* assuming len >= 2 here */
+	m = crc ^ (p[0] << 8) ^ p[1];
+	for (i = 2; i < len; i++)
+		m = (m << 8) ^ p[i];
+	return crct10dif_zbc(m);
+}
+
+u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
+{
+	size_t align;
+	unsigned long m;
+
+	asm goto(ALTERNATIVE("j %l[fallback]", "nop", 0,
+			     RISCV_ISA_EXT_ZBC, 1) : : : : fallback);
+
+	align = -(unsigned long)p % sizeof(unsigned long);
+	if (align && len) {
+		align = min(align, len);
+		crc = crct10dif_unaligned(crc, p, align);
+		p += align;
+		len -= align;
+	}
+
+	while (len >= sizeof(unsigned long)) {
+		m = ((unsigned long)crc << (8 * sizeof(unsigned long) - 16)) ^
+		    load_be_long((const void *)p);
+		crc = crct10dif_zbc(m);
+		p += sizeof(unsigned long);
+		len -= sizeof(unsigned long);
+	}
+
+	if (len)
+		crc = crct10dif_unaligned(crc, p, len);
+
+	return crc;
+
+fallback:
+	return crc_t10dif_generic(crc, p, len);
+}
+EXPORT_SYMBOL(crc_t10dif_arch);
+
+MODULE_DESCRIPTION("CRC-T10DIF using RISC-V ZBC Extension");
+MODULE_LICENSE("GPL");

base-commit: 4ffd50862d41e5aaf2e749efa354afaa1317c309
-- 
2.48.1


WARNING: multiple messages have this Message-ID (diff)
From: Eric Biggers <ebiggers@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: linux-crypto@vger.kernel.org, Ard Biesheuvel <ardb@kernel.org>,
	Zhihang Shao <zhihang.shao.iscas@gmail.com>,
	linux-riscv@lists.infradead.org
Subject: [PATCH v4] riscv/crc-t10dif: Optimize crct10dif with zbc extension
Date: Wed, 12 Feb 2025 12:07:23 -0800	[thread overview]
Message-ID: <20250212200723.135894-1-ebiggers@kernel.org> (raw)

From: Zhihang Shao <zhihang.shao.iscas@gmail.com>

The current CRC-T10DIF algorithm on RISC-V platform is based on
table-lookup optimization.  Given the previous work on optimizing crc32
calculations with zbc extension, it is believed that this will be
equally effective for accelerating crc-t10dif.

Therefore this patch adds an implementation of crc-t10dif using zbc
extension. It detects whether the current runtime environment supports
zbc feature and, if so, uses it to accelerate crc-t10dif calculations.

This patch is updated due to the patchset of updating kernel's
CRC-T10DIF library in 6.14, which is finished by Eric Biggers.  Also, I
used crc_kunit.c to test the performance of crc-t10dif optimized by crc
extension.

Signed-off-by: Zhihang Shao <zhihang.shao.iscas@gmail.com>
[EB: fixed 32-bit build, added comments that explain the algorithm used,
     and various other cleanups]
Signed-off-by: Eric Biggers <ebiggers@google.com>
---

This patch applies to 
https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux.git/log/?h=crc-next

 arch/riscv/Kconfig                |   1 +
 arch/riscv/lib/Makefile           |   1 +
 arch/riscv/lib/crc-t10dif-riscv.c | 131 ++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+)
 create mode 100644 arch/riscv/lib/crc-t10dif-riscv.c

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 7612c52e9b1e3..db1cf9666dfdd 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -23,10 +23,11 @@ config RISCV
 	select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
 	select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
 	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
 	select ARCH_HAS_BINFMT_FLAT
 	select ARCH_HAS_CRC32 if RISCV_ISA_ZBC
+	select ARCH_HAS_CRC_T10DIF if RISCV_ISA_ZBC
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DEBUG_VIRTUAL if MMU
 	select ARCH_HAS_DEBUG_VM_PGTABLE
 	select ARCH_HAS_DEBUG_WX
 	select ARCH_HAS_FAST_MULTIPLIER
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 79368a895feed..d1d1f3d880e32 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -14,8 +14,9 @@ lib-$(CONFIG_RISCV_ISA_V)	+= uaccess_vector.o
 endif
 lib-$(CONFIG_MMU)	+= uaccess.o
 lib-$(CONFIG_64BIT)	+= tishift.o
 lib-$(CONFIG_RISCV_ISA_ZICBOZ)	+= clear_page.o
 obj-$(CONFIG_CRC32_ARCH)	+= crc32-riscv.o
+obj-$(CONFIG_CRC_T10DIF_ARCH)	+= crc-t10dif-riscv.o
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 lib-$(CONFIG_RISCV_ISA_V)	+= xor.o
 lib-$(CONFIG_RISCV_ISA_V)	+= riscv_v_helpers.o
diff --git a/arch/riscv/lib/crc-t10dif-riscv.c b/arch/riscv/lib/crc-t10dif-riscv.c
new file mode 100644
index 0000000000000..2e9c3dcba8a0e
--- /dev/null
+++ b/arch/riscv/lib/crc-t10dif-riscv.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC-T10DIF implementation with RISC-V Zbc extension.
+ *
+ * Copyright (C) 2024 Institute of Software, CAS.
+ */
+
+#include <asm/alternative-macros.h>
+#include <asm/byteorder.h>
+#include <asm/hwcap.h>
+
+#include <linux/crc-t10dif.h>
+#include <linux/module.h>
+
+/*
+ * CRC-T10DIF is a 16-bit CRC that uses most-significant-bit-first bit order,
+ * i.e. bit i contains the coefficient of x^i (not reflected).
+ */
+
+#define CRCT10DIF_POLY		0x18bb7 /* The generator polynomial G */
+
+#if __riscv_xlen == 64
+#define CRCT10DIF_QUOTIENT_POLY	0xf65a57f81d33a48a /* floor(x^80 / G) - x^64 */
+#define load_be_long(x)		be64_to_cpup(x)
+#elif __riscv_xlen == 32
+#define CRCT10DIF_QUOTIENT_POLY	0xf65a57f8	   /* floor(x^48 / G) - x^32 */
+#define load_be_long(x)		be32_to_cpup(x)
+#else
+#error "Unsupported __riscv_xlen"
+#endif
+
+/*
+ * Multiply the XLEN-bit message polynomial @m by x^16 and reduce it modulo the
+ * generator polynomial G.  This gives the CRC of the message polynomial @m.
+ */
+static inline u16 crct10dif_zbc(unsigned long m)
+{
+	u16 crc;
+
+	asm volatile(".option push\n"
+		     ".option arch,+zbc\n"
+		     /*
+		      * First step of Barrett reduction with integrated
+		      * multiplication by x^16:
+		      *
+		      *    %0 := floor((m * floor(x^(XLEN+16) / G)) / x^XLEN)
+		      *
+		      * The resulting value is equal to floor((m * x^16) / G).
+		      *
+		      * The constant floor(x^(XLEN+16) / G) has degree x^XLEN,
+		      * i.e. it has XLEN+1 bits.  The clmulh instruction
+		      * multiplies m by the x^0 through x^(XLEN-1) terms of this
+		      * constant and does the floored division by x^XLEN.  The
+		      * xor instruction handles the x^XLEN term of the constant
+		      * by adding an additional (m * x^XLEN) / x^XLEN = m.
+		      */
+		     "clmulh %0, %1, %2\n"
+		     "xor    %0, %0, %1\n"
+		     /*
+		      * Second step of Barrett reduction:
+		      *
+		      *    crc := (m * x^16) + (G * floor((m * x^16) / G))
+		      *
+		      * This reduces (m * x^16) modulo G by adding the
+		      * appropriate multiple of G to it.  The result uses only
+		      * the x^0 through x^15 terms.  HOWEVER, since the
+		      * unreduced value (m * x^16) is zero in those terms in the
+		      * first place, it is more efficient to do the equivalent:
+		      *
+		      *    crc := (G * floor((m * x^16) / G)) mod x^16
+		      */
+		     "clmul  %0, %0, %3\n"
+		     ".option pop\n"
+		     : "=&r" (crc)
+		     : "r" (m),
+		     "r" (CRCT10DIF_QUOTIENT_POLY),
+		     "r" (CRCT10DIF_POLY));
+	return crc;
+}
+
+static inline u16 crct10dif_unaligned(u16 crc, const u8 *p, size_t len)
+{
+	unsigned long m;
+	size_t i;
+
+	if (len == 1)
+		return crct10dif_zbc(p[0] ^ (crc >> 8)) ^ (crc << 8);
+
+	/* assuming len >= 2 here */
+	m = crc ^ (p[0] << 8) ^ p[1];
+	for (i = 2; i < len; i++)
+		m = (m << 8) ^ p[i];
+	return crct10dif_zbc(m);
+}
+
+u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
+{
+	size_t align;
+	unsigned long m;
+
+	asm goto(ALTERNATIVE("j %l[fallback]", "nop", 0,
+			     RISCV_ISA_EXT_ZBC, 1) : : : : fallback);
+
+	align = -(unsigned long)p % sizeof(unsigned long);
+	if (align && len) {
+		align = min(align, len);
+		crc = crct10dif_unaligned(crc, p, align);
+		p += align;
+		len -= align;
+	}
+
+	while (len >= sizeof(unsigned long)) {
+		m = ((unsigned long)crc << (8 * sizeof(unsigned long) - 16)) ^
+		    load_be_long((const void *)p);
+		crc = crct10dif_zbc(m);
+		p += sizeof(unsigned long);
+		len -= sizeof(unsigned long);
+	}
+
+	if (len)
+		crc = crct10dif_unaligned(crc, p, len);
+
+	return crc;
+
+fallback:
+	return crc_t10dif_generic(crc, p, len);
+}
+EXPORT_SYMBOL(crc_t10dif_arch);
+
+MODULE_DESCRIPTION("CRC-T10DIF using RISC-V ZBC Extension");
+MODULE_LICENSE("GPL");

base-commit: 4ffd50862d41e5aaf2e749efa354afaa1317c309
-- 
2.48.1


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

             reply	other threads:[~2025-02-12 20:07 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-12 20:07 Eric Biggers [this message]
2025-02-12 20:07 ` [PATCH v4] riscv/crc-t10dif: Optimize crct10dif with zbc extension Eric Biggers

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250212200723.135894-1-ebiggers@kernel.org \
    --to=ebiggers@kernel.org \
    --cc=ardb@kernel.org \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-riscv@lists.infradead.org \
    --cc=zhihang.shao.iscas@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.