[RFC PATCH] crypto: riscv: scalar accelerated GHASH

Linux cryptographic layer development
 help / color / mirror / Atom feed

* [RFC PATCH] crypto: riscv: scalar accelerated GHASH
@ 2025-04-17  6:49 Qingfang Deng
  2025-04-17  6:57 ` Ard Biesheuvel
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Qingfang Deng @ 2025-04-17  6:49 UTC (permalink / raw)
  To: Herbert Xu, David S. Miller, Paul Walmsley, Palmer Dabbelt,
	Albert Ou, Alexandre Ghiti, linux-crypto, linux-riscv,
	linux-kernel
  Cc: Christoph Müllner, Heiko Stuebner, Qingfang Deng

From: Qingfang Deng <qingfang.deng@siflower.com.cn>

Add a scalar implementation of GHASH for RISC-V using the Zbc (carry-less
multiplication) and Zbb (bit-manipulation) extensions. This implementation
is adapted from OpenSSL but rewritten in plain C for clarity.

Unlike the OpenSSL one that rely on bit-reflection of the data, this
version uses a pre-computed (reflected and multiplied) key, inspired by
the approach used in Intel's CLMUL driver, to avoid reflections during
runtime.

Signed-off-by: Qingfang Deng <qingfang.deng@siflower.com.cn>
---
 arch/riscv/crypto/Kconfig               |  16 +-
 arch/riscv/crypto/Makefile              |   2 +
 arch/riscv/crypto/ghash-riscv64-clmul.c | 270 ++++++++++++++++++++++++
 3 files changed, 287 insertions(+), 1 deletion(-)
 create mode 100644 arch/riscv/crypto/ghash-riscv64-clmul.c

diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig
index 6392e1e11bc9..03b74d4116cb 100644
--- a/arch/riscv/crypto/Kconfig
+++ b/arch/riscv/crypto/Kconfig
@@ -26,7 +26,7 @@ config CRYPTO_CHACHA_RISCV64
 	default CRYPTO_LIB_CHACHA_INTERNAL
 
 config CRYPTO_GHASH_RISCV64
-	tristate "Hash functions: GHASH"
+	tristate "Hash functions: GHASH (vector accelarated)"
 	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
 	select CRYPTO_GCM
 	help
@@ -35,6 +35,20 @@ config CRYPTO_GHASH_RISCV64
 	  Architecture: riscv64 using:
 	  - Zvkg vector crypto extension
 
+config CRYPTO_GHASH_RISCV64_CLMUL
+	tristate "Hash functions: GHASH (CLMUL scalar accelerated)"
+	depends on 64BIT && TOOLCHAIN_HAS_ZBB && TOOLCHAIN_HAS_ZBC
+	select CRYPTO_GCM
+	help
+	  GCM GHASH function (NIST SP 800-38D)
+
+	  Architecture: riscv64 using:
+	  - Zbb Bitmanipulation extension
+	  - Zbc Carry-less multiplication
+	    OR
+	  - Zbkb Bit-manipulation for Cryptography
+	  - Zbkc Carry-less multiplication for Cryptography
+
 config CRYPTO_SHA256_RISCV64
 	tristate "Hash functions: SHA-224 and SHA-256"
 	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile
index 247c7bc7288c..b5dc497d398c 100644
--- a/arch/riscv/crypto/Makefile
+++ b/arch/riscv/crypto/Makefile
@@ -10,6 +10,8 @@ chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o
 obj-$(CONFIG_CRYPTO_GHASH_RISCV64) += ghash-riscv64.o
 ghash-riscv64-y := ghash-riscv64-glue.o ghash-riscv64-zvkg.o
 
+obj-$(CONFIG_CRYPTO_GHASH_RISCV64_CLMUL) += ghash-riscv64-clmul.o
+
 obj-$(CONFIG_CRYPTO_SHA256_RISCV64) += sha256-riscv64.o
 sha256-riscv64-y := sha256-riscv64-glue.o sha256-riscv64-zvknha_or_zvknhb-zvkb.o
 
diff --git a/arch/riscv/crypto/ghash-riscv64-clmul.c b/arch/riscv/crypto/ghash-riscv64-clmul.c
new file mode 100644
index 000000000000..4777aa8e94cb
--- /dev/null
+++ b/arch/riscv/crypto/ghash-riscv64-clmul.c
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * GHASH using the RISC-V Zbc/Zbkc (CLMUL) extension
+ *
+ * Copyright (C) 2023 VRULL GmbH
+ * Author: Christoph Müllner <christoph.muellner@vrull.eu>
+ *
+ * Copyright (C) 2025 Siflower Communications Ltd
+ * Author: Qingfang Deng <qingfang.deng@siflower.com.cn>
+ */
+
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <crypto/ghash.h>
+#include <crypto/internal/hash.h>
+
+#define GHASH_MOD_POLY	0xc200000000000000
+
+struct riscv64_clmul_ghash_ctx {
+	__uint128_t key;
+};
+
+struct riscv64_clmul_ghash_desc_ctx {
+	__uint128_t shash;
+	u8 buffer[GHASH_DIGEST_SIZE];
+	int bytes;
+};
+
+static __always_inline u64 riscv_zbb_swab64(u64 val)
+{
+	asm (".option push\n"
+	     ".option arch,+zbb\n"
+	     "rev8 %0, %1\n"
+	     ".option pop\n"
+	     : "=r" (val) : "r" (val));
+	return val;
+}
+
+static __always_inline __uint128_t get_unaligned_be128(const u8 *p)
+{
+	__uint128_t val;
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	val = *(__uint128_t *)p;
+	val = riscv_zbb_swab64(val >> 64) | (__uint128_t)riscv_zbb_swab64(val) << 64;
+#else
+	val = (__uint128_t)p[0] << 120;
+	val |= (__uint128_t)p[1] << 112;
+	val |= (__uint128_t)p[2] << 104;
+	val |= (__uint128_t)p[3] << 96;
+	val |= (__uint128_t)p[4] << 88;
+	val |= (__uint128_t)p[5] << 80;
+	val |= (__uint128_t)p[6] << 72;
+	val |= (__uint128_t)p[7] << 64;
+	val |= (__uint128_t)p[8] << 56;
+	val |= (__uint128_t)p[9] << 48;
+	val |= (__uint128_t)p[10] << 40;
+	val |= (__uint128_t)p[11] << 32;
+	val |= (__uint128_t)p[12] << 24;
+	val |= (__uint128_t)p[13] << 16;
+	val |= (__uint128_t)p[14] << 8;
+	val |= (__uint128_t)p[15];
+#endif
+	return val;
+}
+
+static __always_inline void put_unaligned_be128(__uint128_t val, u8 *p)
+{
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	*(__uint128_t *)p = riscv_zbb_swab64(val >> 64) | (__uint128_t)riscv_zbb_swab64(val) << 64;
+#else
+	p[0] = val >> 120;
+	p[1] = val >> 112;
+	p[2] = val >> 104;
+	p[3] = val >> 96;
+	p[4] = val >> 88;
+	p[5] = val >> 80;
+	p[6] = val >> 72;
+	p[7] = val >> 64;
+	p[8] = val >> 56;
+	p[9] = val >> 48;
+	p[10] = val >> 40;
+	p[11] = val >> 32;
+	p[12] = val >> 24;
+	p[13] = val >> 16;
+	p[14] = val >> 8;
+	p[15] = val;
+#endif
+}
+
+static __always_inline __attribute_const__
+__uint128_t clmul128(u64 a, u64 b)
+{
+	u64 hi, lo;
+
+	asm(".option push\n"
+	    ".option arch,+zbc\n"
+	    "clmul	%0, %2, %3\n"
+	    "clmulh	%1, %2, %3\n"
+	    ".option pop\n"
+	    : "=&r" (lo), "=&r" (hi) : "r" (a), "r" (b));
+	return (__uint128_t)hi << 64 | lo;
+}
+
+static int riscv64_clmul_ghash_init(struct shash_desc *desc)
+{
+	struct riscv64_clmul_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+	dctx->bytes = 0;
+	dctx->shash = 0;
+	return 0;
+}
+
+/* Compute GMULT (Xi*H mod f) using the Zbc (clmul) extensions.
+ * Using the no-Karatsuba approach and clmul for the final reduction.
+ * This results in an implementation with minimized number of instructions.
+ * HW with clmul latencies higher than 2 cycles might observe a performance
+ * improvement with Karatsuba. HW with clmul latencies higher than 6 cycles
+ * might observe a performance improvement with additionally converting the
+ * reduction to shift&xor. For a full discussion of this estimates see
+ * https://github.com/riscv/riscv-crypto/blob/master/doc/supp/gcm-mode-cmul.adoc
+ */
+static void gcm_ghash_rv64i_zbc(__uint128_t *Xi, __uint128_t k, const u8 *inp, size_t len)
+{
+	u64 k_hi = k >> 64, k_lo = k, p_hi, p_lo;
+	__uint128_t hash = *Xi, p;
+
+	do {
+		__uint128_t t0, t1, t2, t3, lo, mid, hi;
+
+		/* Load the input data, byte-reverse them, and XOR them with Xi */
+		p = get_unaligned_be128(inp);
+
+		inp += GHASH_BLOCK_SIZE;
+		len -= GHASH_BLOCK_SIZE;
+
+		p ^= hash;
+		p_hi = p >> 64;
+		p_lo = p;
+
+		/* Multiplication (without Karatsuba) */
+		t0 = clmul128(p_lo, k_lo);
+		t1 = clmul128(p_lo, k_hi);
+		t2 = clmul128(p_hi, k_lo);
+		t3 = clmul128(p_hi, k_hi);
+		mid = t1 ^ t2;
+		lo = t0 ^ (mid << 64);
+		hi = t3 ^ (mid >> 64);
+
+		/* Reduction with clmul */
+		mid = clmul128(lo, GHASH_MOD_POLY);
+		lo ^= mid << 64;
+		hi ^= lo ^ (mid >> 64);
+		hi ^= clmul128(lo >> 64, GHASH_MOD_POLY);
+		hash = hi;
+	} while (len);
+
+	*Xi = hash;
+}
+
+static int riscv64_clmul_ghash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen)
+{
+	struct riscv64_clmul_ghash_ctx *ctx = crypto_shash_ctx(tfm);
+	__uint128_t k;
+
+	if (keylen != GHASH_BLOCK_SIZE)
+		return -EINVAL;
+
+	k = get_unaligned_be128(key);
+	k = (k << 1 | k >> 127) ^ (k >> 127 ? (__uint128_t)GHASH_MOD_POLY << 64 : 0);
+	ctx->key = k;
+
+	return 0;
+}
+
+static int riscv64_clmul_ghash_update(struct shash_desc *desc, const u8 *src, unsigned int srclen)
+{
+	struct riscv64_clmul_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+	struct riscv64_clmul_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+	unsigned int len;
+
+	if (dctx->bytes) {
+		if (dctx->bytes + srclen < GHASH_DIGEST_SIZE) {
+			memcpy(dctx->buffer + dctx->bytes, src, srclen);
+			dctx->bytes += srclen;
+			return 0;
+		}
+		memcpy(dctx->buffer + dctx->bytes, src, GHASH_DIGEST_SIZE - dctx->bytes);
+
+		gcm_ghash_rv64i_zbc(&dctx->shash, ctx->key, dctx->buffer, GHASH_DIGEST_SIZE);
+
+		src += GHASH_DIGEST_SIZE - dctx->bytes;
+		srclen -= GHASH_DIGEST_SIZE - dctx->bytes;
+		dctx->bytes = 0;
+	}
+
+	len = round_down(srclen, GHASH_BLOCK_SIZE);
+	if (len) {
+		gcm_ghash_rv64i_zbc(&dctx->shash, ctx->key, src, len);
+		src += len;
+		srclen -= len;
+	}
+
+	if (srclen) {
+		memcpy(dctx->buffer, src, srclen);
+		dctx->bytes = srclen;
+	}
+	return 0;
+}
+
+static int riscv64_clmul_ghash_final(struct shash_desc *desc, u8 out[GHASH_DIGEST_SIZE])
+{
+	struct riscv64_clmul_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+	struct riscv64_clmul_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+	int i;
+
+	if (dctx->bytes) {
+		for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++)
+			dctx->buffer[i] = 0;
+		gcm_ghash_rv64i_zbc(&dctx->shash, ctx->key, dctx->buffer, GHASH_DIGEST_SIZE);
+		dctx->bytes = 0;
+	}
+	put_unaligned_be128(dctx->shash, out);
+	return 0;
+}
+
+struct shash_alg riscv64_clmul_ghash_alg = {
+	.init = riscv64_clmul_ghash_init,
+	.update = riscv64_clmul_ghash_update,
+	.final = riscv64_clmul_ghash_final,
+	.setkey = riscv64_clmul_ghash_setkey,
+	.descsize = sizeof(struct riscv64_clmul_ghash_desc_ctx),
+	.digestsize = GHASH_DIGEST_SIZE,
+	.base = {
+		 .cra_blocksize = GHASH_BLOCK_SIZE,
+		 .cra_ctxsize = sizeof(struct riscv64_clmul_ghash_ctx),
+		 .cra_priority = 250,
+		 .cra_name = "ghash",
+		 .cra_driver_name = "ghash-riscv64-clmul",
+		 .cra_module = THIS_MODULE,
+	},
+};
+
+static int __init riscv64_clmul_ghash_mod_init(void)
+{
+	bool has_clmul, has_rev8;
+
+	has_clmul = riscv_isa_extension_available(NULL, ZBC) ||
+		    riscv_isa_extension_available(NULL, ZBKC);
+	has_rev8 = riscv_isa_extension_available(NULL, ZBB) ||
+		   riscv_isa_extension_available(NULL, ZBKB);
+	if (has_clmul && has_rev8)
+		return crypto_register_shash(&riscv64_clmul_ghash_alg);
+
+	return -ENODEV;
+}
+
+static void __exit riscv64_clmul_ghash_mod_fini(void)
+{
+	crypto_unregister_shash(&riscv64_clmul_ghash_alg);
+}
+
+module_init(riscv64_clmul_ghash_mod_init);
+module_exit(riscv64_clmul_ghash_mod_fini);
+
+MODULE_DESCRIPTION("GHASH (RISC-V CLMUL accelerated)");
+MODULE_AUTHOR("Qingfang Deng <dqfext@gmail.com>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("ghash");
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] crypto: riscv: scalar accelerated GHASH
  2025-04-17  6:49 [RFC PATCH] crypto: riscv: scalar accelerated GHASH Qingfang Deng
@ 2025-04-17  6:57 ` Ard Biesheuvel
  2025-04-17  7:25   ` Qingfang Deng
  2025-04-17  7:21 ` Herbert Xu
  2025-04-17 17:09 ` Eric Biggers
  2 siblings, 1 reply; 14+ messages in thread
From: Ard Biesheuvel @ 2025-04-17  6:57 UTC (permalink / raw)
  To: Qingfang Deng, Eric Biggers
  Cc: Herbert Xu, David S. Miller, Paul Walmsley, Palmer Dabbelt,
	Albert Ou, Alexandre Ghiti, linux-crypto, linux-riscv,
	linux-kernel, Christoph Müllner, Heiko Stuebner,
	Qingfang Deng

(cc Eric)

On Thu, 17 Apr 2025 at 08:49, Qingfang Deng <dqfext@gmail.com> wrote:
>
> From: Qingfang Deng <qingfang.deng@siflower.com.cn>
>
> Add a scalar implementation of GHASH for RISC-V using the Zbc (carry-less
> multiplication) and Zbb (bit-manipulation) extensions. This implementation
> is adapted from OpenSSL but rewritten in plain C for clarity.
>
> Unlike the OpenSSL one that rely on bit-reflection of the data, this
> version uses a pre-computed (reflected and multiplied) key, inspired by
> the approach used in Intel's CLMUL driver, to avoid reflections during
> runtime.
>
> Signed-off-by: Qingfang Deng <qingfang.deng@siflower.com.cn>

What is the use case for this? AIUI, the scalar AES instructions were
never implemented by anyone, so how do you expect this to be used in
practice?


> ---
>  arch/riscv/crypto/Kconfig               |  16 +-
>  arch/riscv/crypto/Makefile              |   2 +
>  arch/riscv/crypto/ghash-riscv64-clmul.c | 270 ++++++++++++++++++++++++
>  3 files changed, 287 insertions(+), 1 deletion(-)
>  create mode 100644 arch/riscv/crypto/ghash-riscv64-clmul.c
>
> diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig
> index 6392e1e11bc9..03b74d4116cb 100644
> --- a/arch/riscv/crypto/Kconfig
> +++ b/arch/riscv/crypto/Kconfig
> @@ -26,7 +26,7 @@ config CRYPTO_CHACHA_RISCV64
>         default CRYPTO_LIB_CHACHA_INTERNAL
>
>  config CRYPTO_GHASH_RISCV64
> -       tristate "Hash functions: GHASH"
> +       tristate "Hash functions: GHASH (vector accelarated)"
>         depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
>         select CRYPTO_GCM
>         help
> @@ -35,6 +35,20 @@ config CRYPTO_GHASH_RISCV64
>           Architecture: riscv64 using:
>           - Zvkg vector crypto extension
>
> +config CRYPTO_GHASH_RISCV64_CLMUL
> +       tristate "Hash functions: GHASH (CLMUL scalar accelerated)"
> +       depends on 64BIT && TOOLCHAIN_HAS_ZBB && TOOLCHAIN_HAS_ZBC
> +       select CRYPTO_GCM
> +       help
> +         GCM GHASH function (NIST SP 800-38D)
> +
> +         Architecture: riscv64 using:
> +         - Zbb Bitmanipulation extension
> +         - Zbc Carry-less multiplication
> +           OR
> +         - Zbkb Bit-manipulation for Cryptography
> +         - Zbkc Carry-less multiplication for Cryptography
> +
>  config CRYPTO_SHA256_RISCV64
>         tristate "Hash functions: SHA-224 and SHA-256"
>         depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
> diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile
> index 247c7bc7288c..b5dc497d398c 100644
> --- a/arch/riscv/crypto/Makefile
> +++ b/arch/riscv/crypto/Makefile
> @@ -10,6 +10,8 @@ chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o
>  obj-$(CONFIG_CRYPTO_GHASH_RISCV64) += ghash-riscv64.o
>  ghash-riscv64-y := ghash-riscv64-glue.o ghash-riscv64-zvkg.o
>
> +obj-$(CONFIG_CRYPTO_GHASH_RISCV64_CLMUL) += ghash-riscv64-clmul.o
> +
>  obj-$(CONFIG_CRYPTO_SHA256_RISCV64) += sha256-riscv64.o
>  sha256-riscv64-y := sha256-riscv64-glue.o sha256-riscv64-zvknha_or_zvknhb-zvkb.o
>
> diff --git a/arch/riscv/crypto/ghash-riscv64-clmul.c b/arch/riscv/crypto/ghash-riscv64-clmul.c
> new file mode 100644
> index 000000000000..4777aa8e94cb
> --- /dev/null
> +++ b/arch/riscv/crypto/ghash-riscv64-clmul.c
> @@ -0,0 +1,270 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * GHASH using the RISC-V Zbc/Zbkc (CLMUL) extension
> + *
> + * Copyright (C) 2023 VRULL GmbH
> + * Author: Christoph Müllner <christoph.muellner@vrull.eu>
> + *
> + * Copyright (C) 2025 Siflower Communications Ltd
> + * Author: Qingfang Deng <qingfang.deng@siflower.com.cn>
> + */
> +
> +#include <linux/crypto.h>
> +#include <linux/err.h>
> +#include <linux/module.h>
> +#include <linux/types.h>
> +#include <crypto/ghash.h>
> +#include <crypto/internal/hash.h>
> +
> +#define GHASH_MOD_POLY 0xc200000000000000
> +
> +struct riscv64_clmul_ghash_ctx {
> +       __uint128_t key;
> +};
> +
> +struct riscv64_clmul_ghash_desc_ctx {
> +       __uint128_t shash;
> +       u8 buffer[GHASH_DIGEST_SIZE];
> +       int bytes;
> +};
> +
> +static __always_inline u64 riscv_zbb_swab64(u64 val)
> +{
> +       asm (".option push\n"
> +            ".option arch,+zbb\n"
> +            "rev8 %0, %1\n"
> +            ".option pop\n"
> +            : "=r" (val) : "r" (val));
> +       return val;
> +}
> +
> +static __always_inline __uint128_t get_unaligned_be128(const u8 *p)
> +{
> +       __uint128_t val;
> +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS

CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS means that get_unaligned_xxx()
helpers are cheap. Casting a void* to an aligned type is still UB as
per the C standard.

So better to drop the #ifdef entirely, and just use the
get_unaligned_be64() helpers for both cases.

(same below)

Also, do you need to test for int128 support? Or is that guaranteed
for all compilers that are supported by the RISC-V port?


> +       val = *(__uint128_t *)p;
> +       val = riscv_zbb_swab64(val >> 64) | (__uint128_t)riscv_zbb_swab64(val) << 64;
> +#else
> +       val = (__uint128_t)p[0] << 120;
> +       val |= (__uint128_t)p[1] << 112;
> +       val |= (__uint128_t)p[2] << 104;
> +       val |= (__uint128_t)p[3] << 96;
> +       val |= (__uint128_t)p[4] << 88;
> +       val |= (__uint128_t)p[5] << 80;
> +       val |= (__uint128_t)p[6] << 72;
> +       val |= (__uint128_t)p[7] << 64;
> +       val |= (__uint128_t)p[8] << 56;
> +       val |= (__uint128_t)p[9] << 48;
> +       val |= (__uint128_t)p[10] << 40;
> +       val |= (__uint128_t)p[11] << 32;
> +       val |= (__uint128_t)p[12] << 24;
> +       val |= (__uint128_t)p[13] << 16;
> +       val |= (__uint128_t)p[14] << 8;
> +       val |= (__uint128_t)p[15];
> +#endif
> +       return val;
> +}
> +
> +static __always_inline void put_unaligned_be128(__uint128_t val, u8 *p)
> +{
> +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> +       *(__uint128_t *)p = riscv_zbb_swab64(val >> 64) | (__uint128_t)riscv_zbb_swab64(val) << 64;
> +#else
> +       p[0] = val >> 120;
> +       p[1] = val >> 112;
> +       p[2] = val >> 104;
> +       p[3] = val >> 96;
> +       p[4] = val >> 88;
> +       p[5] = val >> 80;
> +       p[6] = val >> 72;
> +       p[7] = val >> 64;
> +       p[8] = val >> 56;
> +       p[9] = val >> 48;
> +       p[10] = val >> 40;
> +       p[11] = val >> 32;
> +       p[12] = val >> 24;
> +       p[13] = val >> 16;
> +       p[14] = val >> 8;
> +       p[15] = val;
> +#endif
> +}
> +
> +static __always_inline __attribute_const__
> +__uint128_t clmul128(u64 a, u64 b)
> +{
> +       u64 hi, lo;
> +
> +       asm(".option push\n"
> +           ".option arch,+zbc\n"
> +           "clmul      %0, %2, %3\n"
> +           "clmulh     %1, %2, %3\n"
> +           ".option pop\n"
> +           : "=&r" (lo), "=&r" (hi) : "r" (a), "r" (b));
> +       return (__uint128_t)hi << 64 | lo;
> +}
> +
> +static int riscv64_clmul_ghash_init(struct shash_desc *desc)
> +{
> +       struct riscv64_clmul_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
> +
> +       dctx->bytes = 0;
> +       dctx->shash = 0;
> +       return 0;
> +}
> +
> +/* Compute GMULT (Xi*H mod f) using the Zbc (clmul) extensions.
> + * Using the no-Karatsuba approach and clmul for the final reduction.
> + * This results in an implementation with minimized number of instructions.
> + * HW with clmul latencies higher than 2 cycles might observe a performance
> + * improvement with Karatsuba. HW with clmul latencies higher than 6 cycles
> + * might observe a performance improvement with additionally converting the
> + * reduction to shift&xor. For a full discussion of this estimates see
> + * https://github.com/riscv/riscv-crypto/blob/master/doc/supp/gcm-mode-cmul.adoc
> + */
> +static void gcm_ghash_rv64i_zbc(__uint128_t *Xi, __uint128_t k, const u8 *inp, size_t len)
> +{
> +       u64 k_hi = k >> 64, k_lo = k, p_hi, p_lo;
> +       __uint128_t hash = *Xi, p;
> +
> +       do {
> +               __uint128_t t0, t1, t2, t3, lo, mid, hi;
> +
> +               /* Load the input data, byte-reverse them, and XOR them with Xi */
> +               p = get_unaligned_be128(inp);
> +
> +               inp += GHASH_BLOCK_SIZE;
> +               len -= GHASH_BLOCK_SIZE;
> +
> +               p ^= hash;
> +               p_hi = p >> 64;
> +               p_lo = p;
> +
> +               /* Multiplication (without Karatsuba) */
> +               t0 = clmul128(p_lo, k_lo);
> +               t1 = clmul128(p_lo, k_hi);
> +               t2 = clmul128(p_hi, k_lo);
> +               t3 = clmul128(p_hi, k_hi);
> +               mid = t1 ^ t2;
> +               lo = t0 ^ (mid << 64);
> +               hi = t3 ^ (mid >> 64);
> +
> +               /* Reduction with clmul */
> +               mid = clmul128(lo, GHASH_MOD_POLY);
> +               lo ^= mid << 64;
> +               hi ^= lo ^ (mid >> 64);
> +               hi ^= clmul128(lo >> 64, GHASH_MOD_POLY);
> +               hash = hi;
> +       } while (len);
> +
> +       *Xi = hash;
> +}
> +
> +static int riscv64_clmul_ghash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen)
> +{
> +       struct riscv64_clmul_ghash_ctx *ctx = crypto_shash_ctx(tfm);
> +       __uint128_t k;
> +
> +       if (keylen != GHASH_BLOCK_SIZE)
> +               return -EINVAL;
> +
> +       k = get_unaligned_be128(key);
> +       k = (k << 1 | k >> 127) ^ (k >> 127 ? (__uint128_t)GHASH_MOD_POLY << 64 : 0);
> +       ctx->key = k;
> +
> +       return 0;
> +}
> +
> +static int riscv64_clmul_ghash_update(struct shash_desc *desc, const u8 *src, unsigned int srclen)
> +{
> +       struct riscv64_clmul_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
> +       struct riscv64_clmul_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
> +       unsigned int len;
> +
> +       if (dctx->bytes) {
> +               if (dctx->bytes + srclen < GHASH_DIGEST_SIZE) {
> +                       memcpy(dctx->buffer + dctx->bytes, src, srclen);
> +                       dctx->bytes += srclen;
> +                       return 0;
> +               }
> +               memcpy(dctx->buffer + dctx->bytes, src, GHASH_DIGEST_SIZE - dctx->bytes);
> +
> +               gcm_ghash_rv64i_zbc(&dctx->shash, ctx->key, dctx->buffer, GHASH_DIGEST_SIZE);
> +
> +               src += GHASH_DIGEST_SIZE - dctx->bytes;
> +               srclen -= GHASH_DIGEST_SIZE - dctx->bytes;
> +               dctx->bytes = 0;
> +       }
> +
> +       len = round_down(srclen, GHASH_BLOCK_SIZE);
> +       if (len) {
> +               gcm_ghash_rv64i_zbc(&dctx->shash, ctx->key, src, len);
> +               src += len;
> +               srclen -= len;
> +       }
> +
> +       if (srclen) {
> +               memcpy(dctx->buffer, src, srclen);
> +               dctx->bytes = srclen;
> +       }
> +       return 0;
> +}
> +
> +static int riscv64_clmul_ghash_final(struct shash_desc *desc, u8 out[GHASH_DIGEST_SIZE])
> +{
> +       struct riscv64_clmul_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
> +       struct riscv64_clmul_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
> +       int i;
> +
> +       if (dctx->bytes) {
> +               for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++)
> +                       dctx->buffer[i] = 0;
> +               gcm_ghash_rv64i_zbc(&dctx->shash, ctx->key, dctx->buffer, GHASH_DIGEST_SIZE);
> +               dctx->bytes = 0;
> +       }
> +       put_unaligned_be128(dctx->shash, out);
> +       return 0;
> +}
> +
> +struct shash_alg riscv64_clmul_ghash_alg = {
> +       .init = riscv64_clmul_ghash_init,
> +       .update = riscv64_clmul_ghash_update,
> +       .final = riscv64_clmul_ghash_final,
> +       .setkey = riscv64_clmul_ghash_setkey,
> +       .descsize = sizeof(struct riscv64_clmul_ghash_desc_ctx),
> +       .digestsize = GHASH_DIGEST_SIZE,
> +       .base = {
> +                .cra_blocksize = GHASH_BLOCK_SIZE,
> +                .cra_ctxsize = sizeof(struct riscv64_clmul_ghash_ctx),
> +                .cra_priority = 250,
> +                .cra_name = "ghash",
> +                .cra_driver_name = "ghash-riscv64-clmul",
> +                .cra_module = THIS_MODULE,
> +       },
> +};
> +
> +static int __init riscv64_clmul_ghash_mod_init(void)
> +{
> +       bool has_clmul, has_rev8;
> +
> +       has_clmul = riscv_isa_extension_available(NULL, ZBC) ||
> +                   riscv_isa_extension_available(NULL, ZBKC);
> +       has_rev8 = riscv_isa_extension_available(NULL, ZBB) ||
> +                  riscv_isa_extension_available(NULL, ZBKB);
> +       if (has_clmul && has_rev8)
> +               return crypto_register_shash(&riscv64_clmul_ghash_alg);
> +
> +       return -ENODEV;
> +}
> +
> +static void __exit riscv64_clmul_ghash_mod_fini(void)
> +{
> +       crypto_unregister_shash(&riscv64_clmul_ghash_alg);
> +}
> +
> +module_init(riscv64_clmul_ghash_mod_init);
> +module_exit(riscv64_clmul_ghash_mod_fini);
> +
> +MODULE_DESCRIPTION("GHASH (RISC-V CLMUL accelerated)");
> +MODULE_AUTHOR("Qingfang Deng <dqfext@gmail.com>");
> +MODULE_LICENSE("GPL");
> +MODULE_ALIAS_CRYPTO("ghash");
> --
> 2.43.0
>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] crypto: riscv: scalar accelerated GHASH
  2025-04-17  6:49 [RFC PATCH] crypto: riscv: scalar accelerated GHASH Qingfang Deng
  2025-04-17  6:57 ` Ard Biesheuvel
@ 2025-04-17  7:21 ` Herbert Xu
  2025-04-17 17:09 ` Eric Biggers
  2 siblings, 0 replies; 14+ messages in thread
From: Herbert Xu @ 2025-04-17  7:21 UTC (permalink / raw)
  To: Qingfang Deng
  Cc: David S. Miller, Paul Walmsley, Palmer Dabbelt, Albert Ou,
	Alexandre Ghiti, linux-crypto, linux-riscv, linux-kernel,
	Christoph Müllner, Heiko Stuebner, Qingfang Deng

On Thu, Apr 17, 2025 at 02:49:38PM +0800, Qingfang Deng wrote:
>
> +static int riscv64_clmul_ghash_update(struct shash_desc *desc, const u8 *src, unsigned int srclen)
> +{
> +	struct riscv64_clmul_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
> +	struct riscv64_clmul_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
> +	unsigned int len;
> +
> +	if (dctx->bytes) {
> +		if (dctx->bytes + srclen < GHASH_DIGEST_SIZE) {
> +			memcpy(dctx->buffer + dctx->bytes, src, srclen);
> +			dctx->bytes += srclen;
> +			return 0;
> +		}
> +		memcpy(dctx->buffer + dctx->bytes, src, GHASH_DIGEST_SIZE - dctx->bytes);
> +
> +		gcm_ghash_rv64i_zbc(&dctx->shash, ctx->key, dctx->buffer, GHASH_DIGEST_SIZE);
> +
> +		src += GHASH_DIGEST_SIZE - dctx->bytes;
> +		srclen -= GHASH_DIGEST_SIZE - dctx->bytes;
> +		dctx->bytes = 0;
> +	}

If this progresses beyond an RFC, you will need to do convert this
into a block-only algorithm on top of:

https://patchwork.kernel.org/project/linux-crypto/patch/b2eb753b083c029785c5e18238ca6cf06f48c86a.1744784515.git.herbert@gondor.apana.org.au/

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] crypto: riscv: scalar accelerated GHASH
  2025-04-17  6:57 ` Ard Biesheuvel
@ 2025-04-17  7:25   ` Qingfang Deng
  2025-04-17  7:39     ` Jeffrey Walton
  2025-04-17  7:57     ` Ard Biesheuvel
  0 siblings, 2 replies; 14+ messages in thread
From: Qingfang Deng @ 2025-04-17  7:25 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Eric Biggers, Herbert Xu, David S. Miller, Paul Walmsley,
	Palmer Dabbelt, Albert Ou, Alexandre Ghiti, linux-crypto,
	linux-riscv, linux-kernel, Christoph Müllner, Heiko Stuebner,
	Qingfang Deng

Hi Ard,

On Thu, Apr 17, 2025 at 2:58 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>
> (cc Eric)
>
> On Thu, 17 Apr 2025 at 08:49, Qingfang Deng <dqfext@gmail.com> wrote:
> >
> > From: Qingfang Deng <qingfang.deng@siflower.com.cn>
> >
> > Add a scalar implementation of GHASH for RISC-V using the Zbc (carry-less
> > multiplication) and Zbb (bit-manipulation) extensions. This implementation
> > is adapted from OpenSSL but rewritten in plain C for clarity.
> >
> > Unlike the OpenSSL one that rely on bit-reflection of the data, this
> > version uses a pre-computed (reflected and multiplied) key, inspired by
> > the approach used in Intel's CLMUL driver, to avoid reflections during
> > runtime.
> >
> > Signed-off-by: Qingfang Deng <qingfang.deng@siflower.com.cn>
>
> What is the use case for this? AIUI, the scalar AES instructions were
> never implemented by anyone, so how do you expect this to be used in
> practice?

The use case _is_ AES-GCM, as you mentioned. Without this, computing
GHASH can take a considerable amount of CPU time (monitored by perf).

> ...
> > +static __always_inline __uint128_t get_unaligned_be128(const u8 *p)
> > +{
> > +       __uint128_t val;
> > +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
>
> CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS means that get_unaligned_xxx()
> helpers are cheap. Casting a void* to an aligned type is still UB as
> per the C standard.

Technically an unaligned access is UB but this pattern is widely used
in networking code.

>
> So better to drop the #ifdef entirely, and just use the
> get_unaligned_be64() helpers for both cases.

Currently those helpers won't generate rev8 instructions, even if
HAVE_EFFICIENT_UNALIGNED_ACCESS and RISCV_ISA_ZBB is set, so I have to
implement my own version of this to reduce the number of instructions,
and to align with the original OpenSSL implementation.

>
> (same below)
>
> Also, do you need to test for int128 support? Or is that guaranteed
> for all compilers that are supported by the RISC-V port?

I believe int128 support is available for all 64-bit targets.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] crypto: riscv: scalar accelerated GHASH
  2025-04-17  7:25   ` Qingfang Deng
@ 2025-04-17  7:39     ` Jeffrey Walton
  2025-04-17  7:45       ` Qingfang Deng
  2025-04-17  7:57     ` Ard Biesheuvel
  1 sibling, 1 reply; 14+ messages in thread
From: Jeffrey Walton @ 2025-04-17  7:39 UTC (permalink / raw)
  To: Qingfang Deng
  Cc: Ard Biesheuvel, Eric Biggers, Herbert Xu, David S. Miller,
	Paul Walmsley, Palmer Dabbelt, Albert Ou, Alexandre Ghiti,
	linux-crypto, linux-riscv, linux-kernel, Christoph Müllner,
	Heiko Stuebner, Qingfang Deng

On Thu, Apr 17, 2025 at 3:25 AM Qingfang Deng <dqfext@gmail.com> wrote:
>
> Hi Ard,
>
> On Thu, Apr 17, 2025 at 2:58 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> > [...]
> >
> > Also, do you need to test for int128 support? Or is that guaranteed
> > for all compilers that are supported by the RISC-V port?
>
> I believe int128 support is available for all 64-bit targets.

You can verify the compiler supports int128 with the following macro:

    #if (__SIZEOF_INT128__ >= 16)
    ...
    #endif

Also see <https://gcc.gnu.org/pipermail/gcc-help/2015-August/124862.html>.

Jeff

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] crypto: riscv: scalar accelerated GHASH
  2025-04-17  7:39     ` Jeffrey Walton
@ 2025-04-17  7:45       ` Qingfang Deng
  0 siblings, 0 replies; 14+ messages in thread
From: Qingfang Deng @ 2025-04-17  7:45 UTC (permalink / raw)
  To: noloader
  Cc: Ard Biesheuvel, Eric Biggers, Herbert Xu, David S. Miller,
	Paul Walmsley, Palmer Dabbelt, Albert Ou, Alexandre Ghiti,
	linux-crypto, linux-riscv, linux-kernel, Christoph Müllner,
	Qingfang Deng

Hi Jeffrey,

On Thu, Apr 17, 2025 at 3:40 PM Jeffrey Walton <noloader@gmail.com> wrote:
>
> On Thu, Apr 17, 2025 at 3:25 AM Qingfang Deng <dqfext@gmail.com> wrote:
> >
> > Hi Ard,
> >
> > On Thu, Apr 17, 2025 at 2:58 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> > > [...]
> > >
> > > Also, do you need to test for int128 support? Or is that guaranteed
> > > for all compilers that are supported by the RISC-V port?
> >
> > I believe int128 support is available for all 64-bit targets.
>
> You can verify the compiler supports int128 with the following macro:
>
>     #if (__SIZEOF_INT128__ >= 16)
>     ...
>     #endif
>
> Also see <https://gcc.gnu.org/pipermail/gcc-help/2015-August/124862.html>.

There is a Kconfig symbol ARCH_SUPPORTS_INT128. I may switch to that.

>
> Jeff

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] crypto: riscv: scalar accelerated GHASH
  2025-04-17  7:25   ` Qingfang Deng
  2025-04-17  7:39     ` Jeffrey Walton
@ 2025-04-17  7:57     ` Ard Biesheuvel
  2025-04-17  8:42       ` Qingfang Deng
  1 sibling, 1 reply; 14+ messages in thread
From: Ard Biesheuvel @ 2025-04-17  7:57 UTC (permalink / raw)
  To: Qingfang Deng
  Cc: Eric Biggers, Herbert Xu, David S. Miller, Paul Walmsley,
	Palmer Dabbelt, Albert Ou, Alexandre Ghiti, linux-crypto,
	linux-riscv, linux-kernel, Christoph Müllner, Heiko Stuebner,
	Qingfang Deng

On Thu, 17 Apr 2025 at 09:25, Qingfang Deng <dqfext@gmail.com> wrote:
>
> Hi Ard,
>
> On Thu, Apr 17, 2025 at 2:58 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> >
> > (cc Eric)
> >
> > On Thu, 17 Apr 2025 at 08:49, Qingfang Deng <dqfext@gmail.com> wrote:
> > >
> > > From: Qingfang Deng <qingfang.deng@siflower.com.cn>
> > >
> > > Add a scalar implementation of GHASH for RISC-V using the Zbc (carry-less
> > > multiplication) and Zbb (bit-manipulation) extensions. This implementation
> > > is adapted from OpenSSL but rewritten in plain C for clarity.
> > >
> > > Unlike the OpenSSL one that rely on bit-reflection of the data, this
> > > version uses a pre-computed (reflected and multiplied) key, inspired by
> > > the approach used in Intel's CLMUL driver, to avoid reflections during
> > > runtime.
> > >
> > > Signed-off-by: Qingfang Deng <qingfang.deng@siflower.com.cn>
> >
> > What is the use case for this? AIUI, the scalar AES instructions were
> > never implemented by anyone, so how do you expect this to be used in
> > practice?
>
> The use case _is_ AES-GCM, as you mentioned. Without this, computing
> GHASH can take a considerable amount of CPU time (monitored by perf).
>

I see. But do you have a particular configuration in mind? Does it
have scalar AES too? I looked into that a while ago but I was told
that nobody actually incorporates that. So what about these
extensions? Are they commonly implemented?

[0] https://web.git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/log/?h=riscv-scalar-aes

> > ...
> > > +static __always_inline __uint128_t get_unaligned_be128(const u8 *p)
> > > +{
> > > +       __uint128_t val;
> > > +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> >
> > CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS means that get_unaligned_xxx()
> > helpers are cheap. Casting a void* to an aligned type is still UB as
> > per the C standard.
>
> Technically an unaligned access is UB but this pattern is widely used
> in networking code.
>

Of course. But that is no reason to keep doing it.

> >
> > So better to drop the #ifdef entirely, and just use the
> > get_unaligned_be64() helpers for both cases.
>
> Currently those helpers won't generate rev8 instructions, even if
> HAVE_EFFICIENT_UNALIGNED_ACCESS and RISCV_ISA_ZBB is set, so I have to
> implement my own version of this to reduce the number of instructions,
> and to align with the original OpenSSL implementation.
>

So fix the helpers.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] crypto: riscv: scalar accelerated GHASH
  2025-04-17  7:57     ` Ard Biesheuvel
@ 2025-04-17  8:42       ` Qingfang Deng
  2025-04-17 14:15         ` Ard Biesheuvel
  2025-04-17 16:58         ` Eric Biggers
  0 siblings, 2 replies; 14+ messages in thread
From: Qingfang Deng @ 2025-04-17  8:42 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Eric Biggers, Herbert Xu, David S. Miller, Paul Walmsley,
	Palmer Dabbelt, Albert Ou, Alexandre Ghiti, linux-crypto,
	linux-riscv, linux-kernel, Christoph Müllner, Heiko Stuebner,
	Qingfang Deng

On Thu, Apr 17, 2025 at 3:58 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> > >
> > > What is the use case for this? AIUI, the scalar AES instructions were
> > > never implemented by anyone, so how do you expect this to be used in
> > > practice?
> >
> > The use case _is_ AES-GCM, as you mentioned. Without this, computing
> > GHASH can take a considerable amount of CPU time (monitored by perf).
> >
>
> I see. But do you have a particular configuration in mind? Does it
> have scalar AES too? I looked into that a while ago but I was told
> that nobody actually incorporates that. So what about these
> extensions? Are they commonly implemented?

It's aes-generic.c (LUT-based) with accelerated GHASH.

>
> [0] https://web.git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/log/?h=riscv-scalar-aes
>
> > > ...
> > > > +static __always_inline __uint128_t get_unaligned_be128(const u8 *p)
> > > > +{
> > > > +       __uint128_t val;
> > > > +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> > >
> > > CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS means that get_unaligned_xxx()
> > > helpers are cheap. Casting a void* to an aligned type is still UB as
> > > per the C standard.
> >
> > Technically an unaligned access is UB but this pattern is widely used
> > in networking code.
> >
>
> Of course. But that is no reason to keep doing it.
>
> > >
> > > So better to drop the #ifdef entirely, and just use the
> > > get_unaligned_be64() helpers for both cases.
> >
> > Currently those helpers won't generate rev8 instructions, even if
> > HAVE_EFFICIENT_UNALIGNED_ACCESS and RISCV_ISA_ZBB is set, so I have to
> > implement my own version of this to reduce the number of instructions,
> > and to align with the original OpenSSL implementation.
> >
>
> So fix the helpers.

The issue is that RISC-V GCC doesn’t emit efficient unaligned loads by default:
- Not all RISC-V CPUs support unaligned access efficiently, so GCC
falls back to conservative byte-wise code.
- There’s no clean way to force the optimized path - GCC only emits
fast unaligned loads if tuned for a specific CPU (e.g., -mtune=size or
-mtune=thead-c906), which the kernel doesn't typically do, even with
HAVE_EFFICIENT_UNALIGNED_ACCESS.

Maybe we should raise this with the GCC maintainers. An explicit
option to enable optimized unaligned access could help.

As for rev8, there's a patch pending to implement the swab macros.

-- Qingfang

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] crypto: riscv: scalar accelerated GHASH
  2025-04-17  8:42       ` Qingfang Deng
@ 2025-04-17 14:15         ` Ard Biesheuvel
  2025-04-17 14:39           ` Qingfang Deng
  2025-04-17 16:58         ` Eric Biggers
  1 sibling, 1 reply; 14+ messages in thread
From: Ard Biesheuvel @ 2025-04-17 14:15 UTC (permalink / raw)
  To: Qingfang Deng
  Cc: Eric Biggers, Herbert Xu, David S. Miller, Paul Walmsley,
	Palmer Dabbelt, Albert Ou, Alexandre Ghiti, linux-crypto,
	linux-riscv, linux-kernel, Christoph Müllner, Heiko Stuebner,
	Qingfang Deng

On Thu, 17 Apr 2025 at 10:42, Qingfang Deng <dqfext@gmail.com> wrote:
>
> On Thu, Apr 17, 2025 at 3:58 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> > > >
> > > > What is the use case for this? AIUI, the scalar AES instructions were
> > > > never implemented by anyone, so how do you expect this to be used in
> > > > practice?
> > >
> > > The use case _is_ AES-GCM, as you mentioned. Without this, computing
> > > GHASH can take a considerable amount of CPU time (monitored by perf).
> > >
> >
> > I see. But do you have a particular configuration in mind? Does it
> > have scalar AES too? I looked into that a while ago but I was told
> > that nobody actually incorporates that. So what about these
> > extensions? Are they commonly implemented?
>
> It's aes-generic.c (LUT-based) with accelerated GHASH.
>
> >
> > [0] https://web.git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/log/?h=riscv-scalar-aes
> >
> > > > ...
> > > > > +static __always_inline __uint128_t get_unaligned_be128(const u8 *p)
> > > > > +{
> > > > > +       __uint128_t val;
> > > > > +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> > > >
> > > > CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS means that get_unaligned_xxx()
> > > > helpers are cheap. Casting a void* to an aligned type is still UB as
> > > > per the C standard.
> > >
> > > Technically an unaligned access is UB but this pattern is widely used
> > > in networking code.
> > >
> >
> > Of course. But that is no reason to keep doing it.
> >
> > > >
> > > > So better to drop the #ifdef entirely, and just use the
> > > > get_unaligned_be64() helpers for both cases.
> > >
> > > Currently those helpers won't generate rev8 instructions, even if
> > > HAVE_EFFICIENT_UNALIGNED_ACCESS and RISCV_ISA_ZBB is set, so I have to
> > > implement my own version of this to reduce the number of instructions,
> > > and to align with the original OpenSSL implementation.
> > >
> >
> > So fix the helpers.
>
> The issue is that RISC-V GCC doesn’t emit efficient unaligned loads by default:
> - Not all RISC-V CPUs support unaligned access efficiently, so GCC
> falls back to conservative byte-wise code.

That makes sense.

> - There’s no clean way to force the optimized path - GCC only emits
> fast unaligned loads if tuned for a specific CPU (e.g., -mtune=size or
> -mtune=thead-c906), which the kernel doesn't typically do, even with
> HAVE_EFFICIENT_UNALIGNED_ACCESS.
>
> Maybe we should raise this with the GCC maintainers. An explicit
> option to enable optimized unaligned access could help.
>

HAVE_EFFICIENT_UNALIGNED_ACCESS is a build time setting, so the
resulting kernel only runs correctly on hardware that implements
unaligned accesses in hardware.

So that means you could pass this -mtune= option too in that case, no?
Then, you can just use a packed struct or an __aligned(1) annotation
and the compiler will emit the correct code for you, depending on
whether unaligned accesses are permitted.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] crypto: riscv: scalar accelerated GHASH
  2025-04-17 14:15         ` Ard Biesheuvel
@ 2025-04-17 14:39           ` Qingfang Deng
  0 siblings, 0 replies; 14+ messages in thread
From: Qingfang Deng @ 2025-04-17 14:39 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Eric Biggers, Herbert Xu, David S. Miller, Paul Walmsley,
	Palmer Dabbelt, Albert Ou, Alexandre Ghiti, linux-crypto,
	linux-riscv, linux-kernel, Christoph Müllner, Qingfang Deng

On Thu, Apr 17, 2025 at 10:15 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> > - There’s no clean way to force the optimized path - GCC only emits
> > fast unaligned loads if tuned for a specific CPU (e.g., -mtune=size or
> > -mtune=thead-c906), which the kernel doesn't typically do, even with
> > HAVE_EFFICIENT_UNALIGNED_ACCESS.
> >
> > Maybe we should raise this with the GCC maintainers. An explicit
> > option to enable optimized unaligned access could help.
> >
>
> HAVE_EFFICIENT_UNALIGNED_ACCESS is a build time setting, so the
> resulting kernel only runs correctly on hardware that implements
> unaligned accesses in hardware.
>
> So that means you could pass this -mtune= option too in that case, no?

GCC docs say -mtune=size is internal to -Os and not meant for direct
use. So while it enables optimized unaligned access, relying on it
feels a bit hacky.
Clang is more explicit here: -mno-strict-align cleanly enables
optimized unaligned accesses. It'd be great if GCC had something
similar..

[1] https://gcc.gnu.org/onlinedocs/gcc-14.2.0/gcc/RISC-V-Options.html#index-mtune-12

> Then, you can just use a packed struct or an __aligned(1) annotation
> and the compiler will emit the correct code for you, depending on
> whether unaligned accesses are permitted.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] crypto: riscv: scalar accelerated GHASH
  2025-04-17  8:42       ` Qingfang Deng
  2025-04-17 14:15         ` Ard Biesheuvel
@ 2025-04-17 16:58         ` Eric Biggers
  2025-04-18  1:48           ` Qingfang Deng
  1 sibling, 1 reply; 14+ messages in thread
From: Eric Biggers @ 2025-04-17 16:58 UTC (permalink / raw)
  To: Qingfang Deng
  Cc: Ard Biesheuvel, Herbert Xu, David S. Miller, Paul Walmsley,
	Palmer Dabbelt, Albert Ou, Alexandre Ghiti, linux-crypto,
	linux-riscv, linux-kernel, Christoph Müllner, Heiko Stuebner,
	Qingfang Deng

On Thu, Apr 17, 2025 at 04:42:46PM +0800, Qingfang Deng wrote:
> > I see. But do you have a particular configuration in mind? Does it
> > have scalar AES too? I looked into that a while ago but I was told
> > that nobody actually incorporates that. So what about these
> > extensions? Are they commonly implemented?
> 
> It's aes-generic.c (LUT-based) with accelerated GHASH.

That's an odd combination.  Normally accelerated AES and GHASH go together.
Presumably you're targeting some specific RISC-V CPU.  Is this going to be a
one-off thing for that specific CPU?  Or can we expect many RISC-V CPUs to have
GHASH acceleration without AES?  And if so, why?

- Eric

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] crypto: riscv: scalar accelerated GHASH
  2025-04-17  6:49 [RFC PATCH] crypto: riscv: scalar accelerated GHASH Qingfang Deng
  2025-04-17  6:57 ` Ard Biesheuvel
  2025-04-17  7:21 ` Herbert Xu
@ 2025-04-17 17:09 ` Eric Biggers
  2025-04-18  2:49   ` Qingfang Deng
  2 siblings, 1 reply; 14+ messages in thread
From: Eric Biggers @ 2025-04-17 17:09 UTC (permalink / raw)
  To: Qingfang Deng
  Cc: Herbert Xu, David S. Miller, Paul Walmsley, Palmer Dabbelt,
	Albert Ou, Alexandre Ghiti, linux-crypto, linux-riscv,
	linux-kernel, Christoph Müllner, Heiko Stuebner,
	Qingfang Deng

On Thu, Apr 17, 2025 at 02:49:38PM +0800, Qingfang Deng wrote:
> +static __always_inline u64 riscv_zbb_swab64(u64 val)
> +{
> +	asm (".option push\n"
> +	     ".option arch,+zbb\n"
> +	     "rev8 %0, %1\n"
> +	     ".option pop\n"
> +	     : "=r" (val) : "r" (val));
> +	return val;
> +}
> +
> +static __always_inline __uint128_t get_unaligned_be128(const u8 *p)
> +{
> +	__uint128_t val;
> +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> +	val = *(__uint128_t *)p;
> +	val = riscv_zbb_swab64(val >> 64) | (__uint128_t)riscv_zbb_swab64(val) << 64;
> +#else
> +	val = (__uint128_t)p[0] << 120;
> +	val |= (__uint128_t)p[1] << 112;
> +	val |= (__uint128_t)p[2] << 104;
> +	val |= (__uint128_t)p[3] << 96;
> +	val |= (__uint128_t)p[4] << 88;
> +	val |= (__uint128_t)p[5] << 80;
> +	val |= (__uint128_t)p[6] << 72;
> +	val |= (__uint128_t)p[7] << 64;
> +	val |= (__uint128_t)p[8] << 56;
> +	val |= (__uint128_t)p[9] << 48;
> +	val |= (__uint128_t)p[10] << 40;
> +	val |= (__uint128_t)p[11] << 32;
> +	val |= (__uint128_t)p[12] << 24;
> +	val |= (__uint128_t)p[13] << 16;
> +	val |= (__uint128_t)p[14] << 8;
> +	val |= (__uint128_t)p[15];
> +#endif
> +	return val;
> +}
> +
> +static __always_inline void put_unaligned_be128(__uint128_t val, u8 *p)
> +{
> +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> +	*(__uint128_t *)p = riscv_zbb_swab64(val >> 64) | (__uint128_t)riscv_zbb_swab64(val) << 64;
> +#else
> +	p[0] = val >> 120;
> +	p[1] = val >> 112;
> +	p[2] = val >> 104;
> +	p[3] = val >> 96;
> +	p[4] = val >> 88;
> +	p[5] = val >> 80;
> +	p[6] = val >> 72;
> +	p[7] = val >> 64;
> +	p[8] = val >> 56;
> +	p[9] = val >> 48;
> +	p[10] = val >> 40;
> +	p[11] = val >> 32;
> +	p[12] = val >> 24;
> +	p[13] = val >> 16;
> +	p[14] = val >> 8;
> +	p[15] = val;
> +#endif
> +}

Please help properly optimize swab*() and {get,put}_unaligned_* for RISC-V
first, before considering random hacks like this.

https://lore.kernel.org/r/20250403-riscv-swab-v3-0-3bf705d80e33@iencinas.com
is working on swab*().

> +		/* Multiplication (without Karatsuba) */
> +		t0 = clmul128(p_lo, k_lo);
> +		t1 = clmul128(p_lo, k_hi);
> +		t2 = clmul128(p_hi, k_lo);
> +		t3 = clmul128(p_hi, k_hi);
> +		mid = t1 ^ t2;
> +		lo = t0 ^ (mid << 64);
> +		hi = t3 ^ (mid >> 64);

There is no need to explicitly XOR 'mid << 64' into lo and 'mid >> 64' into hi.
Take a look at how arch/x86/crypto/aes-gcm-*.S do it.

Also, since this is only doing one block at a time and does not use Karatsuba
multiplication, the single-step reduction would work well here.  See
aes-gcm-aesni-x86_64.S.

- Eric

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] crypto: riscv: scalar accelerated GHASH
  2025-04-17 16:58         ` Eric Biggers
@ 2025-04-18  1:48           ` Qingfang Deng
  0 siblings, 0 replies; 14+ messages in thread
From: Qingfang Deng @ 2025-04-18  1:48 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Ard Biesheuvel, Herbert Xu, David S. Miller, Paul Walmsley,
	Palmer Dabbelt, Albert Ou, Alexandre Ghiti, linux-crypto,
	linux-riscv, linux-kernel, Christoph Müllner, Qingfang Deng,
	Guo Ren

Hi Eric,

On Fri, Apr 18, 2025 at 12:59 AM Eric Biggers <ebiggers@kernel.org> wrote:
>
> On Thu, Apr 17, 2025 at 04:42:46PM +0800, Qingfang Deng wrote:
> > > I see. But do you have a particular configuration in mind? Does it
> > > have scalar AES too? I looked into that a while ago but I was told
> > > that nobody actually incorporates that. So what about these
> > > extensions? Are they commonly implemented?
> >
> > It's aes-generic.c (LUT-based) with accelerated GHASH.
>
> That's an odd combination.  Normally accelerated AES and GHASH go together.
> Presumably you're targeting some specific RISC-V CPU.  Is this going to be a
> one-off thing for that specific CPU?  Or can we expect many RISC-V CPUs to have
> GHASH acceleration without AES?  And if so, why?

There are a few RISC-V CPUs that support the B extensions
(Zba+Zbb+Zbc+Zbs) but lack both scalar and vector crypto extensions.
One such CPU already upstreamed in the kernel is the SpacemiT K1.
Another example and the one I'm currently running this code on is the
XuanTie C908, as found in the CanMV K230.

(+Cc Guo Ren)
Guo, do you know if future XuanTie CPUs will support the RISC-V
scalar/vector crypto extensions?

[1] https://www.xrvm.com/product/xuantie/C908
[2] https://www.xrvm.com/product/xuantie/C907
[3] https://docs.banana-pi.org/en/BPI-F3/SpacemiT_K1_datasheet
>
> - Eric

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] crypto: riscv: scalar accelerated GHASH
  2025-04-17 17:09 ` Eric Biggers
@ 2025-04-18  2:49   ` Qingfang Deng
  0 siblings, 0 replies; 14+ messages in thread
From: Qingfang Deng @ 2025-04-18  2:49 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Herbert Xu, David S. Miller, Paul Walmsley, Palmer Dabbelt,
	Albert Ou, Alexandre Ghiti, linux-crypto, linux-riscv,
	linux-kernel, Christoph Müllner, Qingfang Deng

Hi Eric,

On Fri, Apr 18, 2025 at 1:09 AM Eric Biggers <ebiggers@kernel.org> wrote:
>
> Please help properly optimize swab*() and {get,put}_unaligned_* for RISC-V
> first, before considering random hacks like this.
>
> https://lore.kernel.org/r/20250403-riscv-swab-v3-0-3bf705d80e33@iencinas.com
> is working on swab*().

Indeed — in fact, our downstream NONPORTABLE version currently uses
{get,put}_unaligned_be64, as we've modified the Makefile to ensure the
compiler optimizes for both unaligned access and efficient swab*()
handling.

>
> > +             /* Multiplication (without Karatsuba) */
> > +             t0 = clmul128(p_lo, k_lo);
> > +             t1 = clmul128(p_lo, k_hi);
> > +             t2 = clmul128(p_hi, k_lo);
> > +             t3 = clmul128(p_hi, k_hi);
> > +             mid = t1 ^ t2;
> > +             lo = t0 ^ (mid << 64);
> > +             hi = t3 ^ (mid >> 64);
>
> There is no need to explicitly XOR 'mid << 64' into lo and 'mid >> 64' into hi.
> Take a look at how arch/x86/crypto/aes-gcm-*.S do it.

Thanks, I saw your comments in aes-gcm-avx10-x86_64.S and now
understand what you meant.

However, since we're working with 64-bit scalar registers on RISC-V
(as opposed to 128-bit SIMD registers on x86), there's no reduction in
the number of XOR instructions. Regardless of whether we explicitly
compute mid and shift it, or directly XOR the intermediate results, we
still end up with 8 individual 64-bit XORs to combine t0, t1, t2, and
t3.

So while the optimization helps on x86 due to wider registers and
vector instructions, it doesn't offer a benefit in our scalar RISC-V
implementation.

>
> Also, since this is only doing one block at a time and does not use Karatsuba
> multiplication, the single-step reduction would work well here.  See
> aes-gcm-aesni-x86_64.S.

I saw the pre-compute key step. Is it the same as the step mentioned
on page 12 of this PDF?

[1] https://builders.intel.com/docs/networkbuilders/advanced-encryption-standard-galois-counter-mode-optimized-ghash-function-technology-guide-1693300747.pdf

>
> - Eric

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2025-04-18  2:49 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-17  6:49 [RFC PATCH] crypto: riscv: scalar accelerated GHASH Qingfang Deng
2025-04-17  6:57 ` Ard Biesheuvel
2025-04-17  7:25   ` Qingfang Deng
2025-04-17  7:39     ` Jeffrey Walton
2025-04-17  7:45       ` Qingfang Deng
2025-04-17  7:57     ` Ard Biesheuvel
2025-04-17  8:42       ` Qingfang Deng
2025-04-17 14:15         ` Ard Biesheuvel
2025-04-17 14:39           ` Qingfang Deng
2025-04-17 16:58         ` Eric Biggers
2025-04-18  1:48           ` Qingfang Deng
2025-04-17  7:21 ` Herbert Xu
2025-04-17 17:09 ` Eric Biggers
2025-04-18  2:49   ` Qingfang Deng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox