[PATCH V2] crypto: aegis128: Add RISC-V vector SIMD implementation

public inbox for linux-crypto@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH V2] crypto: aegis128: Add RISC-V vector SIMD implementation
@ 2026-01-26  9:24 Chunyan Zhang
  2026-02-06 10:03 ` Herbert Xu
  0 siblings, 1 reply; 3+ messages in thread
From: Chunyan Zhang @ 2026-01-26  9:24 UTC (permalink / raw)
  To: Paul Walmsley, Palmer Dabbelt, Albert Ou, Alexandre Ghiti,
	Herbert Xu, David S . Miller
  Cc: linux-riscv, linux-crypto, linux-kernel, Chunyan Zhang

Add a RISC-V vector-accelerated implementation of aegis128 by
wiring it into the generic SIMD hooks.

This implementation supports vlen values of 512, 256, and 128.

Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
---
V2:
- Add config dependency of RISCV_ISA_V to fix the issue reported by kernel test robot;
- Add return value in preload_round_data() and aegis128_round().

V1: https://lore.kernel.org/all/20260121101923.64657-1-zhangchunyan@iscas.ac.cn/
---
 crypto/Kconfig              |   4 +-
 crypto/Makefile             |   4 +
 crypto/aegis-rvv.h          |  19 +
 crypto/aegis128-rvv-inner.c | 762 ++++++++++++++++++++++++++++++++++++
 crypto/aegis128-rvv.c       |  63 +++
 5 files changed, 850 insertions(+), 2 deletions(-)
 create mode 100644 crypto/aegis-rvv.h
 create mode 100644 crypto/aegis128-rvv-inner.c
 create mode 100644 crypto/aegis128-rvv.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 2e5b195b1b06..9766b3596049 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -777,8 +777,8 @@ config CRYPTO_AEGIS128
 	  AEGIS-128 AEAD algorithm
 
 config CRYPTO_AEGIS128_SIMD
-	bool "AEGIS-128 (arm NEON, arm64 NEON)"
-	depends on CRYPTO_AEGIS128 && ((ARM || ARM64) && KERNEL_MODE_NEON)
+	bool "AEGIS-128 (arm NEON, arm64 NEON, RISC-V vector)"
+	depends on CRYPTO_AEGIS128 && (((ARM || ARM64) && KERNEL_MODE_NEON) || (RISCV && RISCV_ISA_V))
 	default y
 	help
 	  AEGIS-128 AEAD algorithm
diff --git a/crypto/Makefile b/crypto/Makefile
index 16a35649dd91..3d94cae9eeba 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -121,6 +121,10 @@ endif
 # Enable <arm_neon.h>
 CFLAGS_aegis128-neon-inner.o += -isystem $(shell $(CC) -print-file-name=include)
 
+ifeq ($(ARCH),riscv)
+aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-rvv.o aegis128-rvv-inner.o
+endif
+
 obj-$(CONFIG_CRYPTO_PCRYPT) += pcrypt.o
 obj-$(CONFIG_CRYPTO_CRYPTD) += cryptd.o
 obj-$(CONFIG_CRYPTO_DES) += des_generic.o
diff --git a/crypto/aegis-rvv.h b/crypto/aegis-rvv.h
new file mode 100644
index 000000000000..02bd646e4467
--- /dev/null
+++ b/crypto/aegis-rvv.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2026 Institute of Software, CAS
+ */
+
+#ifndef _AEGIS_RVV_H
+#define _AEGIS_RVV_H
+
+extern const u8 crypto_aes_sbox[];
+
+void crypto_aegis128_init_rvv(void *state, const void *key, const void *iv);
+void crypto_aegis128_update_rvv(void *state, const void *msg);
+void crypto_aegis128_encrypt_chunk_rvv(void *state, void *dst, const void *src,
+				       unsigned int size);
+void crypto_aegis128_decrypt_chunk_rvv(void *state, void *dst, const void *src,
+				       unsigned int size);
+int crypto_aegis128_final_rvv(void *state, void *tag_xor, unsigned int assoclen,
+			      unsigned int cryptlen, unsigned int authsize);
+#endif
diff --git a/crypto/aegis128-rvv-inner.c b/crypto/aegis128-rvv-inner.c
new file mode 100644
index 000000000000..2d7439769d77
--- /dev/null
+++ b/crypto/aegis128-rvv-inner.c
@@ -0,0 +1,762 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2026 Institute of Software, CAS
+ * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
+ *
+ * Based on aegis128-neon-inner.c:
+ *	Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/vector.h>
+#include <linux/types.h>
+
+#include "aegis-rvv.h"
+#include "aegis.h"
+
+#define AEGIS128_STATE_BLOCKS 5
+#define RVV_VLEN	riscv_vector_vlen()
+
+typedef u8 aegis128_block_bytes[AEGIS_BLOCK_SIZE];
+struct aegis_state {
+	aegis128_block_bytes blocks[AEGIS128_STATE_BLOCKS];
+};
+
+/* Load 256 bytes at one time into the vector registers starting from r0 */
+#define preload_sbox_1(m, r0) do {				\
+	unsigned long vl;					\
+	asm volatile (".option	push\n"				\
+		      ".option	arch,+v\n"			\
+		      "vsetvli	%0, x0, e8, "m", ta, ma\n"	\
+		      "vle8.v	"r0", (%1)\n"			\
+		      ".option	pop\n"				\
+		      : "=&r" (vl)				\
+		      :						\
+		      "r" (crypto_aes_sbox)			\
+	:);							\
+} while (0)
+
+/* Load 256 bytes at two times into the vector registers starting from r0 and r1 */
+#define preload_sbox_2(m, r0, r1) do {				\
+	unsigned long vl;					\
+	asm volatile (".option	push\n"				\
+		      ".option	arch,+v\n"			\
+		      "vsetvli	%0, x0, e8, "m", ta, ma\n"	\
+		      "vle8.v	"r0", (%1)\n"			\
+		      "vle8.v	"r1", (%2)\n"			\
+		      ".option	pop\n"				\
+		      : "=&r" (vl)				\
+		      :						\
+		      "r" (crypto_aes_sbox),			\
+		      "r" (crypto_aes_sbox + 0x80)		\
+	:);							\
+} while (0)
+
+/* v16 - v31: crypto_aes_sbox[0-255] */
+#define preload_sbox_128() preload_sbox_2("m8", "v16", "v24")
+
+/* v16 - v23: crypto_aes_sbox[0-255] */
+#define preload_sbox_256() preload_sbox_1("m8", "v16")
+
+/* v16 - v19: crypto_aes_sbox[0-255] */
+#define preload_sbox_512() preload_sbox_1("m4", "v16")
+
+static __always_inline
+int preload_round_data(void)
+{
+	static const u8 rev32qu16[] = {
+		0x2, 0x3, 0x0, 0x1, 0x6, 0x7, 0x4, 0x5,
+		0xa, 0xb, 0x8, 0x9, 0xe, 0xf, 0xc, 0xd,
+	};
+
+	static const u8 shift_rows[] = {
+		0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+		0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+	};
+
+	static const u8 ror32by8[] = {
+		0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+		0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+	};
+
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+		      "vle8.v	v13, (%[rev32qu16])\n"
+		      "vle8.v	v14, (%[shift_rows])\n"
+		      "vle8.v	v15, (%[ror32by8])\n"
+		      ".option	pop\n"
+		      : :
+		      [rev32qu16]"r"(rev32qu16),
+		      [shift_rows]"r"(shift_rows),
+		      [ror32by8]"r"(ror32by8)
+	:);
+
+	switch (RVV_VLEN) {
+	case 128:
+		preload_sbox_128();
+		break;
+	case 256:
+		preload_sbox_256();
+		break;
+	case 512:
+		preload_sbox_512();
+		break;
+	default:
+		pr_err("ERROR: %d is not a supported vector length!", RVV_VLEN);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+#define AEGIS128_ROUND_PART1				\
+	".option	push\n"				\
+	".option	arch,+v\n"			\
+	"vsetivli	zero, 0x10, e8, m1, ta, ma\n"	\
+	/* s = vqtbl1q_u8(s, vld1q_u8(shift_rows)) */	\
+	"vle8.v		v0, (%[s])\n"			\
+	"vrgather.vv	v1, v0, v14\n" /* v14: shift_rows */
+
+#define AEGIS128_ROUND_PART3						\
+	/* s= (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b) */	\
+	"vsetivli	zero, 0x10, e8, m1, ta, ma\n"			\
+	"vsra.vi	v3, v2, 7\n" /* ((int8x16_t)v >> 7) */		\
+	"vand.vx	v3, v3, %[x1b]\n"				\
+	"vsll.vi	v4, v2, 1\n" /* (v << 1) */			\
+	"vxor.vv	v3, v4, v3\n"					\
+	/* s ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v) */		\
+	"vrgather.vv	v4, v2, v13\n" /* v13: rev32qu16 */		\
+	"vxor.vv	v3, v3, v4\n"					\
+	/* s ^= vqtbl1q_u8(v ^ s, vld1q_u8(ror32by8)); */		\
+	"vxor.vv	v4, v3, v2\n" /* v ^ s */			\
+	"vrgather.vv	v5, v4, v15\n" /* v15: ror32by8 */		\
+	"vxor.vv	v3, v3, v5\n"					\
+	"vle8.v		v4, (%[d])\n"					\
+	"vxor.vv	v3, v3, v4\n" /* dst ^= v3 */			\
+	"vse8.v		v3, (%[d])\n"					\
+	".option	pop\n"
+
+/*
+ * v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + step), s - step);
+ * r: vector register which stores sbox array
+ */
+#define gather_sbox(r)				\
+	"vsub.vx	v1, v1, %[step]\n"	\
+	"vrgather.vv	v3, "r", v1\n"		\
+	"vor.vv		v2, v2, v3\n"
+
+static __always_inline
+void aegis128_round_128(u8 *dst, const u8 *src)
+{
+	unsigned long vl;
+
+	/* v16 - v31: crypto_aes_sbox[0-255] */
+	asm volatile (AEGIS128_ROUND_PART1
+		      /* v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), s); */
+		      "vsetvli		%0, x0, e8, m1, ta, ma\n"
+		      "vrgather.vv	v2, v16, v1\n"
+		      /* v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x10), s - 0x10); */
+		      gather_sbox("v17")
+		      gather_sbox("v18")
+		      gather_sbox("v19")
+		      gather_sbox("v20")
+		      gather_sbox("v21")
+		      gather_sbox("v22")
+		      gather_sbox("v23")
+		      gather_sbox("v24")
+		      gather_sbox("v25")
+		      gather_sbox("v26")
+		      gather_sbox("v27")
+		      gather_sbox("v28")
+		      gather_sbox("v29")
+		      gather_sbox("v30")
+		      gather_sbox("v31")
+		      AEGIS128_ROUND_PART3
+		      : "=&r" (vl) :
+		      [s]"r"(src),
+		      [step]"r"(0x10),
+		      [x1b]"r"(0x1b),
+		      [d]"r"(dst)
+		      : "memory"
+	);
+}
+
+static __always_inline
+void aegis128_round_256(u8 *dst, const u8 *src)
+{
+	unsigned long vl;
+
+	/* v16 - v23: crypto_aes_sbox[0-255] */
+	asm volatile (AEGIS128_ROUND_PART1
+		      /* v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), s); */
+		      "vsetvli		%0, x0, e8, m1, ta, ma\n"
+		      "vrgather.vv	v2, v16, v1\n"
+		      /* v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x20), s - 0x20); */
+		      gather_sbox("v17")
+		      gather_sbox("v18")
+		      gather_sbox("v19")
+		      gather_sbox("v20")
+		      gather_sbox("v21")
+		      gather_sbox("v22")
+		      gather_sbox("v23")
+		      AEGIS128_ROUND_PART3
+		      : "=&r" (vl) :
+		      [s]"r"(src),
+		      [step]"r"(0x20),
+		      [x1b]"r"(0x1b),
+		      [d]"r"(dst)
+		      : "memory"
+	);
+}
+
+static __always_inline
+void aegis128_round_512(u8 *dst, const u8 *src)
+{
+	unsigned long vl;
+
+	/* v16 - v19: crypto_aes_sbox[0-255] */
+	asm volatile (AEGIS128_ROUND_PART1
+		      /* v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), s); */
+		      "vsetvli		%0, x0, e8, m1, ta, ma\n"
+		      "vrgather.vv	v2, v16, v1\n"
+		      /*v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x40), s - 0x40);*/
+		      gather_sbox("v17")
+		      /*v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x80), s - 0x80);*/
+		      gather_sbox("v18")
+		      /*v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0xc0), s - 0xc0);*/
+		      gather_sbox("v19")
+		      AEGIS128_ROUND_PART3
+		      : "=&r" (vl) :
+		      [s]"r"(src),
+		      [step]"r"(0x40),
+		      [x1b]"r"(0x1b),
+		      [d]"r"(dst)
+		      : "memory"
+	);
+}
+
+static __always_inline
+int aegis128_round(u8 *dst, const u8 *src)
+{
+	switch (RVV_VLEN) {
+	case 128:
+		aegis128_round_128(dst, src);
+		break;
+	case 256:
+		aegis128_round_256(dst, src);
+		break;
+	case 512:
+		aegis128_round_512(dst, src);
+		break;
+	default:
+		pr_err("ERROR: %d is not a supported vector length!", RVV_VLEN);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static __always_inline
+void aegis128_update_rvv(struct aegis_state *state, const void *key)
+{
+	u8 k[AEGIS_BLOCK_SIZE];
+
+	/* save key to k[16] */
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+		      "vle8.v	v1, (%[key])\n"
+		      "vse8.v	v1, (%[k])\n"
+		      ".option	pop\n"
+		      : :
+		      [key]"r"(key),
+		      [k]"r"(k)
+	:);
+
+	aegis128_round(k, state->blocks[4]);
+	aegis128_round(state->blocks[4], state->blocks[3]);
+	aegis128_round(state->blocks[3], state->blocks[2]);
+	aegis128_round(state->blocks[2], state->blocks[1]);
+	aegis128_round(state->blocks[1], state->blocks[0]);
+
+	/* state->blocks[0] ^= key */
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+		      "vle8.v	v1, (%[k])\n"
+		      "vle8.v	v2, (%[block0])\n"
+		      "vxor.vv	v2, v2, v1\n"
+		      "vse8.v	v2, (%[block0])\n"
+		      ".option	pop\n"
+		      : :
+		      [k]"r"(k),
+		      [block0]"r"(state->blocks[0])
+	:);
+}
+
+void crypto_aegis128_init_rvv(void *state, const void *key, const void *iv)
+{
+	struct aegis_state *st = state;
+	u8 kiv[AEGIS_BLOCK_SIZE];
+
+	static const u8 const0[] = {
+		0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d,
+		0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62,
+	};
+	static const u8 const1[] = {
+		0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1,
+		0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd,
+	};
+
+	/*
+	 * kiv = key^iv
+	 * struct aegis128_state st = {{
+		kiv,
+		vld1q_u8(const1),
+		vld1q_u8(const0),
+		key ^ vld1q_u8(const0),
+		key ^ vld1q_u8(const1),
+	   }};
+	 */
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+		      "vle8.v	v0, (%[const0])\n"
+		      "vle8.v	v1, (%[const1])\n"
+		      "vse8.v	v0, (%[block2])\n"
+		      "vse8.v	v1, (%[block1])\n"
+		      "vle8.v	v2, (%[iv])\n"
+		      "vle8.v	v3, (%[key])\n"
+		      "vxor.vv	v0, v0, v3\n"
+		      "vxor.vv	v1, v1, v3\n"
+		      "vxor.vv	v2, v2, v3\n"
+		      "vse8.v	v2, (%[block0])\n"
+		      "vse8.v	v2, (%[kiv])\n"
+		      "vse8.v	v0, (%[block3])\n"
+		      "vse8.v	v1, (%[block4])\n"
+		      ".option	pop\n"
+		      : :
+		      [const0]"r"(const0),
+		      [const1]"r"(const1),
+		      [block1]"r"(st->blocks[1]),
+		      [block2]"r"(st->blocks[2]),
+		      [iv]"r"(iv),
+		      [key]"r"(key),
+		      [block0]"r"(st->blocks[0]),
+		      [kiv]"r"(kiv),
+		      [block3]"r"(st->blocks[3]),
+		      [block4]"r"(st->blocks[4])
+	:);
+
+	if (preload_round_data())
+		return;
+
+	for (int i = 0; i < 5; i++) {
+		aegis128_update_rvv(st, key);
+		aegis128_update_rvv(st, kiv);
+	}
+}
+
+void crypto_aegis128_update_rvv(void *state, const void *msg)
+{
+	struct aegis_state *st = state;
+
+	if (preload_round_data())
+		return;
+
+	aegis128_update_rvv(st, msg);
+}
+
+static const u8 permute[] __aligned(64) = {
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+void crypto_aegis128_encrypt_chunk_rvv(void *state, void *dst, const void *src,
+				       unsigned int size)
+{
+	struct aegis_state *st = state;
+	const int short_input = size < AEGIS_BLOCK_SIZE;
+	u8 s[AEGIS_BLOCK_SIZE];
+	u8 msg[AEGIS_BLOCK_SIZE];
+
+	if (preload_round_data())
+		return;
+
+	while (size >= AEGIS_BLOCK_SIZE) {
+		/* s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+			      "vle8.v	v1, (%[block1])\n"
+			      "vle8.v	v2, (%[block2])\n"
+			      "vle8.v	v3, (%[block3])\n"
+			      "vle8.v	v4, (%[block4])\n"
+			      "vxor.vv	v1, v1, v4\n"
+			      "vand.vv	v2, v2, v3\n"
+			      "vxor.vv	v1, v1, v2\n"
+			      "vse8.v	v1, (%[s])\n"
+			      ".option	pop\n"
+			      : :
+			      [block1]"r"(st->blocks[1]),
+			      [block2]"r"(st->blocks[2]),
+			      [block3]"r"(st->blocks[3]),
+			      [block4]"r"(st->blocks[4]),
+			      [s]"r"(s)
+		:);
+
+		aegis128_update_rvv(st, src);
+		/* dst = s ^ src*/
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vle8.v	v1, (%[s])\n"
+			      "vle8.v	v2, (%[src])\n"
+			      "vxor.vv	v1, v1, v2\n"
+			      "vse8.v	v1, (%[dst])\n"
+			      "vse8.v	v1, (%[msg])\n"
+			      ".option	pop\n"
+			      : :
+			      [s]"r"(s),
+			      [src]"r"(src),
+			      [dst]"r"(dst),
+			      [msg]"r"(msg)
+		:);
+
+		size -= AEGIS_BLOCK_SIZE;
+		src += AEGIS_BLOCK_SIZE;
+		dst += AEGIS_BLOCK_SIZE;
+	}
+
+	if (size > 0) {
+		u8 buf[AEGIS_BLOCK_SIZE];
+		const void *in = src;
+		void *out = dst;
+		u8 m[AEGIS_BLOCK_SIZE];
+
+		/* s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+			      "vle8.v	v1, (%[block1])\n"
+			      "vle8.v	v2, (%[block2])\n"
+			      "vle8.v	v3, (%[block3])\n"
+			      "vle8.v	v4, (%[block4])\n"
+			      "vxor.vv	v1, v1, v4\n" /* st.v[1] ^ st.v[4] */
+			      "vand.vv	v2, v2, v3\n" /* st.v[2] & st.v[3] */
+			      "vxor.vv	v1, v1, v2\n"
+			      "vse8.v	v1, (%[s])\n"
+			      ".option	pop\n"
+			      : :
+			      [block1]"r"(st->blocks[1]),
+			      [block2]"r"(st->blocks[2]),
+			      [block3]"r"(st->blocks[3]),
+			      [block4]"r"(st->blocks[4]),
+			      [s]"r"(s)
+		:);
+
+		if (__builtin_expect(short_input, 0))
+			in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
+
+		/*
+		 * m = vqtbl1q_u8(vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
+		 *		  vld1q_u8(permute + 32 - size));
+		 */
+		asm volatile (".option		push\n"
+			      ".option		arch,+v\n"
+			      "vle8.v		v1, (%[in])\n"
+			      "vle8.v		v2, (%[p])\n"
+			      "vrgather.vv	v3, v1, v2\n"
+			      "vse8.v		v3, (%[m])\n"
+			      ".option		pop\n"
+			      : :
+			      [in]"r"(in + size - AEGIS_BLOCK_SIZE),
+			      [p]"r"(permute + 32 - size),
+			      [m]"r"(m)
+		:);
+
+		aegis128_update_rvv(st, m);
+
+		/*
+		 * vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
+		 *			vqtbl1q_u8(m ^ s, vld1q_u8(permute + size)));
+		 */
+		asm volatile (".option		push\n"
+			      ".option		arch,+v\n"
+			      "vsetivli		zero, 0x10, e8, m1, ta, ma\n"
+			      "vle8.v		v1, (%[m])\n"
+			      "vle8.v		v2, (%[s])\n"
+			      "vxor.vv		v1, v1, v2\n"
+			      "vle8.v		v2, (%[p])\n"
+			      "vrgather.vv	v3, v1, v2\n"
+			      "vse8.v		v3, (%[out])\n"
+			      ".option		pop\n"
+			      : :
+			      [m]"r"(m),
+			      [s]"r"(s),
+			      [p]"r"(permute + size),
+			      [out]"r"(out + size - AEGIS_BLOCK_SIZE)
+		:);
+
+		if (__builtin_expect(short_input, 0)) {
+			memcpy(dst, out, size);
+		} else {
+			/* vst1q_u8(out - AEGIS_BLOCK_SIZE, m); */
+			asm volatile (".option	push\n"
+				      ".option	arch,+v\n"
+				      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+				      "vle8.v	v1, (%[msg])\n"
+				      "vse8.v	v1, (%[out])\n"
+				      ".option	pop\n"
+				      : :
+				      [msg]"r"(msg),
+				      [out]"r"(out - AEGIS_BLOCK_SIZE)
+			:);
+		}
+	}
+}
+
+void crypto_aegis128_decrypt_chunk_rvv(void *state, void *dst, const void *src,
+				       unsigned int size)
+{
+	struct aegis_state *st = state;
+	const int short_input = size < AEGIS_BLOCK_SIZE;
+	u8 s[AEGIS_BLOCK_SIZE];
+	u8 msg[AEGIS_BLOCK_SIZE];
+
+	if (preload_round_data())
+		return;
+
+	while (size >= AEGIS_BLOCK_SIZE) {
+		/* s = vld1q_u8(src) ^ st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+			      "vle8.v	v1, (%[block1])\n"
+			      "vle8.v	v2, (%[block2])\n"
+			      "vle8.v	v3, (%[block3])\n"
+			      "vle8.v	v4, (%[block4])\n"
+			      "vle8.v	v5, (%[src])\n"
+			      "vxor.vv	v1, v1, v4\n"
+			      "vand.vv	v2, v2, v3\n"
+			      "vxor.vv	v1, v1, v2\n"
+			      "vxor.vv	v1, v1, v5\n"
+			      "vse8.v	v1, (%[msg])\n"
+			      ".option	pop\n"
+			      : :
+			      [block1]"r"(st->blocks[1]),
+			      [block2]"r"(st->blocks[2]),
+			      [block3]"r"(st->blocks[3]),
+			      [block4]"r"(st->blocks[4]),
+			      [src]"r"(src),
+			      [msg]"r"(msg)
+		:);
+
+		aegis128_update_rvv(st, msg);
+
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vle8.v	v1, (%[msg])\n"
+			      "vse8.v	v1, (%[dst])\n"
+			      ".option	pop\n"
+			      : :
+			      [msg]"r"(msg),
+			      [dst]"r"(dst)
+		:);
+
+		size -= AEGIS_BLOCK_SIZE;
+		src += AEGIS_BLOCK_SIZE;
+		dst += AEGIS_BLOCK_SIZE;
+	}
+
+	if (size > 0) {
+		u8 buf[AEGIS_BLOCK_SIZE];
+		const void *in = src;
+		void *out = dst;
+		u8 m[AEGIS_BLOCK_SIZE];
+
+		/* s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+			      "vle8.v	v1, (%[block1])\n"
+			      "vle8.v	v2, (%[block2])\n"
+			      "vle8.v	v3, (%[block3])\n"
+			      "vle8.v	v4, (%[block4])\n"
+			      "vxor.vv	v1, v1, v4\n"
+			      "vand.vv	v2, v2, v3\n"
+			      "vxor.vv	v1, v1, v2\n"
+			      "vse8.v	v1, (%[s])\n"
+			      ".option	pop\n"
+			      : :
+			      [block1]"r"(st->blocks[1]),
+			      [block2]"r"(st->blocks[2]),
+			      [block3]"r"(st->blocks[3]),
+			      [block4]"r"(st->blocks[4]),
+			      [s]"r"(s)
+		:);
+
+		if (__builtin_expect(short_input, 0))
+			in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
+
+		/*
+		 * m = s ^ vqtbx1q_u8(s, vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
+		 *		      vld1q_u8(permute + 32 - size));
+		 */
+		asm volatile (".option		push\n"
+			      ".option		arch,+v\n"
+			      "vle8.v		v1, (%[in])\n"
+			      "vle8.v		v2, (%[p])\n"
+			      "vrgather.vv	v3, v1, v2\n"
+			      "vle8.v		v4, (%[s])\n"
+			      "vmsltu.vx	v0, v2, %[x10]\n" /* set if less then 0x10 */
+			      "vmerge.vvm	v3, v4, v3, v0\n"
+			      "vxor.vv		v3, v4, v3\n"
+			      "vse8.v		v3, (%[m])\n"
+			      ".option		pop\n"
+			      : :
+			      [in]"r"(in + size - AEGIS_BLOCK_SIZE),
+			      [p]"r"(permute + 32 - size),
+			      [s]"r"(s),
+			      [x10]"r"(0x10),
+			      [m]"r"(m)
+		:);
+
+		aegis128_update_rvv(st, m);
+
+		/*
+		 * vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
+		 *	    vqtbl1q_u8(m, vld1q_u8(permute + size)));
+		 */
+		asm volatile (".option		push\n"
+			      ".option		arch,+v\n"
+			      "vsetivli		zero, 0x10, e8, m1, ta, ma\n"
+			      "vle8.v		v1, (%[m])\n"
+			      "vle8.v		v2, (%[p])\n"
+			      "vrgather.vv	v3, v1, v2\n"
+			      "vse8.v		v3, (%[out])\n"
+			      ".option		pop\n"
+			      : :
+			      [m]"r"(m),
+			      [p]"r"(permute + size),
+			      [out]"r"(out + size - AEGIS_BLOCK_SIZE)
+		:);
+
+		if (__builtin_expect(short_input, 0)) {
+			memcpy(dst, out, size);
+		} else {
+			/* vst1q_u8(out - AEGIS_BLOCK_SIZE, m); */
+			asm volatile (".option	push\n"
+				      ".option	arch,+v\n"
+				      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+				      "vle8.v	v1, (%[msg])\n"
+				      "vse8.v	v1, (%[out])\n"
+				      ".option	pop\n"
+				      : :
+				      [msg]"r"(msg),
+				      [out]"r"(out - AEGIS_BLOCK_SIZE)
+			:);
+		}
+	}
+}
+
+int crypto_aegis128_final_rvv(void *state, void *tag_xor, unsigned int assoclen,
+			      unsigned int cryptlen, unsigned int authsize)
+{
+	struct aegis_state *st = state;
+	u64 v[2];
+	int i;
+	int ret;
+
+	ret = preload_round_data();
+	if (ret)
+		return ret;
+
+	/*
+	 *v = st.v[3] ^ (uint8x16_t)vcombine_u64(vmov_n_u64(8ULL * assoclen),
+	 *					 vmov_n_u64(8ULL * cryptlen));
+	 */
+	v[0] = 8ULL * assoclen;
+	v[1] = 8ULL * cryptlen;
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+		      "vle8.v	v0, (%[v])\n"
+		      "vle8.v	v1, (%[block3])\n"
+		      "vxor.vv	v0, v0, v1\n"
+		      "vse8.v	v0, (%[v])\n"
+		      ".option	pop\n"
+		      : :
+		      [v]"r"(v),
+		      [block3]"r"(st->blocks[3])
+	:);
+
+	for (i = 0; i < 7; i++)
+		aegis128_update_rvv(st, v);
+
+	/* v = st.v[0] ^ st.v[1] ^ st.v[2] ^ st.v[3] ^ st.v[4]; */
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+		      "vle8.v	v0, (%[block0])\n"
+		      "vle8.v	v1, (%[block1])\n"
+		      "vle8.v	v2, (%[block2])\n"
+		      "vle8.v	v3, (%[block3])\n"
+		      "vle8.v	v4, (%[block4])\n"
+		      "vxor.vv	v0, v0, v1\n"
+		      "vxor.vv	v2, v2, v3\n"
+		      "vxor.vv	v0, v0, v2\n"
+		      "vxor.vv	v0, v0, v4\n"
+		      "vse8.v	v0, (%[v])\n"
+		      ".option	pop\n"
+		      : :
+		      [block0]"r"(st->blocks[0]),
+		      [block1]"r"(st->blocks[1]),
+		      [block2]"r"(st->blocks[2]),
+		      [block3]"r"(st->blocks[3]),
+		      [block4]"r"(st->blocks[4]),
+		      [v]"r"(v)
+	:);
+
+	if (authsize > 0) {
+		/*
+		 * v = vqtbl1q_u8(~vceqq_u8(v, vld1q_u8(tag_xor)),
+		 *			    vld1q_u8(permute + authsize));
+		 */
+		asm volatile (".option		push\n"
+			      ".option		arch,+v\n"
+			      "vsetivli		zero, 0x10, e8, m1, ta, ma\n"
+			      "vle8.v		v0, (%[v])\n"
+			      "vle8.v		v1, (%[tag_xor])\n"
+			      "vmseq.vv		v0, v0, v1\n" /* vceqq_u8(v, vld1q_u8(tag_xor) */
+			      "vmv.v.i		v1, 0\n" /* set v1 = 0 */
+			      "vmerge.vxm	v1, v1, %[xff], v0\n"
+			      "vxor.vi		v1, v1, -1\n" /* vnot.v v0, v0 */
+			      "vle8.v		v0, (%[pa])\n"
+			      "vrgather.vv	v2, v1, v0\n"
+			      "vredmin.vs	v2, v2, v2\n" /* vminvq_s8((int8x16_t)v) */
+			      "vse8.v		v2, (%[v])\n"
+			      ".option		pop\n"
+			      : :
+			      [v]"r"(v),
+			      [tag_xor]"r"(tag_xor),
+			      [xff]"r"(0xff),
+			      [pa]"r"(permute + authsize)
+		:);
+
+		return *((s8 *)v);
+	}
+
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+		      "vle8.v	v0, (%[v])\n"
+		      "vse8.v	v0, (%[tag_xor])\n"
+		      ".option	pop\n"
+		      : :
+		      [v]"r"(v),
+		      [tag_xor]"r"(tag_xor)
+	:);
+
+	return 0;
+}
diff --git a/crypto/aegis128-rvv.c b/crypto/aegis128-rvv.c
new file mode 100644
index 000000000000..5a6647722d82
--- /dev/null
+++ b/crypto/aegis128-rvv.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2026 Institute of Software, CAS
+ * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
+ */
+
+#include <asm/vector.h>
+
+#include "aegis.h"
+#include "aegis-rvv.h"
+
+bool crypto_aegis128_have_simd(void)
+{
+	return IS_ENABLED(CONFIG_RISCV_ISA_V);
+}
+
+void crypto_aegis128_init_simd(struct aegis_state *state,
+			       const union aegis_block *key,
+			       const u8 *iv)
+{
+	kernel_vector_begin();
+	crypto_aegis128_init_rvv(state, key, iv);
+	kernel_vector_end();
+}
+
+void crypto_aegis128_update_simd(struct aegis_state *state, const void *msg)
+{
+	kernel_vector_begin();
+	crypto_aegis128_update_rvv(state, msg);
+	kernel_vector_end();
+}
+
+void crypto_aegis128_encrypt_chunk_simd(struct aegis_state *state, u8 *dst,
+					const u8 *src, unsigned int size)
+{
+	kernel_vector_begin();
+	crypto_aegis128_encrypt_chunk_rvv(state, dst, src, size);
+	kernel_vector_end();
+}
+
+void crypto_aegis128_decrypt_chunk_simd(struct aegis_state *state, u8 *dst,
+					const u8 *src, unsigned int size)
+{
+	kernel_vector_begin();
+	crypto_aegis128_decrypt_chunk_rvv(state, dst, src, size);
+	kernel_vector_end();
+}
+
+int crypto_aegis128_final_simd(struct aegis_state *state,
+			       union aegis_block *tag_xor,
+			       unsigned int assoclen,
+			       unsigned int cryptlen,
+			       unsigned int authsize)
+{
+	int ret;
+
+	kernel_vector_begin();
+	ret = crypto_aegis128_final_rvv(state, tag_xor, assoclen, cryptlen,
+					authsize);
+	kernel_vector_end();
+
+	return ret;
+}
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH V2] crypto: aegis128: Add RISC-V vector SIMD implementation
  2026-01-26  9:24 [PATCH V2] crypto: aegis128: Add RISC-V vector SIMD implementation Chunyan Zhang
@ 2026-02-06 10:03 ` Herbert Xu
  2026-02-13  0:13   ` Eric Biggers
  0 siblings, 1 reply; 3+ messages in thread
From: Herbert Xu @ 2026-02-06 10:03 UTC (permalink / raw)
  To: Chunyan Zhang
  Cc: Paul Walmsley, Palmer Dabbelt, Albert Ou, Alexandre Ghiti,
	David S . Miller, linux-riscv, linux-crypto, linux-kernel,
	Chunyan Zhang, Eric Biggers

On Mon, Jan 26, 2026 at 05:24:11PM +0800, Chunyan Zhang wrote:
> Add a RISC-V vector-accelerated implementation of aegis128 by
> wiring it into the generic SIMD hooks.
> 
> This implementation supports vlen values of 512, 256, and 128.
> 
> Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> ---
> V2:
> - Add config dependency of RISCV_ISA_V to fix the issue reported by kernel test robot;
> - Add return value in preload_round_data() and aegis128_round().
> 
> V1: https://lore.kernel.org/all/20260121101923.64657-1-zhangchunyan@iscas.ac.cn/
> ---
>  crypto/Kconfig              |   4 +-
>  crypto/Makefile             |   4 +
>  crypto/aegis-rvv.h          |  19 +
>  crypto/aegis128-rvv-inner.c | 762 ++++++++++++++++++++++++++++++++++++
>  crypto/aegis128-rvv.c       |  63 +++
>  5 files changed, 850 insertions(+), 2 deletions(-)
>  create mode 100644 crypto/aegis-rvv.h
>  create mode 100644 crypto/aegis128-rvv-inner.c
>  create mode 100644 crypto/aegis128-rvv.c

In light of the recent move of aes from crypto to lib/crypto,
perhaps the same should be done for aegis?

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH V2] crypto: aegis128: Add RISC-V vector SIMD implementation
  2026-02-06 10:03 ` Herbert Xu
@ 2026-02-13  0:13   ` Eric Biggers
  0 siblings, 0 replies; 3+ messages in thread
From: Eric Biggers @ 2026-02-13  0:13 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Chunyan Zhang, Paul Walmsley, Palmer Dabbelt, Albert Ou,
	Alexandre Ghiti, David S . Miller, linux-riscv, linux-crypto,
	linux-kernel, Chunyan Zhang

On Fri, Feb 06, 2026 at 06:03:08PM +0800, Herbert Xu wrote:
> On Mon, Jan 26, 2026 at 05:24:11PM +0800, Chunyan Zhang wrote:
> > Add a RISC-V vector-accelerated implementation of aegis128 by
> > wiring it into the generic SIMD hooks.
> > 
> > This implementation supports vlen values of 512, 256, and 128.
> > 
> > Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> > ---
> > V2:
> > - Add config dependency of RISCV_ISA_V to fix the issue reported by kernel test robot;
> > - Add return value in preload_round_data() and aegis128_round().
> > 
> > V1: https://lore.kernel.org/all/20260121101923.64657-1-zhangchunyan@iscas.ac.cn/
> > ---
> >  crypto/Kconfig              |   4 +-
> >  crypto/Makefile             |   4 +
> >  crypto/aegis-rvv.h          |  19 +
> >  crypto/aegis128-rvv-inner.c | 762 ++++++++++++++++++++++++++++++++++++
> >  crypto/aegis128-rvv.c       |  63 +++
> >  5 files changed, 850 insertions(+), 2 deletions(-)
> >  create mode 100644 crypto/aegis-rvv.h
> >  create mode 100644 crypto/aegis128-rvv-inner.c
> >  create mode 100644 crypto/aegis128-rvv.c
> 
> In light of the recent move of aes from crypto to lib/crypto,
> perhaps the same should be done for aegis?

Yes, I'll be focusing on AES modes next, but it will make sense to move
AEGIS too.

Regardless of that though, this patch needs a proper review.  I'll try
to find time, but maybe others in the RISC-V community can help too.

- Eric

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-02-13  0:13 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-26  9:24 [PATCH V2] crypto: aegis128: Add RISC-V vector SIMD implementation Chunyan Zhang
2026-02-06 10:03 ` Herbert Xu
2026-02-13  0:13   ` Eric Biggers

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox