From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: linux-kernel@vger.kernel.org, Ard Biesheuvel <ardb@kernel.org>,
"Jason A . Donenfeld" <Jason@zx2c4.com>,
Herbert Xu <herbert@gondor.apana.org.au>,
linux-arm-kernel@lists.infradead.org,
linuxppc-dev@lists.ozlabs.org, linux-riscv@lists.infradead.org,
linux-s390@vger.kernel.org, x86@kernel.org,
Eric Biggers <ebiggers@kernel.org>
Subject: [PATCH 06/19] crypto: arm/ghash - Move NEON GHASH assembly into its own file
Date: Wed, 18 Mar 2026 23:17:07 -0700 [thread overview]
Message-ID: <20260319061723.1140720-7-ebiggers@kernel.org> (raw)
In-Reply-To: <20260319061723.1140720-1-ebiggers@kernel.org>
arch/arm/crypto/ghash-ce-core.S implements pmull_ghash_update_p8(),
which is used only by a crypto_shash implementation of GHASH. It also
implements other functions, including pmull_ghash_update_p64() and
others, which are used only by a crypto_aead implementation of AES-GCM.
While some code is shared between pmull_ghash_update_p8() and
pmull_ghash_update_p64(), it's not very much. Since
pmull_ghash_update_p8() will also need to be migrated into lib/crypto/
to achieve parity in the standalone GHASH support, let's move it into a
separate file ghash-neon-core.S.
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
arch/arm/crypto/Makefile | 2 +-
arch/arm/crypto/ghash-ce-core.S | 171 ++----------------------
arch/arm/crypto/ghash-neon-core.S | 207 ++++++++++++++++++++++++++++++
3 files changed, 222 insertions(+), 158 deletions(-)
create mode 100644 arch/arm/crypto/ghash-neon-core.S
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index e73099e120b3..cedce94d5ee5 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -8,6 +8,6 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
aes-arm-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
-ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
+ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o ghash-neon-core.o
diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S
index 858c0d66798b..a449525d61f8 100644
--- a/arch/arm/crypto/ghash-ce-core.S
+++ b/arch/arm/crypto/ghash-ce-core.S
@@ -1,8 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
+ * Accelerated AES-GCM implementation with ARMv8 Crypto Extensions.
*
* Copyright (C) 2015 - 2017 Linaro Ltd.
* Copyright (C) 2023 Google LLC. <ardb@google.com>
*/
@@ -27,43 +27,14 @@
XL_H .req d5
XM_L .req d6
XM_H .req d7
XH_L .req d8
- t0l .req d10
- t0h .req d11
- t1l .req d12
- t1h .req d13
- t2l .req d14
- t2h .req d15
- t3l .req d16
- t3h .req d17
- t4l .req d18
- t4h .req d19
-
- t0q .req q5
- t1q .req q6
- t2q .req q7
- t3q .req q8
- t4q .req q9
XH2 .req q9
- s1l .req d20
- s1h .req d21
- s2l .req d22
- s2h .req d23
- s3l .req d24
- s3h .req d25
- s4l .req d26
- s4h .req d27
-
MASK .req d28
- SHASH2_p8 .req d28
- k16 .req d29
- k32 .req d30
- k48 .req d31
SHASH2_p64 .req d31
HH .req q10
HH3 .req q11
HH4 .req q12
@@ -91,76 +62,10 @@
T3_L .req d16
T3_H .req d17
.text
- .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
- vmull.p64 \rd, \rn, \rm
- .endm
-
- /*
- * This implementation of 64x64 -> 128 bit polynomial multiplication
- * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
- * "Fast Software Polynomial Multiplication on ARM Processors Using
- * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
- * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
- *
- * It has been slightly tweaked for in-order performance, and to allow
- * 'rq' to overlap with 'ad' or 'bd'.
- */
- .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
- vext.8 t0l, \ad, \ad, #1 @ A1
- .ifc \b1, t4l
- vext.8 t4l, \bd, \bd, #1 @ B1
- .endif
- vmull.p8 t0q, t0l, \bd @ F = A1*B
- vext.8 t1l, \ad, \ad, #2 @ A2
- vmull.p8 t4q, \ad, \b1 @ E = A*B1
- .ifc \b2, t3l
- vext.8 t3l, \bd, \bd, #2 @ B2
- .endif
- vmull.p8 t1q, t1l, \bd @ H = A2*B
- vext.8 t2l, \ad, \ad, #3 @ A3
- vmull.p8 t3q, \ad, \b2 @ G = A*B2
- veor t0q, t0q, t4q @ L = E + F
- .ifc \b3, t4l
- vext.8 t4l, \bd, \bd, #3 @ B3
- .endif
- vmull.p8 t2q, t2l, \bd @ J = A3*B
- veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
- veor t1q, t1q, t3q @ M = G + H
- .ifc \b4, t3l
- vext.8 t3l, \bd, \bd, #4 @ B4
- .endif
- vmull.p8 t4q, \ad, \b3 @ I = A*B3
- veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
- vmull.p8 t3q, \ad, \b4 @ K = A*B4
- vand t0h, t0h, k48
- vand t1h, t1h, k32
- veor t2q, t2q, t4q @ N = I + J
- veor t0l, t0l, t0h
- veor t1l, t1l, t1h
- veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
- vand t2h, t2h, k16
- veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
- vmov.i64 t3h, #0
- vext.8 t0q, t0q, t0q, #15
- veor t2l, t2l, t2h
- vext.8 t1q, t1q, t1q, #14
- vmull.p8 \rq, \ad, \bd @ D = A*B
- vext.8 t2q, t2q, t2q, #13
- vext.8 t3q, t3q, t3q, #12
- veor t0q, t0q, t1q
- veor t2q, t2q, t3q
- veor \rq, \rq, t0q
- veor \rq, \rq, t2q
- .endm
-
- //
- // PMULL (64x64->128) based reduction for CPUs that can do
- // it in a single instruction.
- //
.macro __pmull_reduce_p64
vmull.p64 T1, XL_L, MASK
veor XH_L, XH_L, XM_H
vext.8 T1, T1, T1, #8
@@ -168,34 +73,11 @@
veor T1, T1, XL
vmull.p64 XL, T1_H, MASK
.endm
- //
- // Alternative reduction for CPUs that lack support for the
- // 64x64->128 PMULL instruction
- //
- .macro __pmull_reduce_p8
- veor XL_H, XL_H, XM_L
- veor XH_L, XH_L, XM_H
-
- vshl.i64 T1, XL, #57
- vshl.i64 T2, XL, #62
- veor T1, T1, T2
- vshl.i64 T2, XL, #63
- veor T1, T1, T2
- veor XL_H, XL_H, T1_L
- veor XH_L, XH_L, T1_H
-
- vshr.u64 T1, XL, #1
- veor XH, XH, XL
- veor XL, XL, T1
- vshr.u64 T1, T1, #6
- vshr.u64 XL, XL, #1
- .endm
-
- .macro ghash_update, pn, enc, aggregate=1, head=1
+ .macro ghash_update, enc, aggregate=1, head=1
vld1.64 {XL}, [r1]
.if \head
/* do the head block first, if supplied */
ldr ip, [sp]
@@ -204,12 +86,11 @@
vld1.64 {T1}, [ip]
teq r0, #0
b 3f
.endif
-0: .ifc \pn, p64
- .if \aggregate
+0: .if \aggregate
tst r0, #3 // skip until #blocks is a
bne 2f // round multiple of 4
vld1.8 {XL2-XM2}, [r2]!
1: vld1.8 {T2-T3}, [r2]!
@@ -286,11 +167,10 @@
veor T1, T1, XH
veor XL, XL, T1
b 1b
.endif
- .endif
2: vld1.8 {T1}, [r2]!
.ifnb \enc
\enc\()_1x T1
@@ -306,29 +186,29 @@
vext.8 IN1, T1, T1, #8
veor T1_L, T1_L, XL_H
veor XL, XL, IN1
- __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
+ vmull.p64 XH, XL_H, SHASH_H @ a1 * b1
veor T1, T1, XL
- __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
- __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
+ vmull.p64 XL, XL_L, SHASH_L @ a0 * b0
+ vmull.p64 XM, T1_L, SHASH2_p64 @ (a1+a0)(b1+b0)
4: veor T1, XL, XH
veor XM, XM, T1
- __pmull_reduce_\pn
+ __pmull_reduce_p64
veor T1, T1, XH
veor XL, XL, T1
bne 0b
.endm
/*
- * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
- * struct ghash_key const *k, const char *head)
+ * void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
+ * u64 const h[4][2], const char *head)
*/
ENTRY(pmull_ghash_update_p64)
vld1.64 {SHASH}, [r3]!
vld1.64 {HH}, [r3]!
vld1.64 {HH3-HH4}, [r3]
@@ -339,39 +219,16 @@ ENTRY(pmull_ghash_update_p64)
veor HH34_H, HH4_L, HH4_H
vmov.i8 MASK, #0xe1
vshl.u64 MASK, MASK, #57
- ghash_update p64
+ ghash_update
vst1.64 {XL}, [r1]
bx lr
ENDPROC(pmull_ghash_update_p64)
-ENTRY(pmull_ghash_update_p8)
- vld1.64 {SHASH}, [r3]
- veor SHASH2_p8, SHASH_L, SHASH_H
-
- vext.8 s1l, SHASH_L, SHASH_L, #1
- vext.8 s2l, SHASH_L, SHASH_L, #2
- vext.8 s3l, SHASH_L, SHASH_L, #3
- vext.8 s4l, SHASH_L, SHASH_L, #4
- vext.8 s1h, SHASH_H, SHASH_H, #1
- vext.8 s2h, SHASH_H, SHASH_H, #2
- vext.8 s3h, SHASH_H, SHASH_H, #3
- vext.8 s4h, SHASH_H, SHASH_H, #4
-
- vmov.i64 k16, #0xffff
- vmov.i64 k32, #0xffffffff
- vmov.i64 k48, #0xffffffffffff
-
- ghash_update p8
- vst1.64 {XL}, [r1]
-
- bx lr
-ENDPROC(pmull_ghash_update_p8)
-
e0 .req q9
e1 .req q10
e2 .req q11
e3 .req q12
e0l .req d18
@@ -534,11 +391,11 @@ ENTRY(pmull_gcm_encrypt)
ldrd r4, r5, [sp, #24]
ldrd r6, r7, [sp, #32]
vld1.64 {SHASH}, [r3]
- ghash_update p64, enc, head=0
+ ghash_update enc, head=0
vst1.64 {XL}, [r1]
pop {r4-r8, pc}
ENDPROC(pmull_gcm_encrypt)
@@ -552,11 +409,11 @@ ENTRY(pmull_gcm_decrypt)
ldrd r4, r5, [sp, #24]
ldrd r6, r7, [sp, #32]
vld1.64 {SHASH}, [r3]
- ghash_update p64, dec, head=0
+ ghash_update dec, head=0
vst1.64 {XL}, [r1]
pop {r4-r8, pc}
ENDPROC(pmull_gcm_decrypt)
@@ -601,11 +458,11 @@ ENTRY(pmull_gcm_enc_final)
vmov.i8 MASK, #0xe1
veor SHASH2_p64, SHASH_L, SHASH_H
vshl.u64 MASK, MASK, #57
mov r0, #1
bne 3f // process head block first
- ghash_update p64, aggregate=0, head=0
+ ghash_update aggregate=0, head=0
vrev64.8 XL, XL
vext.8 XL, XL, XL, #8
veor XL, XL, e1
@@ -658,11 +515,11 @@ ENTRY(pmull_gcm_dec_final)
vmov.i8 MASK, #0xe1
veor SHASH2_p64, SHASH_L, SHASH_H
vshl.u64 MASK, MASK, #57
mov r0, #1
bne 3f // process head block first
- ghash_update p64, aggregate=0, head=0
+ ghash_update aggregate=0, head=0
vrev64.8 XL, XL
vext.8 XL, XL, XL, #8
veor XL, XL, e1
diff --git a/arch/arm/crypto/ghash-neon-core.S b/arch/arm/crypto/ghash-neon-core.S
new file mode 100644
index 000000000000..bdf6fb6d063c
--- /dev/null
+++ b/arch/arm/crypto/ghash-neon-core.S
@@ -0,0 +1,207 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Accelerated GHASH implementation with NEON vmull.p8 instructions.
+ *
+ * Copyright (C) 2015 - 2017 Linaro Ltd.
+ * Copyright (C) 2023 Google LLC. <ardb@google.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .fpu neon
+
+ SHASH .req q0
+ T1 .req q1
+ XL .req q2
+ XM .req q3
+ XH .req q4
+ IN1 .req q4
+
+ SHASH_L .req d0
+ SHASH_H .req d1
+ T1_L .req d2
+ T1_H .req d3
+ XL_L .req d4
+ XL_H .req d5
+ XM_L .req d6
+ XM_H .req d7
+ XH_L .req d8
+
+ t0l .req d10
+ t0h .req d11
+ t1l .req d12
+ t1h .req d13
+ t2l .req d14
+ t2h .req d15
+ t3l .req d16
+ t3h .req d17
+ t4l .req d18
+ t4h .req d19
+
+ t0q .req q5
+ t1q .req q6
+ t2q .req q7
+ t3q .req q8
+ t4q .req q9
+
+ s1l .req d20
+ s1h .req d21
+ s2l .req d22
+ s2h .req d23
+ s3l .req d24
+ s3h .req d25
+ s4l .req d26
+ s4h .req d27
+
+ SHASH2_p8 .req d28
+
+ k16 .req d29
+ k32 .req d30
+ k48 .req d31
+
+ T2 .req q7
+
+ .text
+
+ /*
+ * This implementation of 64x64 -> 128 bit polynomial multiplication
+ * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
+ * "Fast Software Polynomial Multiplication on ARM Processors Using
+ * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
+ * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
+ *
+ * It has been slightly tweaked for in-order performance, and to allow
+ * 'rq' to overlap with 'ad' or 'bd'.
+ */
+ .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
+ vext.8 t0l, \ad, \ad, #1 @ A1
+ .ifc \b1, t4l
+ vext.8 t4l, \bd, \bd, #1 @ B1
+ .endif
+ vmull.p8 t0q, t0l, \bd @ F = A1*B
+ vext.8 t1l, \ad, \ad, #2 @ A2
+ vmull.p8 t4q, \ad, \b1 @ E = A*B1
+ .ifc \b2, t3l
+ vext.8 t3l, \bd, \bd, #2 @ B2
+ .endif
+ vmull.p8 t1q, t1l, \bd @ H = A2*B
+ vext.8 t2l, \ad, \ad, #3 @ A3
+ vmull.p8 t3q, \ad, \b2 @ G = A*B2
+ veor t0q, t0q, t4q @ L = E + F
+ .ifc \b3, t4l
+ vext.8 t4l, \bd, \bd, #3 @ B3
+ .endif
+ vmull.p8 t2q, t2l, \bd @ J = A3*B
+ veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
+ veor t1q, t1q, t3q @ M = G + H
+ .ifc \b4, t3l
+ vext.8 t3l, \bd, \bd, #4 @ B4
+ .endif
+ vmull.p8 t4q, \ad, \b3 @ I = A*B3
+ veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
+ vmull.p8 t3q, \ad, \b4 @ K = A*B4
+ vand t0h, t0h, k48
+ vand t1h, t1h, k32
+ veor t2q, t2q, t4q @ N = I + J
+ veor t0l, t0l, t0h
+ veor t1l, t1l, t1h
+ veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
+ vand t2h, t2h, k16
+ veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 t3h, #0
+ vext.8 t0q, t0q, t0q, #15
+ veor t2l, t2l, t2h
+ vext.8 t1q, t1q, t1q, #14
+ vmull.p8 \rq, \ad, \bd @ D = A*B
+ vext.8 t2q, t2q, t2q, #13
+ vext.8 t3q, t3q, t3q, #12
+ veor t0q, t0q, t1q
+ veor t2q, t2q, t3q
+ veor \rq, \rq, t0q
+ veor \rq, \rq, t2q
+ .endm
+
+ .macro __pmull_reduce_p8
+ veor XL_H, XL_H, XM_L
+ veor XH_L, XH_L, XM_H
+
+ vshl.i64 T1, XL, #57
+ vshl.i64 T2, XL, #62
+ veor T1, T1, T2
+ vshl.i64 T2, XL, #63
+ veor T1, T1, T2
+ veor XL_H, XL_H, T1_L
+ veor XH_L, XH_L, T1_H
+
+ vshr.u64 T1, XL, #1
+ veor XH, XH, XL
+ veor XL, XL, T1
+ vshr.u64 T1, T1, #6
+ vshr.u64 XL, XL, #1
+ .endm
+
+ .macro ghash_update
+ vld1.64 {XL}, [r1]
+
+ /* do the head block first, if supplied */
+ ldr ip, [sp]
+ teq ip, #0
+ beq 0f
+ vld1.64 {T1}, [ip]
+ teq r0, #0
+ b 3f
+
+0:
+ vld1.8 {T1}, [r2]!
+ subs r0, r0, #1
+
+3: /* multiply XL by SHASH in GF(2^128) */
+ vrev64.8 T1, T1
+
+ vext.8 IN1, T1, T1, #8
+ veor T1_L, T1_L, XL_H
+ veor XL, XL, IN1
+
+ __pmull_p8 XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
+ veor T1, T1, XL
+ __pmull_p8 XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
+ __pmull_p8 XM, T1_L, SHASH2_p8 @ (a1+a0)(b1+b0)
+
+ veor T1, XL, XH
+ veor XM, XM, T1
+
+ __pmull_reduce_p8
+
+ veor T1, T1, XH
+ veor XL, XL, T1
+
+ bne 0b
+ .endm
+
+ /*
+ * void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
+ * u64 const h[1][2], const char *head)
+ */
+ENTRY(pmull_ghash_update_p8)
+ vld1.64 {SHASH}, [r3]
+ veor SHASH2_p8, SHASH_L, SHASH_H
+
+ vext.8 s1l, SHASH_L, SHASH_L, #1
+ vext.8 s2l, SHASH_L, SHASH_L, #2
+ vext.8 s3l, SHASH_L, SHASH_L, #3
+ vext.8 s4l, SHASH_L, SHASH_L, #4
+ vext.8 s1h, SHASH_H, SHASH_H, #1
+ vext.8 s2h, SHASH_H, SHASH_H, #2
+ vext.8 s3h, SHASH_H, SHASH_H, #3
+ vext.8 s4h, SHASH_H, SHASH_H, #4
+
+ vmov.i64 k16, #0xffff
+ vmov.i64 k32, #0xffffffff
+ vmov.i64 k48, #0xffffffffffff
+
+ ghash_update
+ vst1.64 {XL}, [r1]
+
+ bx lr
+ENDPROC(pmull_ghash_update_p8)
--
2.53.0
WARNING: multiple messages have this Message-ID (diff)
From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: linux-kernel@vger.kernel.org, Ard Biesheuvel <ardb@kernel.org>,
"Jason A . Donenfeld" <Jason@zx2c4.com>,
Herbert Xu <herbert@gondor.apana.org.au>,
linux-arm-kernel@lists.infradead.org,
linuxppc-dev@lists.ozlabs.org, linux-riscv@lists.infradead.org,
linux-s390@vger.kernel.org, x86@kernel.org,
Eric Biggers <ebiggers@kernel.org>
Subject: [PATCH 06/19] crypto: arm/ghash - Move NEON GHASH assembly into its own file
Date: Wed, 18 Mar 2026 23:17:07 -0700 [thread overview]
Message-ID: <20260319061723.1140720-7-ebiggers@kernel.org> (raw)
In-Reply-To: <20260319061723.1140720-1-ebiggers@kernel.org>
arch/arm/crypto/ghash-ce-core.S implements pmull_ghash_update_p8(),
which is used only by a crypto_shash implementation of GHASH. It also
implements other functions, including pmull_ghash_update_p64() and
others, which are used only by a crypto_aead implementation of AES-GCM.
While some code is shared between pmull_ghash_update_p8() and
pmull_ghash_update_p64(), it's not very much. Since
pmull_ghash_update_p8() will also need to be migrated into lib/crypto/
to achieve parity in the standalone GHASH support, let's move it into a
separate file ghash-neon-core.S.
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
arch/arm/crypto/Makefile | 2 +-
arch/arm/crypto/ghash-ce-core.S | 171 ++----------------------
arch/arm/crypto/ghash-neon-core.S | 207 ++++++++++++++++++++++++++++++
3 files changed, 222 insertions(+), 158 deletions(-)
create mode 100644 arch/arm/crypto/ghash-neon-core.S
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index e73099e120b3..cedce94d5ee5 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -8,6 +8,6 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
aes-arm-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
-ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
+ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o ghash-neon-core.o
diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S
index 858c0d66798b..a449525d61f8 100644
--- a/arch/arm/crypto/ghash-ce-core.S
+++ b/arch/arm/crypto/ghash-ce-core.S
@@ -1,8 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
+ * Accelerated AES-GCM implementation with ARMv8 Crypto Extensions.
*
* Copyright (C) 2015 - 2017 Linaro Ltd.
* Copyright (C) 2023 Google LLC. <ardb@google.com>
*/
@@ -27,43 +27,14 @@
XL_H .req d5
XM_L .req d6
XM_H .req d7
XH_L .req d8
- t0l .req d10
- t0h .req d11
- t1l .req d12
- t1h .req d13
- t2l .req d14
- t2h .req d15
- t3l .req d16
- t3h .req d17
- t4l .req d18
- t4h .req d19
-
- t0q .req q5
- t1q .req q6
- t2q .req q7
- t3q .req q8
- t4q .req q9
XH2 .req q9
- s1l .req d20
- s1h .req d21
- s2l .req d22
- s2h .req d23
- s3l .req d24
- s3h .req d25
- s4l .req d26
- s4h .req d27
-
MASK .req d28
- SHASH2_p8 .req d28
- k16 .req d29
- k32 .req d30
- k48 .req d31
SHASH2_p64 .req d31
HH .req q10
HH3 .req q11
HH4 .req q12
@@ -91,76 +62,10 @@
T3_L .req d16
T3_H .req d17
.text
- .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
- vmull.p64 \rd, \rn, \rm
- .endm
-
- /*
- * This implementation of 64x64 -> 128 bit polynomial multiplication
- * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
- * "Fast Software Polynomial Multiplication on ARM Processors Using
- * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
- * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
- *
- * It has been slightly tweaked for in-order performance, and to allow
- * 'rq' to overlap with 'ad' or 'bd'.
- */
- .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
- vext.8 t0l, \ad, \ad, #1 @ A1
- .ifc \b1, t4l
- vext.8 t4l, \bd, \bd, #1 @ B1
- .endif
- vmull.p8 t0q, t0l, \bd @ F = A1*B
- vext.8 t1l, \ad, \ad, #2 @ A2
- vmull.p8 t4q, \ad, \b1 @ E = A*B1
- .ifc \b2, t3l
- vext.8 t3l, \bd, \bd, #2 @ B2
- .endif
- vmull.p8 t1q, t1l, \bd @ H = A2*B
- vext.8 t2l, \ad, \ad, #3 @ A3
- vmull.p8 t3q, \ad, \b2 @ G = A*B2
- veor t0q, t0q, t4q @ L = E + F
- .ifc \b3, t4l
- vext.8 t4l, \bd, \bd, #3 @ B3
- .endif
- vmull.p8 t2q, t2l, \bd @ J = A3*B
- veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
- veor t1q, t1q, t3q @ M = G + H
- .ifc \b4, t3l
- vext.8 t3l, \bd, \bd, #4 @ B4
- .endif
- vmull.p8 t4q, \ad, \b3 @ I = A*B3
- veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
- vmull.p8 t3q, \ad, \b4 @ K = A*B4
- vand t0h, t0h, k48
- vand t1h, t1h, k32
- veor t2q, t2q, t4q @ N = I + J
- veor t0l, t0l, t0h
- veor t1l, t1l, t1h
- veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
- vand t2h, t2h, k16
- veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
- vmov.i64 t3h, #0
- vext.8 t0q, t0q, t0q, #15
- veor t2l, t2l, t2h
- vext.8 t1q, t1q, t1q, #14
- vmull.p8 \rq, \ad, \bd @ D = A*B
- vext.8 t2q, t2q, t2q, #13
- vext.8 t3q, t3q, t3q, #12
- veor t0q, t0q, t1q
- veor t2q, t2q, t3q
- veor \rq, \rq, t0q
- veor \rq, \rq, t2q
- .endm
-
- //
- // PMULL (64x64->128) based reduction for CPUs that can do
- // it in a single instruction.
- //
.macro __pmull_reduce_p64
vmull.p64 T1, XL_L, MASK
veor XH_L, XH_L, XM_H
vext.8 T1, T1, T1, #8
@@ -168,34 +73,11 @@
veor T1, T1, XL
vmull.p64 XL, T1_H, MASK
.endm
- //
- // Alternative reduction for CPUs that lack support for the
- // 64x64->128 PMULL instruction
- //
- .macro __pmull_reduce_p8
- veor XL_H, XL_H, XM_L
- veor XH_L, XH_L, XM_H
-
- vshl.i64 T1, XL, #57
- vshl.i64 T2, XL, #62
- veor T1, T1, T2
- vshl.i64 T2, XL, #63
- veor T1, T1, T2
- veor XL_H, XL_H, T1_L
- veor XH_L, XH_L, T1_H
-
- vshr.u64 T1, XL, #1
- veor XH, XH, XL
- veor XL, XL, T1
- vshr.u64 T1, T1, #6
- vshr.u64 XL, XL, #1
- .endm
-
- .macro ghash_update, pn, enc, aggregate=1, head=1
+ .macro ghash_update, enc, aggregate=1, head=1
vld1.64 {XL}, [r1]
.if \head
/* do the head block first, if supplied */
ldr ip, [sp]
@@ -204,12 +86,11 @@
vld1.64 {T1}, [ip]
teq r0, #0
b 3f
.endif
-0: .ifc \pn, p64
- .if \aggregate
+0: .if \aggregate
tst r0, #3 // skip until #blocks is a
bne 2f // round multiple of 4
vld1.8 {XL2-XM2}, [r2]!
1: vld1.8 {T2-T3}, [r2]!
@@ -286,11 +167,10 @@
veor T1, T1, XH
veor XL, XL, T1
b 1b
.endif
- .endif
2: vld1.8 {T1}, [r2]!
.ifnb \enc
\enc\()_1x T1
@@ -306,29 +186,29 @@
vext.8 IN1, T1, T1, #8
veor T1_L, T1_L, XL_H
veor XL, XL, IN1
- __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
+ vmull.p64 XH, XL_H, SHASH_H @ a1 * b1
veor T1, T1, XL
- __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
- __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
+ vmull.p64 XL, XL_L, SHASH_L @ a0 * b0
+ vmull.p64 XM, T1_L, SHASH2_p64 @ (a1+a0)(b1+b0)
4: veor T1, XL, XH
veor XM, XM, T1
- __pmull_reduce_\pn
+ __pmull_reduce_p64
veor T1, T1, XH
veor XL, XL, T1
bne 0b
.endm
/*
- * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
- * struct ghash_key const *k, const char *head)
+ * void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
+ * u64 const h[4][2], const char *head)
*/
ENTRY(pmull_ghash_update_p64)
vld1.64 {SHASH}, [r3]!
vld1.64 {HH}, [r3]!
vld1.64 {HH3-HH4}, [r3]
@@ -339,39 +219,16 @@ ENTRY(pmull_ghash_update_p64)
veor HH34_H, HH4_L, HH4_H
vmov.i8 MASK, #0xe1
vshl.u64 MASK, MASK, #57
- ghash_update p64
+ ghash_update
vst1.64 {XL}, [r1]
bx lr
ENDPROC(pmull_ghash_update_p64)
-ENTRY(pmull_ghash_update_p8)
- vld1.64 {SHASH}, [r3]
- veor SHASH2_p8, SHASH_L, SHASH_H
-
- vext.8 s1l, SHASH_L, SHASH_L, #1
- vext.8 s2l, SHASH_L, SHASH_L, #2
- vext.8 s3l, SHASH_L, SHASH_L, #3
- vext.8 s4l, SHASH_L, SHASH_L, #4
- vext.8 s1h, SHASH_H, SHASH_H, #1
- vext.8 s2h, SHASH_H, SHASH_H, #2
- vext.8 s3h, SHASH_H, SHASH_H, #3
- vext.8 s4h, SHASH_H, SHASH_H, #4
-
- vmov.i64 k16, #0xffff
- vmov.i64 k32, #0xffffffff
- vmov.i64 k48, #0xffffffffffff
-
- ghash_update p8
- vst1.64 {XL}, [r1]
-
- bx lr
-ENDPROC(pmull_ghash_update_p8)
-
e0 .req q9
e1 .req q10
e2 .req q11
e3 .req q12
e0l .req d18
@@ -534,11 +391,11 @@ ENTRY(pmull_gcm_encrypt)
ldrd r4, r5, [sp, #24]
ldrd r6, r7, [sp, #32]
vld1.64 {SHASH}, [r3]
- ghash_update p64, enc, head=0
+ ghash_update enc, head=0
vst1.64 {XL}, [r1]
pop {r4-r8, pc}
ENDPROC(pmull_gcm_encrypt)
@@ -552,11 +409,11 @@ ENTRY(pmull_gcm_decrypt)
ldrd r4, r5, [sp, #24]
ldrd r6, r7, [sp, #32]
vld1.64 {SHASH}, [r3]
- ghash_update p64, dec, head=0
+ ghash_update dec, head=0
vst1.64 {XL}, [r1]
pop {r4-r8, pc}
ENDPROC(pmull_gcm_decrypt)
@@ -601,11 +458,11 @@ ENTRY(pmull_gcm_enc_final)
vmov.i8 MASK, #0xe1
veor SHASH2_p64, SHASH_L, SHASH_H
vshl.u64 MASK, MASK, #57
mov r0, #1
bne 3f // process head block first
- ghash_update p64, aggregate=0, head=0
+ ghash_update aggregate=0, head=0
vrev64.8 XL, XL
vext.8 XL, XL, XL, #8
veor XL, XL, e1
@@ -658,11 +515,11 @@ ENTRY(pmull_gcm_dec_final)
vmov.i8 MASK, #0xe1
veor SHASH2_p64, SHASH_L, SHASH_H
vshl.u64 MASK, MASK, #57
mov r0, #1
bne 3f // process head block first
- ghash_update p64, aggregate=0, head=0
+ ghash_update aggregate=0, head=0
vrev64.8 XL, XL
vext.8 XL, XL, XL, #8
veor XL, XL, e1
diff --git a/arch/arm/crypto/ghash-neon-core.S b/arch/arm/crypto/ghash-neon-core.S
new file mode 100644
index 000000000000..bdf6fb6d063c
--- /dev/null
+++ b/arch/arm/crypto/ghash-neon-core.S
@@ -0,0 +1,207 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Accelerated GHASH implementation with NEON vmull.p8 instructions.
+ *
+ * Copyright (C) 2015 - 2017 Linaro Ltd.
+ * Copyright (C) 2023 Google LLC. <ardb@google.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .fpu neon
+
+ SHASH .req q0
+ T1 .req q1
+ XL .req q2
+ XM .req q3
+ XH .req q4
+ IN1 .req q4
+
+ SHASH_L .req d0
+ SHASH_H .req d1
+ T1_L .req d2
+ T1_H .req d3
+ XL_L .req d4
+ XL_H .req d5
+ XM_L .req d6
+ XM_H .req d7
+ XH_L .req d8
+
+ t0l .req d10
+ t0h .req d11
+ t1l .req d12
+ t1h .req d13
+ t2l .req d14
+ t2h .req d15
+ t3l .req d16
+ t3h .req d17
+ t4l .req d18
+ t4h .req d19
+
+ t0q .req q5
+ t1q .req q6
+ t2q .req q7
+ t3q .req q8
+ t4q .req q9
+
+ s1l .req d20
+ s1h .req d21
+ s2l .req d22
+ s2h .req d23
+ s3l .req d24
+ s3h .req d25
+ s4l .req d26
+ s4h .req d27
+
+ SHASH2_p8 .req d28
+
+ k16 .req d29
+ k32 .req d30
+ k48 .req d31
+
+ T2 .req q7
+
+ .text
+
+ /*
+ * This implementation of 64x64 -> 128 bit polynomial multiplication
+ * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
+ * "Fast Software Polynomial Multiplication on ARM Processors Using
+ * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
+ * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
+ *
+ * It has been slightly tweaked for in-order performance, and to allow
+ * 'rq' to overlap with 'ad' or 'bd'.
+ */
+ .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
+ vext.8 t0l, \ad, \ad, #1 @ A1
+ .ifc \b1, t4l
+ vext.8 t4l, \bd, \bd, #1 @ B1
+ .endif
+ vmull.p8 t0q, t0l, \bd @ F = A1*B
+ vext.8 t1l, \ad, \ad, #2 @ A2
+ vmull.p8 t4q, \ad, \b1 @ E = A*B1
+ .ifc \b2, t3l
+ vext.8 t3l, \bd, \bd, #2 @ B2
+ .endif
+ vmull.p8 t1q, t1l, \bd @ H = A2*B
+ vext.8 t2l, \ad, \ad, #3 @ A3
+ vmull.p8 t3q, \ad, \b2 @ G = A*B2
+ veor t0q, t0q, t4q @ L = E + F
+ .ifc \b3, t4l
+ vext.8 t4l, \bd, \bd, #3 @ B3
+ .endif
+ vmull.p8 t2q, t2l, \bd @ J = A3*B
+ veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
+ veor t1q, t1q, t3q @ M = G + H
+ .ifc \b4, t3l
+ vext.8 t3l, \bd, \bd, #4 @ B4
+ .endif
+ vmull.p8 t4q, \ad, \b3 @ I = A*B3
+ veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
+ vmull.p8 t3q, \ad, \b4 @ K = A*B4
+ vand t0h, t0h, k48
+ vand t1h, t1h, k32
+ veor t2q, t2q, t4q @ N = I + J
+ veor t0l, t0l, t0h
+ veor t1l, t1l, t1h
+ veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
+ vand t2h, t2h, k16
+ veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 t3h, #0
+ vext.8 t0q, t0q, t0q, #15
+ veor t2l, t2l, t2h
+ vext.8 t1q, t1q, t1q, #14
+ vmull.p8 \rq, \ad, \bd @ D = A*B
+ vext.8 t2q, t2q, t2q, #13
+ vext.8 t3q, t3q, t3q, #12
+ veor t0q, t0q, t1q
+ veor t2q, t2q, t3q
+ veor \rq, \rq, t0q
+ veor \rq, \rq, t2q
+ .endm
+
+ .macro __pmull_reduce_p8
+ veor XL_H, XL_H, XM_L
+ veor XH_L, XH_L, XM_H
+
+ vshl.i64 T1, XL, #57
+ vshl.i64 T2, XL, #62
+ veor T1, T1, T2
+ vshl.i64 T2, XL, #63
+ veor T1, T1, T2
+ veor XL_H, XL_H, T1_L
+ veor XH_L, XH_L, T1_H
+
+ vshr.u64 T1, XL, #1
+ veor XH, XH, XL
+ veor XL, XL, T1
+ vshr.u64 T1, T1, #6
+ vshr.u64 XL, XL, #1
+ .endm
+
+ .macro ghash_update
+ vld1.64 {XL}, [r1]
+
+ /* do the head block first, if supplied */
+ ldr ip, [sp]
+ teq ip, #0
+ beq 0f
+ vld1.64 {T1}, [ip]
+ teq r0, #0
+ b 3f
+
+0:
+ vld1.8 {T1}, [r2]!
+ subs r0, r0, #1
+
+3: /* multiply XL by SHASH in GF(2^128) */
+ vrev64.8 T1, T1
+
+ vext.8 IN1, T1, T1, #8
+ veor T1_L, T1_L, XL_H
+ veor XL, XL, IN1
+
+ __pmull_p8 XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
+ veor T1, T1, XL
+ __pmull_p8 XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
+ __pmull_p8 XM, T1_L, SHASH2_p8 @ (a1+a0)(b1+b0)
+
+ veor T1, XL, XH
+ veor XM, XM, T1
+
+ __pmull_reduce_p8
+
+ veor T1, T1, XH
+ veor XL, XL, T1
+
+ bne 0b
+ .endm
+
+ /*
+ * void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
+ * u64 const h[1][2], const char *head)
+ */
+ENTRY(pmull_ghash_update_p8)
+ vld1.64 {SHASH}, [r3]
+ veor SHASH2_p8, SHASH_L, SHASH_H
+
+ vext.8 s1l, SHASH_L, SHASH_L, #1
+ vext.8 s2l, SHASH_L, SHASH_L, #2
+ vext.8 s3l, SHASH_L, SHASH_L, #3
+ vext.8 s4l, SHASH_L, SHASH_L, #4
+ vext.8 s1h, SHASH_H, SHASH_H, #1
+ vext.8 s2h, SHASH_H, SHASH_H, #2
+ vext.8 s3h, SHASH_H, SHASH_H, #3
+ vext.8 s4h, SHASH_H, SHASH_H, #4
+
+ vmov.i64 k16, #0xffff
+ vmov.i64 k32, #0xffffffff
+ vmov.i64 k48, #0xffffffffffff
+
+ ghash_update
+ vst1.64 {XL}, [r1]
+
+ bx lr
+ENDPROC(pmull_ghash_update_p8)
--
2.53.0
_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv
next prev parent reply other threads:[~2026-03-19 6:19 UTC|newest]
Thread overview: 44+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-19 6:17 [PATCH 00/19] GHASH library Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 01/19] lib/crypto: gf128hash: Rename polyval module to gf128hash Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 02/19] lib/crypto: gf128hash: Support GF128HASH_ARCH without all POLYVAL functions Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 03/19] lib/crypto: gf128hash: Add GHASH support Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 04/19] lib/crypto: tests: Add KUnit tests for GHASH Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 05/19] crypto: arm/ghash - Make the "ghash" crypto_shash NEON-only Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` Eric Biggers [this message]
2026-03-19 6:17 ` [PATCH 06/19] crypto: arm/ghash - Move NEON GHASH assembly into its own file Eric Biggers
2026-03-19 6:17 ` [PATCH 07/19] lib/crypto: arm/ghash: Migrate optimized code into library Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 08/19] crypto: arm64/ghash - Move NEON GHASH assembly into its own file Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 09/19] lib/crypto: arm64/ghash: Migrate optimized code into library Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 10/19] crypto: arm64/aes-gcm - Rename struct ghash_key and make fixed-sized Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 11/19] lib/crypto: powerpc/ghash: Migrate optimized code into library Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 12/19] lib/crypto: riscv/ghash: " Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 13/19] lib/crypto: s390/ghash: " Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 14/19] lib/crypto: x86/ghash: " Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 15/19] crypto: gcm - Use GHASH library instead of crypto_ahash Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 16/19] crypto: ghash - Remove ghash from crypto_shash API Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 17/19] lib/crypto: gf128mul: Remove unused 4k_lle functions Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 18/19] lib/crypto: gf128hash: Remove unused content from ghash.h Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-19 6:17 ` [PATCH 19/19] lib/crypto: aesgcm: Use GHASH library API Eric Biggers
2026-03-19 6:17 ` Eric Biggers
2026-03-23 14:14 ` [PATCH 00/19] GHASH library Ard Biesheuvel
2026-03-23 14:14 ` Ard Biesheuvel
2026-03-24 0:50 ` Eric Biggers
2026-03-24 0:50 ` Eric Biggers
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260319061723.1140720-7-ebiggers@kernel.org \
--to=ebiggers@kernel.org \
--cc=Jason@zx2c4.com \
--cc=ardb@kernel.org \
--cc=herbert@gondor.apana.org.au \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-crypto@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-riscv@lists.infradead.org \
--cc=linux-s390@vger.kernel.org \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.