From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: linux-kernel@vger.kernel.org, Ard Biesheuvel <ardb@kernel.org>,
"Jason A . Donenfeld" <Jason@zx2c4.com>,
Herbert Xu <herbert@gondor.apana.org.au>,
x86@kernel.org, Samuel Neves <sneves@dei.uc.pt>,
Eric Biggers <ebiggers@kernel.org>
Subject: [PATCH 4/6] lib/crypto: x86/blake2s: Improve readability
Date: Sun, 2 Nov 2025 15:42:07 -0800 [thread overview]
Message-ID: <20251102234209.62133-5-ebiggers@kernel.org> (raw)
In-Reply-To: <20251102234209.62133-1-ebiggers@kernel.org>
Various cleanups for readability. No change to the generated code:
- Add some comments
- Add #defines for arguments
- Rename some labels
- Use decimal constants instead of hex where it makes sense.
(The pshufd immediates intentionally remain as hex.)
- Add blank lines when there's a logical break
The round loop still could use some work, but this is at least a start.
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
lib/crypto/x86/blake2s-core.S | 231 ++++++++++++++++++++--------------
1 file changed, 134 insertions(+), 97 deletions(-)
diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S
index 14e487559c09..f805a49c590d 100644
--- a/lib/crypto/x86/blake2s-core.S
+++ b/lib/crypto/x86/blake2s-core.S
@@ -48,209 +48,246 @@
.byte 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
.byte 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
.byte 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
.byte 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
+#define CTX %rdi
+#define DATA %rsi
+#define NBLOCKS %rdx
+#define INC %ecx
+
.text
+//
+// void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
+// const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+// u32 h[8]; (inout)
+// u32 t[2]; (inout)
+// u32 f[2]; (in)
+//
SYM_FUNC_START(blake2s_compress_ssse3)
- movdqu (%rdi),%xmm0
- movdqu 0x10(%rdi),%xmm1
+ movdqu (CTX),%xmm0 // Load h[0..3]
+ movdqu 16(CTX),%xmm1 // Load h[4..7]
movdqa .Lror16(%rip),%xmm12
movdqa .Lror8(%rip),%xmm13
- movdqu 0x20(%rdi),%xmm14
- movd %ecx,%xmm15
- leaq .Lsigma+0xa0(%rip),%r8
- jmp .Lbeginofloop
+ movdqu 32(CTX),%xmm14 // Load t and f
+ movd INC,%xmm15 // Load inc
+ leaq .Lsigma+160(%rip),%r8
+ jmp .Lssse3_mainloop
+
.align 32
-.Lbeginofloop:
- movdqa %xmm0,%xmm10
- movdqa %xmm1,%xmm11
- paddq %xmm15,%xmm14
- movdqa .Liv(%rip),%xmm2
+.Lssse3_mainloop:
+ // Main loop: each iteration processes one 64-byte block.
+ movdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3]
+ movdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7]
+ paddq %xmm15,%xmm14 // t += inc (64-bit addition)
+ movdqa .Liv(%rip),%xmm2 // v[8..11] = iv[0..3]
movdqa %xmm14,%xmm3
- pxor .Liv+0x10(%rip),%xmm3
+ pxor .Liv+16(%rip),%xmm3 // v[12..15] = iv[4..7] ^ [t, f]
leaq .Lsigma(%rip),%rcx
-.Lroundloop:
+
+.Lssse3_roundloop:
+ // Round loop: each iteration does 1 round (of 10 rounds total).
movzbl (%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- movzbl 0x1(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- movzbl 0x2(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- movzbl 0x3(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
+ movd (DATA,%rax,4),%xmm4
+ movzbl 1(%rcx),%eax
+ movd (DATA,%rax,4),%xmm5
+ movzbl 2(%rcx),%eax
+ movd (DATA,%rax,4),%xmm6
+ movzbl 3(%rcx),%eax
+ movd (DATA,%rax,4),%xmm7
punpckldq %xmm5,%xmm4
punpckldq %xmm7,%xmm6
punpcklqdq %xmm6,%xmm4
paddd %xmm4,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm12,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
+ psrld $12,%xmm1
+ pslld $20,%xmm8
por %xmm8,%xmm1
- movzbl 0x4(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- movzbl 0x5(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- movzbl 0x6(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- movzbl 0x7(%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
+ movzbl 4(%rcx),%eax
+ movd (DATA,%rax,4),%xmm5
+ movzbl 5(%rcx),%eax
+ movd (DATA,%rax,4),%xmm6
+ movzbl 6(%rcx),%eax
+ movd (DATA,%rax,4),%xmm7
+ movzbl 7(%rcx),%eax
+ movd (DATA,%rax,4),%xmm4
punpckldq %xmm6,%xmm5
punpckldq %xmm4,%xmm7
punpcklqdq %xmm7,%xmm5
paddd %xmm5,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm13,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
+ psrld $7,%xmm1
+ pslld $25,%xmm8
por %xmm8,%xmm1
pshufd $0x93,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x39,%xmm2,%xmm2
- movzbl 0x8(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- movzbl 0x9(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- movzbl 0xa(%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- movzbl 0xb(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
+ movzbl 8(%rcx),%eax
+ movd (DATA,%rax,4),%xmm6
+ movzbl 9(%rcx),%eax
+ movd (DATA,%rax,4),%xmm7
+ movzbl 10(%rcx),%eax
+ movd (DATA,%rax,4),%xmm4
+ movzbl 11(%rcx),%eax
+ movd (DATA,%rax,4),%xmm5
punpckldq %xmm7,%xmm6
punpckldq %xmm5,%xmm4
punpcklqdq %xmm4,%xmm6
paddd %xmm6,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm12,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
+ psrld $12,%xmm1
+ pslld $20,%xmm8
por %xmm8,%xmm1
- movzbl 0xc(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- movzbl 0xd(%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- movzbl 0xe(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- movzbl 0xf(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
+ movzbl 12(%rcx),%eax
+ movd (DATA,%rax,4),%xmm7
+ movzbl 13(%rcx),%eax
+ movd (DATA,%rax,4),%xmm4
+ movzbl 14(%rcx),%eax
+ movd (DATA,%rax,4),%xmm5
+ movzbl 15(%rcx),%eax
+ movd (DATA,%rax,4),%xmm6
punpckldq %xmm4,%xmm7
punpckldq %xmm6,%xmm5
punpcklqdq %xmm5,%xmm7
paddd %xmm7,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm13,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
+ psrld $7,%xmm1
+ pslld $25,%xmm8
por %xmm8,%xmm1
pshufd $0x39,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x93,%xmm2,%xmm2
- addq $0x10,%rcx
+ addq $16,%rcx
cmpq %r8,%rcx
- jnz .Lroundloop
+ jnz .Lssse3_roundloop
+
+ // Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
pxor %xmm2,%xmm0
pxor %xmm3,%xmm1
pxor %xmm10,%xmm0
pxor %xmm11,%xmm1
- addq $0x40,%rsi
- decq %rdx
- jnz .Lbeginofloop
- movdqu %xmm0,(%rdi)
- movdqu %xmm1,0x10(%rdi)
- movdqu %xmm14,0x20(%rdi)
+ addq $64,DATA
+ decq NBLOCKS
+ jnz .Lssse3_mainloop
+
+ movdqu %xmm0,(CTX) // Store new h[0..3]
+ movdqu %xmm1,16(CTX) // Store new h[4..7]
+ movdqu %xmm14,32(CTX) // Store new t and f
RET
SYM_FUNC_END(blake2s_compress_ssse3)
+//
+// void blake2s_compress_avx512(struct blake2s_ctx *ctx,
+// const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+// u32 h[8]; (inout)
+// u32 t[2]; (inout)
+// u32 f[2]; (in)
+//
SYM_FUNC_START(blake2s_compress_avx512)
- vmovdqu (%rdi),%xmm0
- vmovdqu 0x10(%rdi),%xmm1
- vmovdqu 0x20(%rdi),%xmm4
- vmovd %ecx,%xmm5
- vmovdqa .Liv(%rip),%xmm14
- vmovdqa .Liv+16(%rip),%xmm15
- jmp .Lblake2s_compress_avx512_mainloop
-.align 32
-.Lblake2s_compress_avx512_mainloop:
- vmovdqa %xmm0,%xmm10
- vmovdqa %xmm1,%xmm11
- vpaddq %xmm5,%xmm4,%xmm4
- vmovdqa %xmm14,%xmm2
- vpxor %xmm15,%xmm4,%xmm3
- vmovdqu (%rsi),%ymm6
- vmovdqu 0x20(%rsi),%ymm7
- addq $0x40,%rsi
+ vmovdqu (CTX),%xmm0 // Load h[0..3]
+ vmovdqu 16(CTX),%xmm1 // Load h[4..7]
+ vmovdqu 32(CTX),%xmm4 // Load t and f
+ vmovd INC,%xmm5 // Load inc
+ vmovdqa .Liv(%rip),%xmm14 // Load iv[0..3]
+ vmovdqa .Liv+16(%rip),%xmm15 // Load iv[4..7]
+ jmp .Lavx512_mainloop
+
+ .align 32
+.Lavx512_mainloop:
+ // Main loop: each iteration processes one 64-byte block.
+ vmovdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3]
+ vmovdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7]
+ vpaddq %xmm5,%xmm4,%xmm4 // t += inc (64-bit addition)
+ vmovdqa %xmm14,%xmm2 // v[8..11] = iv[0..3]
+ vpxor %xmm15,%xmm4,%xmm3 // v[12..15] = iv[4..7] ^ [t, f]
+ vmovdqu (DATA),%ymm6 // Load first 8 data words
+ vmovdqu 32(DATA),%ymm7 // Load second 8 data words
+ addq $64,DATA
leaq .Lsigma2(%rip),%rax
- movb $0xa,%cl
-.Lblake2s_compress_avx512_roundloop:
+ movb $10,%cl // Set num rounds remaining
+
+.Lavx512_roundloop:
+ // Round loop: each iteration does 1 round (of 10 rounds total).
vpmovzxbd (%rax),%ymm8
- vpmovzxbd 0x8(%rax),%ymm9
- addq $0x10,%rax
+ vpmovzxbd 8(%rax),%ymm9
+ addq $16,%rax
vpermi2d %ymm7,%ymm6,%ymm8
vpermi2d %ymm7,%ymm6,%ymm9
vmovdqa %ymm8,%ymm6
vmovdqa %ymm9,%ymm7
vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
- vprord $0x10,%xmm3,%xmm3
+ vprord $16,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
- vprord $0xc,%xmm1,%xmm1
- vextracti128 $0x1,%ymm8,%xmm8
+ vprord $12,%xmm1,%xmm1
+ vextracti128 $1,%ymm8,%xmm8
vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
- vprord $0x8,%xmm3,%xmm3
+ vprord $8,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
- vprord $0x7,%xmm1,%xmm1
+ vprord $7,%xmm1,%xmm1
vpshufd $0x93,%xmm0,%xmm0
vpshufd $0x4e,%xmm3,%xmm3
vpshufd $0x39,%xmm2,%xmm2
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
- vprord $0x10,%xmm3,%xmm3
+ vprord $16,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
- vprord $0xc,%xmm1,%xmm1
- vextracti128 $0x1,%ymm9,%xmm9
+ vprord $12,%xmm1,%xmm1
+ vextracti128 $1,%ymm9,%xmm9
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
- vprord $0x8,%xmm3,%xmm3
+ vprord $8,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
- vprord $0x7,%xmm1,%xmm1
+ vprord $7,%xmm1,%xmm1
vpshufd $0x39,%xmm0,%xmm0
vpshufd $0x4e,%xmm3,%xmm3
vpshufd $0x93,%xmm2,%xmm2
decb %cl
- jne .Lblake2s_compress_avx512_roundloop
+ jne .Lavx512_roundloop
+
+ // Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
vpxor %xmm10,%xmm0,%xmm0
vpxor %xmm11,%xmm1,%xmm1
vpxor %xmm2,%xmm0,%xmm0
vpxor %xmm3,%xmm1,%xmm1
- decq %rdx
- jne .Lblake2s_compress_avx512_mainloop
- vmovdqu %xmm0,(%rdi)
- vmovdqu %xmm1,0x10(%rdi)
- vmovdqu %xmm4,0x20(%rdi)
+ decq NBLOCKS
+ jne .Lavx512_mainloop
+
+ vmovdqu %xmm0,(CTX) // Store new h[0..3]
+ vmovdqu %xmm1,16(CTX) // Store new h[4..7]
+ vmovdqu %xmm4,32(CTX) // Store new t and f
vzeroupper
RET
SYM_FUNC_END(blake2s_compress_avx512)
--
2.51.2
next prev parent reply other threads:[~2025-11-02 23:44 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-11-02 23:42 [PATCH 0/6] x86 BLAKE2s cleanups Eric Biggers
2025-11-02 23:42 ` [PATCH 1/6] lib/crypto: x86/blake2s: Fix 32-bit arg treated as 64-bit Eric Biggers
2025-11-02 23:42 ` [PATCH 2/6] lib/crypto: x86/blake2s: Drop check for nblocks == 0 Eric Biggers
2025-11-02 23:42 ` [PATCH 3/6] lib/crypto: x86/blake2s: Use local labels for data Eric Biggers
2025-11-02 23:42 ` Eric Biggers [this message]
2025-11-02 23:42 ` [PATCH 5/6] lib/crypto: x86/blake2s: Avoid writing back unchanged 'f' value Eric Biggers
2025-11-02 23:42 ` [PATCH 6/6] lib/crypto: x86/blake2s: Use vpternlogd for 3-input XORs Eric Biggers
2025-11-03 8:14 ` [PATCH 0/6] x86 BLAKE2s cleanups Ard Biesheuvel
2025-11-03 17:35 ` Eric Biggers
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251102234209.62133-5-ebiggers@kernel.org \
--to=ebiggers@kernel.org \
--cc=Jason@zx2c4.com \
--cc=ardb@kernel.org \
--cc=herbert@gondor.apana.org.au \
--cc=linux-crypto@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=sneves@dei.uc.pt \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).