* [PATCH 3/6] crypto: arm/chacha20 - implement NEON version based on SSE3 code
From: Ard Biesheuvel @ 2017-01-02 18:21 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1483381268-12987-1-git-send-email-ard.biesheuvel@linaro.org>
This is a straight port to ARM/NEON of the x86 SSE3 implementation
of the ChaCha20 stream cipher. It uses the new skcipher walksize
attribute to process the input in strides of 4x the block size.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
arch/arm/crypto/Kconfig | 6 +
arch/arm/crypto/Makefile | 2 +
arch/arm/crypto/chacha20-neon-core.S | 524 ++++++++++++++++++++
arch/arm/crypto/chacha20-neon-glue.c | 128 +++++
4 files changed, 660 insertions(+)
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 13f1b4c289d4..2f3339f015d3 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -130,4 +130,10 @@ config CRYPTO_CRC32_ARM_CE
depends on KERNEL_MODE_NEON && CRC32
select CRYPTO_HASH
+config CRYPTO_CHACHA20_NEON
+ tristate "NEON accelerated ChaCha20 symmetric cipher"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_BLKCIPHER
+ select CRYPTO_CHACHA20
+
endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index b578a1820ab1..8d74e55eacd4 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -40,6 +41,7 @@ aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
+chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
quiet_cmd_perl = PERL $@
cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S
new file mode 100644
index 000000000000..ff1d337bdb4a
--- /dev/null
+++ b/arch/arm/crypto/chacha20-neon-core.S
@@ -0,0 +1,524 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+ .text
+ .fpu neon
+ .align 5
+
+ENTRY(chacha20_block_xor_neon)
+ // r0: Input state matrix, s
+ // r1: 1 data block output, o
+ // r2: 1 data block input, i
+
+ //
+ // This function encrypts one ChaCha20 block by loading the state matrix
+ // in four NEON registers. It performs matrix operation on four words in
+ // parallel, but requireds shuffling to rearrange the words after each
+ // round.
+ //
+
+ // x0..3 = s0..3
+ add ip, r0, #0x20
+ vld1.32 {q0-q1}, [r0]
+ vld1.32 {q2-q3}, [ip]
+
+ vmov q8, q0
+ vmov q9, q1
+ vmov q10, q2
+ vmov q11, q3
+
+ mov r3, #10
+
+.Ldoubleround:
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vadd.i32 q0, q0, q1
+ veor q4, q3, q0
+ vshl.u32 q3, q4, #16
+ vsri.u32 q3, q4, #16
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #12
+ vsri.u32 q1, q4, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vadd.i32 q0, q0, q1
+ veor q4, q3, q0
+ vshl.u32 q3, q4, #8
+ vsri.u32 q3, q4, #24
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #7
+ vsri.u32 q1, q4, #25
+
+ // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vext.8 q1, q1, q1, #4
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vext.8 q2, q2, q2, #8
+ // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vext.8 q3, q3, q3, #12
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vadd.i32 q0, q0, q1
+ veor q4, q3, q0
+ vshl.u32 q3, q4, #16
+ vsri.u32 q3, q4, #16
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #12
+ vsri.u32 q1, q4, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vadd.i32 q0, q0, q1
+ veor q4, q3, q0
+ vshl.u32 q3, q4, #8
+ vsri.u32 q3, q4, #24
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #7
+ vsri.u32 q1, q4, #25
+
+ // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vext.8 q1, q1, q1, #12
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vext.8 q2, q2, q2, #8
+ // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vext.8 q3, q3, q3, #4
+
+ subs r3, r3, #1
+ bne .Ldoubleround
+
+ add ip, r2, #0x20
+ vld1.8 {q4-q5}, [r2]
+ vld1.8 {q6-q7}, [ip]
+
+ // o0 = i0 ^ (x0 + s0)
+ vadd.i32 q0, q0, q8
+ veor q0, q0, q4
+
+ // o1 = i1 ^ (x1 + s1)
+ vadd.i32 q1, q1, q9
+ veor q1, q1, q5
+
+ // o2 = i2 ^ (x2 + s2)
+ vadd.i32 q2, q2, q10
+ veor q2, q2, q6
+
+ // o3 = i3 ^ (x3 + s3)
+ vadd.i32 q3, q3, q11
+ veor q3, q3, q7
+
+ add ip, r1, #0x20
+ vst1.8 {q0-q1}, [r1]
+ vst1.8 {q2-q3}, [ip]
+
+ bx lr
+ENDPROC(chacha20_block_xor_neon)
+
+ .align 5
+ENTRY(chacha20_4block_xor_neon)
+ push {r4-r6, lr}
+ mov ip, sp // preserve the stack pointer
+ sub r3, sp, #0x20 // allocate a 32 byte buffer
+ bic r3, r3, #0x1f // aligned to 32 bytes
+ mov sp, r3
+
+ // r0: Input state matrix, s
+ // r1: 4 data blocks output, o
+ // r2: 4 data blocks input, i
+
+ //
+ // This function encrypts four consecutive ChaCha20 blocks by loading
+ // the state matrix in NEON registers four times. The algorithm performs
+ // each operation on the corresponding word of each state matrix, hence
+ // requires no word shuffling. For final XORing step we transpose the
+ // matrix by interleaving 32- and then 64-bit words, which allows us to
+ // do XOR in NEON registers.
+ //
+
+ // x0..15[0-3] = s0..3[0..3]
+ add r3, r0, #0x20
+ vld1.32 {q0-q1}, [r0]
+ vld1.32 {q2-q3}, [r3]
+
+ adr r3, CTRINC
+ vdup.32 q15, d7[1]
+ vdup.32 q14, d7[0]
+ vld1.32 {q11}, [r3, :128]
+ vdup.32 q13, d6[1]
+ vdup.32 q12, d6[0]
+ vadd.i32 q12, q12, q11 // x12 += counter values 0-3
+ vdup.32 q11, d5[1]
+ vdup.32 q10, d5[0]
+ vdup.32 q9, d4[1]
+ vdup.32 q8, d4[0]
+ vdup.32 q7, d3[1]
+ vdup.32 q6, d3[0]
+ vdup.32 q5, d2[1]
+ vdup.32 q4, d2[0]
+ vdup.32 q3, d1[1]
+ vdup.32 q2, d1[0]
+ vdup.32 q1, d0[1]
+ vdup.32 q0, d0[0]
+
+ mov r3, #10
+
+.Ldoubleround4:
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vadd.i32 q0, q0, q4
+ vadd.i32 q1, q1, q5
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+
+ veor q12, q12, q0
+ veor q13, q13, q1
+ veor q14, q14, q2
+ veor q15, q15, q3
+
+ vrev32.16 q12, q12
+ vrev32.16 q13, q13
+ vrev32.16 q14, q14
+ vrev32.16 q15, q15
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vadd.i32 q10, q10, q14
+ vadd.i32 q11, q11, q15
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q4, q8
+ veor q9, q5, q9
+ vshl.u32 q4, q8, #12
+ vshl.u32 q5, q9, #12
+ vsri.u32 q4, q8, #20
+ vsri.u32 q5, q9, #20
+
+ veor q8, q6, q10
+ veor q9, q7, q11
+ vshl.u32 q6, q8, #12
+ vshl.u32 q7, q9, #12
+ vsri.u32 q6, q8, #20
+ vsri.u32 q7, q9, #20
+
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vadd.i32 q0, q0, q4
+ vadd.i32 q1, q1, q5
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+
+ veor q8, q12, q0
+ veor q9, q13, q1
+ vshl.u32 q12, q8, #8
+ vshl.u32 q13, q9, #8
+ vsri.u32 q12, q8, #24
+ vsri.u32 q13, q9, #24
+
+ veor q8, q14, q2
+ veor q9, q15, q3
+ vshl.u32 q14, q8, #8
+ vshl.u32 q15, q9, #8
+ vsri.u32 q14, q8, #24
+ vsri.u32 q15, q9, #24
+
+ vld1.32 {q8-q9}, [sp, :256]
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vadd.i32 q10, q10, q14
+ vadd.i32 q11, q11, q15
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q4, q8
+ veor q9, q5, q9
+ vshl.u32 q4, q8, #7
+ vshl.u32 q5, q9, #7
+ vsri.u32 q4, q8, #25
+ vsri.u32 q5, q9, #25
+
+ veor q8, q6, q10
+ veor q9, q7, q11
+ vshl.u32 q6, q8, #7
+ vshl.u32 q7, q9, #7
+ vsri.u32 q6, q8, #25
+ vsri.u32 q7, q9, #25
+
+ vld1.32 {q8-q9}, [sp, :256]
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vadd.i32 q0, q0, q5
+ vadd.i32 q1, q1, q6
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q4
+
+ veor q15, q15, q0
+ veor q12, q12, q1
+ veor q13, q13, q2
+ veor q14, q14, q3
+
+ vrev32.16 q15, q15
+ vrev32.16 q12, q12
+ vrev32.16 q13, q13
+ vrev32.16 q14, q14
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vadd.i32 q10, q10, q15
+ vadd.i32 q11, q11, q12
+ vadd.i32 q8, q8, q13
+ vadd.i32 q9, q9, q14
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q7, q8
+ veor q9, q4, q9
+ vshl.u32 q7, q8, #12
+ vshl.u32 q4, q9, #12
+ vsri.u32 q7, q8, #20
+ vsri.u32 q4, q9, #20
+
+ veor q8, q5, q10
+ veor q9, q6, q11
+ vshl.u32 q5, q8, #12
+ vshl.u32 q6, q9, #12
+ vsri.u32 q5, q8, #20
+ vsri.u32 q6, q9, #20
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vadd.i32 q0, q0, q5
+ vadd.i32 q1, q1, q6
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q4
+
+ veor q8, q15, q0
+ veor q9, q12, q1
+ vshl.u32 q15, q8, #8
+ vshl.u32 q12, q9, #8
+ vsri.u32 q15, q8, #24
+ vsri.u32 q12, q9, #24
+
+ veor q8, q13, q2
+ veor q9, q14, q3
+ vshl.u32 q13, q8, #8
+ vshl.u32 q14, q9, #8
+ vsri.u32 q13, q8, #24
+ vsri.u32 q14, q9, #24
+
+ vld1.32 {q8-q9}, [sp, :256]
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vadd.i32 q10, q10, q15
+ vadd.i32 q11, q11, q12
+ vadd.i32 q8, q8, q13
+ vadd.i32 q9, q9, q14
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q7, q8
+ veor q9, q4, q9
+ vshl.u32 q7, q8, #7
+ vshl.u32 q4, q9, #7
+ vsri.u32 q7, q8, #25
+ vsri.u32 q4, q9, #25
+
+ veor q8, q5, q10
+ veor q9, q6, q11
+ vshl.u32 q5, q8, #7
+ vshl.u32 q6, q9, #7
+ vsri.u32 q5, q8, #25
+ vsri.u32 q6, q9, #25
+
+ subs r3, r3, #1
+ beq 0f
+
+ vld1.32 {q8-q9}, [sp, :256]
+ b .Ldoubleround4
+
+ // x0[0-3] += s0[0]
+ // x1[0-3] += s0[1]
+ // x2[0-3] += s0[2]
+ // x3[0-3] += s0[3]
+0: ldmia r0!, {r3-r6}
+ vdup.32 q8, r3
+ vdup.32 q9, r4
+ vadd.i32 q0, q0, q8
+ vadd.i32 q1, q1, q9
+ vdup.32 q8, r5
+ vdup.32 q9, r6
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+
+ // x4[0-3] += s1[0]
+ // x5[0-3] += s1[1]
+ // x6[0-3] += s1[2]
+ // x7[0-3] += s1[3]
+ ldmia r0!, {r3-r6}
+ vdup.32 q8, r3
+ vdup.32 q9, r4
+ vadd.i32 q4, q4, q8
+ vadd.i32 q5, q5, q9
+ vdup.32 q8, r5
+ vdup.32 q9, r6
+ vadd.i32 q6, q6, q8
+ vadd.i32 q7, q7, q9
+
+ // interleave 32-bit words in state n, n+1
+ vzip.32 q0, q1
+ vzip.32 q2, q3
+ vzip.32 q4, q5
+ vzip.32 q6, q7
+
+ // interleave 64-bit words in state n, n+2
+ vswp d1, d4
+ vswp d3, d6
+ vswp d9, d12
+ vswp d11, d14
+
+ // xor with corresponding input, write to output
+ vld1.8 {q8-q9}, [r2]!
+ veor q8, q8, q0
+ veor q9, q9, q4
+ vst1.8 {q8-q9}, [r1]!
+
+ vld1.32 {q8-q9}, [sp, :256]
+
+ // x8[0-3] += s2[0]
+ // x9[0-3] += s2[1]
+ // x10[0-3] += s2[2]
+ // x11[0-3] += s2[3]
+ ldmia r0!, {r3-r6}
+ vdup.32 q0, r3
+ vdup.32 q4, r4
+ vadd.i32 q8, q8, q0
+ vadd.i32 q9, q9, q4
+ vdup.32 q0, r5
+ vdup.32 q4, r6
+ vadd.i32 q10, q10, q0
+ vadd.i32 q11, q11, q4
+
+ // x12[0-3] += s3[0]
+ // x13[0-3] += s3[1]
+ // x14[0-3] += s3[2]
+ // x15[0-3] += s3[3]
+ ldmia r0!, {r3-r6}
+ vdup.32 q0, r3
+ vdup.32 q4, r4
+ adr r3, CTRINC
+ vadd.i32 q12, q12, q0
+ vld1.32 {q0}, [r3, :128]
+ vadd.i32 q13, q13, q4
+ vadd.i32 q12, q12, q0 // x12 += counter values 0-3
+
+ vdup.32 q0, r5
+ vdup.32 q4, r6
+ vadd.i32 q14, q14, q0
+ vadd.i32 q15, q15, q4
+
+ // interleave 32-bit words in state n, n+1
+ vzip.32 q8, q9
+ vzip.32 q10, q11
+ vzip.32 q12, q13
+ vzip.32 q14, q15
+
+ // interleave 64-bit words in state n, n+2
+ vswp d17, d20
+ vswp d19, d22
+ vswp d25, d28
+ vswp d27, d30
+
+ vmov q4, q1
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q8
+ veor q1, q1, q12
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q2
+ veor q1, q1, q6
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q10
+ veor q1, q1, q14
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q4
+ veor q1, q1, q5
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q9
+ veor q1, q1, q13
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q3
+ veor q1, q1, q7
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]
+ veor q0, q0, q11
+ veor q1, q1, q15
+ vst1.8 {q0-q1}, [r1]
+
+ mov sp, ip
+ pop {r4-r6, pc}
+ENDPROC(chacha20_4block_xor_neon)
+
+ .align 4
+CTRINC: .word 0, 1, 2, 3
+
diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha20-neon-glue.c
new file mode 100644
index 000000000000..592f75ae4fa1
--- /dev/null
+++ b/arch/arm/crypto/chacha20-neon-glue.c
@@ -0,0 +1,128 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha20.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+
+static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes)
+{
+ u8 buf[CHACHA20_BLOCK_SIZE];
+
+ while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+ chacha20_4block_xor_neon(state, dst, src);
+ bytes -= CHACHA20_BLOCK_SIZE * 4;
+ src += CHACHA20_BLOCK_SIZE * 4;
+ dst += CHACHA20_BLOCK_SIZE * 4;
+ state[12] += 4;
+ }
+ while (bytes >= CHACHA20_BLOCK_SIZE) {
+ chacha20_block_xor_neon(state, dst, src);
+ bytes -= CHACHA20_BLOCK_SIZE;
+ src += CHACHA20_BLOCK_SIZE;
+ dst += CHACHA20_BLOCK_SIZE;
+ state[12]++;
+ }
+ if (bytes) {
+ memcpy(buf, src, bytes);
+ chacha20_block_xor_neon(state, buf, buf);
+ memcpy(dst, buf, bytes);
+ }
+}
+
+static int chacha20_neon(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ u32 state[16];
+ int err;
+
+ if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
+ return crypto_chacha20_crypt(req);
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ crypto_chacha20_init(state, ctx, walk.iv);
+
+ kernel_neon_begin();
+ while (walk.nbytes > 0) {
+ unsigned int nbytes = walk.nbytes;
+
+ if (nbytes < walk.total)
+ nbytes = round_down(nbytes, walk.stride);
+
+ chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes);
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ }
+ kernel_neon_end();
+
+ return err;
+}
+
+static struct skcipher_alg alg = {
+ .base.cra_name = "chacha20",
+ .base.cra_driver_name = "chacha20-neon",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha20_ctx),
+ .base.cra_alignmask = 1,
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA20_KEY_SIZE,
+ .max_keysize = CHACHA20_KEY_SIZE,
+ .ivsize = CHACHA20_IV_SIZE,
+ .chunksize = CHACHA20_BLOCK_SIZE,
+ .walksize = 4 * CHACHA20_BLOCK_SIZE,
+ .setkey = crypto_chacha20_setkey,
+ .encrypt = chacha20_neon,
+ .decrypt = chacha20_neon,
+};
+
+static int __init chacha20_simd_mod_init(void)
+{
+ if (!(elf_hwcap & HWCAP_NEON))
+ return -ENODEV;
+
+ return crypto_register_skcipher(&alg);
+}
+
+static void __exit chacha20_simd_mod_fini(void)
+{
+ crypto_unregister_skcipher(&alg);
+}
+
+module_init(chacha20_simd_mod_init);
+module_exit(chacha20_simd_mod_fini);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
--
2.7.4
^ permalink raw reply related
* [PATCH 4/6] crypto: arm64/chacha20 - implement NEON version based on SSE3 code
From: Ard Biesheuvel @ 2017-01-02 18:21 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1483381268-12987-1-git-send-email-ard.biesheuvel@linaro.org>
This is a straight port to arm64/NEON of the x86 SSE3 implementation
of the ChaCha20 stream cipher. It uses the new skcipher walksize
attribute to process the input in strides of 4x the block size.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
arch/arm64/crypto/Kconfig | 6 +
arch/arm64/crypto/Makefile | 3 +
arch/arm64/crypto/chacha20-neon-core.S | 450 ++++++++++++++++++++
arch/arm64/crypto/chacha20-neon-glue.c | 127 ++++++
4 files changed, 586 insertions(+)
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 450a85df041a..0bf0f531f539 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -72,4 +72,10 @@ config CRYPTO_CRC32_ARM64
depends on ARM64
select CRYPTO_HASH
+config CRYPTO_CHACHA20_NEON
+ tristate "NEON accelerated ChaCha20 symmetric cipher"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_BLKCIPHER
+ select CRYPTO_CHACHA20
+
endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index aa8888d7b744..9d2826c5fccf 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -41,6 +41,9 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
sha512-arm64-y := sha512-glue.o sha512-core.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
+chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
+
AFLAGS_aes-ce.o := -DINTERLEAVE=4
AFLAGS_aes-neon.o := -DINTERLEAVE=4
diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha20-neon-core.S
new file mode 100644
index 000000000000..13c85e272c2a
--- /dev/null
+++ b/arch/arm64/crypto/chacha20-neon-core.S
@@ -0,0 +1,450 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+ .text
+ .align 6
+
+ENTRY(chacha20_block_xor_neon)
+ // x0: Input state matrix, s
+ // x1: 1 data block output, o
+ // x2: 1 data block input, i
+
+ //
+ // This function encrypts one ChaCha20 block by loading the state matrix
+ // in four NEON registers. It performs matrix operation on four words in
+ // parallel, but requires shuffling to rearrange the words after each
+ // round.
+ //
+
+ // x0..3 = s0..3
+ adr x3, ROT8
+ ld1 {v0.4s-v3.4s}, [x0]
+ ld1 {v8.4s-v11.4s}, [x0]
+ ld1 {v12.4s}, [x3]
+
+ mov x3, #10
+
+.Ldoubleround:
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v3.16b, v0.16b
+ rev32 v3.8h, v3.8h
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ add v2.4s, v2.4s, v3.4s
+ eor v4.16b, v1.16b, v2.16b
+ shl v1.4s, v4.4s, #12
+ sri v1.4s, v4.4s, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v3.16b, v0.16b
+ tbl v3.16b, {v3.16b}, v12.16b
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ add v2.4s, v2.4s, v3.4s
+ eor v4.16b, v1.16b, v2.16b
+ shl v1.4s, v4.4s, #7
+ sri v1.4s, v4.4s, #25
+
+ // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ ext v1.16b, v1.16b, v1.16b, #4
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ ext v2.16b, v2.16b, v2.16b, #8
+ // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ ext v3.16b, v3.16b, v3.16b, #12
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v3.16b, v0.16b
+ rev32 v3.8h, v3.8h
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ add v2.4s, v2.4s, v3.4s
+ eor v4.16b, v1.16b, v2.16b
+ shl v1.4s, v4.4s, #12
+ sri v1.4s, v4.4s, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v3.16b, v0.16b
+ tbl v3.16b, {v3.16b}, v12.16b
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ add v2.4s, v2.4s, v3.4s
+ eor v4.16b, v1.16b, v2.16b
+ shl v1.4s, v4.4s, #7
+ sri v1.4s, v4.4s, #25
+
+ // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ ext v1.16b, v1.16b, v1.16b, #12
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ ext v2.16b, v2.16b, v2.16b, #8
+ // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ ext v3.16b, v3.16b, v3.16b, #4
+
+ subs x3, x3, #1
+ b.ne .Ldoubleround
+
+ ld1 {v4.16b-v7.16b}, [x2]
+
+ // o0 = i0 ^ (x0 + s0)
+ add v0.4s, v0.4s, v8.4s
+ eor v0.16b, v0.16b, v4.16b
+
+ // o1 = i1 ^ (x1 + s1)
+ add v1.4s, v1.4s, v9.4s
+ eor v1.16b, v1.16b, v5.16b
+
+ // o2 = i2 ^ (x2 + s2)
+ add v2.4s, v2.4s, v10.4s
+ eor v2.16b, v2.16b, v6.16b
+
+ // o3 = i3 ^ (x3 + s3)
+ add v3.4s, v3.4s, v11.4s
+ eor v3.16b, v3.16b, v7.16b
+
+ st1 {v0.16b-v3.16b}, [x1]
+
+ ret
+ENDPROC(chacha20_block_xor_neon)
+
+ .align 6
+ENTRY(chacha20_4block_xor_neon)
+ // x0: Input state matrix, s
+ // x1: 4 data blocks output, o
+ // x2: 4 data blocks input, i
+
+ //
+ // This function encrypts four consecutive ChaCha20 blocks by loading
+ // the state matrix in NEON registers four times. The algorithm performs
+ // each operation on the corresponding word of each state matrix, hence
+ // requires no word shuffling. For final XORing step we transpose the
+ // matrix by interleaving 32- and then 64-bit words, which allows us to
+ // do XOR in NEON registers.
+ //
+ adr x3, CTRINC // ... and ROT8
+ ld1 {v30.4s-v31.4s}, [x3]
+
+ // x0..15[0-3] = s0..3[0..3]
+ mov x4, x0
+ ld4r { v0.4s- v3.4s}, [x4], #16
+ ld4r { v4.4s- v7.4s}, [x4], #16
+ ld4r { v8.4s-v11.4s}, [x4], #16
+ ld4r {v12.4s-v15.4s}, [x4]
+
+ // x12 += counter values 0-3
+ add v12.4s, v12.4s, v30.4s
+
+ mov x3, #10
+
+.Ldoubleround4:
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ add v0.4s, v0.4s, v4.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ add v3.4s, v3.4s, v7.4s
+
+ eor v12.16b, v12.16b, v0.16b
+ eor v13.16b, v13.16b, v1.16b
+ eor v14.16b, v14.16b, v2.16b
+ eor v15.16b, v15.16b, v3.16b
+
+ rev32 v12.8h, v12.8h
+ rev32 v13.8h, v13.8h
+ rev32 v14.8h, v14.8h
+ rev32 v15.8h, v15.8h
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ add v8.4s, v8.4s, v12.4s
+ add v9.4s, v9.4s, v13.4s
+ add v10.4s, v10.4s, v14.4s
+ add v11.4s, v11.4s, v15.4s
+
+ eor v16.16b, v4.16b, v8.16b
+ eor v17.16b, v5.16b, v9.16b
+ eor v18.16b, v6.16b, v10.16b
+ eor v19.16b, v7.16b, v11.16b
+
+ shl v4.4s, v16.4s, #12
+ shl v5.4s, v17.4s, #12
+ shl v6.4s, v18.4s, #12
+ shl v7.4s, v19.4s, #12
+
+ sri v4.4s, v16.4s, #20
+ sri v5.4s, v17.4s, #20
+ sri v6.4s, v18.4s, #20
+ sri v7.4s, v19.4s, #20
+
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ add v0.4s, v0.4s, v4.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ add v3.4s, v3.4s, v7.4s
+
+ eor v12.16b, v12.16b, v0.16b
+ eor v13.16b, v13.16b, v1.16b
+ eor v14.16b, v14.16b, v2.16b
+ eor v15.16b, v15.16b, v3.16b
+
+ tbl v12.16b, {v12.16b}, v31.16b
+ tbl v13.16b, {v13.16b}, v31.16b
+ tbl v14.16b, {v14.16b}, v31.16b
+ tbl v15.16b, {v15.16b}, v31.16b
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ add v8.4s, v8.4s, v12.4s
+ add v9.4s, v9.4s, v13.4s
+ add v10.4s, v10.4s, v14.4s
+ add v11.4s, v11.4s, v15.4s
+
+ eor v16.16b, v4.16b, v8.16b
+ eor v17.16b, v5.16b, v9.16b
+ eor v18.16b, v6.16b, v10.16b
+ eor v19.16b, v7.16b, v11.16b
+
+ shl v4.4s, v16.4s, #7
+ shl v5.4s, v17.4s, #7
+ shl v6.4s, v18.4s, #7
+ shl v7.4s, v19.4s, #7
+
+ sri v4.4s, v16.4s, #25
+ sri v5.4s, v17.4s, #25
+ sri v6.4s, v18.4s, #25
+ sri v7.4s, v19.4s, #25
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v4.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v12.16b, v12.16b, v1.16b
+ eor v13.16b, v13.16b, v2.16b
+ eor v14.16b, v14.16b, v3.16b
+
+ rev32 v15.8h, v15.8h
+ rev32 v12.8h, v12.8h
+ rev32 v13.8h, v13.8h
+ rev32 v14.8h, v14.8h
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v12.4s
+ add v8.4s, v8.4s, v13.4s
+ add v9.4s, v9.4s, v14.4s
+
+ eor v16.16b, v5.16b, v10.16b
+ eor v17.16b, v6.16b, v11.16b
+ eor v18.16b, v7.16b, v8.16b
+ eor v19.16b, v4.16b, v9.16b
+
+ shl v5.4s, v16.4s, #12
+ shl v6.4s, v17.4s, #12
+ shl v7.4s, v18.4s, #12
+ shl v4.4s, v19.4s, #12
+
+ sri v5.4s, v16.4s, #20
+ sri v6.4s, v17.4s, #20
+ sri v7.4s, v18.4s, #20
+ sri v4.4s, v19.4s, #20
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v4.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v12.16b, v12.16b, v1.16b
+ eor v13.16b, v13.16b, v2.16b
+ eor v14.16b, v14.16b, v3.16b
+
+ tbl v15.16b, {v15.16b}, v31.16b
+ tbl v12.16b, {v12.16b}, v31.16b
+ tbl v13.16b, {v13.16b}, v31.16b
+ tbl v14.16b, {v14.16b}, v31.16b
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v12.4s
+ add v8.4s, v8.4s, v13.4s
+ add v9.4s, v9.4s, v14.4s
+
+ eor v16.16b, v5.16b, v10.16b
+ eor v17.16b, v6.16b, v11.16b
+ eor v18.16b, v7.16b, v8.16b
+ eor v19.16b, v4.16b, v9.16b
+
+ shl v5.4s, v16.4s, #7
+ shl v6.4s, v17.4s, #7
+ shl v7.4s, v18.4s, #7
+ shl v4.4s, v19.4s, #7
+
+ sri v5.4s, v16.4s, #25
+ sri v6.4s, v17.4s, #25
+ sri v7.4s, v18.4s, #25
+ sri v4.4s, v19.4s, #25
+
+ subs x3, x3, #1
+ b.ne .Ldoubleround4
+
+ ld4r {v16.4s-v19.4s}, [x0], #16
+ ld4r {v20.4s-v23.4s}, [x0], #16
+
+ // x12 += counter values 0-3
+ add v12.4s, v12.4s, v30.4s
+
+ // x0[0-3] += s0[0]
+ // x1[0-3] += s0[1]
+ // x2[0-3] += s0[2]
+ // x3[0-3] += s0[3]
+ add v0.4s, v0.4s, v16.4s
+ add v1.4s, v1.4s, v17.4s
+ add v2.4s, v2.4s, v18.4s
+ add v3.4s, v3.4s, v19.4s
+
+ ld4r {v24.4s-v27.4s}, [x0], #16
+ ld4r {v28.4s-v31.4s}, [x0]
+
+ // x4[0-3] += s1[0]
+ // x5[0-3] += s1[1]
+ // x6[0-3] += s1[2]
+ // x7[0-3] += s1[3]
+ add v4.4s, v4.4s, v20.4s
+ add v5.4s, v5.4s, v21.4s
+ add v6.4s, v6.4s, v22.4s
+ add v7.4s, v7.4s, v23.4s
+
+ // x8[0-3] += s2[0]
+ // x9[0-3] += s2[1]
+ // x10[0-3] += s2[2]
+ // x11[0-3] += s2[3]
+ add v8.4s, v8.4s, v24.4s
+ add v9.4s, v9.4s, v25.4s
+ add v10.4s, v10.4s, v26.4s
+ add v11.4s, v11.4s, v27.4s
+
+ // x12[0-3] += s3[0]
+ // x13[0-3] += s3[1]
+ // x14[0-3] += s3[2]
+ // x15[0-3] += s3[3]
+ add v12.4s, v12.4s, v28.4s
+ add v13.4s, v13.4s, v29.4s
+ add v14.4s, v14.4s, v30.4s
+ add v15.4s, v15.4s, v31.4s
+
+ // interleave 32-bit words in state n, n+1
+ zip1 v16.4s, v0.4s, v1.4s
+ zip2 v17.4s, v0.4s, v1.4s
+ zip1 v18.4s, v2.4s, v3.4s
+ zip2 v19.4s, v2.4s, v3.4s
+ zip1 v20.4s, v4.4s, v5.4s
+ zip2 v21.4s, v4.4s, v5.4s
+ zip1 v22.4s, v6.4s, v7.4s
+ zip2 v23.4s, v6.4s, v7.4s
+ zip1 v24.4s, v8.4s, v9.4s
+ zip2 v25.4s, v8.4s, v9.4s
+ zip1 v26.4s, v10.4s, v11.4s
+ zip2 v27.4s, v10.4s, v11.4s
+ zip1 v28.4s, v12.4s, v13.4s
+ zip2 v29.4s, v12.4s, v13.4s
+ zip1 v30.4s, v14.4s, v15.4s
+ zip2 v31.4s, v14.4s, v15.4s
+
+ // interleave 64-bit words in state n, n+2
+ zip1 v0.2d, v16.2d, v18.2d
+ zip2 v4.2d, v16.2d, v18.2d
+ zip1 v8.2d, v17.2d, v19.2d
+ zip2 v12.2d, v17.2d, v19.2d
+ ld1 {v16.16b-v19.16b}, [x2], #64
+
+ zip1 v1.2d, v20.2d, v22.2d
+ zip2 v5.2d, v20.2d, v22.2d
+ zip1 v9.2d, v21.2d, v23.2d
+ zip2 v13.2d, v21.2d, v23.2d
+ ld1 {v20.16b-v23.16b}, [x2], #64
+
+ zip1 v2.2d, v24.2d, v26.2d
+ zip2 v6.2d, v24.2d, v26.2d
+ zip1 v10.2d, v25.2d, v27.2d
+ zip2 v14.2d, v25.2d, v27.2d
+ ld1 {v24.16b-v27.16b}, [x2], #64
+
+ zip1 v3.2d, v28.2d, v30.2d
+ zip2 v7.2d, v28.2d, v30.2d
+ zip1 v11.2d, v29.2d, v31.2d
+ zip2 v15.2d, v29.2d, v31.2d
+ ld1 {v28.16b-v31.16b}, [x2]
+
+ // xor with corresponding input, write to output
+ eor v16.16b, v16.16b, v0.16b
+ eor v17.16b, v17.16b, v1.16b
+ eor v18.16b, v18.16b, v2.16b
+ eor v19.16b, v19.16b, v3.16b
+ eor v20.16b, v20.16b, v4.16b
+ eor v21.16b, v21.16b, v5.16b
+ st1 {v16.16b-v19.16b}, [x1], #64
+ eor v22.16b, v22.16b, v6.16b
+ eor v23.16b, v23.16b, v7.16b
+ eor v24.16b, v24.16b, v8.16b
+ eor v25.16b, v25.16b, v9.16b
+ st1 {v20.16b-v23.16b}, [x1], #64
+ eor v26.16b, v26.16b, v10.16b
+ eor v27.16b, v27.16b, v11.16b
+ eor v28.16b, v28.16b, v12.16b
+ st1 {v24.16b-v27.16b}, [x1], #64
+ eor v29.16b, v29.16b, v13.16b
+ eor v30.16b, v30.16b, v14.16b
+ eor v31.16b, v31.16b, v15.16b
+ st1 {v28.16b-v31.16b}, [x1]
+
+ ret
+ENDPROC(chacha20_4block_xor_neon)
+
+CTRINC: .word 0, 1, 2, 3
+ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c
new file mode 100644
index 000000000000..a7f2337d46cf
--- /dev/null
+++ b/arch/arm64/crypto/chacha20-neon-glue.c
@@ -0,0 +1,127 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha20.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+
+asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+
+static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes)
+{
+ u8 buf[CHACHA20_BLOCK_SIZE];
+
+ while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+ chacha20_4block_xor_neon(state, dst, src);
+ bytes -= CHACHA20_BLOCK_SIZE * 4;
+ src += CHACHA20_BLOCK_SIZE * 4;
+ dst += CHACHA20_BLOCK_SIZE * 4;
+ state[12] += 4;
+ }
+ while (bytes >= CHACHA20_BLOCK_SIZE) {
+ chacha20_block_xor_neon(state, dst, src);
+ bytes -= CHACHA20_BLOCK_SIZE;
+ src += CHACHA20_BLOCK_SIZE;
+ dst += CHACHA20_BLOCK_SIZE;
+ state[12]++;
+ }
+ if (bytes) {
+ memcpy(buf, src, bytes);
+ chacha20_block_xor_neon(state, buf, buf);
+ memcpy(dst, buf, bytes);
+ }
+}
+
+static int chacha20_neon(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ u32 state[16];
+ int err;
+
+ if (req->cryptlen <= CHACHA20_BLOCK_SIZE)
+ return crypto_chacha20_crypt(req);
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ crypto_chacha20_init(state, ctx, walk.iv);
+
+ kernel_neon_begin();
+ while (walk.nbytes > 0) {
+ unsigned int nbytes = walk.nbytes;
+
+ if (nbytes < walk.total)
+ nbytes = round_down(nbytes, walk.stride);
+
+ chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes);
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ }
+ kernel_neon_end();
+
+ return err;
+}
+
+static struct skcipher_alg alg = {
+ .base.cra_name = "chacha20",
+ .base.cra_driver_name = "chacha20-neon",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha20_ctx),
+ .base.cra_alignmask = 1,
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA20_KEY_SIZE,
+ .max_keysize = CHACHA20_KEY_SIZE,
+ .ivsize = CHACHA20_IV_SIZE,
+ .chunksize = CHACHA20_BLOCK_SIZE,
+ .walksize = 4 * CHACHA20_BLOCK_SIZE,
+ .setkey = crypto_chacha20_setkey,
+ .encrypt = chacha20_neon,
+ .decrypt = chacha20_neon,
+};
+
+static int __init chacha20_simd_mod_init(void)
+{
+ if (!(elf_hwcap & HWCAP_ASIMD))
+ return -ENODEV;
+
+ return crypto_register_skcipher(&alg);
+}
+
+static void __exit chacha20_simd_mod_fini(void)
+{
+ crypto_unregister_skcipher(&alg);
+}
+
+module_init(chacha20_simd_mod_init);
+module_exit(chacha20_simd_mod_fini);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
--
2.7.4
^ permalink raw reply related
* [PATCH 5/6] crypto: arm64/aes-blk - expose AES-CTR as synchronous cipher as well
From: Ard Biesheuvel @ 2017-01-02 18:21 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1483381268-12987-1-git-send-email-ard.biesheuvel@linaro.org>
In addition to wrapping the AES-CTR cipher into the async SIMD wrapper,
which exposes it as an async skcipher that defers processing to process
context, expose our AES-CTR implementation directly as a synchronous cipher
as well, but with a lower priority.
This makes the AES-CTR transform usable in places where synchronous
transforms are required, such as the MAC802.11 encryption code, which
executes in sotfirq context, where SIMD processing is allowed on arm64.
Users of the async transform will keep the existing behavior.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
arch/arm64/crypto/aes-glue.c | 25 ++++++++++++++++++--
1 file changed, 23 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 4e3f8adb1793..5164aaf82c6a 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -327,6 +327,23 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = ctr_encrypt,
}, {
.base = {
+ .cra_name = "ctr(aes)",
+ .cra_driver_name = "ctr-aes-" MODE,
+ .cra_priority = PRIO - 1,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct crypto_aes_ctx),
+ .cra_alignmask = 7,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .setkey = skcipher_aes_setkey,
+ .encrypt = ctr_encrypt,
+ .decrypt = ctr_encrypt,
+}, {
+ .base = {
.cra_name = "__xts(aes)",
.cra_driver_name = "__xts-aes-" MODE,
.cra_priority = PRIO,
@@ -350,8 +367,9 @@ static void aes_exit(void)
{
int i;
- for (i = 0; i < ARRAY_SIZE(aes_simd_algs) && aes_simd_algs[i]; i++)
- simd_skcipher_free(aes_simd_algs[i]);
+ for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
+ if (aes_simd_algs[i])
+ simd_skcipher_free(aes_simd_algs[i]);
crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
}
@@ -370,6 +388,9 @@ static int __init aes_init(void)
return err;
for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
+ if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
+ continue;
+
algname = aes_algs[i].base.cra_name + 2;
drvname = aes_algs[i].base.cra_driver_name + 2;
basename = aes_algs[i].base.cra_driver_name;
--
2.7.4
^ permalink raw reply related
* [PATCH 6/6] crypto: arm64/aes - reimplement bit-sliced ARM/NEON implementation for arm64
From: Ard Biesheuvel @ 2017-01-02 18:21 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1483381268-12987-1-git-send-email-ard.biesheuvel@linaro.org>
This is a reimplementation of the NEON version of the bit-sliced AES
algorithm. This code is heavily based on Andy Polyakov's OpenSSL version
for ARM, which is also available in the kernel. This is an alternative for
the existing NEON implementation for arm64 authored by me, which suffers
from poor performance due to its reliance on the pathologically slow four
register variant of the tbl/tbx NEON instruction.
This version is about ~30% (*) faster than the generic C code, but only in
cases where the input can be 8x interleaved (this is a fundamental property
of bit slicing). For this reason, only the chaining modes ECB, XTS and CTR
are implemented. (The significance of ECB is that it could potentially be
used by other chaining modes)
* Measured on Cortex-A57. Note that this is still an order of magnitude
slower than the implementations that use the dedicated AES instructions
introduced in ARMv8, but those are part of an optional extension, and so
it is good to have a fallback.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
arch/arm64/crypto/Kconfig | 7 +
arch/arm64/crypto/Makefile | 3 +
arch/arm64/crypto/aes-neonbs-core.S | 879 ++++++++++++++++++++
arch/arm64/crypto/aes-neonbs-glue.c | 344 ++++++++
4 files changed, 1233 insertions(+)
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 0bf0f531f539..7c4249ad4935 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -78,4 +78,11 @@ config CRYPTO_CHACHA20_NEON
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
+config CRYPTO_AES_NEON_BS
+ tristate "AES in ECB/CTR/XTS modes using bit-sliced NEON algorithm"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_BLKCIPHER
+ select CRYPTO_AES
+ select CRYPTO_SIMD
+
endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 9d2826c5fccf..df3c0584b05c 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -44,6 +44,9 @@ sha512-arm64-y := sha512-glue.o sha512-core.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
+obj-$(CONFIG_CRYPTO_AES_NEON_BS) += aes-neon-bs.o
+aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
+
AFLAGS_aes-ce.o := -DINTERLEAVE=4
AFLAGS_aes-neon.o := -DINTERLEAVE=4
diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S
new file mode 100644
index 000000000000..f5e1f76e8ee8
--- /dev/null
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -0,0 +1,879 @@
+/*
+ * Bit sliced AES using NEON instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * The algorithm implemented here is described in detail by the paper
+ * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
+ * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
+ *
+ * This implementation is based primarily on the OpenSSL implementation
+ * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+
+ rounds .req x11
+ bskey .req x12
+
+ .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
+ eor \b2, \b2, \b1
+ eor \b5, \b5, \b6
+ eor \b3, \b3, \b0
+ eor \b6, \b6, \b2
+ eor \b5, \b5, \b0
+ eor \b6, \b6, \b3
+ eor \b3, \b3, \b7
+ eor \b7, \b7, \b5
+ eor \b3, \b3, \b4
+ eor \b4, \b4, \b5
+ eor \b2, \b2, \b7
+ eor \b3, \b3, \b1
+ eor \b1, \b1, \b5
+ .endm
+
+ .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
+ eor \b0, \b0, \b6
+ eor \b1, \b1, \b4
+ eor \b4, \b4, \b6
+ eor \b2, \b2, \b0
+ eor \b6, \b6, \b1
+ eor \b1, \b1, \b5
+ eor \b5, \b5, \b3
+ eor \b3, \b3, \b7
+ eor \b7, \b7, \b5
+ eor \b2, \b2, \b5
+ eor \b4, \b4, \b7
+ .endm
+
+ .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
+ eor \b1, \b1, \b7
+ eor \b4, \b4, \b7
+ eor \b7, \b7, \b5
+ eor \b1, \b1, \b3
+ eor \b2, \b2, \b5
+ eor \b3, \b3, \b7
+ eor \b6, \b6, \b1
+ eor \b2, \b2, \b0
+ eor \b5, \b5, \b3
+ eor \b4, \b4, \b6
+ eor \b0, \b0, \b6
+ eor \b1, \b1, \b4
+ .endm
+
+ .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
+ eor \b1, \b1, \b5
+ eor \b2, \b2, \b7
+ eor \b3, \b3, \b1
+ eor \b4, \b4, \b5
+ eor \b7, \b7, \b5
+ eor \b3, \b3, \b4
+ eor \b5, \b5, \b0
+ eor \b3, \b3, \b7
+ eor \b6, \b6, \b2
+ eor \b2, \b2, \b1
+ eor \b6, \b6, \b3
+ eor \b3, \b3, \b0
+ eor \b5, \b5, \b6
+ .endm
+
+ .macro mul_gf4, x0, x1, y0, y1, t0, t1
+ eor \t0, \y0, \y1
+ and \t0, \t0, \x0
+ eor \x0, \x0, \x1
+ and \t1, \x1, \y0
+ and \x0, \x0, \y1
+ eor \x1, \t1, \t0
+ eor \x0, \x0, \t1
+ .endm
+
+ .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
+ eor \t0, \y0, \y1
+ eor \t1, \y2, \y3
+ and \t0, \t0, \x0
+ and \t1, \t1, \x2
+ eor \x0, \x0, \x1
+ eor \x2, \x2, \x3
+ and \x1, \x1, \y0
+ and \x3, \x3, \y2
+ and \x0, \x0, \y1
+ and \x2, \x2, \y3
+ eor \x1, \x1, \x0
+ eor \x2, \x2, \x3
+ eor \x0, \x0, \t0
+ eor \x3, \x3, \t1
+ .endm
+
+ .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, t0, t1, t2, t3
+ eor \t0, \x0, \x2
+ eor \t1, \x1, \x3
+ mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
+ eor \y0, \y0, \y2
+ eor \y1, \y1, \y3
+ mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
+ eor \x0, \x0, \t0
+ eor \x2, \x2, \t0
+ eor \x1, \x1, \t1
+ eor \x3, \x3, \t1
+ eor \t0, \x4, \x6
+ eor \t1, \x5, \x7
+ mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
+ eor \y0, \y0, \y2
+ eor \y1, \y1, \y3
+ mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
+ eor \x4, \x4, \t0
+ eor \x6, \x6, \t0
+ eor \x5, \x5, \t1
+ eor \x7, \x7, \t1
+ .endm
+
+ .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
+ t0, t1, t2, t3, s0, s1, s2, s3
+ eor \t3, \x4, \x6
+ eor \t0, \x5, \x7
+ eor \t1, \x1, \x3
+ eor \s1, \x7, \x6
+ eor \s0, \x0, \x2
+ eor \s3, \t3, \t0
+ orr \t2, \t0, \t1
+ and \s2, \t3, \s0
+ orr \t3, \t3, \s0
+ eor \s0, \s0, \t1
+ and \t0, \t0, \t1
+ eor \t1, \x3, \x2
+ and \s3, \s3, \s0
+ and \s1, \s1, \t1
+ eor \t1, \x4, \x5
+ eor \s0, \x1, \x0
+ eor \t3, \t3, \s1
+ eor \t2, \t2, \s1
+ and \s1, \t1, \s0
+ orr \t1, \t1, \s0
+ eor \t3, \t3, \s3
+ eor \t0, \t0, \s1
+ eor \t2, \t2, \s2
+ eor \t1, \t1, \s3
+ eor \t0, \t0, \s2
+ and \s0, \x7, \x3
+ eor \t1, \t1, \s2
+ and \s1, \x6, \x2
+ and \s2, \x5, \x1
+ orr \s3, \x4, \x0
+ eor \t3, \t3, \s0
+ eor \t1, \t1, \s2
+ eor \s0, \t0, \s3
+ eor \t2, \t2, \s1
+ and \s2, \t3, \t1
+ eor \s1, \t2, \s2
+ eor \s3, \s0, \s2
+ bsl \s1, \t1, \s0
+ not \t0, \s0
+ bsl \s0, \s1, \s3
+ bsl \t0, \s1, \s3
+ bsl \s3, \t3, \t2
+ eor \t3, \t3, \t2
+ and \s2, \s0, \s3
+ eor \t1, \t1, \t0
+ eor \s2, \s2, \t3
+ mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
+ \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+ .endm
+
+ .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
+ t0, t1, t2, t3, s0, s1, s2, s3
+ in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
+ \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
+ inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
+ \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
+ \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
+ \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
+ out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
+ \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
+ .endm
+
+ .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
+ t0, t1, t2, t3, s0, s1, s2, s3
+ inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
+ \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
+ inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
+ \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
+ \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
+ \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
+ inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
+ \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
+ .endm
+
+ .macro enc_next_rk
+ ldp q16, q17, [bskey], #128
+ ldp q18, q19, [bskey, #-96]
+ ldp q20, q21, [bskey, #-64]
+ ldp q22, q23, [bskey, #-32]
+ .endm
+
+ .macro dec_next_rk
+ ldp q16, q17, [bskey, #-128]!
+ ldp q18, q19, [bskey, #32]
+ ldp q20, q21, [bskey, #64]
+ ldp q22, q23, [bskey, #96]
+ .endm
+
+ .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
+ eor \x0\().16b, \x0\().16b, v16.16b
+ eor \x1\().16b, \x1\().16b, v17.16b
+ eor \x2\().16b, \x2\().16b, v18.16b
+ eor \x3\().16b, \x3\().16b, v19.16b
+ eor \x4\().16b, \x4\().16b, v20.16b
+ eor \x5\().16b, \x5\().16b, v21.16b
+ eor \x6\().16b, \x6\().16b, v22.16b
+ eor \x7\().16b, \x7\().16b, v23.16b
+ .endm
+
+ .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
+ tbl \x0\().16b, {\x0\().16b}, \mask\().16b
+ tbl \x1\().16b, {\x1\().16b}, \mask\().16b
+ tbl \x2\().16b, {\x2\().16b}, \mask\().16b
+ tbl \x3\().16b, {\x3\().16b}, \mask\().16b
+ tbl \x4\().16b, {\x4\().16b}, \mask\().16b
+ tbl \x5\().16b, {\x5\().16b}, \mask\().16b
+ tbl \x6\().16b, {\x6\().16b}, \mask\().16b
+ tbl \x7\().16b, {\x7\().16b}, \mask\().16b
+ .endm
+
+ .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
+ t0, t1, t2, t3, t4, t5, t6, t7, inv
+ ext \t0\().16b, \x0\().16b, \x0\().16b, #12
+ ext \t1\().16b, \x1\().16b, \x1\().16b, #12
+ eor \x0\().16b, \x0\().16b, \t0\().16b
+ ext \t2\().16b, \x2\().16b, \x2\().16b, #12
+ eor \x1\().16b, \x1\().16b, \t1\().16b
+ ext \t3\().16b, \x3\().16b, \x3\().16b, #12
+ eor \x2\().16b, \x2\().16b, \t2\().16b
+ ext \t4\().16b, \x4\().16b, \x4\().16b, #12
+ eor \x3\().16b, \x3\().16b, \t3\().16b
+ ext \t5\().16b, \x5\().16b, \x5\().16b, #12
+ eor \x4\().16b, \x4\().16b, \t4\().16b
+ ext \t6\().16b, \x6\().16b, \x6\().16b, #12
+ eor \x5\().16b, \x5\().16b, \t5\().16b
+ ext \t7\().16b, \x7\().16b, \x7\().16b, #12
+ eor \x6\().16b, \x6\().16b, \t6\().16b
+ eor \t1\().16b, \t1\().16b, \x0\().16b
+ eor \x7\().16b, \x7\().16b, \t7\().16b
+ ext \x0\().16b, \x0\().16b, \x0\().16b, #8
+ eor \t2\().16b, \t2\().16b, \x1\().16b
+ eor \t0\().16b, \t0\().16b, \x7\().16b
+ eor \t1\().16b, \t1\().16b, \x7\().16b
+ ext \x1\().16b, \x1\().16b, \x1\().16b, #8
+ eor \t5\().16b, \t5\().16b, \x4\().16b
+ eor \x0\().16b, \x0\().16b, \t0\().16b
+ eor \t6\().16b, \t6\().16b, \x5\().16b
+ eor \x1\().16b, \x1\().16b, \t1\().16b
+ ext \t0\().16b, \x4\().16b, \x4\().16b, #8
+ eor \t4\().16b, \t4\().16b, \x3\().16b
+ ext \t1\().16b, \x5\().16b, \x5\().16b, #8
+ eor \t7\().16b, \t7\().16b, \x6\().16b
+ ext \x4\().16b, \x3\().16b, \x3\().16b, #8
+ eor \t3\().16b, \t3\().16b, \x2\().16b
+ ext \x5\().16b, \x7\().16b, \x7\().16b, #8
+ eor \t4\().16b, \t4\().16b, \x7\().16b
+ ext \x3\().16b, \x6\().16b, \x6\().16b, #8
+ eor \t3\().16b, \t3\().16b, \x7\().16b
+ ext \x6\().16b, \x2\().16b, \x2\().16b, #8
+ eor \x7\().16b, \t1\().16b, \t5\().16b
+ .ifb \inv
+ eor \x2\().16b, \t0\().16b, \t4\().16b
+ eor \x4\().16b, \x4\().16b, \t3\().16b
+ eor \x5\().16b, \x5\().16b, \t7\().16b
+ eor \x3\().16b, \x3\().16b, \t6\().16b
+ eor \x6\().16b, \x6\().16b, \t2\().16b
+ .else
+ eor \t3\().16b, \t3\().16b, \x4\().16b
+ eor \x5\().16b, \x5\().16b, \t7\().16b
+ eor \x2\().16b, \x3\().16b, \t6\().16b
+ eor \x3\().16b, \t0\().16b, \t4\().16b
+ eor \x4\().16b, \x6\().16b, \t2\().16b
+ mov \x6\().16b, \t3\().16b
+ .endif
+ .endm
+
+ .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
+ t0, t1, t2, t3, t4, t5, t6, t7
+ ext \t0\().16b, \x0\().16b, \x0\().16b, #8
+ ext \t6\().16b, \x6\().16b, \x6\().16b, #8
+ ext \t7\().16b, \x7\().16b, \x7\().16b, #8
+ eor \t0\().16b, \t0\().16b, \x0\().16b
+ ext \t1\().16b, \x1\().16b, \x1\().16b, #8
+ eor \t6\().16b, \t6\().16b, \x6\().16b
+ ext \t2\().16b, \x2\().16b, \x2\().16b, #8
+ eor \t7\().16b, \t7\().16b, \x7\().16b
+ ext \t3\().16b, \x3\().16b, \x3\().16b, #8
+ eor \t1\().16b, \t1\().16b, \x1\().16b
+ ext \t4\().16b, \x4\().16b, \x4\().16b, #8
+ eor \t2\().16b, \t2\().16b, \x2\().16b
+ ext \t5\().16b, \x5\().16b, \x5\().16b, #8
+ eor \t3\().16b, \t3\().16b, \x3\().16b
+ eor \t4\().16b, \t4\().16b, \x4\().16b
+ eor \t5\().16b, \t5\().16b, \x5\().16b
+ eor \x0\().16b, \x0\().16b, \t6\().16b
+ eor \x1\().16b, \x1\().16b, \t6\().16b
+ eor \x2\().16b, \x2\().16b, \t0\().16b
+ eor \x4\().16b, \x4\().16b, \t2\().16b
+ eor \x3\().16b, \x3\().16b, \t1\().16b
+ eor \x1\().16b, \x1\().16b, \t7\().16b
+ eor \x2\().16b, \x2\().16b, \t7\().16b
+ eor \x4\().16b, \x4\().16b, \t6\().16b
+ eor \x5\().16b, \x5\().16b, \t3\().16b
+ eor \x3\().16b, \x3\().16b, \t6\().16b
+ eor \x6\().16b, \x6\().16b, \t4\().16b
+ eor \x4\().16b, \x4\().16b, \t7\().16b
+ eor \x5\().16b, \x5\().16b, \t7\().16b
+ eor \x7\().16b, \x7\().16b, \t5\().16b
+ mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
+ \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
+ .endm
+
+ .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
+ ushr \t0\().2d, \b0\().2d, #\n
+ ushr \t1\().2d, \b1\().2d, #\n
+ eor \t0\().16b, \t0\().16b, \a0\().16b
+ eor \t1\().16b, \t1\().16b, \a1\().16b
+ and \t0\().16b, \t0\().16b, \mask\().16b
+ and \t1\().16b, \t1\().16b, \mask\().16b
+ eor \a0\().16b, \a0\().16b, \t0\().16b
+ shl \t0\().2d, \t0\().2d, #\n
+ eor \a1\().16b, \a1\().16b, \t1\().16b
+ shl \t1\().2d, \t1\().2d, #\n
+ eor \b0\().16b, \b0\().16b, \t0\().16b
+ eor \b1\().16b, \b1\().16b, \t1\().16b
+ .endm
+
+ .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
+ movi \t0\().16b, #0x55
+ movi \t1\().16b, #0x33
+ swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
+ swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
+ movi \t0\().16b, #0x0f
+ swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
+ swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
+ swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
+ swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
+ .endm
+
+
+ .align 6
+M0: .octa 0x0004080c0105090d02060a0e03070b0f
+
+M0SR: .octa 0x0004080c05090d010a0e02060f03070b
+SR: .octa 0x0f0e0d0c0a09080b0504070600030201
+SRM0: .octa 0x01060b0c0207080d0304090e00050a0f
+
+M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03
+ISR: .octa 0x0f0e0d0c080b0a090504070602010003
+ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f
+
+ /*
+ * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
+ */
+ENTRY(aesbs_convert_key)
+ ld1 {v7.4s}, [x1], #16 // load round 0 key
+ ld1 {v17.4s}, [x1], #16 // load round 1 key
+
+ movi v8.16b, #0x01 // bit masks
+ movi v9.16b, #0x02
+ movi v10.16b, #0x04
+ movi v11.16b, #0x08
+ movi v12.16b, #0x10
+ movi v13.16b, #0x20
+ movi v14.16b, #0x40
+ movi v15.16b, #0x80
+ ldr q16, M0
+
+ sub x2, x2, #1
+ str q7, [x0], #16 // save round 0 key
+
+.Lkey_loop:
+ tbl v7.16b ,{v17.16b}, v16.16b
+ ld1 {v17.4s}, [x1], #16 // load next round key
+
+ cmtst v0.16b, v7.16b, v8.16b
+ cmtst v1.16b, v7.16b, v9.16b
+ cmtst v2.16b, v7.16b, v10.16b
+ cmtst v3.16b, v7.16b, v11.16b
+ cmtst v4.16b, v7.16b, v12.16b
+ cmtst v5.16b, v7.16b, v13.16b
+ cmtst v6.16b, v7.16b, v14.16b
+ cmtst v7.16b, v7.16b, v15.16b
+ not v0.16b, v0.16b
+ not v1.16b, v1.16b
+ not v5.16b, v5.16b
+ not v6.16b, v6.16b
+
+ subs x2, x2, #1
+ stp q0, q1, [x0], #128
+ stp q2, q3, [x0, #-96]
+ stp q4, q5, [x0, #-64]
+ stp q6, q7, [x0, #-32]
+ b.ne .Lkey_loop
+
+ movi v7.16b, #0x63 // compose .L63
+ eor v17.16b, v17.16b, v7.16b
+ str q17, [x0]
+ ret
+ENDPROC(aesbs_convert_key)
+
+ .align 4
+aesbs_encrypt8:
+ ldr q9, [bskey], #16 // round 0 key
+ ldr q8, M0SR
+ ldr q24, SR
+
+ eor v10.16b, v0.16b, v9.16b // xor with round0 key
+ eor v11.16b, v1.16b, v9.16b
+ tbl v0.16b, {v10.16b}, v8.16b
+ eor v12.16b, v2.16b, v9.16b
+ tbl v1.16b, {v11.16b}, v8.16b
+ eor v13.16b, v3.16b, v9.16b
+ tbl v2.16b, {v12.16b}, v8.16b
+ eor v14.16b, v4.16b, v9.16b
+ tbl v3.16b, {v13.16b}, v8.16b
+ eor v15.16b, v5.16b, v9.16b
+ tbl v4.16b, {v14.16b}, v8.16b
+ eor v10.16b, v6.16b, v9.16b
+ tbl v5.16b, {v15.16b}, v8.16b
+ eor v11.16b, v7.16b, v9.16b
+ tbl v6.16b, {v10.16b}, v8.16b
+ tbl v7.16b, {v11.16b}, v8.16b
+
+ bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+
+ sub rounds, rounds, #1
+ b .Lenc_sbox
+
+.Lenc_loop:
+ shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
+.Lenc_sbox:
+ sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
+ v13, v14, v15
+ subs rounds, rounds, #1
+ b.cc .Lenc_done
+
+ enc_next_rk
+
+ mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
+ v13, v14, v15
+
+ add_round_key v0, v1, v2, v3, v4, v5, v6, v7
+
+ b.ne .Lenc_loop
+ ldr q24, SRM0
+ b .Lenc_loop
+
+.Lenc_done:
+ ldr q12, [bskey] // last round key
+
+ bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
+
+ eor v0.16b, v0.16b, v12.16b
+ eor v1.16b, v1.16b, v12.16b
+ eor v4.16b, v4.16b, v12.16b
+ eor v6.16b, v6.16b, v12.16b
+ eor v3.16b, v3.16b, v12.16b
+ eor v7.16b, v7.16b, v12.16b
+ eor v2.16b, v2.16b, v12.16b
+ eor v5.16b, v5.16b, v12.16b
+ ret
+ENDPROC(aesbs_encrypt8)
+
+ .align 4
+aesbs_decrypt8:
+ lsl x9, rounds, #7
+ add bskey, bskey, x9
+
+ ldr q9, [bskey, #-112]! // round 0 key
+ ldr q8, M0ISR
+ ldr q24, ISR
+
+ eor v10.16b, v0.16b, v9.16b // xor with round0 key
+ eor v11.16b, v1.16b, v9.16b
+ tbl v0.16b, {v10.16b}, v8.16b
+ eor v12.16b, v2.16b, v9.16b
+ tbl v1.16b, {v11.16b}, v8.16b
+ eor v13.16b, v3.16b, v9.16b
+ tbl v2.16b, {v12.16b}, v8.16b
+ eor v14.16b, v4.16b, v9.16b
+ tbl v3.16b, {v13.16b}, v8.16b
+ eor v15.16b, v5.16b, v9.16b
+ tbl v4.16b, {v14.16b}, v8.16b
+ eor v10.16b, v6.16b, v9.16b
+ tbl v5.16b, {v15.16b}, v8.16b
+ eor v11.16b, v7.16b, v9.16b
+ tbl v6.16b, {v10.16b}, v8.16b
+ tbl v7.16b, {v11.16b}, v8.16b
+
+ bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+
+ sub rounds, rounds, #1
+ b .Ldec_sbox
+
+.Ldec_loop:
+ shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
+.Ldec_sbox:
+ inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
+ v13, v14, v15
+ subs rounds, rounds, #1
+ b.cc .Ldec_done
+
+ dec_next_rk
+
+ add_round_key v0, v1, v6, v4, v2, v7, v3, v5
+
+ inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
+ v13, v14, v15
+
+ b.ne .Ldec_loop
+ ldr q24, ISRM0
+ b .Ldec_loop
+.Ldec_done:
+ ldr q12, [bskey, #-16] // last round key
+
+ bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
+
+ eor v0.16b, v0.16b, v12.16b
+ eor v1.16b, v1.16b, v12.16b
+ eor v6.16b, v6.16b, v12.16b
+ eor v4.16b, v4.16b, v12.16b
+ eor v2.16b, v2.16b, v12.16b
+ eor v7.16b, v7.16b, v12.16b
+ eor v3.16b, v3.16b, v12.16b
+ eor v5.16b, v5.16b, v12.16b
+ ret
+ENDPROC(aesbs_decrypt8)
+
+ /*
+ * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int blocks)
+ * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int blocks)
+ */
+ .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+99: mov x5, #1
+ lsl x5, x5, x4
+ subs w4, w4, #8
+ csel x4, x4, xzr, pl
+ csel x5, x5, xzr, mi
+
+ ld1 {v0.16b}, [x1], #16
+ tbnz x5, #1, 0f
+ ld1 {v1.16b}, [x1], #16
+ tbnz x5, #2, 0f
+ ld1 {v2.16b}, [x1], #16
+ tbnz x5, #3, 0f
+ ld1 {v3.16b}, [x1], #16
+ tbnz x5, #4, 0f
+ ld1 {v4.16b}, [x1], #16
+ tbnz x5, #5, 0f
+ ld1 {v5.16b}, [x1], #16
+ tbnz x5, #6, 0f
+ ld1 {v6.16b}, [x1], #16
+ tbnz x5, #7, 0f
+ ld1 {v7.16b}, [x1], #16
+
+0: mov bskey, x2
+ mov rounds, x3
+ bl \do8
+
+ st1 {\o0\().16b}, [x0], #16
+ tbnz x5, #1, 1f
+ st1 {\o1\().16b}, [x0], #16
+ tbnz x5, #2, 1f
+ st1 {\o2\().16b}, [x0], #16
+ tbnz x5, #3, 1f
+ st1 {\o3\().16b}, [x0], #16
+ tbnz x5, #4, 1f
+ st1 {\o4\().16b}, [x0], #16
+ tbnz x5, #5, 1f
+ st1 {\o5\().16b}, [x0], #16
+ tbnz x5, #6, 1f
+ st1 {\o6\().16b}, [x0], #16
+ tbnz x5, #7, 1f
+ st1 {\o7\().16b}, [x0], #16
+
+ cbnz x4, 99b
+
+1: ldp x29, x30, [sp], #16
+ ret
+ .endm
+
+ .align 4
+ENTRY(aesbs_ecb_encrypt)
+ __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
+ENDPROC(aesbs_ecb_encrypt)
+
+ .align 4
+ENTRY(aesbs_ecb_decrypt)
+ __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
+ENDPROC(aesbs_ecb_decrypt)
+
+ .macro next_tweak, out, in, const, tmp
+ sshr \tmp\().2d, \in\().2d, #63
+ and \tmp\().16b, \tmp\().16b, \const\().16b
+ add \out\().2d, \in\().2d, \in\().2d
+ ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
+ eor \out\().16b, \out\().16b, \tmp\().16b
+ .endm
+
+ .align 4
+.Lxts_mul_x:
+CPU_LE( .quad 1, 0x87 )
+CPU_BE( .quad 0x87, 1 )
+
+ /*
+ * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int blocks, u8 iv[])
+ * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int blocks, u8 iv[])
+ */
+__xts_crypt8:
+ mov x6, #1
+ lsl x6, x6, x4
+ subs w4, w4, #8
+ csel x4, x4, xzr, pl
+ csel x6, x6, xzr, mi
+
+ ld1 {v0.16b}, [x1], #16
+ next_tweak v26, v25, v30, v31
+ eor v0.16b, v0.16b, v25.16b
+ tbnz x6, #1, 0f
+
+ ld1 {v1.16b}, [x1], #16
+ next_tweak v27, v26, v30, v31
+ eor v1.16b, v1.16b, v26.16b
+ tbnz x6, #2, 0f
+
+ ld1 {v2.16b}, [x1], #16
+ next_tweak v28, v27, v30, v31
+ eor v2.16b, v2.16b, v27.16b
+ tbnz x6, #3, 0f
+
+ ld1 {v3.16b}, [x1], #16
+ next_tweak v29, v28, v30, v31
+ eor v3.16b, v3.16b, v28.16b
+ tbnz x6, #4, 0f
+
+ ld1 {v4.16b}, [x1], #16
+ str q29, [sp, #16]
+ eor v4.16b, v4.16b, v29.16b
+ next_tweak v29, v29, v30, v31
+ tbnz x6, #5, 0f
+
+ ld1 {v5.16b}, [x1], #16
+ str q29, [sp, #32]
+ eor v5.16b, v5.16b, v29.16b
+ next_tweak v29, v29, v30, v31
+ tbnz x6, #6, 0f
+
+ ld1 {v6.16b}, [x1], #16
+ str q29, [sp, #48]
+ eor v6.16b, v6.16b, v29.16b
+ next_tweak v29, v29, v30, v31
+ tbnz x6, #7, 0f
+
+ ld1 {v7.16b}, [x1], #16
+ str q29, [sp, #64]
+ eor v7.16b, v7.16b, v29.16b
+ next_tweak v29, v29, v30, v31
+
+0: mov bskey, x2
+ mov rounds, x3
+ br x7
+ENDPROC(__xts_crypt8)
+
+ .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
+ stp x29, x30, [sp, #-80]!
+ mov x29, sp
+
+ ldr q30, .Lxts_mul_x
+ ld1 {v25.16b}, [x5]
+
+99: adr x7, \do8
+ bl __xts_crypt8
+
+ ldp q16, q17, [sp, #16]
+ ldp q18, q19, [sp, #48]
+
+ eor \o0\().16b, \o0\().16b, v25.16b
+ eor \o1\().16b, \o1\().16b, v26.16b
+ eor \o2\().16b, \o2\().16b, v27.16b
+ eor \o3\().16b, \o3\().16b, v28.16b
+
+ st1 {\o0\().16b}, [x0], #16
+ mov v25.16b, v26.16b
+ tbnz x6, #1, 1f
+ st1 {\o1\().16b}, [x0], #16
+ mov v25.16b, v27.16b
+ tbnz x6, #2, 1f
+ st1 {\o2\().16b}, [x0], #16
+ mov v25.16b, v28.16b
+ tbnz x6, #3, 1f
+ st1 {\o3\().16b}, [x0], #16
+ mov v25.16b, v29.16b
+ tbnz x6, #4, 1f
+
+ eor \o4\().16b, \o4\().16b, v16.16b
+ eor \o5\().16b, \o5\().16b, v17.16b
+ eor \o6\().16b, \o6\().16b, v18.16b
+ eor \o7\().16b, \o7\().16b, v19.16b
+
+ st1 {\o4\().16b}, [x0], #16
+ tbnz x6, #5, 1f
+ st1 {\o5\().16b}, [x0], #16
+ tbnz x6, #6, 1f
+ st1 {\o6\().16b}, [x0], #16
+ tbnz x6, #7, 1f
+ st1 {\o7\().16b}, [x0], #16
+
+ cbnz x4, 99b
+
+1: st1 {v25.16b}, [x5]
+ ldp x29, x30, [sp], #80
+ ret
+ .endm
+
+ENTRY(aesbs_xts_encrypt)
+ __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
+ENDPROC(aesbs_xts_encrypt)
+
+ENTRY(aesbs_xts_decrypt)
+ __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
+ENDPROC(aesbs_xts_decrypt)
+
+ .macro next_ctr, v
+ mov \v\().d[1], x8
+ adds x8, x8, #1
+ mov \v\().d[0], x7
+ adc x7, x7, xzr
+ rev64 \v\().16b, \v\().16b
+ .endm
+
+ /*
+ * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+ * int rounds, int blocks, u8 iv[], bool final)
+ */
+ENTRY(aesbs_ctr_encrypt)
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ add x4, x4, x6 // do one extra block if final
+
+ ldp x7, x8, [x5]
+ ld1 {v0.16b}, [x5]
+CPU_LE( rev x7, x7 )
+CPU_LE( rev x8, x8 )
+ adds x8, x8, #1
+ adc x7, x7, xzr
+
+99: mov x9, #1
+ lsl x9, x9, x4
+ subs w4, w4, #8
+ csel x4, x4, xzr, pl
+ csel x9, x9, xzr, le
+
+ next_ctr v1
+ next_ctr v2
+ next_ctr v3
+ next_ctr v4
+ next_ctr v5
+ next_ctr v6
+ next_ctr v7
+
+0: mov bskey, x2
+ mov rounds, x3
+ bl aesbs_encrypt8
+
+ lsr x9, x9, x6 // disregard the extra block
+ tbnz x9, #0, 0f
+
+ ld1 {v8.16b}, [x1], #16
+ eor v0.16b, v0.16b, v8.16b
+ st1 {v0.16b}, [x0], #16
+ tbnz x9, #1, 1f
+
+ ld1 {v9.16b}, [x1], #16
+ eor v1.16b, v1.16b, v9.16b
+ st1 {v1.16b}, [x0], #16
+ tbnz x9, #2, 2f
+
+ ld1 {v10.16b}, [x1], #16
+ eor v4.16b, v4.16b, v10.16b
+ st1 {v4.16b}, [x0], #16
+ tbnz x9, #3, 3f
+
+ ld1 {v11.16b}, [x1], #16
+ eor v6.16b, v6.16b, v11.16b
+ st1 {v6.16b}, [x0], #16
+ tbnz x9, #4, 4f
+
+ ld1 {v12.16b}, [x1], #16
+ eor v3.16b, v3.16b, v12.16b
+ st1 {v3.16b}, [x0], #16
+ tbnz x9, #5, 5f
+
+ ld1 {v13.16b}, [x1], #16
+ eor v7.16b, v7.16b, v13.16b
+ st1 {v7.16b}, [x0], #16
+ tbnz x9, #6, 6f
+
+ ld1 {v14.16b}, [x1], #16
+ eor v2.16b, v2.16b, v14.16b
+ st1 {v2.16b}, [x0], #16
+ tbnz x9, #7, 7f
+
+ ld1 {v15.16b}, [x1], #16
+ eor v5.16b, v5.16b, v15.16b
+ st1 {v5.16b}, [x0], #16
+
+ next_ctr v0
+ cbnz x4, 99b
+
+0: st1 {v0.16b}, [x5]
+8: ldp x29, x30, [sp], #16
+ ret
+
+ /*
+ * If we are handling the tail of the input (x6 == 1), return the
+ * final keystream block back to the caller via the IV buffer.
+ */
+1: cbz x6, 8b
+ st1 {v1.16b}, [x5]
+ b 8b
+2: cbz x6, 8b
+ st1 {v4.16b}, [x5]
+ b 8b
+3: cbz x6, 8b
+ st1 {v6.16b}, [x5]
+ b 8b
+4: cbz x6, 8b
+ st1 {v3.16b}, [x5]
+ b 8b
+5: cbz x6, 8b
+ st1 {v7.16b}, [x5]
+ b 8b
+6: cbz x6, 8b
+ st1 {v2.16b}, [x5]
+ b 8b
+7: cbz x6, 8b
+ st1 {v5.16b}, [x5]
+ b 8b
+ENDPROC(aesbs_ctr_encrypt)
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
new file mode 100644
index 000000000000..45c1862f86a7
--- /dev/null
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -0,0 +1,344 @@
+/*
+ * Bit sliced AES using NEON instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/xts.h>
+#include <linux/module.h>
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+MODULE_ALIAS_CRYPTO("ecb(aes)");
+MODULE_ALIAS_CRYPTO("ctr(aes)");
+MODULE_ALIAS_CRYPTO("xts(aes)");
+
+asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds);
+
+asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks);
+asmlinkage void aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks);
+
+asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks, u8 iv[], bool final);
+
+asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks, u8 iv[]);
+asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks, u8 iv[]);
+
+struct aesbs_key {
+ u8 key[13 * (8 * AES_BLOCK_SIZE) + 32];
+} __aligned(AES_BLOCK_SIZE);
+
+struct aesbs_ctx {
+ struct aesbs_key bskey;
+ int rounds;
+};
+
+struct aesbs_xts_ctx {
+ struct crypto_aes_ctx tweak; /* keep at the beginning */
+ struct aesbs_key bskey;
+ int rounds;
+};
+
+static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct crypto_aes_ctx rk;
+ int err;
+
+ err = crypto_aes_expand_key(&rk, in_key, key_len);
+ if (err)
+ return err;
+
+ ctx->rounds = 6 + key_len / 4;
+
+ kernel_neon_begin();
+ aesbs_convert_key(ctx->bskey.key, rk.key_enc, ctx->rounds);
+ kernel_neon_end();
+
+ return 0;
+}
+
+static int __ecb_crypt(struct skcipher_request *req,
+ void (*fn)(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks))
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_neon_begin();
+ while (walk.nbytes >= AES_BLOCK_SIZE) {
+ unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+ if (walk.nbytes < walk.total)
+ blocks = round_down(blocks,
+ walk.stride / AES_BLOCK_SIZE);
+
+ fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->bskey.key,
+ ctx->rounds, blocks);
+ err = skcipher_walk_done(&walk,
+ walk.nbytes - blocks * AES_BLOCK_SIZE);
+ }
+ kernel_neon_end();
+
+ return err;
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return __ecb_crypt(req, aesbs_ecb_encrypt);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return __ecb_crypt(req, aesbs_ecb_decrypt);
+}
+
+static int ctr_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_neon_begin();
+ while (walk.nbytes > 0) {
+ unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+ bool final = (walk.total % AES_BLOCK_SIZE) != 0;
+
+ if (walk.nbytes < walk.total) {
+ blocks = round_down(blocks,
+ walk.stride / AES_BLOCK_SIZE);
+ final = false;
+ }
+
+ aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->bskey.key, ctx->rounds, blocks, walk.iv,
+ final);
+
+ if (final) {
+ u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+ u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+
+ if (dst != src)
+ memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
+ crypto_xor(dst, walk.iv, walk.total % AES_BLOCK_SIZE);
+
+ err = skcipher_walk_done(&walk, 0);
+ break;
+ }
+ err = skcipher_walk_done(&walk,
+ walk.nbytes - blocks * AES_BLOCK_SIZE);
+ }
+ kernel_neon_end();
+
+ return err;
+}
+
+static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct crypto_aes_ctx rk;
+ int err;
+
+ err = xts_verify_key(tfm, in_key, key_len);
+ if (err)
+ return err;
+
+ key_len /= 2;
+ err = crypto_aes_expand_key(&ctx->tweak, in_key + key_len, key_len);
+ if (err)
+ return err;
+
+ err = crypto_aes_expand_key(&rk, in_key, key_len);
+ if (err)
+ return err;
+
+ ctx->rounds = 6 + key_len / 4;
+
+ kernel_neon_begin();
+ aesbs_convert_key(ctx->bskey.key, rk.key_enc, ctx->rounds);
+ kernel_neon_end();
+
+ return 0;
+}
+
+static int __xts_crypt(struct skcipher_request *req,
+ void (*fn)(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks, u8 iv[]))
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ crypto_aes_encrypt(crypto_skcipher_tfm(tfm), walk.iv, walk.iv);
+
+ kernel_neon_begin();
+ while (walk.nbytes >= AES_BLOCK_SIZE) {
+ unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+ if (walk.nbytes < walk.total)
+ blocks = round_down(blocks,
+ walk.stride / AES_BLOCK_SIZE);
+
+ fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->bskey.key,
+ ctx->rounds, blocks, walk.iv);
+ err = skcipher_walk_done(&walk,
+ walk.nbytes - blocks * AES_BLOCK_SIZE);
+ }
+ kernel_neon_end();
+
+ return err;
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ return __xts_crypt(req, aesbs_xts_encrypt);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ return __xts_crypt(req, aesbs_xts_decrypt);
+}
+
+static struct skcipher_alg aes_algs[] = { {
+ .base.cra_name = "__ecb(aes)",
+ .base.cra_driver_name = "__ecb-aes-neonbs",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = AES_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct aesbs_ctx),
+ .base.cra_module = THIS_MODULE,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .walksize = 8 * AES_BLOCK_SIZE,
+ .setkey = aesbs_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+}, {
+ .base.cra_name = "__ctr(aes)",
+ .base.cra_driver_name = "__ctr-aes-neonbs",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct aesbs_ctx),
+ .base.cra_module = THIS_MODULE,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .walksize = 8 * AES_BLOCK_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_setkey,
+ .encrypt = ctr_encrypt,
+ .decrypt = ctr_encrypt,
+}, {
+ .base.cra_name = "ctr(aes)",
+ .base.cra_driver_name = "ctr-aes-neonbs",
+ .base.cra_priority = 200 - 1,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct aesbs_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .walksize = 8 * AES_BLOCK_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_setkey,
+ .encrypt = ctr_encrypt,
+ .decrypt = ctr_encrypt,
+}, {
+ .base.cra_name = "__xts(aes)",
+ .base.cra_driver_name = "__xts-aes-neonbs",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = AES_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct aesbs_xts_ctx),
+ .base.cra_module = THIS_MODULE,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .walksize = 8 * AES_BLOCK_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_xts_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+} };
+
+static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
+
+static void aes_exit(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
+ if (aes_simd_algs[i])
+ simd_skcipher_free(aes_simd_algs[i]);
+
+ crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
+}
+
+static int __init aes_init(void)
+{
+ struct simd_skcipher_alg *simd;
+ const char *basename;
+ const char *algname;
+ const char *drvname;
+ int err;
+ int i;
+
+ if (!(elf_hwcap & HWCAP_ASIMD))
+ return -ENODEV;
+
+ err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
+ if (err)
+ return err;
+
+ for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
+ if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
+ continue;
+
+ algname = aes_algs[i].base.cra_name + 2;
+ drvname = aes_algs[i].base.cra_driver_name + 2;
+ basename = aes_algs[i].base.cra_driver_name;
+ simd = simd_skcipher_create_compat(algname, drvname, basename);
+ err = PTR_ERR(simd);
+ if (IS_ERR(simd))
+ goto unregister_simds;
+
+ aes_simd_algs[i] = simd;
+ }
+ return 0;
+
+unregister_simds:
+ aes_exit();
+ return err;
+}
+
+module_init(aes_init);
+module_exit(aes_exit);
--
2.7.4
^ permalink raw reply related
* [PATCH v6 6/8] IIO: add STM32 timer trigger driver
From: Jonathan Cameron @ 2017-01-02 18:22 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <CA+M3ks7O_6cWZLAAjEvPD-DQBmU--qrsuh1HfcTa9eLqaT-+fQ@mail.gmail.com>
On 02/01/17 08:46, Benjamin Gaignard wrote:
> 2016-12-30 22:12 GMT+01:00 Jonathan Cameron <jic23@kernel.org>:
>> On 09/12/16 14:15, Benjamin Gaignard wrote:
>>> Timers IPs can be used to generate triggers for other IPs like
>>> DAC, ADC or other timers.
>>> Each trigger may result of timer internals signals like counter enable,
>>> reset or edge, this configuration could be done through "master_mode"
>>> device attribute.
>>>
>>> A timer device could be triggered by other timers, we use the trigger
>>> name and is_stm32_iio_timer_trigger() function to distinguish them
>>> and configure IP input switch.
>>>
>>> Timer may also decide on which event (edge, level) they could
>>> be activated by a trigger, this configuration is done by writing in
>>> "slave_mode" device attribute.
>>>
>>> Since triggers could also be used by DAC or ADC their names are defined
>>> in include/ nux/iio/timer/stm32-timer-trigger.h so those IPs will be able
>>> to configure themselves in valid_trigger function
>>>
>>> Trigger have a "sampling_frequency" attribute which allow to configure
>>> timer sampling frequency without using PWM interface
>>>
>>> version 5:
>>> - simplify tables of triggers
>>> - only create an IIO device when needed
>>>
>>> version 4:
>>> - get triggers configuration from "reg" in DT
>>> - add tables of triggers
>>> - sampling frequency is enable/disable when writing in trigger
>>> sampling_frequency attribute
>>> - no more use of interruptions
>>>
>>> version 3:
>>> - change compatible to "st,stm32-timer-trigger"
>>> - fix attributes access right
>>> - use string instead of int for master_mode and slave_mode
>>> - document device attributes in sysfs-bus-iio-timer-stm32
>>>
>>> version 2:
>>> - keep only one compatible
>>> - use st,input-triggers-names and st,output-triggers-names
>>> to know which triggers are accepted and/or create by the device
>> Firstly, sorry it has taken me so long to get back to this.
>>
>> I'm still not keen on this use of iio_device elements just to act as
>> glue between triggers. I think we need to work out a more light weight
>> way to do this. As you are only using them for validation and to provide
>> somewhere to hang the control attibutes off, there is nothing stopping us
>> moving that over to the iio_trigger instead which would avoid the messy
>> duality going on here.
>
> I have add an iio_device because each hardware can generate multiple
> triggers (up to 5: trgo, ch 1...4) and slave_mode attribute will impact all the
> triggers of a device. For me it was making sense to centralize that in an
> iio_device rather than having an attribute "shared" (from hardware
> point of view)
> on multiple triggers.
> Since master_mode attribute is only used by trgo and not impact ch1...4
> triggers I will move it to trigger instead of the iio_device.
>
> I also wanted to be able to connect triggers on a iio_device as I
> could do for an
> ADC with a command like 'echo "tim1_trgo" > iio_deviceX/trigger/current_trigger'
This is interesting, but with a bit of refactoring I would think it would
be possible to share some of that code thus allowing non IIO devices to
bind to triggers. Ultimately I want to be able to bind a trigger to
a trigger - I appreciate here the topology is more limited than that
so some complexity comes in.
My gut feeling is that representing that topology explicitly is hard
to do in a remotely general way, but lets try it and see.
We run into this sort of interdependency issue between different bits of
the hardware all the time. Setting a value somewhere effects the configuration
elsewhere - often the best plan is to just let that happen and leave it up to
userspace to check for changes if it cares.
> If I change that to parent_trigger attribute it change this behavior
> and I will have to
> duplicated what is done in iio_trigger_write_current() to find and
> validate triggers.
I get the reasoning, but we still end up with something represented
by an IIO device that isn't providing any channels at all. It's simply
using some of the infrastructure. To my mind it is 'something else'
and should be represented as such. I have no problem at all with
you registering additional elements in /sysfs/bus/iio/ to represent
these shared elements - we already have drivers that do that to
provide some centralized infrastructure (e.g. the sysfs-trigger)
I'm worried about the scope spread we get for an IIO device otherwise.
They serve a well defined purpose at the moment, and that isn't what
is happening here.
So my gut feeling is we are better deliberately not representing the
inter dependence and claiming all triggers we are creating are
independent. That way we can have a nice generic infrastructure
that will work in all cases (be it pushing the sanity checking to
userspace).
So each trigger has direct access to what controls it. Changing anything
can effect other triggers in weird ways.
I'm finding it hard to see anything else generalizing sufficiently
as we'll always get cases where we can't represent the topology without
diving into the complexity of something like the media controller
framework.
Jonathan
>
>> I might still be missing something though!
>>
>> You would only I think need 3 attributes
>>
>> parrent_trigger
>> and something like your master_mode and slave_mode attributes.
>>
>> The parrent_trigger would need some validation etc, but if we keep it
>> within this driver initially that won't be hard to do. Checking the device
>> parent matches will do most of it.
>>
>> Jonathan
>>>
>>> Signed-off-by: Benjamin Gaignard <benjamin.gaignard@st.com>
>>> ---
>>> .../ABI/testing/sysfs-bus-iio-timer-stm32 | 55 +++
>>> drivers/iio/Kconfig | 2 +-
>>> drivers/iio/Makefile | 1 +
>>> drivers/iio/timer/Kconfig | 13 +
>>> drivers/iio/timer/Makefile | 1 +
>>> drivers/iio/timer/stm32-timer-trigger.c | 466 +++++++++++++++++++++
>>> drivers/iio/trigger/Kconfig | 1 -
>>> include/linux/iio/timer/stm32-timer-trigger.h | 62 +++
>>> 8 files changed, 599 insertions(+), 2 deletions(-)
>>> create mode 100644 Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
>>> create mode 100644 drivers/iio/timer/Kconfig
>>> create mode 100644 drivers/iio/timer/Makefile
>>> create mode 100644 drivers/iio/timer/stm32-timer-trigger.c
>>> create mode 100644 include/linux/iio/timer/stm32-timer-trigger.h
>>>
>>> diff --git a/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32 b/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
>>> new file mode 100644
>>> index 0000000..26583dd
>>> --- /dev/null
>>> +++ b/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
>>> @@ -0,0 +1,55 @@
>>> +What: /sys/bus/iio/devices/iio:deviceX/master_mode_available
>>> +KernelVersion: 4.10
>>> +Contact: benjamin.gaignard at st.com
>>> +Description:
>>> + Reading returns the list possible master modes which are:
>>> + - "reset" : The UG bit from the TIMx_EGR register is used as trigger output (TRGO).
>>> + - "enable" : The Counter Enable signal CNT_EN is used as trigger output.
>>> + - "update" : The update event is selected as trigger output.
>>> + For instance a master timer can then be used as a prescaler for a slave timer.
>>> + - "compare_pulse" : The trigger output send a positive pulse when the CC1IF flag is to be set.
>>> + - "OC1REF" : OC1REF signal is used as trigger output.
>>> + - "OC2REF" : OC2REF signal is used as trigger output.
>>> + - "OC3REF" : OC3REF signal is used as trigger output.
>>> + - "OC4REF" : OC4REF signal is used as trigger output.
>>> +
>>> +What: /sys/bus/iio/devices/iio:deviceX/master_mode
>>> +KernelVersion: 4.10
>>> +Contact: benjamin.gaignard at st.com
>>> +Description:
>>> + Reading returns the current master modes.
>>> + Writing set the master mode
>>> +
>>> +What: /sys/bus/iio/devices/iio:deviceX/slave_mode_available
>>> +KernelVersion: 4.10
>>> +Contact: benjamin.gaignard at st.com
>>> +Description:
>>> + Reading returns the list possible slave modes which are:
>>> + - "disabled" : The prescaler is clocked directly by the internal clock.
>>> + - "encoder_1" : Counter counts up/down on TI2FP1 edge depending on TI1FP2 level.
>>> + - "encoder_2" : Counter counts up/down on TI1FP2 edge depending on TI2FP1 level.
>>> + - "encoder_3" : Counter counts up/down on both TI1FP1 and TI2FP2 edges depending
>>> + on the level of the other input.
>>> + - "reset" : Rising edge of the selected trigger input reinitializes the counter
>>> + and generates an update of the registers.
>>> + - "gated" : The counter clock is enabled when the trigger input is high.
>>> + The counter stops (but is not reset) as soon as the trigger becomes low.
>>> + Both start and stop of the counter are controlled.
>>> + - "trigger" : The counter starts at a rising edge of the trigger TRGI (but it is not
>>> + reset). Only the start of the counter is controlled.
>>> + - "external_clock": Rising edges of the selected trigger (TRGI) clock the counter.
>>> +
>>> +What: /sys/bus/iio/devices/iio:deviceX/slave_mode
>>> +KernelVersion: 4.10
>>> +Contact: benjamin.gaignard at st.com
>>> +Description:
>>> + Reading returns the current slave mode.
>>> + Writing set the slave mode
>>> +
>>> +What: /sys/bus/iio/devices/triggerX/sampling_frequency
>>> +KernelVersion: 4.10
>>> +Contact: benjamin.gaignard at st.com
>>> +Description:
>>> + Reading returns the current sampling frequency.
>>> + Writing an value different of 0 set and start sampling.
>>> + Writing 0 stop sampling.
>>> diff --git a/drivers/iio/Kconfig b/drivers/iio/Kconfig
>>> index 6743b18..2de2a80 100644
>>> --- a/drivers/iio/Kconfig
>>> +++ b/drivers/iio/Kconfig
>>> @@ -90,5 +90,5 @@ source "drivers/iio/potentiometer/Kconfig"
>>> source "drivers/iio/pressure/Kconfig"
>>> source "drivers/iio/proximity/Kconfig"
>>> source "drivers/iio/temperature/Kconfig"
>>> -
>>> +source "drivers/iio/timer/Kconfig"
>>> endif # IIO
>>> diff --git a/drivers/iio/Makefile b/drivers/iio/Makefile
>>> index 87e4c43..b797c08 100644
>>> --- a/drivers/iio/Makefile
>>> +++ b/drivers/iio/Makefile
>>> @@ -32,4 +32,5 @@ obj-y += potentiometer/
>>> obj-y += pressure/
>>> obj-y += proximity/
>>> obj-y += temperature/
>>> +obj-y += timer/
>>> obj-y += trigger/
>>> diff --git a/drivers/iio/timer/Kconfig b/drivers/iio/timer/Kconfig
>>> new file mode 100644
>>> index 0000000..e3c21f2
>>> --- /dev/null
>>> +++ b/drivers/iio/timer/Kconfig
>>> @@ -0,0 +1,13 @@
>>> +#
>>> +# Timers drivers
>>> +
>>> +menu "Timers"
>>> +
>>> +config IIO_STM32_TIMER_TRIGGER
>>> + tristate "STM32 Timer Trigger"
>>> + depends on (ARCH_STM32 && OF && MFD_STM32_TIMERS) || COMPILE_TEST
>>> + select IIO_TRIGGERED_EVENT
>>> + help
>>> + Select this option to enable STM32 Timer Trigger
>>> +
>>> +endmenu
>>> diff --git a/drivers/iio/timer/Makefile b/drivers/iio/timer/Makefile
>>> new file mode 100644
>>> index 0000000..4ad95ec9
>>> --- /dev/null
>>> +++ b/drivers/iio/timer/Makefile
>>> @@ -0,0 +1 @@
>>> +obj-$(CONFIG_IIO_STM32_TIMER_TRIGGER) += stm32-timer-trigger.o
>>> diff --git a/drivers/iio/timer/stm32-timer-trigger.c b/drivers/iio/timer/stm32-timer-trigger.c
>>> new file mode 100644
>>> index 0000000..8d16e8f
>>> --- /dev/null
>>> +++ b/drivers/iio/timer/stm32-timer-trigger.c
>>> @@ -0,0 +1,466 @@
>>> +/*
>>> + * Copyright (C) STMicroelectronics 2016
>>> + *
>>> + * Author: Benjamin Gaignard <benjamin.gaignard@st.com>
>>> + *
>>> + * License terms: GNU General Public License (GPL), version 2
>>> + */
>>> +
>>> +#include <linux/iio/iio.h>
>>> +#include <linux/iio/sysfs.h>
>>> +#include <linux/iio/timer/stm32-timer-trigger.h>
>>> +#include <linux/iio/trigger.h>
>>> +#include <linux/iio/triggered_event.h>
>>> +#include <linux/interrupt.h>
>>> +#include <linux/mfd/stm32-timers.h>
>>> +#include <linux/module.h>
>>> +#include <linux/platform_device.h>
>>> +
>>> +#define MAX_TRIGGERS 6
>>> +#define MAX_VALIDS 5
>>> +
>>> +/* List the triggers created by each timer */
>>> +static const void *triggers_table[][MAX_TRIGGERS] = {
>>> + { TIM1_TRGO, TIM1_CH1, TIM1_CH2, TIM1_CH3, TIM1_CH4,},
>>> + { TIM2_TRGO, TIM2_CH1, TIM2_CH2, TIM2_CH3, TIM2_CH4,},
>>> + { TIM3_TRGO, TIM3_CH1, TIM3_CH2, TIM3_CH3, TIM3_CH4,},
>>> + { TIM4_TRGO, TIM4_CH1, TIM4_CH2, TIM4_CH3, TIM4_CH4,},
>>> + { TIM5_TRGO, TIM5_CH1, TIM5_CH2, TIM5_CH3, TIM5_CH4,},
>>> + { TIM6_TRGO,},
>>> + { TIM7_TRGO,},
>>> + { TIM8_TRGO, TIM8_CH1, TIM8_CH2, TIM8_CH3, TIM8_CH4,},
>>> + { TIM9_TRGO, TIM9_CH1, TIM9_CH2,},
>>> + { TIM12_TRGO, TIM12_CH1, TIM12_CH2,},
>>> +};
>>> +
>>> +/* List the triggers accepted by each timer */
>>> +static const void *valids_table[][MAX_VALIDS] = {
>>> + { TIM5_TRGO, TIM2_TRGO, TIM4_TRGO, TIM3_TRGO,},
>>> + { TIM1_TRGO, TIM8_TRGO, TIM3_TRGO, TIM4_TRGO,},
>>> + { TIM1_TRGO, TIM8_TRGO, TIM5_TRGO, TIM4_TRGO,},
>>> + { TIM1_TRGO, TIM2_TRGO, TIM3_TRGO, TIM8_TRGO,},
>>> + { TIM2_TRGO, TIM3_TRGO, TIM4_TRGO, TIM8_TRGO,},
>>> + { }, /* timer 6 */
>>> + { }, /* timer 7 */
>>> + { TIM1_TRGO, TIM2_TRGO, TIM4_TRGO, TIM5_TRGO,},
>>> + { TIM2_TRGO, TIM3_TRGO,},
>>> + { TIM4_TRGO, TIM5_TRGO,},
>>> +};
>>> +
>>> +struct stm32_timer_trigger {
>>> + struct device *dev;
>>> + struct regmap *regmap;
>>> + struct clk *clk;
>>> + u32 max_arr;
>>> + const void *triggers;
>>> + const void *valids;
>>> +};
>>> +
>>> +static int stm32_timer_start(struct stm32_timer_trigger *priv,
>>> + unsigned int frequency)
>>> +{
>>> + unsigned long long prd, div;
>>> + int prescaler = 0;
>>> + u32 ccer, cr1;
>>> +
>>> + /* Period and prescaler values depends of clock rate */
>>> + div = (unsigned long long)clk_get_rate(priv->clk);
>>> +
>>> + do_div(div, frequency);
>>> +
>>> + prd = div;
>>> +
>>> + /*
>>> + * Increase prescaler value until we get a result that fit
>>> + * with auto reload register maximum value.
>>> + */
>>> + while (div > priv->max_arr) {
>>> + prescaler++;
>>> + div = prd;
>>> + do_div(div, (prescaler + 1));
>>> + }
>>> + prd = div;
>>> +
>>> + if (prescaler > MAX_TIM_PSC) {
>>> + dev_err(priv->dev, "prescaler exceeds the maximum value\n");
>>> + return -EINVAL;
>>> + }
>>> +
>>> + /* Check if nobody else use the timer */
>>> + regmap_read(priv->regmap, TIM_CCER, &ccer);
>>> + if (ccer & TIM_CCER_CCXE)
>>> + return -EBUSY;
>>> +
>>> + regmap_read(priv->regmap, TIM_CR1, &cr1);
>>> + if (!(cr1 & TIM_CR1_CEN))
>>> + clk_enable(priv->clk);
>>> +
>>> + regmap_write(priv->regmap, TIM_PSC, prescaler);
>>> + regmap_write(priv->regmap, TIM_ARR, prd - 1);
>>> + regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_ARPE, TIM_CR1_ARPE);
>>> +
>>> + /* Force master mode to update mode */
>>> + regmap_update_bits(priv->regmap, TIM_CR2, TIM_CR2_MMS, 0x20);
>>> +
>>> + /* Make sure that registers are updated */
>>> + regmap_update_bits(priv->regmap, TIM_EGR, TIM_EGR_UG, TIM_EGR_UG);
>>> +
>>> + /* Enable controller */
>>> + regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_CEN, TIM_CR1_CEN);
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static void stm32_timer_stop(struct stm32_timer_trigger *priv)
>>> +{
>>> + u32 ccer, cr1;
>>> +
>>> + regmap_read(priv->regmap, TIM_CCER, &ccer);
>>> + if (ccer & TIM_CCER_CCXE)
>>> + return;
>>> +
>>> + regmap_read(priv->regmap, TIM_CR1, &cr1);
>>> + if (cr1 & TIM_CR1_CEN)
>>> + clk_disable(priv->clk);
>>> +
>>> + /* Stop timer */
>>> + regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_CEN, 0);
>>> + regmap_write(priv->regmap, TIM_PSC, 0);
>>> + regmap_write(priv->regmap, TIM_ARR, 0);
>>> +}
>>> +
>>> +static ssize_t stm32_tt_store_frequency(struct device *dev,
>>> + struct device_attribute *attr,
>>> + const char *buf, size_t len)
>>> +{
>>> + struct iio_trigger *trig = to_iio_trigger(dev);
>>> + struct stm32_timer_trigger *priv = iio_trigger_get_drvdata(trig);
>>> + unsigned int freq;
>>> + int ret;
>>> +
>>> + ret = kstrtouint(buf, 10, &freq);
>>> + if (ret)
>>> + return ret;
>>> +
>>> + if (freq == 0) {
>>> + stm32_timer_stop(priv);
>>> + } else {
>>> + ret = stm32_timer_start(priv, freq);
>>> + if (ret)
>>> + return ret;
>>> + }
>>> +
>>> + return len;
>>> +}
>>> +
>>> +static ssize_t stm32_tt_read_frequency(struct device *dev,
>>> + struct device_attribute *attr, char *buf)
>>> +{
>>> + struct iio_trigger *trig = to_iio_trigger(dev);
>>> + struct stm32_timer_trigger *priv = iio_trigger_get_drvdata(trig);
>>> + u32 psc, arr, cr1;
>>> + unsigned long long freq = 0;
>>> +
>>> + regmap_read(priv->regmap, TIM_CR1, &cr1);
>>> + regmap_read(priv->regmap, TIM_PSC, &psc);
>>> + regmap_read(priv->regmap, TIM_ARR, &arr);
>>> +
>>> + if (psc && arr && (cr1 & TIM_CR1_CEN)) {
>>> + freq = (unsigned long long)clk_get_rate(priv->clk);
>>> + do_div(freq, psc);
>>> + do_div(freq, arr);
>>> + }
>>> +
>>> + return sprintf(buf, "%d\n", (unsigned int)freq);
>>> +}
>>> +
>>> +static IIO_DEV_ATTR_SAMP_FREQ(0660,
>>> + stm32_tt_read_frequency,
>>> + stm32_tt_store_frequency);
>>> +
>>> +static struct attribute *stm32_trigger_attrs[] = {
>>> + &iio_dev_attr_sampling_frequency.dev_attr.attr,
>>> + NULL,
>>> +};
>>> +
>>> +static const struct attribute_group stm32_trigger_attr_group = {
>>> + .attrs = stm32_trigger_attrs,
>>> +};
>>> +
>>> +static const struct attribute_group *stm32_trigger_attr_groups[] = {
>>> + &stm32_trigger_attr_group,
>>> + NULL,
>>> +};
>>> +
>>> +static char *master_mode_table[] = {
>>> + "reset",
>>> + "enable",
>>> + "update",
>>> + "compare_pulse",
>>> + "OC1REF",
>>> + "OC2REF",
>>> + "OC3REF",
>>> + "OC4REF"
>>> +};
>>> +
>>> +static ssize_t stm32_tt_show_master_mode(struct device *dev,
>>> + struct device_attribute *attr,
>>> + char *buf)
>>> +{
>>> + struct iio_dev *indio_dev = dev_to_iio_dev(dev);
>>> + struct stm32_timer_trigger *priv = iio_priv(indio_dev);
>>> + u32 cr2;
>>> +
>>> + regmap_read(priv->regmap, TIM_CR2, &cr2);
>>> + cr2 = (cr2 & TIM_CR2_MMS) >> TIM_CR2_MMS_SHIFT;
>>> +
>>> + return snprintf(buf, PAGE_SIZE, "%s\n", master_mode_table[cr2]);
>>> +}
>>> +
>>> +static ssize_t stm32_tt_store_master_mode(struct device *dev,
>>> + struct device_attribute *attr,
>>> + const char *buf, size_t len)
>>> +{
>>> + struct iio_dev *indio_dev = dev_to_iio_dev(dev);
>>> + struct stm32_timer_trigger *priv = iio_priv(indio_dev);
>>> + int i;
>>> +
>>> + for (i = 0; i < ARRAY_SIZE(master_mode_table); i++) {
>>> + if (!strncmp(master_mode_table[i], buf,
>>> + strlen(master_mode_table[i]))) {
>>> + regmap_update_bits(priv->regmap, TIM_CR2,
>>> + TIM_CR2_MMS, i << TIM_CR2_MMS_SHIFT);
>>> + return len;
>>> + }
>>> + }
>>> +
>>> + return -EINVAL;
>>> +}
>>> +
>>> +static IIO_CONST_ATTR(master_mode_available,
>>> + "reset enable update compare_pulse OC1REF OC2REF OC3REF OC4REF");
>>> +
>>> +static IIO_DEVICE_ATTR(master_mode, 0660,
>>> + stm32_tt_show_master_mode,
>>> + stm32_tt_store_master_mode,
>>> + 0);
>>> +
>>> +static char *slave_mode_table[] = {
>>> + "disabled",
>>> + "encoder_1",
>>> + "encoder_2",
>>> + "encoder_3",
>>> + "reset",
>>> + "gated",
>>> + "trigger",
>>> + "external_clock",
>>> +};
>>> +
>>> +static ssize_t stm32_tt_show_slave_mode(struct device *dev,
>>> + struct device_attribute *attr,
>>> + char *buf)
>>> +{
>>> + struct iio_dev *indio_dev = dev_to_iio_dev(dev);
>>> + struct stm32_timer_trigger *priv = iio_priv(indio_dev);
>>> + u32 smcr;
>>> +
>>> + regmap_read(priv->regmap, TIM_SMCR, &smcr);
>>> + smcr &= TIM_SMCR_SMS;
>>> +
>>> + return snprintf(buf, PAGE_SIZE, "%s\n", slave_mode_table[smcr]);
>>> +}
>>> +
>>> +static ssize_t stm32_tt_store_slave_mode(struct device *dev,
>>> + struct device_attribute *attr,
>>> + const char *buf, size_t len)
>>> +{
>>> + struct iio_dev *indio_dev = dev_to_iio_dev(dev);
>>> + struct stm32_timer_trigger *priv = iio_priv(indio_dev);
>>> + int i;
>>> +
>>> + for (i = 0; i < ARRAY_SIZE(slave_mode_table); i++) {
>>> + if (!strncmp(slave_mode_table[i], buf,
>>> + strlen(slave_mode_table[i]))) {
>>> + regmap_update_bits(priv->regmap,
>>> + TIM_SMCR, TIM_SMCR_SMS, i);
>>> + return len;
>>> + }
>>> + }
>>> +
>>> + return -EINVAL;
>>> +}
>>> +
>>> +static IIO_CONST_ATTR(slave_mode_available,
>>> +"disabled encoder_1 encoder_2 encoder_3 reset gated trigger external_clock");
>>> +
>>> +static IIO_DEVICE_ATTR(slave_mode, 0660,
>>> + stm32_tt_show_slave_mode,
>>> + stm32_tt_store_slave_mode,
>>> + 0);
>>> +
>>> +static struct attribute *stm32_timer_attrs[] = {
>>> + &iio_dev_attr_master_mode.dev_attr.attr,
>>> + &iio_const_attr_master_mode_available.dev_attr.attr,
>>> + &iio_dev_attr_slave_mode.dev_attr.attr,
>>> + &iio_const_attr_slave_mode_available.dev_attr.attr,
>>> + NULL,
>>> +};
>>> +
>>> +static const struct attribute_group stm32_timer_attr_group = {
>>> + .attrs = stm32_timer_attrs,
>>> +};
>>> +
>>> +static const struct iio_trigger_ops timer_trigger_ops = {
>>> + .owner = THIS_MODULE,
>>> +};
>>> +
>>> +static int stm32_setup_iio_triggers(struct stm32_timer_trigger *priv)
>>> +{
>>> + int ret;
>>> + const char * const *cur = priv->triggers;
>>> +
>>> + while (cur && *cur) {
>>> + struct iio_trigger *trig;
>>> +
>>> + trig = devm_iio_trigger_alloc(priv->dev, "%s", *cur);
>>> + if (!trig)
>>> + return -ENOMEM;
>>> +
>>> + trig->dev.parent = priv->dev->parent;
>>> + trig->ops = &timer_trigger_ops;
>>> + trig->dev.groups = stm32_trigger_attr_groups;
>>> + iio_trigger_set_drvdata(trig, priv);
>>> +
>>> + ret = devm_iio_trigger_register(priv->dev, trig);
>>> + if (ret)
>>> + return ret;
>>> + cur++;
>>> + }
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +/**
>>> + * is_stm32_timer_trigger
>>> + * @trig: trigger to be checked
>>> + *
>>> + * return true if the trigger is a valid stm32 iio timer trigger
>>> + * either return false
>>> + */
>>> +bool is_stm32_timer_trigger(struct iio_trigger *trig)
>>> +{
>>> + return (trig->ops == &timer_trigger_ops);
>>> +}
>>> +EXPORT_SYMBOL(is_stm32_timer_trigger);
>>> +
>>> +static int stm32_validate_trigger(struct iio_dev *indio_dev,
>>> + struct iio_trigger *trig)
>>> +{
>>> + struct stm32_timer_trigger *priv = iio_priv(indio_dev);
>>> + const char * const *cur = priv->valids;
>>> + unsigned int i = 0;
>>> +
>>> + if (!is_stm32_timer_trigger(trig))
>>> + return -EINVAL;
>>> +
>>> + while (cur && *cur) {
>>> + if (!strncmp(trig->name, *cur, strlen(trig->name))) {
>>> + regmap_update_bits(priv->regmap,
>>> + TIM_SMCR, TIM_SMCR_TS,
>>> + i << TIM_SMCR_TS_SHIFT);
>>> + return 0;
>>> + }
>>> + cur++;
>>> + i++;
>>> + }
>>> +
>>> + return -EINVAL;
>>> +}
>>> +
>>> +static const struct iio_info stm32_trigger_info = {
>>> + .driver_module = THIS_MODULE,
>>> + .validate_trigger = stm32_validate_trigger,
>>> + .attrs = &stm32_timer_attr_group,
>>> +};
>>> +
>>> +static struct stm32_timer_trigger *stm32_setup_iio_device(struct device *dev)
>>> +{
>>> + struct iio_dev *indio_dev;
>>> + int ret;
>>> +
>>> + indio_dev = devm_iio_device_alloc(dev,
>>> + sizeof(struct stm32_timer_trigger));
>>> + if (!indio_dev)
>>> + return NULL;
>>> +
>>> + indio_dev->name = dev_name(dev);
>>> + indio_dev->dev.parent = dev;
>>> + indio_dev->info = &stm32_trigger_info;
>>> + indio_dev->modes = INDIO_EVENT_TRIGGERED;
>>> + indio_dev->num_channels = 0;
>>> + indio_dev->dev.of_node = dev->of_node;
>>> +
>>> + ret = devm_iio_device_register(dev, indio_dev);
>>> + if (ret)
>>> + return NULL;
>>> +
>>> + return iio_priv(indio_dev);
>>> +}
>>> +
>>> +static int stm32_timer_trigger_probe(struct platform_device *pdev)
>>> +{
>>> + struct device *dev = &pdev->dev;
>>> + struct stm32_timer_trigger *priv;
>>> + struct stm32_timers *ddata = dev_get_drvdata(pdev->dev.parent);
>>> + unsigned int index;
>>> + int ret;
>>> +
>>> + if (of_property_read_u32(dev->of_node, "reg", &index))
>>> + return -EINVAL;
>>> +
>>> + if (index >= ARRAY_SIZE(triggers_table))
>>> + return -EINVAL;
>>> +
>>> + /* Create an IIO device only if we have triggers to be validated */
>>> + if (*valids_table[index])
>>> + priv = stm32_setup_iio_device(dev);
>>
>> I still don't like this. Really feels like we shouldn't be creating an
>> iio device with all the bagage that carries just to allow us to do the
>> trigger trees. We ought to have a much more light weight solution for this
>> functionality - we aren't typically even using the interrupt tree stuff
>> that the triggers for devices are all really about.
>>
>> A simpler approach of allowing each trigger the option of a parent seems like
>> it would be cleaner. Could be done entirely within this driver in the first
>> instance. Basically it would just look like your master and slave attributes
>> but have those under triggerX not iio:deviceX.
>>
>> We can work out how to make it more generic later - including perhaps the
>> option to trigger from triggers outside this driver, using some parallel
>> infrastructure to the device triggering.
>>
>>
>>> + else
>>> + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
>>> +
>>> + if (!priv)
>>> + return -ENOMEM;
>>> +
>>> + priv->dev = dev;
>>> + priv->regmap = ddata->regmap;
>>> + priv->clk = ddata->clk;
>>> + priv->max_arr = ddata->max_arr;
>>> + priv->triggers = triggers_table[index];
>>> + priv->valids = valids_table[index];
>>> +
>>> + ret = stm32_setup_iio_triggers(priv);
>>> + if (ret)
>>> + return ret;
>>> +
>>> + platform_set_drvdata(pdev, priv);
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static const struct of_device_id stm32_trig_of_match[] = {
>>> + { .compatible = "st,stm32-timer-trigger", },
>>> + { /* end node */ },
>>> +};
>>> +MODULE_DEVICE_TABLE(of, stm32_trig_of_match);
>>> +
>>> +static struct platform_driver stm32_timer_trigger_driver = {
>>> + .probe = stm32_timer_trigger_probe,
>>> + .driver = {
>>> + .name = "stm32-timer-trigger",
>>> + .of_match_table = stm32_trig_of_match,
>>> + },
>>> +};
>>> +module_platform_driver(stm32_timer_trigger_driver);
>>> +
>>> +MODULE_ALIAS("platform: stm32-timer-trigger");
>>> +MODULE_DESCRIPTION("STMicroelectronics STM32 Timer Trigger driver");
>>> +MODULE_LICENSE("GPL v2");
>>> diff --git a/drivers/iio/trigger/Kconfig b/drivers/iio/trigger/Kconfig
>>> index 809b2e7..f2af4fe 100644
>>> --- a/drivers/iio/trigger/Kconfig
>>> +++ b/drivers/iio/trigger/Kconfig
>>> @@ -46,5 +46,4 @@ config IIO_SYSFS_TRIGGER
>>>
>>> To compile this driver as a module, choose M here: the
>>> module will be called iio-trig-sysfs.
>>> -
>> Clean this up.
>
> ok
>
>>> endmenu
>>> diff --git a/include/linux/iio/timer/stm32-timer-trigger.h b/include/linux/iio/timer/stm32-timer-trigger.h
>>> new file mode 100644
>>> index 0000000..55535ae
>>> --- /dev/null
>>> +++ b/include/linux/iio/timer/stm32-timer-trigger.h
>>> @@ -0,0 +1,62 @@
>>> +/*
>>> + * Copyright (C) STMicroelectronics 2016
>>> + *
>>> + * Author: Benjamin Gaignard <benjamin.gaignard@st.com>
>>> + *
>>> + * License terms: GNU General Public License (GPL), version 2
>>> + */
>>> +
>>> +#ifndef _STM32_TIMER_TRIGGER_H_
>>> +#define _STM32_TIMER_TRIGGER_H_
>>> +
>>> +#define TIM1_TRGO "tim1_trgo"
>>> +#define TIM1_CH1 "tim1_ch1"
>>> +#define TIM1_CH2 "tim1_ch2"
>>> +#define TIM1_CH3 "tim1_ch3"
>>> +#define TIM1_CH4 "tim1_ch4"
>>> +
>>> +#define TIM2_TRGO "tim2_trgo"
>>> +#define TIM2_CH1 "tim2_ch1"
>>> +#define TIM2_CH2 "tim2_ch2"
>>> +#define TIM2_CH3 "tim2_ch3"
>>> +#define TIM2_CH4 "tim2_ch4"
>>> +
>>> +#define TIM3_TRGO "tim3_trgo"
>>> +#define TIM3_CH1 "tim3_ch1"
>>> +#define TIM3_CH2 "tim3_ch2"
>>> +#define TIM3_CH3 "tim3_ch3"
>>> +#define TIM3_CH4 "tim3_ch4"
>>> +
>>> +#define TIM4_TRGO "tim4_trgo"
>>> +#define TIM4_CH1 "tim4_ch1"
>>> +#define TIM4_CH2 "tim4_ch2"
>>> +#define TIM4_CH3 "tim4_ch3"
>>> +#define TIM4_CH4 "tim4_ch4"
>>> +
>>> +#define TIM5_TRGO "tim5_trgo"
>>> +#define TIM5_CH1 "tim5_ch1"
>>> +#define TIM5_CH2 "tim5_ch2"
>>> +#define TIM5_CH3 "tim5_ch3"
>>> +#define TIM5_CH4 "tim5_ch4"
>>> +
>>> +#define TIM6_TRGO "tim6_trgo"
>>> +
>>> +#define TIM7_TRGO "tim7_trgo"
>>> +
>>> +#define TIM8_TRGO "tim8_trgo"
>>> +#define TIM8_CH1 "tim8_ch1"
>>> +#define TIM8_CH2 "tim8_ch2"
>>> +#define TIM8_CH3 "tim8_ch3"
>>> +#define TIM8_CH4 "tim8_ch4"
>>> +
>>> +#define TIM9_TRGO "tim9_trgo"
>>> +#define TIM9_CH1 "tim9_ch1"
>>> +#define TIM9_CH2 "tim9_ch2"
>>> +
>>> +#define TIM12_TRGO "tim12_trgo"
>>> +#define TIM12_CH1 "tim12_ch1"
>>> +#define TIM12_CH2 "tim12_ch2"
>>> +
>>> +bool is_stm32_timer_trigger(struct iio_trigger *trig);
>>> +
>>> +#endif
>>>
>>
>
>
>
^ permalink raw reply
* [PATCH v2 3/5] arm64: dts: exynos5433: Add PPMU dt node
From: Krzysztof Kozlowski @ 2017-01-02 18:33 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1481173091-9728-4-git-send-email-cw00.choi@samsung.com>
On Thu, Dec 08, 2016 at 01:58:09PM +0900, Chanwoo Choi wrote:
> This patch adds PPMU (Platform Performance Monitoring Unit) Device-tree node
> to measure the utilization of each IP in Exynos SoC.
>
> - PPMU_D{0|1}_CPU are used to measure the utilization of MIF (Memory Interface)
> block with VDD_MIF power source.
> - PPMU_D{0|1}_GENERAL are used to measure the utilization of INT(Internal)
> block with VDD_INT power source.
>
> Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
> Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
> ---
> arch/arm64/boot/dts/exynos/exynos5433.dtsi | 24 ++++++++++++++++++++++++
> 1 file changed, 24 insertions(+)
>
Thanks, applied.
Best regards,
Krzysztof
^ permalink raw reply
* [PATCH v2 4/5] arm64: dts: exynos5433: Add bus dt node using VDD_INT for Exynos5433
From: Krzysztof Kozlowski @ 2017-01-02 18:35 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1481173091-9728-5-git-send-email-cw00.choi@samsung.com>
On Thu, Dec 08, 2016 at 01:58:10PM +0900, Chanwoo Choi wrote:
> This patch adds the bus nodes using VDD_INT for Exynos5433 SoC.
> Exynos5433 has the following AMBA AXI buses to translate data
> between DRAM and sub-blocks.
>
> Following list specify the detailed correlation between sub-block and clock:
> - CLK_ACLK_G2D_{400|266} : Bus clock for G2D (2D graphic engine)
> - CLK_ACLK_MSCL_400 : Bus clock for MSCL (Memory to memory Scaler)
> - CLK_ACLK_GSCL_333 : Bus clock for GSCL (General Scaler)
> - CLK_SCLK_JPEG_MSCL : Bus clock for JPEG
> - CLK_ACLK_MFC_400 : Bus clock for MFC (Multi Format Codec)
> - CLK_ACLK_HEVC_400 : Bus clock for HEVC (High Efficient Video Codec)
> - CLK_ACLK_BUS0_400 : NoC(Network On Chip)'s bus clock for PERIC/PERIS/FSYS/MSCL
> - CLK_ACLK_BUS1_400 : NoC's bus clock for MFC/HEVC/G3D
> - CLK_ACLK_BUS2_400 : NoC's bus clock for GSCL/DISP/G2D/CAM0/CAM1/ISP
>
> Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
> ---
> arch/arm64/boot/dts/exynos/exynos5433-bus.dtsi | 197 +++++++++++++++++++++++++
> arch/arm64/boot/dts/exynos/exynos5433.dtsi | 1 +
> 2 files changed, 198 insertions(+)
> create mode 100644 arch/arm64/boot/dts/exynos/exynos5433-bus.dtsi
>
Thanks, applied with changes:
1. Subject prefix,
2. Minor adjustments in commit msg,
3. Fixed missing space in 'status = "disabled"'.
Best regards,
Krzysztof
^ permalink raw reply
* [PATCH v2 5/5] arm64: dts: exynos5433: Add support of bus frequency using VDD_INT on TM2
From: Krzysztof Kozlowski @ 2017-01-02 18:37 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1481173091-9728-6-git-send-email-cw00.choi@samsung.com>
On Thu, Dec 08, 2016 at 01:58:11PM +0900, Chanwoo Choi wrote:
> This patch adds the bus Device-tree nodes for INT (Internal) block
> to enable the bus frequency scaling.
>
> Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
> Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
> ---
> arch/arm64/boot/dts/exynos/exynos5433-tm2.dts | 70 +++++++++++++++++++++++++++
> 1 file changed, 70 insertions(+)
>
Thanks, applied.
Best regards,
Krzysztof
^ permalink raw reply
* [PATCH] DTS: MCCMON6: IMX: Provide support for iMX6Q based Liebherr mccmon6 board
From: Vladimir Zapolskiy @ 2017-01-02 19:12 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20170102154437.63406b95@jawa>
Hi Lukasz,
please find some comments below as usual.
On 01/02/2017 04:44 PM, Lukasz Majewski wrote:
> Hi Vladimir,
>
> Thank you for review. Comments without my remarks have been applied
> already.
>
>> Hello Lukasz,
>>
>> On 12/27/2016 01:19 AM, Lukasz Majewski wrote:
>>> Signed-off-by: Lukasz Majewski <l.majewski@majess.pl>
>>
>> please add a commit message with a short description of the change.
>>
>> Also change subject line to "ARM: dts: imx6q: Add mccmon6 board
>> support".
>>
>>> ---
[snip]
>>> +/ {
>>> + model = "Monitor6 i.MX6 Quad Board";
>>
>> Missing hardware vendor name.
>>
>>> + compatible = "mccmon6", "fsl,imx6q";
>>
>> Missing hardware vendor prefix before "mccmon6".
>
> "lwn,mccmon6" ?
>
Something like that, but please ensure that you add "lwn" vendor in a separate
preceding change to Documentation/devicetree/bindings/vendor-prefixes.txt
>>
>>> +
>>> + memory {
>>> + reg = <0x10000000 0x80000000>;
>>> + };
>>> +
>>> + ethernet0 {
>>> + status = "okay";
>>> + };
>>
>> It looks like a useless device node, you have a description of &fec
>> already.
>>
>>> +
>>> + backlight_lvds: backlight {
>>> + compatible = "pwm-backlight";
>>> + pinctrl-names = "default";
>>> + pinctrl-0 = <&pinctrl_display>;
>>
>> I would recommend to rename "pinctrl_display" to "pinctrl_backlight".
>>
>>> + pwms = <&pwm2 0 5000000 PWM_POLARITY_INVERTED>;
>>
>> This should work when extension to the i.MX PWM driver is merged.
>
> Yes. The PWM -> apply is an ongoing work. But without the PMW patch the
> board is also fully operational (with reversed PWM :-) )
>
Right, I believe that the current PWM driver igonores the value passed
in the third cell, so it should be okay.
>>
>>> + brightness-levels = < 0 1 2 3 4 5 6
>>> 7 8 9
>>> + 10 11 12 13 14 15 16
>>> 17 18 19
>>> + 20 21 22 23 24 25 26
>>> 27 28 29
>>> + 30 31 32 33 34 35 36
>>> 37 38 39
>>> + 40 41 42 43 44 45 46
>>> 47 48 49
>>> + 50 51 52 53 54 55 56
>>> 57 58 59
>>> + 60 61 62 63 64 65 66
>>> 67 68 69
>>> + 70 71 72 73 74 75 76
>>> 77 78 79
>>> + 80 81 82 83 84 85 86
>>> 87 88 89
>>> + 90 91 92 93 94 95 96
>>> 97 98 99
>>> + 100 101 102 103 104 105 106
>>> 107 108 109
>>> + 110 111 112 113 114 115 116
>>> 117 118 119
>>> + 120 121 122 123 124 125 126
>>> 127 128 129
>>> + 130 131 132 133 134 135 136
>>> 137 138 139
>>> + 140 141 142 143 144 145 146
>>> 147 148 149
>>> + 150 151 152 153 154 155 156
>>> 157 158 159
>>> + 160 161 162 163 164 165 166
>>> 167 168 169
>>> + 170 171 172 173 174 175 176
>>> 177 178 179
>>> + 180 181 182 183 184 185 186
>>> 187 188 189
>>> + 190 191 192 193 194 195 196
>>> 197 198 199
>>> + 200 201 202 203 204 205 206
>>> 207 208 209
>>> + 210 211 212 213 214 215 216
>>> 217 218 219
>>> + 220 221 222 223 224 225 226
>>> 227 228 229
>>> + 230 231 232 233 234 235 236
>>> 237 238 239
>>> + 240 241 242 243 244 245 246
>>> 247 248 249
>>> + 250 251 252 253 254 255>;
>>
>> I'm not sure that actually need such a long list of brightness levels.
>
> Such brightness-level property is so verbose on purpose - in this board
> we need fine brightness adjustment (harsh environment operation).
Okay.
>>
>>> + default-brightness-level = <50>;
>>> + enable-gpios = <&gpio1 2 GPIO_ACTIVE_LOW>;
>>> + };
>>> +
[snip]
>>> + pinctrl_display: dispgrp {
>>> + fsl,pins = <
>>> + /* BLEN_OUT */
>>> + MX6QDL_PAD_GPIO_2__GPIO1_IO02
>>> 0x1b0b0
>>> + /* LVDS_PPEN_OUT */
>>> + MX6QDL_PAD_SD1_DAT2__GPIO1_IO19
>>> 0x1b0b0
>>
>> This GPIO should be moved to a pinctrl group of regulator-lvds device
>> node.
>
> You mean to provide separate:
>
> pinctrl_reg_lvds: req_lvds_grp {
> fsl,pins = <
> /* LVDS_PPEN_OUT */
> MX6QDL_PAD_SD1_DAT2__GPIO1_IO19
> >;
>
> and then
>
> reg_lvds: regulator-lvds {
> compatible = "regulator-fixed";
> regulator-name = "lvds_ppen";
> regulator-min-microvolt = <3300000>;
> regulator-max-microvolt = <3300000>;
> regulator-boot-on;
>
> pinctrl-names = "default";
> pinctrl-0 = <&pinctrl_reg_lvds>;
>
> gpio = <&gpio1 19 GPIO_ACTIVE_HIGH>;
> enable-active-high;
> };
>
This looks correct.
[snip]
>>> +
>>> +&uart1 {
>>> + pinctrl-names = "default";
>>> + pinctrl-0 = <&pinctrl_uart1>;
>>
>> Should you add "uart-has-rtscts" property?
>
> This is a simple "console" uart without rts/cts, so this property is
> not needed.
>
You are right, my review comment is valid for UART4 only.
[snip]
--
With best wishes,
Vladimir
^ permalink raw reply
* [GIT PULL] Qualcomm ARM DT Fixes for 4.10-rc2
From: Andy Gross @ 2017-01-02 20:03 UTC (permalink / raw)
To: linux-arm-kernel
The following changes since commit 0c744ea4f77d72b3dcebb7a8f2684633ec79be88:
Linux 4.10-rc2 (2017-01-01 14:31:53 -0800)
are available in the git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/agross/linux.git tags/qcom-arm-fixes-for-4.10-rc2
for you to fetch changes up to 542b9f0759ed74ca0f1a9f3ff090c95ea73eba91:
ARM: dts: qcom: apq8064: Add missing scm clock (2017-01-02 10:47:10 -0600)
----------------------------------------------------------------
Qualcomm ARM DTS Fixes for v4.10-rc2
* Add SCM clock for APQ8064 to fix boot failures
----------------------------------------------------------------
Bjorn Andersson (1):
ARM: dts: qcom: apq8064: Add missing scm clock
arch/arm/boot/dts/qcom-apq8064.dtsi | 4 ++++
1 file changed, 4 insertions(+)
^ permalink raw reply
* [PATCH 15/20] ARM/hw_breakpoint: Convert to hotplug state machine
From: Linus Walleij @ 2017-01-02 20:15 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20170102150015.GJ14217@n2100.armlinux.org.uk>
On Mon, Jan 2, 2017 at 4:00 PM, Russell King - ARM Linux
<linux@armlinux.org.uk> wrote:
> On Mon, Jan 02, 2017 at 03:34:32PM +0100, Linus Walleij wrote:
>> in the first line of arch_hw_breakpoint_init() in
>> arch/arm/kernel/hw_breakpoint.c
>>
>> I suspect that is not an accepable solution ...
>>
>> It hangs at PC is at write_wb_reg+0x20c/0x330
>> Which is c03101dc, and looks like this in objdump -d:
>>
>> c031020c: ee001eba mcr 14, 0, r1, cr0, cr10, {5}
>> c0310210: eaffffb3 b c03100e4 <write_wb_reg+0x114>
>
> ... and this is several instructions after the address you mention above.
> Presumably c03101dc is accessing a higher numbered register?
Ah sorry. It looks like this:
c03101dc: ee001ed0 mcr 14, 0, r1, cr0, cr0, {6}
c03101e0: eaffffbf b c03100e4 <write_wb_reg+0x114>
c03101e4: ee001ebf mcr 14, 0, r1, cr0, cr15, {5}
c03101e8: eaffffbd b c03100e4 <write_wb_reg+0x114>
c03101ec: ee001ebe mcr 14, 0, r1, cr0, cr14, {5}
c03101f0: eaffffbb b c03100e4 <write_wb_reg+0x114>
c03101f4: ee001ebd mcr 14, 0, r1, cr0, cr13, {5}
c03101f8: eaffffb9 b c03100e4 <write_wb_reg+0x114>
Yours,
Linus Walleij
^ permalink raw reply
* [PATCH] ARM: multi_v7_defconfig: enable Qualcomm RPMCC
From: Andy Gross @ 2017-01-02 20:35 UTC (permalink / raw)
To: linux-arm-kernel
This patch enables the Qualcomm RPM based Clock Controller present on
A-family boards.
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
arch/arm/configs/multi_v7_defconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig
index b01a438..4ff6779 100644
--- a/arch/arm/configs/multi_v7_defconfig
+++ b/arch/arm/configs/multi_v7_defconfig
@@ -824,6 +824,7 @@ CONFIG_QCOM_SMSM=y
CONFIG_QCOM_WCNSS_CTRL=m
CONFIG_ROCKCHIP_PM_DOMAINS=y
CONFIG_COMMON_CLK_QCOM=y
+CONFIG_QCOM_CLK_RPM=y
CONFIG_CHROME_PLATFORMS=y
CONFIG_STAGING_BOARD=y
CONFIG_CROS_EC_CHARDEV=m
--
1.9.1
^ permalink raw reply related
* [PATCH 1/3] ARM: OMAP1: USB: tidy up logging output
From: Aaro Koskinen @ 2017-01-02 20:57 UTC (permalink / raw)
To: linux-arm-kernel
KERN_CONT/pr_cont is now required to continue log messages, use that.
Signed-off-by: Aaro Koskinen <aaro.koskinen@iki.fi>
---
arch/arm/mach-omap1/usb.c | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/arch/arm/mach-omap1/usb.c b/arch/arm/mach-omap1/usb.c
index 2506e59..d4aa118 100644
--- a/arch/arm/mach-omap1/usb.c
+++ b/arch/arm/mach-omap1/usb.c
@@ -95,17 +95,17 @@ omap_otg_init(struct omap_usb_config *config)
printk("USB: hmc %d", config->hmc_mode);
if (!alt_pingroup)
- printk(", usb2 alt %d wires", config->pins[2]);
+ pr_cont(", usb2 alt %d wires", config->pins[2]);
else if (config->pins[0])
- printk(", usb0 %d wires%s", config->pins[0],
+ pr_cont(", usb0 %d wires%s", config->pins[0],
is_usb0_device(config) ? " (dev)" : "");
if (config->pins[1])
- printk(", usb1 %d wires", config->pins[1]);
+ pr_cont(", usb1 %d wires", config->pins[1]);
if (!alt_pingroup && config->pins[2])
- printk(", usb2 %d wires", config->pins[2]);
+ pr_cont(", usb2 %d wires", config->pins[2]);
if (config->otg)
- printk(", Mini-AB on usb%d", config->otg - 1);
- printk("\n");
+ pr_cont(", Mini-AB on usb%d", config->otg - 1);
+ pr_cont("\n");
if (cpu_class_is_omap1()) {
u16 w;
@@ -573,13 +573,13 @@ static void __init omap_1510_usb_init(struct omap_usb_config *config)
printk("USB: hmc %d", config->hmc_mode);
if (config->pins[0])
- printk(", usb0 %d wires%s", config->pins[0],
+ pr_cont(", usb0 %d wires%s", config->pins[0],
is_usb0_device(config) ? " (dev)" : "");
if (config->pins[1])
- printk(", usb1 %d wires", config->pins[1]);
+ pr_cont(", usb1 %d wires", config->pins[1]);
if (config->pins[2])
- printk(", usb2 %d wires", config->pins[2]);
- printk("\n");
+ pr_cont(", usb2 %d wires", config->pins[2]);
+ pr_cont("\n");
/* use DPLL for 48 MHz function clock */
pr_debug("APLL %04x DPLL %04x REQ %04x\n", omap_readw(ULPD_APLL_CTRL),
--
2.9.2
^ permalink raw reply related
* [PATCH 2/3] ARM: OMAP1: USB: make omap_otg_init() static
From: Aaro Koskinen @ 2017-01-02 20:57 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20170102205705.19001-1-aaro.koskinen@iki.fi>
Make omap_otg_init() static.
Signed-off-by: Aaro Koskinen <aaro.koskinen@iki.fi>
---
arch/arm/mach-omap1/include/mach/usb.h | 2 --
arch/arm/mach-omap1/usb.c | 4 ++--
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/arch/arm/mach-omap1/include/mach/usb.h b/arch/arm/mach-omap1/include/mach/usb.h
index a7c5559..eb76628 100644
--- a/arch/arm/mach-omap1/include/mach/usb.h
+++ b/arch/arm/mach-omap1/include/mach/usb.h
@@ -10,8 +10,6 @@
#include <linux/platform_data/usb-omap1.h>
-void omap_otg_init(struct omap_usb_config *config);
-
#if IS_ENABLED(CONFIG_USB)
void omap1_usb_init(struct omap_usb_config *pdata);
#else
diff --git a/arch/arm/mach-omap1/usb.c b/arch/arm/mach-omap1/usb.c
index d4aa118..0b4ed94 100644
--- a/arch/arm/mach-omap1/usb.c
+++ b/arch/arm/mach-omap1/usb.c
@@ -58,7 +58,7 @@
#ifdef CONFIG_ARCH_OMAP_OTG
-void __init
+static void __init
omap_otg_init(struct omap_usb_config *config)
{
u32 syscon;
@@ -166,7 +166,7 @@ omap_otg_init(struct omap_usb_config *config)
}
#else
-void omap_otg_init(struct omap_usb_config *config) {}
+static void omap_otg_init(struct omap_usb_config *config) {}
#endif
#if IS_ENABLED(CONFIG_USB_OMAP)
--
2.9.2
^ permalink raw reply related
* [PATCH 3/3] ARM: OMAP1: USB: delete redundant CPU class checks
From: Aaro Koskinen @ 2017-01-02 20:57 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20170102205705.19001-1-aaro.koskinen@iki.fi>
Delete redundant CPU class checks. This code is only used
on OMAP1 nowadays.
Signed-off-by: Aaro Koskinen <aaro.koskinen@iki.fi>
---
arch/arm/mach-omap1/usb.c | 27 ++++++++++++---------------
1 file changed, 12 insertions(+), 15 deletions(-)
diff --git a/arch/arm/mach-omap1/usb.c b/arch/arm/mach-omap1/usb.c
index 0b4ed94..455e2cf 100644
--- a/arch/arm/mach-omap1/usb.c
+++ b/arch/arm/mach-omap1/usb.c
@@ -1,5 +1,5 @@
/*
- * Platform level USB initialization for FS USB OTG controller on omap1 and 24xx
+ * Platform level USB initialization for FS USB OTG controller on omap1
*
* Copyright (C) 2004 Texas Instruments, Inc.
*
@@ -63,6 +63,7 @@ omap_otg_init(struct omap_usb_config *config)
{
u32 syscon;
int alt_pingroup = 0;
+ u16 w;
/* NOTE: no bus or clock setup (yet?) */
@@ -87,9 +88,8 @@ omap_otg_init(struct omap_usb_config *config)
if (config->otg)
syscon |= OTG_EN;
#endif
- if (cpu_class_is_omap1())
- pr_debug("USB_TRANSCEIVER_CTRL = %03x\n",
- omap_readl(USB_TRANSCEIVER_CTRL));
+ pr_debug("USB_TRANSCEIVER_CTRL = %03x\n",
+ omap_readl(USB_TRANSCEIVER_CTRL));
pr_debug("OTG_SYSCON_2 = %08x\n", omap_readl(OTG_SYSCON_2));
omap_writel(syscon, OTG_SYSCON_2);
@@ -107,19 +107,16 @@ omap_otg_init(struct omap_usb_config *config)
pr_cont(", Mini-AB on usb%d", config->otg - 1);
pr_cont("\n");
- if (cpu_class_is_omap1()) {
- u16 w;
+ /* leave USB clocks/controllers off until needed */
+ w = omap_readw(ULPD_SOFT_REQ);
+ w &= ~SOFT_USB_CLK_REQ;
+ omap_writew(w, ULPD_SOFT_REQ);
- /* leave USB clocks/controllers off until needed */
- w = omap_readw(ULPD_SOFT_REQ);
- w &= ~SOFT_USB_CLK_REQ;
- omap_writew(w, ULPD_SOFT_REQ);
+ w = omap_readw(ULPD_CLOCK_CTRL);
+ w &= ~USB_MCLK_EN;
+ w |= DIS_USB_PVCI_CLK;
+ omap_writew(w, ULPD_CLOCK_CTRL);
- w = omap_readw(ULPD_CLOCK_CTRL);
- w &= ~USB_MCLK_EN;
- w |= DIS_USB_PVCI_CLK;
- omap_writew(w, ULPD_CLOCK_CTRL);
- }
syscon = omap_readl(OTG_SYSCON_1);
syscon |= HST_IDLE_EN|DEV_IDLE_EN|OTG_IDLE_EN;
--
2.9.2
^ permalink raw reply related
* [PATCH 1/5] ARM: wire up HWCAP2 feature bits to the CPU modalias
From: Ard Biesheuvel @ 2017-01-02 21:06 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20161031161319.GF1041@n2100.armlinux.org.uk>
On 31 October 2016 at 16:13, Russell King - ARM Linux
<linux@armlinux.org.uk> wrote:
> On Sat, Oct 29, 2016 at 11:08:36AM +0100, Ard Biesheuvel wrote:
>> On 18 October 2016 at 11:52, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>> > Wire up the generic support for exposing CPU feature bits via the
>> > modalias in /sys/device/system/cpu. This allows udev to automatically
>> > load modules for things like crypto algorithms that are implemented
>> > using optional instructions.
>> >
>> > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>> > ---
>> > arch/arm/Kconfig | 1 +
>> > arch/arm/include/asm/cpufeature.h | 32 ++++++++++++++++++++
>> > 2 files changed, 33 insertions(+)
>> >
>>
>> Russell,
>>
>> do you have any concerns regarding this patch? If not, I will drop it
>> into the patch system.
>
> It's still something I need to look at... I've been offline last week,
> and sort-of offline the previous week, so I'm catching up.
>
Hi Russell,
Any thoughts yet?
Thanks,
Ard.
^ permalink raw reply
* [PATCH 00/20] i.MX Media Driver
From: Fabio Estevam @ 2017-01-02 21:09 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1483050455-10683-1-git-send-email-steve_longerbeam@mentor.com>
Hi Steve,
On Thu, Dec 29, 2016 at 8:27 PM, Steve Longerbeam <slongerbeam@gmail.com> wrote:
> This is a media driver for video capture on i.MX.
>
> Refer to Documentation/media/v4l-drivers/imx.rst for example capture
> pipelines on SabreSD, SabreAuto, and SabreLite reference platforms.
>
> This patchset includes the OF graph layout as proposed by Philipp Zabel,
> with only minor changes which are enumerated in the patch header.
Patches 13, 14 and 19 miss your Signed-off-by tag.
Tested the whole series on a mx6qsabresd:
Tested-by: Fabio Estevam <fabio.estevam@nxp.com>
^ permalink raw reply
* [PATCH v6 09/14] ACPI: platform: setup MSI domain for ACPI based platform device
From: Rafael J. Wysocki @ 2017-01-02 21:17 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1483363905-2806-10-git-send-email-hanjun.guo@linaro.org>
On Mon, Jan 2, 2017 at 2:31 PM, Hanjun Guo <hanjun.guo@linaro.org> wrote:
> With the platform msi domain created, we can set up the msi domain
> for a platform device when it's probed.
>
> In order to do that, we need to get the domain that the platform
> device connecting to, so the iort_get_platform_device_domain() is
> introduced to retrieve the domain from iort.
>
> After the domain is retrieved, we need a proper way to set the
> domain to paltform device, as some platform devices such as an
> irqchip needs the msi irqdomain to be the interrupt parent domain,
> we need to get irqdomain before platform device is probed but after
> the platform device is allocated (the time slot of setting the
> msi domain also works for other cases). So simply call
> acpi_configure_pmsi_domain() in acpi_platform_notify() for
> platform devices will work.
>
> Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
> Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
> Cc: Marc Zyngier <marc.zyngier@arm.com>
> Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
ACK for the glue.c part.
> ---
> drivers/acpi/arm64/iort.c | 43 +++++++++++++++++++++++++++++++++++++++++++
> drivers/acpi/glue.c | 6 ++++++
> include/linux/acpi_iort.h | 3 +++
> 3 files changed, 52 insertions(+)
>
> diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
> index bc68d93..6b72fcb 100644
> --- a/drivers/acpi/arm64/iort.c
> +++ b/drivers/acpi/arm64/iort.c
> @@ -527,6 +527,49 @@ struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id)
> return irq_find_matching_fwnode(handle, DOMAIN_BUS_PCI_MSI);
> }
>
> +/**
> + * iort_get_platform_device_domain() - Find MSI domain related to a
> + * platform device
> + * @dev: the dev pointer associated with the platform device
> + *
> + * Returns: the MSI domain for this device, NULL otherwise
> + */
> +static struct irq_domain *iort_get_platform_device_domain(struct device *dev)
> +{
> + struct acpi_iort_node *node, *msi_parent;
> + struct fwnode_handle *iort_fwnode;
> + struct acpi_iort_its_group *its;
> +
> + /* find its associated iort node */
> + node = iort_scan_node(ACPI_IORT_NODE_NAMED_COMPONENT,
> + iort_match_node_callback, dev);
> + if (!node)
> + return NULL;
> +
> + /* then find its msi parent node */
> + msi_parent = iort_node_get_id(node, NULL, IORT_MSI_TYPE, 0);
> + if (!msi_parent)
> + return NULL;
> +
> + /* Move to ITS specific data */
> + its = (struct acpi_iort_its_group *)msi_parent->node_data;
> +
> + iort_fwnode = iort_find_domain_token(its->identifiers[0]);
> + if (!iort_fwnode)
> + return NULL;
> +
> + return irq_find_matching_fwnode(iort_fwnode, DOMAIN_BUS_PLATFORM_MSI);
> +}
> +
> +void acpi_configure_pmsi_domain(struct device *dev)
> +{
> + struct irq_domain *msi_domain;
> +
> + msi_domain = iort_get_platform_device_domain(dev);
> + if (msi_domain)
> + dev_set_msi_domain(dev, msi_domain);
> +}
> +
> static int __get_pci_rid(struct pci_dev *pdev, u16 alias, void *data)
> {
> u32 *rid = data;
> diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c
> index f8d6564..4a73f27 100644
> --- a/drivers/acpi/glue.c
> +++ b/drivers/acpi/glue.c
> @@ -6,6 +6,8 @@
> *
> * This file is released under the GPLv2.
> */
> +
> +#include <linux/acpi_iort.h>
> #include <linux/export.h>
> #include <linux/init.h>
> #include <linux/list.h>
> @@ -14,6 +16,7 @@
> #include <linux/rwsem.h>
> #include <linux/acpi.h>
> #include <linux/dma-mapping.h>
> +#include <linux/platform_device.h>
>
> #include "internal.h"
>
> @@ -315,6 +318,9 @@ static int acpi_platform_notify(struct device *dev)
> if (!adev)
> goto out;
>
> + if (dev->bus == &platform_bus_type)
> + acpi_configure_pmsi_domain(dev);
> +
> if (type && type->setup)
> type->setup(dev);
> else if (adev->handler && adev->handler->bind)
> diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
> index ef99fd52..33f5ac3 100644
> --- a/include/linux/acpi_iort.h
> +++ b/include/linux/acpi_iort.h
> @@ -38,6 +38,7 @@
> /* IOMMU interface */
> void iort_set_dma_mask(struct device *dev);
> const struct iommu_ops *iort_iommu_configure(struct device *dev);
> +void acpi_configure_pmsi_domain(struct device *dev);
> #else
> static inline void acpi_iort_init(void) { }
> static inline bool iort_node_match(u8 type) { return false; }
> @@ -58,6 +59,8 @@ static inline void iort_set_dma_mask(struct device *dev) { }
> static inline
> const struct iommu_ops *iort_iommu_configure(struct device *dev)
> { return NULL; }
> +
> +static inline void acpi_configure_pmsi_domain(struct device *dev) { }
> #endif
>
> #define IORT_ACPI_DECLARE(name, table_id, fn) \
> --
> 1.9.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> the body of a message to majordomo at vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* [PATCH] cpufreq: s3c64xx: remove incorrect __init annotation
From: Rafael J. Wysocki @ 2017-01-02 21:19 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20170102173651.hvsfalhcmisv4mtv@kozik-lap>
On Mon, Jan 2, 2017 at 6:36 PM, Krzysztof Kozlowski <krzk@kernel.org> wrote:
> On Mon, Jan 02, 2017 at 12:39:03PM +0530, Viresh Kumar wrote:
>> On 16-12-16, 10:06, Arnd Bergmann wrote:
>> > s3c64xx_cpufreq_config_regulator is incorrectly annotated
>> > as __init, since the caller is also not init:
>> >
>> > WARNING: vmlinux.o(.text+0x92fe1c): Section mismatch in reference from the function s3c64xx_cpufreq_driver_init() to the function .init.text:s3c64xx_cpufreq_config_regulator()
>> >
>> > With modern gcc versions, the function gets inline, so we don't
>> > see the warning, this only happens with gcc-4.6 and older.
>> >
>> > Signed-off-by: Arnd Bergmann <arnd@arndb.de>
>> > ---
>> > drivers/cpufreq/s3c64xx-cpufreq.c | 2 +-
>> > 1 file changed, 1 insertion(+), 1 deletion(-)
>> >
>> > diff --git a/drivers/cpufreq/s3c64xx-cpufreq.c b/drivers/cpufreq/s3c64xx-cpufreq.c
>> > index 176e84cc3991..0cb9040eca49 100644
>> > --- a/drivers/cpufreq/s3c64xx-cpufreq.c
>> > +++ b/drivers/cpufreq/s3c64xx-cpufreq.c
>> > @@ -107,7 +107,7 @@ static int s3c64xx_cpufreq_set_target(struct cpufreq_policy *policy,
>> > }
>> >
>> > #ifdef CONFIG_REGULATOR
>> > -static void __init s3c64xx_cpufreq_config_regulator(void)
>> > +static void s3c64xx_cpufreq_config_regulator(void)
>> > {
>> > int count, v, i, found;
>> > struct cpufreq_frequency_table *freq;
>>
>> Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
>
> Rafael,
> Are you going to pick it up?
I thought I did, didn't I?
Thanks,
Rafael
^ permalink raw reply
* [PATCH 0/2] ARM: orion5x: Move micon code to a MFD driver
From: Andrew Lunn @ 2017-01-02 21:24 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20161229000137.5553-1-f.fainelli@gmail.com>
On Wed, Dec 28, 2016 at 04:01:35PM -0800, Florian Fainelli wrote:
> Hi all,
>
> This patch series removes some duplicate code between the Kurobox and the
> Terastation Pro 2 since they both use the same on-board microcontroller (micon)
> attached to their UART1 for system restart.
>
> Future patches will add support for the LEDs, temperature, FAN that the micro
> controller provides.
Hi Florian
Are you coordinating with Roger Shimizu <rogershimizu@gmail.com>. He
has a conflicting patchset doing something similar/different.
Andrew
^ permalink raw reply
* [PATCH v6 10/14] ACPI: ARM64: IORT: rework iort_node_get_id() for NC->SMMU->ITS case
From: Sinan Kaya @ 2017-01-02 22:30 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1483363905-2806-11-git-send-email-hanjun.guo@linaro.org>
Hi Hanjun,
On 1/2/2017 8:31 AM, Hanjun Guo wrote:
> iort_node_get_id() for now only support NC(named componant)->SMMU
> or NC->ITS cases, we also have other device topology such NC->
> SMMU->ITS, so rework iort_node_get_id() for those cases.
>
> Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
> Tested-by: Majun <majun258@huawei.com>
> Tested-by: Xinwei Kong <kong.kongxinwei@hisilicon.com>
> Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
> ---
> drivers/acpi/arm64/iort.c | 61 ++++++++++++++++++++++++++---------------------
> 1 file changed, 34 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
> index 6b72fcb..99f079b 100644
> --- a/drivers/acpi/arm64/iort.c
> +++ b/drivers/acpi/arm64/iort.c
> @@ -292,22 +292,28 @@ static acpi_status iort_match_node_callback(struct acpi_iort_node *node,
> return status;
> }
>
> -static int iort_id_map(struct acpi_iort_id_mapping *map, u8 type, u32 rid_in,
> - u32 *rid_out)
> +static int iort_id_single_map(struct acpi_iort_id_mapping *map, u8 type,
> + u32 *rid_out)
> {
> /* Single mapping does not care for input id */
> if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) {
> if (type == ACPI_IORT_NODE_NAMED_COMPONENT ||
> type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX) {
> - *rid_out = map->output_base;
> + if (rid_out)
> + *rid_out = map->output_base;
> return 0;
> }
>
> pr_warn(FW_BUG "[map %p] SINGLE MAPPING flag not allowed for node type %d, skipping ID map\n",
> map, type);
> - return -ENXIO;
> }
>
> + return -ENXIO;
> +}
> +
> +static int iort_id_map(struct acpi_iort_id_mapping *map, u32 rid_in,
> + u32 *rid_out)
> +{
> if (rid_in < map->input_base ||
> (rid_in >= map->input_base + map->id_count))
> return -ENXIO;
> @@ -324,33 +330,34 @@ struct acpi_iort_node *iort_node_get_id(struct acpi_iort_node *node,
> struct acpi_iort_node *parent;
> struct acpi_iort_id_mapping *map;
>
> - if (!node->mapping_offset || !node->mapping_count ||
> - index >= node->mapping_count)
> - return NULL;
> -
> - map = ACPI_ADD_PTR(struct acpi_iort_id_mapping, node,
> - node->mapping_offset);
> + while (node) {
> + if (!node->mapping_offset || !node->mapping_count ||
> + index >= node->mapping_count)
> + return NULL;
>
> - /* Firmware bug! */
> - if (!map->output_reference) {
> - pr_err(FW_BUG "[node %p type %d] ID map has NULL parent reference\n",
> - node, node->type);
> - return NULL;
> - }
> + map = ACPI_ADD_PTR(struct acpi_iort_id_mapping, node,
> + node->mapping_offset);
>
> - parent = ACPI_ADD_PTR(struct acpi_iort_node, iort_table,
> - map->output_reference);
> + /* Firmware bug! */
> + if (!map->output_reference) {
> + pr_err(FW_BUG "[node %p type %d] ID map has NULL parent reference\n",
> + node, node->type);
> + return NULL;
> + }
>
> - if (!(IORT_TYPE_MASK(parent->type) & type_mask))
> - return NULL;
> + parent = ACPI_ADD_PTR(struct acpi_iort_node, iort_table,
> + map->output_reference);
>
> - if (map[index].flags & ACPI_IORT_ID_SINGLE_MAPPING) {
> - if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT ||
> - node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX) {
> - if (id_out)
> - *id_out = map[index].output_base;
> - return parent;
> + /* go upstream to find its parent */
> + if (!(IORT_TYPE_MASK(parent->type) & type_mask)) {
> + node = parent;
> + continue;
> }
> +
> + if (iort_id_single_map(&map[index], node->type, id_out))
> + break;
> +
> + return parent;
> }
>
> return NULL;
> @@ -388,7 +395,7 @@ static struct acpi_iort_node *iort_node_map_rid(struct acpi_iort_node *node,
>
> /* Do the RID translation */
> for (i = 0; i < node->mapping_count; i++, map++) {
> - if (!iort_id_map(map, node->type, rid, &rid))
> + if (!iort_id_map(map, rid, &rid))
> break;
> }
>
>
I wanted to follow up on your note for NC->SMMU->ITS case as I do have this use case on the
Qualcomm QDF2400 server and HIDMA DMA Engine. HIDMA is capable of sending MSI interrupts
towards the GIC ITS.
I don't know if this patch is supposed to fix the NC->SMMU->ITS case as it suggests in the commit
message but it doesn't seems to be working for me. Maybe, it was a to do for you. It wasn't quite
clear from the commit.
I debugged the code and came up with the following patch. Feel free to incorporate/rework with
your existing patch.
A named node can have an output ID of 0x20 and SMMU can have an output
parameter of 0x80000. The device ID needs to be 0x80000+0x20 for this
use case.
With the addition of this patch on top of the first 11 patches, I'm also providing my tested by here
for the first 11 patches.
Tested-by: Sinan Kaya <okaya@codeaurora.org>
--
Sinan Kaya
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
-------------- next part --------------
>From c5ab7172a400bfd5b460374e70394fe78c260603 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@codeaurora.org>
Date: Mon, 2 Jan 2017 17:16:45 -0500
Subject: [PATCH] ACPI: ARM64: IORT: rework iort_node_get_id() for
NC->SMMU->ITS case part #2
Code won't collect the output ID as it traverses NC->SMMU->ITS path.
Adding support for this use case.
A named node can have an output ID of 0x20 and SMMU can have an output
parameter of 0x80000. The device ID needs to be 0x80000+0x20 for this
use case.
Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
---
drivers/acpi/arm64/iort.c | 58 ++++++++++++++++++++++++++---------------------
1 file changed, 32 insertions(+), 26 deletions(-)
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 882e624..19cb97a 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -296,18 +296,16 @@ static int iort_id_single_map(struct acpi_iort_id_mapping *map, u8 type,
u32 *rid_out)
{
/* Single mapping does not care for input id */
- if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) {
- if (type == ACPI_IORT_NODE_NAMED_COMPONENT ||
- type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX) {
- if (rid_out)
- *rid_out = map->output_base;
- return 0;
- }
-
- pr_warn(FW_BUG "[map %p] SINGLE MAPPING flag not allowed for node type %d, skipping ID map\n",
- map, type);
+ if (type == ACPI_IORT_NODE_NAMED_COMPONENT ||
+ type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX) {
+ if (rid_out)
+ *rid_out = map->output_base;
+ return 0;
}
+ pr_warn(FW_BUG "[map %p] SINGLE MAPPING flag not allowed for node type %d, skipping ID map\n",
+ map, type);
+
return -ENXIO;
}
@@ -327,13 +325,20 @@ struct acpi_iort_node *iort_node_get_id(struct acpi_iort_node *node,
u32 *id_out, u8 type_mask,
int index)
{
- struct acpi_iort_node *parent;
- struct acpi_iort_id_mapping *map;
+ u32 id = 0;
while (node) {
- if (!node->mapping_offset || !node->mapping_count ||
- index >= node->mapping_count)
- return NULL;
+ struct acpi_iort_id_mapping *map;
+
+ if (IORT_TYPE_MASK(node->type) & type_mask) {
+ if (id_out)
+ *id_out = id;
+
+ return node;
+ }
+
+ if (!node->mapping_offset || !node->mapping_count)
+ goto fail_map;
map = ACPI_ADD_PTR(struct acpi_iort_id_mapping, node,
node->mapping_offset);
@@ -342,24 +347,25 @@ struct acpi_iort_node *iort_node_get_id(struct acpi_iort_node *node,
if (!map->output_reference) {
pr_err(FW_BUG "[node %p type %d] ID map has NULL parent reference\n",
node, node->type);
- return NULL;
+ goto fail_map;
}
- parent = ACPI_ADD_PTR(struct acpi_iort_node, iort_table,
- map->output_reference);
-
- /* go upstream to find its parent */
- if (!(IORT_TYPE_MASK(parent->type) & type_mask)) {
- node = parent;
- continue;
+ if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) {
+ if (iort_id_single_map(&map[index], node->type, &id))
+ goto fail_map;
+ } else {
+ if (iort_id_map(map, id, &id))
+ goto fail_map;
}
- if (iort_id_single_map(&map[index], node->type, id_out))
- break;
+ if (index == node->mapping_count)
+ goto fail_map;
- return parent;
+ node = ACPI_ADD_PTR(struct acpi_iort_node, iort_table,
+ map->output_reference);
}
+fail_map:
return NULL;
}
--
1.9.1
^ permalink raw reply related
* 4.10-rc1 on Nokia N900: regression, WARN_ON() omap_l3_smx.c
From: Pavel Machek @ 2017-01-02 22:48 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20170102181058.GF9325@atomide.com>
Hi!
> > I forgot I had v4.10-rc1 running, and now I got warning on all the
> > consoles (hand-copied).
> >
> >
> > Unhandled fault: external abort on non-linefetch (0x1028) at
> > 0xfa0ab060
> > ...
> > Comm: kworker/0:0 Not tainted.
> > Workqueue: events musb_irq_work
> > ...
> > PC is at musb_default_readb().
> > ...
>
> This means the clocks are not enabled at that point.
>
> > WARNING: CPU: 0 ... at drivers/bus/omap_l3_smx.c:166
> > omap3_l3_app_irq+0xcc/...
> > Tainted: GDW.
>
> If you comment out postcore_initcall_sync(omap3_l3_init);
> in drivers/bus/omap_l3_smx.c you'll see the proper stack
> trace instead of the l3 interrupt trace. The system will
> hang at that point most likely.
>
> > I do have patches to allow nfsroot over usb. But they worked ok in
> > v4.9... Does anyone see it, too?
>
> Hmm not much has changed since v4.9. Are you sure you
> had v4.9 or some earlier v4.9-rc version?
I believe v4.9 works.
But... this may be tricky to reproduce. It happened once so
far... after I reconnected N900 to computer when it was running for a
while. I do have USB cable with power meter on it, thus "flakey". Lets
see if it reappears...
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <http://lists.infradead.org/pipermail/linux-arm-kernel/attachments/20170102/09a252bc/attachment.sig>
^ permalink raw reply
* [PATCH] coresight: etm4x: Fix enabling of cycle accurate tracing in perf.
From: Mike Leach @ 2017-01-02 22:55 UTC (permalink / raw)
To: linux-arm-kernel
Using perf record 'cyclacc' option in cs_etm event was not setting up cycle
accurate trace correctly.
Corrects bit set in TRCCONFIGR to enable cycle accurate trace.
Programs TRCCCCTLR with a valid threshold value as required by ETMv4 spec.
Signed-off-by: Mike Leach <mike.leach@linaro.org>
---
drivers/hwtracing/coresight/coresight-etm4x.c | 7 +++++--
drivers/hwtracing/coresight/coresight-etm4x.h | 1 +
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c
index 4db8d6a..07be032 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -216,8 +216,11 @@ static int etm4_parse_event_config(struct etmv4_drvdata *drvdata,
goto out;
/* Go from generic option to ETMv4 specifics */
- if (attr->config & BIT(ETM_OPT_CYCACC))
- config->cfg |= ETMv4_MODE_CYCACC;
+ if (attr->config & BIT(ETM_OPT_CYCACC)) {
+ config->cfg |= BIT(4);
+ /* TRM: Must program this for cycacc to work */
+ config->ccctlr = ETM_CYC_THRESHOLD_DEFAULT;
+ }
if (attr->config & BIT(ETM_OPT_TS))
config->cfg |= ETMv4_MODE_TIMESTAMP;
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.h b/drivers/hwtracing/coresight/coresight-etm4x.h
index ba8d3f8..8a62c6c 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.h
+++ b/drivers/hwtracing/coresight/coresight-etm4x.h
@@ -146,6 +146,7 @@
#define ETM_ARCH_V4 0x40
#define ETMv4_SYNC_MASK 0x1F
#define ETM_CYC_THRESHOLD_MASK 0xFFF
+#define ETM_CYC_THRESHOLD_DEFAULT 256
#define ETMv4_EVENT_MASK 0xFF
#define ETM_CNTR_MAX_VAL 0xFFFF
#define ETM_TRACEID_MASK 0x3f
--
2.7.4
^ permalink raw reply related
* FYI: My boot farm going offline for a while
From: Olof Johansson @ 2017-01-02 23:03 UTC (permalink / raw)
To: linux-arm-kernel
Due to home improvement projects, I'll be shutting down the board farm
for a while. I'm also very tempted to bring it back in a much more
dense format, but that'll require some work on my behalf so it might
take a while.
Build reports should still be going out without much disruption.
-Olof
^ permalink raw reply
* [PATCH 0/5] arm64: sunxi: A64: enable MMC support
From: Andre Przywara @ 2017-01-02 23:03 UTC (permalink / raw)
To: linux-arm-kernel
So far the Allwinner A64/Pine64 DT was missing MMC support, because we
observed issues with that. Those have now been fixed (patch 1 and 2),
so we can enable the MMC IP block in the SoC .dtsi and the Pine64 .dts.
As this gives access to the SD card (as the only mass storage device on
most boards), this makes the kernel support actually useful.
The A64 MMC controller has more up its sleeves, but for now this level
of support is good enough.
Thanks a lot to Maxime for investigating the eMMC failure and coming up
with a nice fix for that.
Maxime: I picked that patch from some pastebin drop of yours, please holler
if there's something wrong with that (patch 2/5).
I send the BananaPi M64 .dts patch along with that series, as the eMMC on
that board now makes some difference.
Cheers,
Andre.
Andre Przywara (4):
drivers: mmc: sunxi: fix A64 calibration routine
arm64: dts: sun50i: add MMC nodes
arm64: dts: Pine64: add MMC support
arm64: dts: add BananaPi-M64 support
Maxime Ripard (1):
drivers: mmc: sunxi: limit A64 MMC2 to 8K DMA buffer
.../devicetree/bindings/mmc/sunxi-mmc.txt | 1 +
arch/arm64/boot/dts/allwinner/Makefile | 1 +
.../boot/dts/allwinner/sun50i-a64-bananapi-m64.dts | 125 +++++++++++++++++++++
.../arm64/boot/dts/allwinner/sun50i-a64-pine64.dts | 18 +++
arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi | 77 +++++++++++++
drivers/mmc/host/sunxi-mmc.c | 37 ++++--
6 files changed, 247 insertions(+), 12 deletions(-)
create mode 100644 arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts
--
2.8.2
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox