Linux-ARM-Kernel Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V1] pinctrl:pxa:pinctrl-pxa2xx:- No need of devm functions
From: Arvind Yadav @ 2016-12-08 14:35 UTC (permalink / raw)
  To: linux-arm-kernel

In functions pxa2xx_build_functions, the memory allocated for
'functions' is live within the function only. After the
allocation it is immediately freed with devm_kfree. There is
no need to allocate memory for 'functions' with devm function
so replace devm_kcalloc  with kcalloc and devm_kfree with kfree.

Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
---
 drivers/pinctrl/pxa/pinctrl-pxa2xx.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
index 866aa3c..47b8e3a 100644
--- a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
+++ b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
@@ -277,7 +277,7 @@ static int pxa2xx_build_functions(struct pxa_pinctrl *pctl)
 	 * alternate function, 6 * npins is an absolute high limit of the number
 	 * of functions.
 	 */
-	functions = devm_kcalloc(pctl->dev, pctl->npins * 6,
+	functions = kcalloc(pctl->npins * 6,
 				 sizeof(*functions), GFP_KERNEL);
 	if (!functions)
 		return -ENOMEM;
@@ -289,10 +289,12 @@ static int pxa2xx_build_functions(struct pxa_pinctrl *pctl)
 	pctl->functions = devm_kmemdup(pctl->dev, functions,
 				       pctl->nfuncs * sizeof(*functions),
 				       GFP_KERNEL);
-	if (!pctl->functions)
+	if (!pctl->functions) {
+		kfree(functions);
 		return -ENOMEM;
+	}
 
-	devm_kfree(pctl->dev, functions);
+	kfree(functions);
 	return 0;
 }
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH 2/2] crypto: arm/chacha20 - implement NEON version based on SSE3 code
From: Ard Biesheuvel @ 2016-12-08 14:28 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1481207339-17332-1-git-send-email-ard.biesheuvel@linaro.org>

This is a straight port to ARM/NEON of the x86 SSE3 implementation
of the ChaCha20 stream cipher.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm/crypto/Kconfig              |   6 +
 arch/arm/crypto/Makefile             |   2 +
 arch/arm/crypto/chacha20-neon-core.S | 524 ++++++++++++++++++++
 arch/arm/crypto/chacha20-neon-glue.c | 136 +++++
 4 files changed, 668 insertions(+)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 13f1b4c289d4..2f3339f015d3 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -130,4 +130,10 @@ config CRYPTO_CRC32_ARM_CE
 	depends on KERNEL_MODE_NEON && CRC32
 	select CRYPTO_HASH
 
+config CRYPTO_CHACHA20_NEON
+	tristate "NEON accelerated ChaCha20 symmetric cipher"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_CHACHA20
+
 endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index b578a1820ab1..8d74e55eacd4 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
 
 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -40,6 +41,7 @@ aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
+chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
 
 quiet_cmd_perl = PERL    $@
       cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S
new file mode 100644
index 000000000000..9f315041f521
--- /dev/null
+++ b/arch/arm/crypto/chacha20-neon-core.S
@@ -0,0 +1,524 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SNEON3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+	.text
+	.fpu		neon
+	.align		5
+
+ENTRY(chacha20_block_xor_neon)
+	// r0: Input state matrix, s
+	// r1: 1 data block output, o
+	// r2: 1 data block input, i
+
+	//
+	// This function encrypts one ChaCha20 block by loading the state matrix
+	// in four NEON registers. It performs matrix operation on four words in
+	// parallel, but requireds shuffling to rearrange the words after each
+	// round.
+	//
+
+	// x0..3 = s0..3
+	add		ip, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [ip]
+
+	vmov		q8, q0
+	vmov		q9, q1
+	vmov		q10, q2
+	vmov		q11, q3
+
+	mov		r3, #10
+
+.Ldoubleround:
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vadd.i32	q0, q0, q1
+	veor		q4, q3, q0
+	vshl.u32	q3, q4, #16
+	vsri.u32	q3, q4, #16
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #12
+	vsri.u32	q1, q4, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vadd.i32	q0, q0, q1
+	veor		q4, q3, q0
+	vshl.u32	q3, q4, #8
+	vsri.u32	q3, q4, #24
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #7
+	vsri.u32	q1, q4, #25
+
+	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vext.8		q1, q1, q1, #4
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vext.8		q2, q2, q2, #8
+	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vext.8		q3, q3, q3, #12
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vadd.i32	q0, q0, q1
+	veor		q4, q3, q0
+	vshl.u32	q3, q4, #16
+	vsri.u32	q3, q4, #16
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #12
+	vsri.u32	q1, q4, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vadd.i32	q0, q0, q1
+	veor		q4, q3, q0
+	vshl.u32	q3, q4, #8
+	vsri.u32	q3, q4, #24
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #7
+	vsri.u32	q1, q4, #25
+
+	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vext.8		q1, q1, q1, #12
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vext.8		q2, q2, q2, #8
+	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vext.8		q3, q3, q3, #4
+
+	subs		r3, r3, #1
+	bne		.Ldoubleround
+
+	add		ip, r2, #0x20
+	vld1.8		{q4-q5}, [r2]
+	vld1.8		{q6-q7}, [ip]
+
+	// o0 = i0 ^ (x0 + s0)
+	vadd.i32	q0, q0, q8
+	veor		q0, q0, q4
+
+	// o1 = i1 ^ (x1 + s1)
+	vadd.i32	q1, q1, q9
+	veor		q1, q1, q5
+
+	// o2 = i2 ^ (x2 + s2)
+	vadd.i32	q2, q2, q10
+	veor		q2, q2, q6
+
+	// o3 = i3 ^ (x3 + s3)
+	vadd.i32	q3, q3, q11
+	veor		q3, q3, q7
+
+	add		ip, r1, #0x20
+	vst1.8		{q0-q1}, [r1]
+	vst1.8		{q2-q3}, [ip]
+
+	bx		lr
+ENDPROC(chacha20_block_xor_neon)
+
+	.align		5
+ENTRY(chacha20_4block_xor_neon)
+	push		{r4-r6, lr}
+	mov		ip, sp			// preserve the stack pointer
+	sub		r3, sp, #0x20		// allocate a 32 byte buffer
+	bic		r3, r3, #0x1f		// aligned to 32 bytes
+	mov		sp, r3
+
+	// r0: Input state matrix, s
+	// r1: 4 data blocks output, o
+	// r2: 4 data blocks input, i
+
+	//
+	// This function encrypts four consecutive ChaCha20 blocks by loading
+	// the state matrix in NEON registers four times. The algorithm performs
+	// each operation on the corresponding word of each state matrix, hence
+	// requires no word shuffling. For final XORing step we transpose the
+	// matrix by interleaving 32- and then 64-bit words, which allows us to
+	// do XOR in NEON registers.
+	//
+
+	// x0..15[0-3] = s0..3[0..3]
+	add		r3, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [r3]
+
+	adr		r3, CTRINC
+	vdup.32		q15, d7[1]
+	vdup.32		q14, d7[0]
+	vld1.32		{q11}, [r3, :128]
+	vdup.32		q13, d6[1]
+	vdup.32		q12, d6[0]
+	vadd.i32	q12, q12, q11		// x12 += counter values 0-3
+	vdup.32		q11, d5[1]
+	vdup.32		q10, d5[0]
+	vdup.32		q9, d4[1]
+	vdup.32		q8, d4[0]
+	vdup.32		q7, d3[1]
+	vdup.32		q6, d3[0]
+	vdup.32		q5, d2[1]
+	vdup.32		q4, d2[0]
+	vdup.32		q3, d1[1]
+	vdup.32		q2, d1[0]
+	vdup.32		q1, d0[1]
+	vdup.32		q0, d0[0]
+
+	mov		r3, #10
+
+.Ldoubleround4:
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vadd.i32	q0, q0, q4
+	vadd.i32	q1, q1, q5
+	vadd.i32	q2, q2, q6
+	vadd.i32	q3, q3, q7
+
+	veor		q12, q12, q0
+	veor		q13, q13, q1
+	veor		q14, q14, q2
+	veor		q15, q15, q3
+
+	vrev32.16	q12, q12
+	vrev32.16	q13, q13
+	vrev32.16	q14, q14
+	vrev32.16	q15, q15
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vadd.i32	q8, q8, q12
+	vadd.i32	q9, q9, q13
+	vadd.i32	q10, q10, q14
+	vadd.i32	q11, q11, q15
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q4, q8
+	veor		q9, q5, q9
+	vshl.u32	q4, q8, #12
+	vshl.u32	q5, q9, #12
+	vsri.u32	q4, q8, #20
+	vsri.u32	q5, q9, #20
+
+	veor		q8, q6, q10
+	veor		q9, q7, q11
+	vshl.u32	q6, q8, #12
+	vshl.u32	q7, q9, #12
+	vsri.u32	q6, q8, #20
+	vsri.u32	q7, q9, #20
+
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vadd.i32	q0, q0, q4
+	vadd.i32	q1, q1, q5
+	vadd.i32	q2, q2, q6
+	vadd.i32	q3, q3, q7
+
+	veor		q8, q12, q0
+	veor		q9, q13, q1
+	vshl.u32	q12, q8, #8
+	vshl.u32	q13, q9, #8
+	vsri.u32	q12, q8, #24
+	vsri.u32	q13, q9, #24
+
+	veor		q8, q14, q2
+	veor		q9, q15, q3
+	vshl.u32	q14, q8, #8
+	vshl.u32	q15, q9, #8
+	vsri.u32	q14, q8, #24
+	vsri.u32	q15, q9, #24
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vadd.i32	q8, q8, q12
+	vadd.i32	q9, q9, q13
+	vadd.i32	q10, q10, q14
+	vadd.i32	q11, q11, q15
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q4, q8
+	veor		q9, q5, q9
+	vshl.u32	q4, q8, #7
+	vshl.u32	q5, q9, #7
+	vsri.u32	q4, q8, #25
+	vsri.u32	q5, q9, #25
+
+	veor		q8, q6, q10
+	veor		q9, q7, q11
+	vshl.u32	q6, q8, #7
+	vshl.u32	q7, q9, #7
+	vsri.u32	q6, q8, #25
+	vsri.u32	q7, q9, #25
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vadd.i32	q0, q0, q5
+	vadd.i32	q1, q1, q6
+	vadd.i32	q2, q2, q7
+	vadd.i32	q3, q3, q4
+
+	veor		q15, q15, q0
+	veor		q12, q12, q1
+	veor		q13, q13, q2
+	veor		q14, q14, q3
+
+	vrev32.16	q15, q15
+	vrev32.16	q12, q12
+	vrev32.16	q13, q13
+	vrev32.16	q14, q14
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vadd.i32	q10, q10, q15
+	vadd.i32	q11, q11, q12
+	vadd.i32	q8, q8, q13
+	vadd.i32	q9, q9, q14
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q7, q8
+	veor		q9, q4, q9
+	vshl.u32	q7, q8, #12
+	vshl.u32	q4, q9, #12
+	vsri.u32	q7, q8, #20
+	vsri.u32	q4, q9, #20
+
+	veor		q8, q5, q10
+	veor		q9, q6, q11
+	vshl.u32	q5, q8, #12
+	vshl.u32	q6, q9, #12
+	vsri.u32	q5, q8, #20
+	vsri.u32	q6, q9, #20
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vadd.i32	q0, q0, q5
+	vadd.i32	q1, q1, q6
+	vadd.i32	q2, q2, q7
+	vadd.i32	q3, q3, q4
+
+	veor		q8, q15, q0
+	veor		q9, q12, q1
+	vshl.u32	q15, q8, #8
+	vshl.u32	q12, q9, #8
+	vsri.u32	q15, q8, #24
+	vsri.u32	q12, q9, #24
+
+	veor		q8, q13, q2
+	veor		q9, q14, q3
+	vshl.u32	q13, q8, #8
+	vshl.u32	q14, q9, #8
+	vsri.u32	q13, q8, #24
+	vsri.u32	q14, q9, #24
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vadd.i32	q10, q10, q15
+	vadd.i32	q11, q11, q12
+	vadd.i32	q8, q8, q13
+	vadd.i32	q9, q9, q14
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q7, q8
+	veor		q9, q4, q9
+	vshl.u32	q7, q8, #7
+	vshl.u32	q4, q9, #7
+	vsri.u32	q7, q8, #25
+	vsri.u32	q4, q9, #25
+
+	veor		q8, q5, q10
+	veor		q9, q6, q11
+	vshl.u32	q5, q8, #7
+	vshl.u32	q6, q9, #7
+	vsri.u32	q5, q8, #25
+	vsri.u32	q6, q9, #25
+
+	subs		r3, r3, #1
+	beq		0f
+
+	vld1.32		{q8-q9}, [sp, :256]
+	b		.Ldoubleround4
+
+	// x0[0-3] += s0[0]
+	// x1[0-3] += s0[1]
+	// x2[0-3] += s0[2]
+	// x3[0-3] += s0[3]
+0:	ldmia		r0!, {r3-r6}
+	vdup.32		q8, r3
+	vdup.32		q9, r4
+	vadd.i32	q0, q0, q8
+	vadd.i32	q1, q1, q9
+	vdup.32		q8, r5
+	vdup.32		q9, r6
+	vadd.i32	q2, q2, q8
+	vadd.i32	q3, q3, q9
+
+	// x4[0-3] += s1[0]
+	// x5[0-3] += s1[1]
+	// x6[0-3] += s1[2]
+	// x7[0-3] += s1[3]
+	ldmia		r0!, {r3-r6}
+	vdup.32		q8, r3
+	vdup.32		q9, r4
+	vadd.i32	q4, q4, q8
+	vadd.i32	q5, q5, q9
+	vdup.32		q8, r5
+	vdup.32		q9, r6
+	vadd.i32	q6, q6, q8
+	vadd.i32	q7, q7, q9
+
+	// interleave 32-bit words in state n, n+1
+	vzip.32		q0, q1
+	vzip.32		q2, q3
+	vzip.32		q4, q5
+	vzip.32		q6, q7
+
+	// interleave 64-bit words in state n, n+2
+	vswp		d1, d4
+	vswp		d3, d6
+	vswp		d9, d12
+	vswp		d11, d14
+
+	// xor with corresponding input, write to output
+	vld1.8		{q8-q9}, [r2]!
+	veor		q8, q8, q0
+	veor		q9, q9, q4
+	vst1.8		{q8-q9}, [r1]!
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x8[0-3] += s2[0]
+	// x9[0-3] += s2[1]
+	// x10[0-3] += s2[2]
+	// x11[0-3] += s2[3]
+	ldmia		r0!, {r3-r6}
+	vdup.32		q0, r3
+	vdup.32		q4, r4
+	vadd.i32	q8, q8, q0
+	vadd.i32	q9, q9, q4
+	vdup.32		q0, r5
+	vdup.32		q4, r6
+	vadd.i32	q10, q10, q0
+	vadd.i32	q11, q11, q4
+
+	// x12[0-3] += s3[0]
+	// x13[0-3] += s3[1]
+	// x14[0-3] += s3[2]
+	// x15[0-3] += s3[3]
+	ldmia		r0!, {r3-r6}
+	vdup.32		q0, r3
+	vdup.32		q4, r4
+	adr		r3, CTRINC
+	vadd.i32	q12, q12, q0
+	vld1.32		{q0}, [r3, :128]
+	vadd.i32	q13, q13, q4
+	vadd.i32	q12, q12, q0		// x12 += counter values 0-3
+
+	vdup.32		q0, r5
+	vdup.32		q4, r6
+	vadd.i32	q14, q14, q0
+	vadd.i32	q15, q15, q4
+
+	// interleave 32-bit words in state n, n+1
+	vzip.32		q8, q9
+	vzip.32		q10, q11
+	vzip.32		q12, q13
+	vzip.32		q14, q15
+
+	// interleave 64-bit words in state n, n+2
+	vswp		d17, d20
+	vswp		d19, d22
+	vswp		d25, d28
+	vswp		d27, d30
+
+	vmov		q4, q1
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q8
+	veor		q1, q1, q12
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q2
+	veor		q1, q1, q6
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q10
+	veor		q1, q1, q14
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q4
+	veor		q1, q1, q5
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q9
+	veor		q1, q1, q13
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q3
+	veor		q1, q1, q7
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]
+	veor		q0, q0, q11
+	veor		q1, q1, q15
+	vst1.8		{q0-q1}, [r1]
+
+	mov		sp, ip
+	pop		{r4-r6, pc}
+ENDPROC(chacha20_4block_xor_neon)
+
+	.align		4
+CTRINC:	.word		0, 1, 2, 3
+
diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha20-neon-glue.c
new file mode 100644
index 000000000000..554f7f6069da
--- /dev/null
+++ b/arch/arm/crypto/chacha20-neon-glue.c
@@ -0,0 +1,136 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha20.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+
+static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
+			    unsigned int bytes)
+{
+	u8 buf[CHACHA20_BLOCK_SIZE];
+
+	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+		chacha20_4block_xor_neon(state, dst, src);
+		bytes -= CHACHA20_BLOCK_SIZE * 4;
+		src += CHACHA20_BLOCK_SIZE * 4;
+		dst += CHACHA20_BLOCK_SIZE * 4;
+		state[12] += 4;
+	}
+	while (bytes >= CHACHA20_BLOCK_SIZE) {
+		chacha20_block_xor_neon(state, dst, src);
+		bytes -= CHACHA20_BLOCK_SIZE;
+		src += CHACHA20_BLOCK_SIZE;
+		dst += CHACHA20_BLOCK_SIZE;
+		state[12]++;
+	}
+	if (bytes) {
+		memcpy(buf, src, bytes);
+		chacha20_block_xor_neon(state, buf, buf);
+		memcpy(dst, buf, bytes);
+	}
+}
+
+static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
+			 struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+	u32 state[16];
+	int err;
+
+	if (nbytes <= CHACHA20_BLOCK_SIZE || !may_use_simd())
+		return crypto_chacha20_crypt(desc, dst, src, nbytes);
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
+
+	crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
+
+	kernel_neon_begin();
+
+	while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
+		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
+				rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
+		err = blkcipher_walk_done(desc, &walk,
+					  walk.nbytes % CHACHA20_BLOCK_SIZE);
+	}
+
+	if (walk.nbytes) {
+		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
+				walk.nbytes);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+
+	kernel_neon_end();
+
+	return err;
+}
+
+static struct crypto_alg alg = {
+	.cra_name		= "chacha20",
+	.cra_driver_name	= "chacha20-neon",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_ctxsize		= sizeof(struct chacha20_ctx),
+	.cra_alignmask		= sizeof(u32) - 1,
+	.cra_module		= THIS_MODULE,
+	.cra_u			= {
+		.blkcipher = {
+			.min_keysize	= CHACHA20_KEY_SIZE,
+			.max_keysize	= CHACHA20_KEY_SIZE,
+			.ivsize		= CHACHA20_IV_SIZE,
+			.geniv		= "seqiv",
+			.setkey		= crypto_chacha20_setkey,
+			.encrypt	= chacha20_simd,
+			.decrypt	= chacha20_simd,
+		},
+	},
+};
+
+static int __init chacha20_simd_mod_init(void)
+{
+	if (!(elf_hwcap & HWCAP_NEON))
+		return -ENODEV;
+
+	return crypto_register_alg(&alg);
+}
+
+static void __exit chacha20_simd_mod_fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(chacha20_simd_mod_init);
+module_exit(chacha20_simd_mod_fini);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
-- 
2.7.4

^ permalink raw reply related

* [PATCH 1/2] crypto: arm64/chacha20 - implement NEON version based on SSE3 code
From: Ard Biesheuvel @ 2016-12-08 14:28 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1481207339-17332-1-git-send-email-ard.biesheuvel@linaro.org>

This is a straight port to arm64/NEON of the x86 SSE3 implementation
of the ChaCha20 stream cipher.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/Kconfig              |   6 +
 arch/arm64/crypto/Makefile             |   3 +
 arch/arm64/crypto/chacha20-neon-core.S | 480 ++++++++++++++++++++
 arch/arm64/crypto/chacha20-neon-glue.c | 131 ++++++
 4 files changed, 620 insertions(+)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 450a85df041a..0bf0f531f539 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -72,4 +72,10 @@ config CRYPTO_CRC32_ARM64
 	depends on ARM64
 	select CRYPTO_HASH
 
+config CRYPTO_CHACHA20_NEON
+	tristate "NEON accelerated ChaCha20 symmetric cipher"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_CHACHA20
+
 endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index aa8888d7b744..9d2826c5fccf 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -41,6 +41,9 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
 sha512-arm64-y := sha512-glue.o sha512-core.o
 
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
+chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
+
 AFLAGS_aes-ce.o		:= -DINTERLEAVE=4
 AFLAGS_aes-neon.o	:= -DINTERLEAVE=4
 
diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha20-neon-core.S
new file mode 100644
index 000000000000..e2cd65580807
--- /dev/null
+++ b/arch/arm64/crypto/chacha20-neon-core.S
@@ -0,0 +1,480 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+	.text
+	.align		6
+
+ENTRY(chacha20_block_xor_neon)
+	// x0: Input state matrix, s
+	// x1: 1 data block output, o
+	// x2: 1 data block input, i
+
+	//
+	// This function encrypts one ChaCha20 block by loading the state matrix
+	// in four NEON registers. It performs matrix operation on four words in
+	// parallel, but requires shuffling to rearrange the words after each
+	// round.
+	//
+
+	// x0..3 = s0..3
+	ld1		{v0.4s-v3.4s}, [x0]
+	ld1		{v8.4s-v11.4s}, [x0]
+
+	mov		x3, #10
+
+.Ldoubleround:
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	rev32		v3.8h, v3.8h
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #12
+	sri		v1.4s, v4.4s, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v4.16b, v3.16b, v0.16b
+	shl		v3.4s, v4.4s, #8
+	sri		v3.4s, v4.4s, #24
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #7
+	sri		v1.4s, v4.4s, #25
+
+	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	ext		v1.16b, v1.16b, v1.16b, #4
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	ext		v2.16b, v2.16b, v2.16b, #8
+	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	ext		v3.16b, v3.16b, v3.16b, #12
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	rev32		v3.8h, v3.8h
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #12
+	sri		v1.4s, v4.4s, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v4.16b, v3.16b, v0.16b
+	shl		v3.4s, v4.4s, #8
+	sri		v3.4s, v4.4s, #24
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #7
+	sri		v1.4s, v4.4s, #25
+
+	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	ext		v1.16b, v1.16b, v1.16b, #12
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	ext		v2.16b, v2.16b, v2.16b, #8
+	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	ext		v3.16b, v3.16b, v3.16b, #4
+
+	subs		x3, x3, #1
+	b.ne		.Ldoubleround
+
+	ld1		{v4.16b-v7.16b}, [x2]
+
+	// o0 = i0 ^ (x0 + s0)
+	add		v0.4s, v0.4s, v8.4s
+	eor		v0.16b, v0.16b, v4.16b
+
+	// o1 = i1 ^ (x1 + s1)
+	add		v1.4s, v1.4s, v9.4s
+	eor		v1.16b, v1.16b, v5.16b
+
+	// o2 = i2 ^ (x2 + s2)
+	add		v2.4s, v2.4s, v10.4s
+	eor		v2.16b, v2.16b, v6.16b
+
+	// o3 = i3 ^ (x3 + s3)
+	add		v3.4s, v3.4s, v11.4s
+	eor		v3.16b, v3.16b, v7.16b
+
+	st1		{v0.16b-v3.16b}, [x1]
+
+	ret
+ENDPROC(chacha20_block_xor_neon)
+
+	.align		6
+ENTRY(chacha20_4block_xor_neon)
+	// x0: Input state matrix, s
+	// x1: 4 data blocks output, o
+	// x2: 4 data blocks input, i
+
+	//
+	// This function encrypts four consecutive ChaCha20 blocks by loading
+	// the state matrix in NEON registers four times. The algorithm performs
+	// each operation on the corresponding word of each state matrix, hence
+	// requires no word shuffling. For final XORing step we transpose the
+	// matrix by interleaving 32- and then 64-bit words, which allows us to
+	// do XOR in NEON registers.
+	//
+	adr		x3, CTRINC
+	ld1		{v16.4s}, [x3]
+
+	// x0..15[0-3] = s0..3[0..3]
+	mov		x4, x0
+	ld4r		{ v0.4s- v3.4s}, [x4], #16
+	ld4r		{ v4.4s- v7.4s}, [x4], #16
+	ld4r		{ v8.4s-v11.4s}, [x4], #16
+	ld4r		{v12.4s-v15.4s}, [x4]
+
+	// x12 += counter values 0-3
+	add		v12.4s, v12.4s, v16.4s
+
+	mov		x3, #10
+
+.Ldoubleround4:
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	add		v0.4s, v0.4s, v4.4s
+	add		v1.4s, v1.4s, v5.4s
+	add		v2.4s, v2.4s, v6.4s
+	add		v3.4s, v3.4s, v7.4s
+
+	eor		v12.16b, v12.16b, v0.16b
+	eor		v13.16b, v13.16b, v1.16b
+	eor		v14.16b, v14.16b, v2.16b
+	eor		v15.16b, v15.16b, v3.16b
+
+	rev32		v12.8h, v12.8h
+	rev32		v13.8h, v13.8h
+	rev32		v14.8h, v14.8h
+	rev32		v15.8h, v15.8h
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	add		v8.4s, v8.4s, v12.4s
+	add		v9.4s, v9.4s, v13.4s
+	add		v10.4s, v10.4s, v14.4s
+	add		v11.4s, v11.4s, v15.4s
+
+	eor		v17.16b, v4.16b, v8.16b
+	eor		v18.16b, v5.16b, v9.16b
+	eor		v19.16b, v6.16b, v10.16b
+	eor		v20.16b, v7.16b, v11.16b
+
+	shl		v4.4s, v17.4s, #12
+	shl		v5.4s, v18.4s, #12
+	shl		v6.4s, v19.4s, #12
+	shl		v7.4s, v20.4s, #12
+
+	sri		v4.4s, v17.4s, #20
+	sri		v5.4s, v18.4s, #20
+	sri		v6.4s, v19.4s, #20
+	sri		v7.4s, v20.4s, #20
+
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	add		v0.4s, v0.4s, v4.4s
+	add		v1.4s, v1.4s, v5.4s
+	add		v2.4s, v2.4s, v6.4s
+	add		v3.4s, v3.4s, v7.4s
+
+	eor		v17.16b, v12.16b, v0.16b
+	eor		v18.16b, v13.16b, v1.16b
+	eor		v19.16b, v14.16b, v2.16b
+	eor		v20.16b, v15.16b, v3.16b
+
+	shl		v12.4s, v17.4s, #8
+	shl		v13.4s, v18.4s, #8
+	shl		v14.4s, v19.4s, #8
+	shl		v15.4s, v20.4s, #8
+
+	sri		v12.4s, v17.4s, #24
+	sri		v13.4s, v18.4s, #24
+	sri		v14.4s, v19.4s, #24
+	sri		v15.4s, v20.4s, #24
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	add		v8.4s, v8.4s, v12.4s
+	add		v9.4s, v9.4s, v13.4s
+	add		v10.4s, v10.4s, v14.4s
+	add		v11.4s, v11.4s, v15.4s
+
+	eor		v17.16b, v4.16b, v8.16b
+	eor		v18.16b, v5.16b, v9.16b
+	eor		v19.16b, v6.16b, v10.16b
+	eor		v20.16b, v7.16b, v11.16b
+
+	shl		v4.4s, v17.4s, #7
+	shl		v5.4s, v18.4s, #7
+	shl		v6.4s, v19.4s, #7
+	shl		v7.4s, v20.4s, #7
+
+	sri		v4.4s, v17.4s, #25
+	sri		v5.4s, v18.4s, #25
+	sri		v6.4s, v19.4s, #25
+	sri		v7.4s, v20.4s, #25
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	add		v0.4s, v0.4s, v5.4s
+	add		v1.4s, v1.4s, v6.4s
+	add		v2.4s, v2.4s, v7.4s
+	add		v3.4s, v3.4s, v4.4s
+
+	eor		v15.16b, v15.16b, v0.16b
+	eor		v12.16b, v12.16b, v1.16b
+	eor		v13.16b, v13.16b, v2.16b
+	eor		v14.16b, v14.16b, v3.16b
+
+	rev32		v15.8h, v15.8h
+	rev32		v12.8h, v12.8h
+	rev32		v13.8h, v13.8h
+	rev32		v14.8h, v14.8h
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	add		v10.4s, v10.4s, v15.4s
+	add		v11.4s, v11.4s, v12.4s
+	add		v8.4s, v8.4s, v13.4s
+	add		v9.4s, v9.4s, v14.4s
+
+	eor		v17.16b, v5.16b, v10.16b
+	eor		v18.16b, v6.16b, v11.16b
+	eor		v19.16b, v7.16b, v8.16b
+	eor		v20.16b, v4.16b, v9.16b
+
+	shl		v5.4s, v17.4s, #12
+	shl		v6.4s, v18.4s, #12
+	shl		v7.4s, v19.4s, #12
+	shl		v4.4s, v20.4s, #12
+
+	sri		v5.4s, v17.4s, #20
+	sri		v6.4s, v18.4s, #20
+	sri		v7.4s, v19.4s, #20
+	sri		v4.4s, v20.4s, #20
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	add		v0.4s, v0.4s, v5.4s
+	add		v1.4s, v1.4s, v6.4s
+	add		v2.4s, v2.4s, v7.4s
+	add		v3.4s, v3.4s, v4.4s
+
+	eor		v17.16b, v15.16b, v0.16b
+	eor		v18.16b, v12.16b, v1.16b
+	eor		v19.16b, v13.16b, v2.16b
+	eor		v20.16b, v14.16b, v3.16b
+
+	shl		v15.4s, v17.4s, #8
+	shl		v12.4s, v18.4s, #8
+	shl		v13.4s, v19.4s, #8
+	shl		v14.4s, v20.4s, #8
+
+	sri		v15.4s, v17.4s, #24
+	sri		v12.4s, v18.4s, #24
+	sri		v13.4s, v19.4s, #24
+	sri		v14.4s, v20.4s, #24
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	add		v10.4s, v10.4s, v15.4s
+	add		v11.4s, v11.4s, v12.4s
+	add		v8.4s, v8.4s, v13.4s
+	add		v9.4s, v9.4s, v14.4s
+
+	eor		v17.16b, v5.16b, v10.16b
+	eor		v18.16b, v6.16b, v11.16b
+	eor		v19.16b, v7.16b, v8.16b
+	eor		v20.16b, v4.16b, v9.16b
+
+	shl		v5.4s, v17.4s, #7
+	shl		v6.4s, v18.4s, #7
+	shl		v7.4s, v19.4s, #7
+	shl		v4.4s, v20.4s, #7
+
+	sri		v5.4s, v17.4s, #25
+	sri		v6.4s, v18.4s, #25
+	sri		v7.4s, v19.4s, #25
+	sri		v4.4s, v20.4s, #25
+
+	subs		x3, x3, #1
+	b.ne		.Ldoubleround4
+
+	// x0[0-3] += s0[0]
+	// x1[0-3] += s0[1]
+	// x2[0-3] += s0[2]
+	// x3[0-3] += s0[3]
+	ld4r		{v17.4s-v20.4s}, [x0], #16
+	add		v0.4s, v0.4s, v17.4s
+	add		v1.4s, v1.4s, v18.4s
+	add		v2.4s, v2.4s, v19.4s
+	add		v3.4s, v3.4s, v20.4s
+
+	// x4[0-3] += s1[0]
+	// x5[0-3] += s1[1]
+	// x6[0-3] += s1[2]
+	// x7[0-3] += s1[3]
+	ld4r		{v21.4s-v24.4s}, [x0], #16
+	add		v4.4s, v4.4s, v21.4s
+	add		v5.4s, v5.4s, v22.4s
+	add		v6.4s, v6.4s, v23.4s
+	add		v7.4s, v7.4s, v24.4s
+
+	// x8[0-3] += s2[0]
+	// x9[0-3] += s2[1]
+	// x10[0-3] += s2[2]
+	// x11[0-3] += s2[3]
+	ld4r		{v17.4s-v20.4s}, [x0], #16
+	add		v8.4s, v8.4s, v17.4s
+	add		v9.4s, v9.4s, v18.4s
+	add		v10.4s, v10.4s, v19.4s
+	add		v11.4s, v11.4s, v20.4s
+
+	// x12[0-3] += s3[0]
+	// x13[0-3] += s3[1]
+	// x14[0-3] += s3[2]
+	// x15[0-3] += s3[3]
+	ld4r		{v21.4s-v24.4s}, [x0]
+	add		v12.4s, v12.4s, v21.4s
+	add		v13.4s, v13.4s, v22.4s
+	add		v14.4s, v14.4s, v23.4s
+	add		v15.4s, v15.4s, v24.4s
+
+	// x12 += counter values 0-3
+	add		v12.4s, v12.4s, v16.4s
+
+	ld1		{v16.16b-v19.16b}, [x2], #64
+	ld1		{v20.16b-v23.16b}, [x2], #64
+
+	// interleave 32-bit words in state n, n+1
+	zip1		v24.4s, v0.4s, v1.4s
+	zip1		v25.4s, v2.4s, v3.4s
+	zip1		v26.4s, v4.4s, v5.4s
+	zip1		v27.4s, v6.4s, v7.4s
+	zip1		v28.4s, v8.4s, v9.4s
+	zip1		v29.4s, v10.4s, v11.4s
+	zip1		v30.4s, v12.4s, v13.4s
+	zip1		v31.4s, v14.4s, v15.4s
+
+	zip2		v1.4s, v0.4s, v1.4s
+	zip2		v3.4s, v2.4s, v3.4s
+	zip2		v5.4s, v4.4s, v5.4s
+	zip2		v7.4s, v6.4s, v7.4s
+	zip2		v9.4s, v8.4s, v9.4s
+	zip2		v11.4s, v10.4s, v11.4s
+	zip2		v13.4s, v12.4s, v13.4s
+	zip2		v15.4s, v14.4s, v15.4s
+
+	mov		v0.16b, v24.16b
+	mov		v2.16b, v25.16b
+	mov		v4.16b, v26.16b
+	mov		v6.16b, v27.16b
+	mov		v8.16b, v28.16b
+	mov		v10.16b, v29.16b
+	mov		v12.16b, v30.16b
+	mov		v14.16b, v31.16b
+
+	// interleave 64-bit words in state n, n+2
+	zip1		v24.2d, v0.2d, v2.2d
+	zip1		v25.2d, v1.2d, v3.2d
+	zip1		v26.2d, v4.2d, v6.2d
+	zip1		v27.2d, v5.2d, v7.2d
+	zip1		v28.2d, v8.2d, v10.2d
+	zip1		v29.2d, v9.2d, v11.2d
+	zip1		v30.2d, v12.2d, v14.2d
+	zip1		v31.2d, v13.2d, v15.2d
+
+	zip2		v2.2d, v0.2d, v2.2d
+	zip2		v3.2d, v1.2d, v3.2d
+	zip2		v6.2d, v4.2d, v6.2d
+	zip2		v7.2d, v5.2d, v7.2d
+	zip2		v10.2d, v8.2d, v10.2d
+	zip2		v11.2d, v9.2d, v11.2d
+	zip2		v14.2d, v12.2d, v14.2d
+	zip2		v15.2d, v13.2d, v15.2d
+
+	mov		v0.16b, v24.16b
+	mov		v1.16b, v25.16b
+	mov		v4.16b, v26.16b
+	mov		v5.16b, v27.16b
+
+	mov		v8.16b, v28.16b
+	mov		v9.16b, v29.16b
+	mov		v12.16b, v30.16b
+	mov		v13.16b, v31.16b
+
+	ld1		{v24.16b-v27.16b}, [x2], #64
+	ld1		{v28.16b-v31.16b}, [x2]
+
+	// xor with corresponding input, write to output
+	eor		v16.16b, v16.16b, v0.16b
+	eor		v17.16b, v17.16b, v4.16b
+	eor		v18.16b, v18.16b, v8.16b
+	eor		v19.16b, v19.16b, v12.16b
+	st1		{v16.16b-v19.16b}, [x1], #64
+
+	eor		v20.16b, v20.16b, v2.16b
+	eor		v21.16b, v21.16b, v6.16b
+	eor		v22.16b, v22.16b, v10.16b
+	eor		v23.16b, v23.16b, v14.16b
+	st1		{v20.16b-v23.16b}, [x1], #64
+
+	eor		v24.16b, v24.16b, v1.16b
+	eor		v25.16b, v25.16b, v5.16b
+	eor		v26.16b, v26.16b, v9.16b
+	eor		v27.16b, v27.16b, v13.16b
+	st1		{v24.16b-v27.16b}, [x1], #64
+
+	eor		v28.16b, v28.16b, v3.16b
+	eor		v29.16b, v29.16b, v7.16b
+	eor		v30.16b, v30.16b, v11.16b
+	eor		v31.16b, v31.16b, v15.16b
+	st1		{v28.16b-v31.16b}, [x1]
+
+	ret
+ENDPROC(chacha20_4block_xor_neon)
+
+CTRINC:	.word		0, 1, 2, 3
diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c
new file mode 100644
index 000000000000..705b42b06d00
--- /dev/null
+++ b/arch/arm64/crypto/chacha20-neon-glue.c
@@ -0,0 +1,131 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha20.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/neon.h>
+
+asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+
+static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
+			    unsigned int bytes)
+{
+	u8 buf[CHACHA20_BLOCK_SIZE];
+
+	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+		chacha20_4block_xor_neon(state, dst, src);
+		bytes -= CHACHA20_BLOCK_SIZE * 4;
+		src += CHACHA20_BLOCK_SIZE * 4;
+		dst += CHACHA20_BLOCK_SIZE * 4;
+		state[12] += 4;
+	}
+	while (bytes >= CHACHA20_BLOCK_SIZE) {
+		chacha20_block_xor_neon(state, dst, src);
+		bytes -= CHACHA20_BLOCK_SIZE;
+		src += CHACHA20_BLOCK_SIZE;
+		dst += CHACHA20_BLOCK_SIZE;
+		state[12]++;
+	}
+	if (bytes) {
+		memcpy(buf, src, bytes);
+		chacha20_block_xor_neon(state, buf, buf);
+		memcpy(dst, buf, bytes);
+	}
+}
+
+static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
+			 struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+	u32 state[16];
+	int err;
+
+	if (nbytes <= CHACHA20_BLOCK_SIZE)
+		return crypto_chacha20_crypt(desc, dst, src, nbytes);
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
+
+	crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
+
+	kernel_neon_begin();
+
+	while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
+		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
+				rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
+		err = blkcipher_walk_done(desc, &walk,
+					  walk.nbytes % CHACHA20_BLOCK_SIZE);
+	}
+
+	if (walk.nbytes) {
+		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
+				walk.nbytes);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+
+	kernel_neon_end();
+
+	return err;
+}
+
+static struct crypto_alg alg = {
+	.cra_name		= "chacha20",
+	.cra_driver_name	= "chacha20-neon",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_ctxsize		= sizeof(struct chacha20_ctx),
+	.cra_alignmask		= sizeof(u32) - 1,
+	.cra_module		= THIS_MODULE,
+	.cra_u			= {
+		.blkcipher = {
+			.min_keysize	= CHACHA20_KEY_SIZE,
+			.max_keysize	= CHACHA20_KEY_SIZE,
+			.ivsize		= CHACHA20_IV_SIZE,
+			.geniv		= "seqiv",
+			.setkey		= crypto_chacha20_setkey,
+			.encrypt	= chacha20_simd,
+			.decrypt	= chacha20_simd,
+		},
+	},
+};
+
+static int __init chacha20_simd_mod_init(void)
+{
+	return crypto_register_alg(&alg);
+}
+
+static void __exit chacha20_simd_mod_fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(chacha20_simd_mod_init);
+module_exit(chacha20_simd_mod_fini);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
-- 
2.7.4

^ permalink raw reply related

* [PATCH 0/2] crypto: arm64/ARM: NEON accelerated ChaCha20
From: Ard Biesheuvel @ 2016-12-08 14:28 UTC (permalink / raw)
  To: linux-arm-kernel

Another port of existing x86 SSE code to NEON, again both for arm64 and ARM.

ChaCha20 is a stream cipher described in RFC 7539, and is intended to be
an efficient software implementable 'standby cipher', in case AES cannot
be used.

This NEON implementation is almost 2x as fast as the generic C code
(measured on Cortex-A57 using the arm64 version)

I'm aware that blkciphers are deprecated in favor of skciphers, but this
code (like the x86 version) uses the init and setkey routines of the generic
version, so it is probably better to port all implementations at once.

Ard Biesheuvel (2):
  crypto: arm64/chacha20 - implement NEON version based on SSE3 code
  crypto: arm/chacha20 - implement NEON version based on SSE3 code

 arch/arm/crypto/Kconfig                |   6 +
 arch/arm/crypto/Makefile               |   2 +
 arch/arm/crypto/chacha20-neon-core.S   | 524 ++++++++++++++++++++
 arch/arm/crypto/chacha20-neon-glue.c   | 136 +++++
 arch/arm64/crypto/Kconfig              |   6 +
 arch/arm64/crypto/Makefile             |   3 +
 arch/arm64/crypto/chacha20-neon-core.S | 480 ++++++++++++++++++
 arch/arm64/crypto/chacha20-neon-glue.c | 131 +++++
 8 files changed, 1288 insertions(+)
 create mode 100644 arch/arm/crypto/chacha20-neon-core.S
 create mode 100644 arch/arm/crypto/chacha20-neon-glue.c
 create mode 100644 arch/arm64/crypto/chacha20-neon-core.S
 create mode 100644 arch/arm64/crypto/chacha20-neon-glue.c

-- 
2.7.4

^ permalink raw reply

* [PATCH v2.1 1/4] ARM: dts: davinci: da850: VPIF: add node and muxing
From: Laurent Pinchart @ 2016-12-08 13:51 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161208001418.4469-1-khilman@baylibre.com>

Hi Kevin,

Thank you for the patch.

On Wednesday 07 Dec 2016 16:14:18 Kevin Hilman wrote:
> Add VPIF node an pins to da850 and enable on boards.  VPIF has two input
> channels described using the standard DT ports and enpoints.
> 
> Signed-off-by: Kevin Hilman <khilman@baylibre.com>

Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>

> ---
> v2 -> v2.1: moved ports from SoC .dtsi to board .dts files.
> 
>  arch/arm/boot/dts/da850-evm.dts  | 20 ++++++++++++++++++++
>  arch/arm/boot/dts/da850-lcdk.dts | 13 +++++++++++++
>  arch/arm/boot/dts/da850.dtsi     | 27 ++++++++++++++++++++++++++-
>  3 files changed, 59 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm/boot/dts/da850-evm.dts
> b/arch/arm/boot/dts/da850-evm.dts index 41de15fe15a2..cea36ee6fd07 100644
> --- a/arch/arm/boot/dts/da850-evm.dts
> +++ b/arch/arm/boot/dts/da850-evm.dts
> @@ -289,3 +289,23 @@
>  		};
>  	};
>  };
> +
> +&vpif {
> +	pinctrl-names = "default";
> +	pinctrl-0 = <&vpif_capture_pins>;
> +	status = "okay";
> +
> +	/* VPIF capture port */
> +	port {
> +		vpif_ch0: endpoint at 0 {
> +			  reg = <0>;
> +			  bus-width = <8>;
> +		};
> +
> +		vpif_ch1: endpoint at 1 {
> +			  reg = <1>;
> +			  bus-width = <8>;
> +			  data-shift = <8>;
> +		};
> +	};
> +};
> diff --git a/arch/arm/boot/dts/da850-lcdk.dts
> b/arch/arm/boot/dts/da850-lcdk.dts index 7b8ab21fed6c..5fc21528e0ba 100644
> --- a/arch/arm/boot/dts/da850-lcdk.dts
> +++ b/arch/arm/boot/dts/da850-lcdk.dts
> @@ -219,3 +219,16 @@
>  		};
>  	};
>  };
> +
> +&vpif {
> +	pinctrl-names = "default";
> +	pinctrl-0 = <&vpif_capture_pins>;
> +	status = "okay";
> +
> +	/* VPIF capture port */
> +	port {
> +		vpif_ch0: endpoint {
> +			  bus-width = <8>;
> +		};
> +	};
> +};
> diff --git a/arch/arm/boot/dts/da850.dtsi b/arch/arm/boot/dts/da850.dtsi
> index f79e1b91c680..5f0b40510b2b 100644
> --- a/arch/arm/boot/dts/da850.dtsi
> +++ b/arch/arm/boot/dts/da850.dtsi
> @@ -186,7 +186,18 @@
>  					0xc 0x88888888 0xffffffff
> 
>  				>;
> 
>  			};
> -
> +			vpif_capture_pins: vpif_capture_pins {
> +				pinctrl-single,bits = <
> +					/* VP_DIN[2..7], VP_CLKIN1, VP_CLKIN0 
*/
> +					0x38 0x11111111 0xffffffff
> +					/* VP_DIN[10..15,0..1] */
> +					0x3c 0x11111111 0xffffffff
> +					/* VP_DIN[8..9] */
> +					0x40 0x00000011 0x000000ff
> +					/* VP_CLKIN3, VP_CLKIN2 */
> +					0x4c 0x00010100 0x000f0f00
> +				>;
> +			};
>  		};
>  		edma0: edma at 0 {
>  			compatible = "ti,edma3-tpcc";
> @@ -399,7 +410,21 @@
>  				<&edma0 0 1>;
>  			dma-names = "tx", "rx";
>  		};
> +
> +		vpif: video at 217000 {
> +			compatible = "ti,da850-vpif";
> +			reg = <0x217000 0x1000>;
> +			interrupts = <92>;
> +			status = "disabled";
> +
> +			/* VPIF capture port */
> +			port {
> +				#address-cells = <1>;
> +				#size-cells = <0>;
> +			};
> +		};
>  	};
> +
>  	aemif: aemif at 68000000 {
>  		compatible = "ti,da850-aemif";
>  		#address-cells = <2>;

-- 
Regards,

Laurent Pinchart

^ permalink raw reply

* Tearing down DMA transfer setup after DMA client has finished
From: Måns Rullgård @ 2016-12-08 13:39 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <68bbe77a-c991-4e64-c189-fafbcda8e7ae@free.fr>

Mason <slash.tmp@free.fr> writes:

> On 08/12/2016 13:44, M?ns Rullg?rd wrote:
>
>> Mason <slash.tmp@free.fr> writes:
>> 
>>> On 08/12/2016 13:20, M?ns Rullg?rd wrote:
>>>
>>>> The only problem we have is that nobody envisioned hardware where the
>>>> dma engine indicates completion slightly too soon.  I suspect there's a
>>>> fifo or such somewhere, and the interrupt is triggered when the last
>>>> byte has been placed in the fifo rather than when it has been removed
>>>> which would have been more correct.
>>>
>>> As I (tried to) explain here:
>>> https://marc.info/?l=dmaengine&m=148007808418242&w=2
>>>
>>> A *read* MBUS agent raises its IRQ when it is safe for the memory
>>> to be overwritten (i.e. every byte has been pushed into the pipe).
>>>
>>> A *write* MBUS agent raises its IRQ when it is safe for another
>>> agent to read any one of the transferred bytes.
>>>
>>> The issue comes from the fact that, for a memory-to-device transfer,
>>> the system will receive the read agent's IRQ, but most devices
>>> (NFC, SATA) don't have an IRQ line to signal that their part of the
>>> operation is complete.
>> 
>> SATA does, actually.  Nevertheless, it's an unusual design.
>
> Thanks, I was mistaken about the SATA controller.
>
> On tango3 (and also tango4, I assume)
>
> IRQ 41 = Serial ATA #0
> IRQ 42 = Serial ATA DMA #0
> IRQ 54 = Serial ATA #1
> IRQ 55 = Serial ATA DMA #1
>
> But in the end, whether there is a device interrupt (SATA)
> or not (NFC), for a memory-to-device transfer, the DMA
> driver will get the read agent notification (which should
> be ignored) and the client driver should either spin until
> idle (NFC) or wait for its completion IRQ (SATA).
>
> Correct?

Yes, and when the client device is finished, the driver needs to signal
this to the dma driver so it can reuse the channel.  It's this last
piece that's missing.

-- 
M?ns Rullg?rd

^ permalink raw reply

* [RFC v3 00/10] KVM PCIe/MSI passthrough on ARM/ARM64 and IOVA reserved regions
From: Auger Eric @ 2016-12-08 13:36 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <cd16fc5c-8649-0bfa-d67d-8f257aa38bd6@arm.com>

Hi Robin,

On 08/12/2016 14:14, Robin Murphy wrote:
> On 08/12/16 09:36, Auger Eric wrote:
>> Hi,
>>
>> On 15/11/2016 14:09, Eric Auger wrote:
>>> Following LPC discussions, we now report reserved regions through
>>> iommu-group sysfs reserved_regions attribute file.
>>
>>
>> While I am respinning this series into v4, here is a tentative summary
>> of technical topics for which no consensus was reached at this point.
>>
>> 1) Shall we report the usable IOVA range instead of reserved IOVA
>>    ranges. Not discussed at/after LPC.
>>    x I currently report reserved regions. Alex expressed the need to
>>      report the full usable IOVA range instead (x86 min-max range
>>      minus MSI APIC window). I think this is meaningful for ARM
>>      too where arm-smmu might not support the full 64b range.
>>    x Any objection we report the usable IOVA regions instead?
> 
> The issue with that is that we can't actually report "the usable
> regions" at the moment, as that involves pulling together disjoint
> properties of arbitrary hardware unrelated to the IOMMU. We'd be
> reporting "the not-definitely-unusable regions, which may have some
> unusable holes in them still". That seems like an ABI nightmare - I'd
> still much rather say "here are some, but not necessarily all, regions
> you definitely can't use", because saying "here are some regions which
> you might be able to use most of, probably" is what we're already doing
> today, via a single implicit region from 0 to ULONG_MAX ;)
> 
> The address space limits are definitely useful to know, but I think it
> would be better to expose them separately to avoid the ambiguity. At
> worst, I guess it would be reasonable to express the limits via an
> "out-of-range" reserved region type for 0 to $base and $top to
> ULONG-MAX. To *safely* expose usable regions, we'd have to start out
> with a very conservative assumption (e.g. only IOVAs matching physical
> RAM), and only expand them once we're sure we can detect every possible
> bit of problematic hardware in the system - that's just too limiting to
> be useful. And if we expose something knowingly inaccurate, we risk
> having another "bogoMIPS in /proc/cpuinfo" ABI burden on our hands, and
> nobody wants that...
Makes sense to me. "out-of-range reserved region type for 0 to $base and
$top to ULONG-MAX" can be an alternative to fulfill the requirement.
> 
>> 2) Shall the kernel check collision with MSI window* when userspace
>>    calls VFIO_IOMMU_MAP_DMA?
>>    Joerg/Will No; Alex yes
>>    *for IOVA regions consumed downstream to the IOMMU: everyone says NO
> 
> If we're starting off by having the SMMU drivers expose it as a fake
> fixed region, I don't think we need to worry about this yet. We all seem
> to agree that as long as we communicate the fixed regions to userspace,
> it's then userspace's job to work around them. Let's come back to this
> one once we actually get to the point of dynamically sizing and
> allocating 'real' MSI remapping region(s).
> 
> Ultimately, the kernel *will* police collisions either way, because an
> underlying iommu_map() is going to fail if overlapping IOVAs are ever
> actually used, so it's really just a question of whether to have a more
> user-friendly failure mode.
That's true on ARM but not on x86 where the APIC MSI region is not
mapped I think.
> 
>> 3) RMRR reporting in the iommu group sysfs? Joerg: yes; Don: no
>>    My current series does not expose them in iommu group sysfs.
>>    I understand we can expose the RMRR regions in the iomm group sysfs
>>    without necessarily supporting RMRR requiring device assignment.
>>    We can also add this support later.
> 
> As you say, reporting them doesn't necessitate allowing device
> assignment, and it's information which can already be easily grovelled
> out of dmesg (for intel-iommu at least) - there doesn't seem to be any
> need to hide them, but the x86 folks can have the final word on that.
agreed

Thanks

Eric
> 
> Robin.
> 
>> Thanks
>>
>> Eric
>>
>>
>>>
>>> Reserved regions are populated through the IOMMU get_resv_region callback
>>> (former get_dm_regions), now implemented by amd-iommu, intel-iommu and
>>> arm-smmu.
>>>
>>> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
>>> IOMMU_RESV_NOMAP reserved region.
>>>
>>> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
>>> 1MB large) and the PCI host bridge windows.
>>>
>>> The series integrates a not officially posted patch from Robin:
>>> "iommu/dma: Allow MSI-only cookies".
>>>
>>> This series currently does not address IRQ safety assessment.
>>>
>>> Best Regards
>>>
>>> Eric
>>>
>>> Git: complete series available at
>>> https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
>>>
>>> History:
>>> RFC v2 -> v3:
>>> - switch to an iommu-group sysfs API
>>> - use new dummy allocator provided by Robin
>>> - dummy allocator initialized by vfio-iommu-type1 after enumerating
>>>   the reserved regions
>>> - at the moment ARM MSI base address/size is left unchanged compared
>>>   to v2
>>> - we currently report reserved regions and not usable IOVA regions as
>>>   requested by Alex
>>>
>>> RFC v1 -> v2:
>>> - fix intel_add_reserved_regions
>>> - add mutex lock/unlock in vfio_iommu_type1
>>>
>>>
>>> Eric Auger (10):
>>>   iommu/dma: Allow MSI-only cookies
>>>   iommu: Rename iommu_dm_regions into iommu_resv_regions
>>>   iommu: Add new reserved IOMMU attributes
>>>   iommu: iommu_alloc_resv_region
>>>   iommu: Do not map reserved regions
>>>   iommu: iommu_get_group_resv_regions
>>>   iommu: Implement reserved_regions iommu-group sysfs file
>>>   iommu/vt-d: Implement reserved region get/put callbacks
>>>   iommu/arm-smmu: Implement reserved region get/put callbacks
>>>   vfio/type1: Get MSI cookie
>>>
>>>  drivers/iommu/amd_iommu.c       |  20 +++---
>>>  drivers/iommu/arm-smmu.c        |  52 +++++++++++++++
>>>  drivers/iommu/dma-iommu.c       | 116 ++++++++++++++++++++++++++-------
>>>  drivers/iommu/intel-iommu.c     |  50 ++++++++++----
>>>  drivers/iommu/iommu.c           | 141 ++++++++++++++++++++++++++++++++++++----
>>>  drivers/vfio/vfio_iommu_type1.c |  26 ++++++++
>>>  include/linux/dma-iommu.h       |   7 ++
>>>  include/linux/iommu.h           |  49 ++++++++++----
>>>  8 files changed, 391 insertions(+), 70 deletions(-)
>>>
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo at vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* [PATCH] arm64: dts: h3ulcb: Provide sd0_uhs node
From: Simon Horman @ 2016-12-08 13:30 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1480583246-6707-1-git-send-email-horms+renesas@verge.net.au>

On Thu, Dec 01, 2016 at 10:07:26AM +0100, Simon Horman wrote:
> Provide separaate sd0 and sd0_uhs nodes rather than duplicate sd0 nodes.
> 
> Cc: Vladimir Barinov <vladimir.barinov@cogentembedded.com>
> Cc: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
> Fixes: 93373c309a70 ("arm64: dts: h3ulcb: rename SDHI0 pins")
> Signed-off-by: Simon Horman <horms+renesas@verge.net.au>

I have queued this up as a fix for v4.10.

>  arch/arm64/boot/dts/renesas/r8a7795-h3ulcb.dts | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/boot/dts/renesas/r8a7795-h3ulcb.dts b/arch/arm64/boot/dts/renesas/r8a7795-h3ulcb.dts
> index 6ffb0517421a..dbea2c3d8f0c 100644
> --- a/arch/arm64/boot/dts/renesas/r8a7795-h3ulcb.dts
> +++ b/arch/arm64/boot/dts/renesas/r8a7795-h3ulcb.dts
> @@ -169,7 +169,7 @@
>  		power-source = <3300>;
>  	};
>  
> -	sdhi0_pins_uhs: sd0 {
> +	sdhi0_pins_uhs: sd0_uhs {
>  		groups = "sdhi0_data4", "sdhi0_ctrl";
>  		function = "sdhi0";
>  		power-source = <1800>;
> -- 
> 2.7.0.rc3.207.g0ac5344
> 

^ permalink raw reply

* Tearing down DMA transfer setup after DMA client has finished
From: Mason @ 2016-12-08 13:29 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <yw1xfulyzj2b.fsf@unicorn.mansr.com>

On 08/12/2016 13:44, M?ns Rullg?rd wrote:

> Mason <slash.tmp@free.fr> writes:
> 
>> On 08/12/2016 13:20, M?ns Rullg?rd wrote:
>>
>>> The only problem we have is that nobody envisioned hardware where the
>>> dma engine indicates completion slightly too soon.  I suspect there's a
>>> fifo or such somewhere, and the interrupt is triggered when the last
>>> byte has been placed in the fifo rather than when it has been removed
>>> which would have been more correct.
>>
>> As I (tried to) explain here:
>> https://marc.info/?l=dmaengine&m=148007808418242&w=2
>>
>> A *read* MBUS agent raises its IRQ when it is safe for the memory
>> to be overwritten (i.e. every byte has been pushed into the pipe).
>>
>> A *write* MBUS agent raises its IRQ when it is safe for another
>> agent to read any one of the transferred bytes.
>>
>> The issue comes from the fact that, for a memory-to-device transfer,
>> the system will receive the read agent's IRQ, but most devices
>> (NFC, SATA) don't have an IRQ line to signal that their part of the
>> operation is complete.
> 
> SATA does, actually.  Nevertheless, it's an unusual design.

Thanks, I was mistaken about the SATA controller.

On tango3 (and also tango4, I assume)

IRQ 41 = Serial ATA #0
IRQ 42 = Serial ATA DMA #0
IRQ 54 = Serial ATA #1
IRQ 55 = Serial ATA DMA #1

But in the end, whether there is a device interrupt (SATA)
or not (NFC), for a memory-to-device transfer, the DMA
driver will get the read agent notification (which should
be ignored) and the client driver should either spin until
idle (NFC) or wait for its completion IRQ (SATA).

Correct?

Regards.

^ permalink raw reply

* [PATCH] arm64: Work around Falkor erratum 1009
From: Catalin Marinas @ 2016-12-08 13:27 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161208114511.GC9768@leverpostej>

On Thu, Dec 08, 2016 at 11:45:12AM +0000, Mark Rutland wrote:
> On Wed, Dec 07, 2016 at 03:04:31PM -0500, Christopher Covington wrote:
> > +	asm volatile(ALTERNATIVE(
> > +		     "nop \n"
> > +		     "nop \n",
> > +		     "tlbi vmalle1is \n"
> > +		     "dsb ish \n",
> 
> As a general note, perhaps we want a C compatible NOP_ALTERNATIVE() so
> that the nop case can be implicitly generated for sequences like this.

It's also worth checking what cpus_have_const_cap() would generate for
the default (no workaround required) case.

-- 
Catalin

^ permalink raw reply

* [PATCH renesas/devel 1/2] ARM: shmobile: defconfig: Enable r8a774[35] SoCs
From: Simon Horman @ 2016-12-08 13:26 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <CAMuHMdUyCcT3k22b-N8J8aasE=YDYfQYfU5GqFKEtsdFnUnHCw@mail.gmail.com>

On Tue, Dec 06, 2016 at 03:00:53PM +0100, Geert Uytterhoeven wrote:
> On Tue, Dec 6, 2016 at 2:32 PM, Simon Horman <horms+renesas@verge.net.au> wrote:
> > Enable recently added r8a7743 (RZ/G1M) and r8a7745 (RZ/G1E) SoCs.
> >
> > Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
> 
> Acked-by: Geert Uytterhoeven <geert+renesas@glider.be>

Thanks, I have queued up this and the following patch for v4.11.

^ permalink raw reply

* [RFC v3 00/10] KVM PCIe/MSI passthrough on ARM/ARM64 and IOVA reserved regions
From: Robin Murphy @ 2016-12-08 13:14 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <c5d4efa7-699d-4aa3-44cc-4ce03d0ce185@redhat.com>

On 08/12/16 09:36, Auger Eric wrote:
> Hi,
> 
> On 15/11/2016 14:09, Eric Auger wrote:
>> Following LPC discussions, we now report reserved regions through
>> iommu-group sysfs reserved_regions attribute file.
> 
> 
> While I am respinning this series into v4, here is a tentative summary
> of technical topics for which no consensus was reached at this point.
> 
> 1) Shall we report the usable IOVA range instead of reserved IOVA
>    ranges. Not discussed at/after LPC.
>    x I currently report reserved regions. Alex expressed the need to
>      report the full usable IOVA range instead (x86 min-max range
>      minus MSI APIC window). I think this is meaningful for ARM
>      too where arm-smmu might not support the full 64b range.
>    x Any objection we report the usable IOVA regions instead?

The issue with that is that we can't actually report "the usable
regions" at the moment, as that involves pulling together disjoint
properties of arbitrary hardware unrelated to the IOMMU. We'd be
reporting "the not-definitely-unusable regions, which may have some
unusable holes in them still". That seems like an ABI nightmare - I'd
still much rather say "here are some, but not necessarily all, regions
you definitely can't use", because saying "here are some regions which
you might be able to use most of, probably" is what we're already doing
today, via a single implicit region from 0 to ULONG_MAX ;)

The address space limits are definitely useful to know, but I think it
would be better to expose them separately to avoid the ambiguity. At
worst, I guess it would be reasonable to express the limits via an
"out-of-range" reserved region type for 0 to $base and $top to
ULONG-MAX. To *safely* expose usable regions, we'd have to start out
with a very conservative assumption (e.g. only IOVAs matching physical
RAM), and only expand them once we're sure we can detect every possible
bit of problematic hardware in the system - that's just too limiting to
be useful. And if we expose something knowingly inaccurate, we risk
having another "bogoMIPS in /proc/cpuinfo" ABI burden on our hands, and
nobody wants that...

> 2) Shall the kernel check collision with MSI window* when userspace
>    calls VFIO_IOMMU_MAP_DMA?
>    Joerg/Will No; Alex yes
>    *for IOVA regions consumed downstream to the IOMMU: everyone says NO

If we're starting off by having the SMMU drivers expose it as a fake
fixed region, I don't think we need to worry about this yet. We all seem
to agree that as long as we communicate the fixed regions to userspace,
it's then userspace's job to work around them. Let's come back to this
one once we actually get to the point of dynamically sizing and
allocating 'real' MSI remapping region(s).

Ultimately, the kernel *will* police collisions either way, because an
underlying iommu_map() is going to fail if overlapping IOVAs are ever
actually used, so it's really just a question of whether to have a more
user-friendly failure mode.

> 3) RMRR reporting in the iommu group sysfs? Joerg: yes; Don: no
>    My current series does not expose them in iommu group sysfs.
>    I understand we can expose the RMRR regions in the iomm group sysfs
>    without necessarily supporting RMRR requiring device assignment.
>    We can also add this support later.

As you say, reporting them doesn't necessitate allowing device
assignment, and it's information which can already be easily grovelled
out of dmesg (for intel-iommu at least) - there doesn't seem to be any
need to hide them, but the x86 folks can have the final word on that.

Robin.

> Thanks
> 
> Eric
> 
> 
>>
>> Reserved regions are populated through the IOMMU get_resv_region callback
>> (former get_dm_regions), now implemented by amd-iommu, intel-iommu and
>> arm-smmu.
>>
>> The intel-iommu reports the [FEE0_0000h - FEF0_000h] MSI window as an
>> IOMMU_RESV_NOMAP reserved region.
>>
>> arm-smmu reports the MSI window (arbitrarily located at 0x8000000 and
>> 1MB large) and the PCI host bridge windows.
>>
>> The series integrates a not officially posted patch from Robin:
>> "iommu/dma: Allow MSI-only cookies".
>>
>> This series currently does not address IRQ safety assessment.
>>
>> Best Regards
>>
>> Eric
>>
>> Git: complete series available at
>> https://github.com/eauger/linux/tree/v4.9-rc5-reserved-rfc-v3
>>
>> History:
>> RFC v2 -> v3:
>> - switch to an iommu-group sysfs API
>> - use new dummy allocator provided by Robin
>> - dummy allocator initialized by vfio-iommu-type1 after enumerating
>>   the reserved regions
>> - at the moment ARM MSI base address/size is left unchanged compared
>>   to v2
>> - we currently report reserved regions and not usable IOVA regions as
>>   requested by Alex
>>
>> RFC v1 -> v2:
>> - fix intel_add_reserved_regions
>> - add mutex lock/unlock in vfio_iommu_type1
>>
>>
>> Eric Auger (10):
>>   iommu/dma: Allow MSI-only cookies
>>   iommu: Rename iommu_dm_regions into iommu_resv_regions
>>   iommu: Add new reserved IOMMU attributes
>>   iommu: iommu_alloc_resv_region
>>   iommu: Do not map reserved regions
>>   iommu: iommu_get_group_resv_regions
>>   iommu: Implement reserved_regions iommu-group sysfs file
>>   iommu/vt-d: Implement reserved region get/put callbacks
>>   iommu/arm-smmu: Implement reserved region get/put callbacks
>>   vfio/type1: Get MSI cookie
>>
>>  drivers/iommu/amd_iommu.c       |  20 +++---
>>  drivers/iommu/arm-smmu.c        |  52 +++++++++++++++
>>  drivers/iommu/dma-iommu.c       | 116 ++++++++++++++++++++++++++-------
>>  drivers/iommu/intel-iommu.c     |  50 ++++++++++----
>>  drivers/iommu/iommu.c           | 141 ++++++++++++++++++++++++++++++++++++----
>>  drivers/vfio/vfio_iommu_type1.c |  26 ++++++++
>>  include/linux/dma-iommu.h       |   7 ++
>>  include/linux/iommu.h           |  49 ++++++++++----
>>  8 files changed, 391 insertions(+), 70 deletions(-)
>>

^ permalink raw reply

* [PATCH 16/18] arm64: ptrace: handle ptrace_request differently for aarch32 and ilp32
From: Catalin Marinas @ 2016-12-08 13:12 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <3703608.GKr1zzErMk@wuerfel>

On Wed, Dec 07, 2016 at 09:40:13PM +0100, Arnd Bergmann wrote:
> On Wednesday, December 7, 2016 4:59:13 PM CET Catalin Marinas wrote:
> > On Tue, Dec 06, 2016 at 11:55:08AM +0530, Yury Norov wrote:
> > > On Mon, Dec 05, 2016 at 04:34:23PM +0000, Catalin Marinas wrote:
> > > > On Fri, Oct 21, 2016 at 11:33:15PM +0300, Yury Norov wrote:
> > > > > New aarch32 ptrace syscall handler is introduced to avoid run-time
> > > > > detection of the task type.
> > > > 
> > > > What's wrong with the run-time detection? If it's just to avoid a
> > > > negligible overhead, I would rather keep the code simpler by avoiding
> > > > duplicating the generic compat_sys_ptrace().
> > > 
> > > Nothing wrong. This is how Arnd asked me to do. You already asked this
> > > question: http://lkml.iu.edu/hypermail/linux/kernel/1604.3/00930.html
> > 
> > Hmm, I completely forgot about this ;). There is still an advantage to
> > doing run-time checking if we avoid touching core code (less acks to
> > gather and less code duplication).
> > 
> > Let's see what Arnd says but the initial patch looked simpler.
> 
> I don't currently have either version of the patch in my inbox
> (the archive is on a different machine), but in general I'd still
> think it's best to avoid the runtime check for aarch64-ilp32
> altogether. I'd have to look at the overall kernel source to
> see if it's worth avoiding one or two instances though, or
> if there are an overwhelming number of other checks that we
> can't avoid at all.

Just in case you haven't found them already, current version:

https://marc.info/?l=linux-arm-kernel&m=147708276818318&w=2

Original version:

https://patchwork.kernel.org/patch/7980521/

The old one looks more readable and given that ptrace is not really a
fast path, I'm not two worried about run-time checks

> Regarding ptrace, I notice that arch/tile doesn't even use
> the compat entry point for its ilp32 user space on 64-bit
> kernels, it just calls the regular 64-bit one. Would that
> help here?

I don't know whether it would work, we have incompatible siginfo_t on
AArch64/ILP32.

-- 
Catalin

^ permalink raw reply

* [PATCH v9 05/11] arm/arm64: vgic: Introduce VENG0 and VENG1 fields to vmcr struct
From: Auger Eric @ 2016-12-08 12:50 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161208122115.GG4816@cbox>

Hi Christoffer,

On 08/12/2016 13:21, Christoffer Dall wrote:
> On Thu, Dec 08, 2016 at 12:52:39PM +0100, Auger Eric wrote:
>> Hi Vijay,
>>
>> On 28/11/2016 15:28, Christoffer Dall wrote:
>>> On Wed, Nov 23, 2016 at 06:31:52PM +0530, vijay.kilari at gmail.com wrote:
>>>> From: Vijaya Kumar K <Vijaya.Kumar@cavium.com>
>>>>
>>>> ICC_VMCR_EL2 supports virtual access to ICC_IGRPEN1_EL1.Enable
>>>> and ICC_IGRPEN0_EL1.Enable fields. Add grpen0 and grpen1 member
>>>> variables to struct vmcr to support read and write of these fields.
>>>>
>>>> Also refactor vgic_set_vmcr and vgic_get_vmcr() code.
>>>> Drop ICH_VMCR_CTLR_SHIFT and ICH_VMCR_CTLR_MASK macros and instead
>>>> use ICH_VMCR_EOI* and ICH_VMCR_CBPR* macros
>>>> .
>>>> Signed-off-by: Vijaya Kumar K <Vijaya.Kumar@cavium.com>
>>>> ---
>>>>  include/linux/irqchip/arm-gic-v3.h |  2 --
>>>>  virt/kvm/arm/vgic/vgic-mmio-v2.c   | 16 ----------------
>>>>  virt/kvm/arm/vgic/vgic-mmio.c      | 16 ++++++++++++++++
>>>>  virt/kvm/arm/vgic/vgic-v3.c        | 22 ++++++++++++++++++++--
>>>>  virt/kvm/arm/vgic/vgic.h           |  5 +++++
>>>>  5 files changed, 41 insertions(+), 20 deletions(-)
>>>>
>>>> diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
>>>> index b4f8287..406fc3e 100644
>>>> --- a/include/linux/irqchip/arm-gic-v3.h
>>>> +++ b/include/linux/irqchip/arm-gic-v3.h
>>>> @@ -404,8 +404,6 @@
>>>>  #define ICH_HCR_EN			(1 << 0)
>>>>  #define ICH_HCR_UIE			(1 << 1)
>>>>  
>>>> -#define ICH_VMCR_CTLR_SHIFT		0
>>>> -#define ICH_VMCR_CTLR_MASK		(0x21f << ICH_VMCR_CTLR_SHIFT)
>>>>  #define ICH_VMCR_CBPR_SHIFT		4
>>>>  #define ICH_VMCR_CBPR_MASK		(1 << ICH_VMCR_CBPR_SHIFT)
>>>>  #define ICH_VMCR_EOIM_SHIFT		9
>>>> diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
>>>> index 2cb04b7..ad353b5 100644
>>>> --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
>>>> +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
>>>> @@ -212,22 +212,6 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
>>>>  	}
>>>>  }
>>>>  
>>>> -static void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
>>>> -{
>>>> -	if (kvm_vgic_global_state.type == VGIC_V2)
>>>> -		vgic_v2_set_vmcr(vcpu, vmcr);
>>>> -	else
>>>> -		vgic_v3_set_vmcr(vcpu, vmcr);
>>>> -}
>>>> -
>>>> -static void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
>>>> -{
>>>> -	if (kvm_vgic_global_state.type == VGIC_V2)
>>>> -		vgic_v2_get_vmcr(vcpu, vmcr);
>>>> -	else
>>>> -		vgic_v3_get_vmcr(vcpu, vmcr);
>>>> -}
>>>> -
>>>>  #define GICC_ARCH_VERSION_V2	0x2
>>>>  
>>>>  /* These are for userland accesses only, there is no guest-facing emulation. */
>>>> diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
>>>> index 0d1bc98..f81e0e5 100644
>>>> --- a/virt/kvm/arm/vgic/vgic-mmio.c
>>>> +++ b/virt/kvm/arm/vgic/vgic-mmio.c
>>>> @@ -416,6 +416,22 @@ int vgic_validate_mmio_region_addr(struct kvm_device *dev,
>>>>  	return -ENXIO;
>>>>  }
>>>>  
>>>> +void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
>>>> +{
>>>> +	if (kvm_vgic_global_state.type == VGIC_V2)
>>>> +		vgic_v2_set_vmcr(vcpu, vmcr);
>>>> +	else
>>>> +		vgic_v3_set_vmcr(vcpu, vmcr);
>>>> +}
>>>> +
>>>> +void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
>>>> +{
>>>> +	if (kvm_vgic_global_state.type == VGIC_V2)
>>>> +		vgic_v2_get_vmcr(vcpu, vmcr);
>>>> +	else
>>>> +		vgic_v3_get_vmcr(vcpu, vmcr);
>>>> +}
>>>> +
>>>>  /*
>>>>   * kvm_mmio_read_buf() returns a value in a format where it can be converted
>>>>   * to a byte array and be directly observed as the guest wanted it to appear
>>>> diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
>>>> index 9f0dae3..a3ff04b 100644
>>>> --- a/virt/kvm/arm/vgic/vgic-v3.c
>>>> +++ b/virt/kvm/arm/vgic/vgic-v3.c
>>>> @@ -175,10 +175,19 @@ void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
>>>>  {
>>>>  	u32 vmcr;
>>>>  
>>>> -	vmcr  = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK;
>>>> +	/*
>>>> +	 * Ignore the FIQen bit, because GIC emulation always implies
>>>> +	 * SRE=1 which means the vFIQEn bit is also RES1.
>>>> +	 */
>>>> +	vmcr = (vmcrp->ctlr & ICC_CTLR_EL1_EOImode_MASK) >>
>>>> +		ICC_CTLR_EL1_EOImode_SHIFT;
>>>> +	vmcr = (vmcr << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK;
> 
>> I am not able to understand why we use ICC_CTLR _*macros here? Please
>> could you explain it to me? Besides if we want to ignore the FIQen bit
>> can't we change the ICH_VMCR_CTLR_MASK value?
>>
> This first statement is setting the vmcr to the ctlr's bit[1], but
> placed in bit[0], and then the next statement is moving that bit value
> to the corresponding place in the vmcr.  I think this is correct
> (although a little opaque).
Yes the question was more about the semantic of the vmcrp->ctlr field. I
thought it was supposed to store ICH_VMCR_EL2 ctrl bits as they are and
not with a different layout.
> 
> There's also a newer series on the list, just so you know.

Argh OK I missed it. I will check the diffs in the AArch64 related patches.

Thanks

Eric
> 
> Thanks,
> -Christoffer
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
> 

^ permalink raw reply

* Tearing down DMA transfer setup after DMA client has finished
From: Måns Rullgård @ 2016-12-08 12:44 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <a865e7d0-4247-657e-25db-9093731005d3@free.fr>

Mason <slash.tmp@free.fr> writes:

> On 08/12/2016 13:20, M?ns Rullg?rd wrote:
>
>> The only problem we have is that nobody envisioned hardware where the
>> dma engine indicates completion slightly too soon.  I suspect there's a
>> fifo or such somewhere, and the interrupt is triggered when the last
>> byte has been placed in the fifo rather than when it has been removed
>> which would have been more correct.
>
> As I (tried to) explain here:
> https://marc.info/?l=dmaengine&m=148007808418242&w=2
>
> A *read* MBUS agent raises its IRQ when it is safe for the memory
> to be overwritten (i.e. every byte has been pushed into the pipe).
>
> A *write* MBUS agent raises its IRQ when it is safe for another
> agent to read any one of the transferred bytes.
>
> The issue comes from the fact that, for a memory-to-device transfer,
> the system will receive the read agent's IRQ, but most devices
> (NFC, SATA) don't have an IRQ line to signal that their part of the
> operation is complete.

SATA does, actually.  Nevertheless, it's an unusual design.

-- 
M?ns Rullg?rd

^ permalink raw reply

* [PATCH 5/8] efi: Get the secure boot status [ver #5]
From: Lukas Wunner @ 2016-12-08 12:42 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <6009.1481184981@warthog.procyon.org.uk>

On Thu, Dec 08, 2016 at 08:16:21AM +0000, David Howells wrote:
> +/*
> + * Determine whether we're in secure boot mode.
> + */
> +enum efi_secureboot_mode efi_get_secureboot(efi_system_table_t *sys_table_arg)
> +{
> +	u8 secboot, setupmode;
> +	unsigned long size;
> +	efi_status_t status;
> +
> +	size = sizeof(secboot);
> +	status = get_efi_var(efi_SecureBoot_name, &efi_variable_guid,
> +			     NULL, &size, &secboot);
> +	if (status != EFI_SUCCESS)
> +		goto out_efi_err;
> +
> +	size = sizeof(setupmode);
> +	status = get_efi_var(efi_SetupMode_name, &efi_variable_guid,
> +			     NULL, &size, &setupmode);
> +	if (status != EFI_SUCCESS)
> +		goto out_efi_err;
> +
> +	if (secboot == 0 || setupmode == 1)
> +		return efi_secureboot_mode_disabled;
> +
> +	pr_efi(sys_table_arg, "UEFI Secure Boot is enabled.\n");
> +	return efi_secureboot_mode_enabled;
> +
> +out_efi_err:
> +	pr_efi_err(sys_table_arg, "Could not determine UEFI Secure Boot status.\n");
> +	if (status == EFI_NOT_FOUND)
> +		return efi_secureboot_mode_disabled;
> +	return efi_secureboot_mode_unknown;
> +}

In the out_efi_err path, the if-statement needs to come before the
pr_efi_err() call.  Otherwise it would be a change of behaviour for
ARM to what we have now.

Also, minor nit, I'd expect Matt to ask for a newline between the
if-statement and the following statements, so:

out_efi_err:
	if (status == EFI_NOT_FOUND)
		return efi_secureboot_mode_disabled;

	pr_efi_err(sys_table_arg, "Could not determine UEFI Secure Boot status.\n");
	return efi_secureboot_mode_unknown;

The error message doesn't say what the consequence is of the
failure to determine the status, but IIUC this differs between
x86 and ARM, is that correct?  (If I remember the discussion
correctly, x86 defaults to disabled, ARM to enabled.)

Thanks,

Lukas

^ permalink raw reply

* Tearing down DMA transfer setup after DMA client has finished
From: Mason @ 2016-12-08 12:41 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <yw1xshpyzk6p.fsf@unicorn.mansr.com>

On 08/12/2016 13:20, M?ns Rullg?rd wrote:

> The only problem we have is that nobody envisioned hardware where the
> dma engine indicates completion slightly too soon.  I suspect there's a
> fifo or such somewhere, and the interrupt is triggered when the last
> byte has been placed in the fifo rather than when it has been removed
> which would have been more correct.

As I (tried to) explain here:
https://marc.info/?l=dmaengine&m=148007808418242&w=2

A *read* MBUS agent raises its IRQ when it is safe for the memory
to be overwritten (i.e. every byte has been pushed into the pipe).

A *write* MBUS agent raises its IRQ when it is safe for another
agent to read any one of the transferred bytes.

The issue comes from the fact that, for a memory-to-device transfer,
the system will receive the read agent's IRQ, but most devices
(NFC, SATA) don't have an IRQ line to signal that their part of the
operation is complete.

As I explained, if one sets up a memory-to-memory DMA copy, then
the system will actually receive *two* IRQs.

Regards.

^ permalink raw reply

* [PATCH v6 1/2] ARM: davinci: Export two clocks function
From: Sekhar Nori @ 2016-12-08 12:39 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <4e313609-0acc-4e06-85e6-ba36b7916b33@ti.com>

On Thursday 08 December 2016 06:02 PM, Sekhar Nori wrote:
> On Wednesday 07 December 2016 10:39 PM, Alexandre Bailon wrote:
>> Rename and export __clk_enable() and __clk_disable() in order
>> to use them from usb-da8xx.c. This file implements the usb20 phy clock
>> that must be able to enable or disable usb20 clock.
>> To prevent a recurssive call to clk_enable() that would cause a recursive
>> locking issue, we must use __clk_enable() and __clk_disable().
>> Rename these methods in davinci_clk_enable() and davinci_clk_disable(),
>> and export them.
>>
>> Signed-off-by: Alexandre Bailon <abailon@baylibre.com>
>> Suggested-by: David Lechner <david@lechnology.com>
>> ---
>>  arch/arm/mach-davinci/clock.c | 14 ++++++++------
>>  arch/arm/mach-davinci/clock.h |  2 ++
>>  2 files changed, 10 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/arm/mach-davinci/clock.c b/arch/arm/mach-davinci/clock.c
>> index df42c93..0f967c3 100644
>> --- a/arch/arm/mach-davinci/clock.c
>> +++ b/arch/arm/mach-davinci/clock.c
>> @@ -31,10 +31,10 @@ static LIST_HEAD(clocks);
>>  static DEFINE_MUTEX(clocks_mutex);
>>  static DEFINE_SPINLOCK(clockfw_lock);
>>  
>> -static void __clk_enable(struct clk *clk)
>> +void davinci_clk_enable(struct clk *clk)
>>  {
>>  	if (clk->parent)
>> -		__clk_enable(clk->parent);
>> +		davinci_clk_enable(clk->parent);
>>  	if (clk->usecount++ == 0) {
>>  		if (clk->flags & CLK_PSC)
>>  			davinci_psc_config(clk->domain, clk->gpsc, clk->lpsc,
>> @@ -43,8 +43,9 @@ static void __clk_enable(struct clk *clk)
>>  			clk->clk_enable(clk);
>>  	}
>>  }
>> +EXPORT_SYMBOL(davinci_clk_enable);
> 
> We don't want to export these as we dont want drivers to use this API.
> This is to be used within mach-davinci only.

Also, subject line is pretty vague. What about:

ARM: davinci: provide lock-less versions of clk_{enable|disable}

In the description too, please talk about on the main difference between
davinci_clk_{enable|disable}() (lockless) vs clk_enable() (locked). Use
USB case only as an example. Using them in usb-da8xx.c might be the
current motivation but this addresses similar needs in future too.

Thanks,
Sekhar

^ permalink raw reply

* [PATCH v6 1/2] ARM: davinci: Export two clocks function
From: Sekhar Nori @ 2016-12-08 12:32 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1481130567-27829-1-git-send-email-abailon@baylibre.com>

On Wednesday 07 December 2016 10:39 PM, Alexandre Bailon wrote:
> Rename and export __clk_enable() and __clk_disable() in order
> to use them from usb-da8xx.c. This file implements the usb20 phy clock
> that must be able to enable or disable usb20 clock.
> To prevent a recurssive call to clk_enable() that would cause a recursive
> locking issue, we must use __clk_enable() and __clk_disable().
> Rename these methods in davinci_clk_enable() and davinci_clk_disable(),
> and export them.
> 
> Signed-off-by: Alexandre Bailon <abailon@baylibre.com>
> Suggested-by: David Lechner <david@lechnology.com>
> ---
>  arch/arm/mach-davinci/clock.c | 14 ++++++++------
>  arch/arm/mach-davinci/clock.h |  2 ++
>  2 files changed, 10 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/arm/mach-davinci/clock.c b/arch/arm/mach-davinci/clock.c
> index df42c93..0f967c3 100644
> --- a/arch/arm/mach-davinci/clock.c
> +++ b/arch/arm/mach-davinci/clock.c
> @@ -31,10 +31,10 @@ static LIST_HEAD(clocks);
>  static DEFINE_MUTEX(clocks_mutex);
>  static DEFINE_SPINLOCK(clockfw_lock);
>  
> -static void __clk_enable(struct clk *clk)
> +void davinci_clk_enable(struct clk *clk)
>  {
>  	if (clk->parent)
> -		__clk_enable(clk->parent);
> +		davinci_clk_enable(clk->parent);
>  	if (clk->usecount++ == 0) {
>  		if (clk->flags & CLK_PSC)
>  			davinci_psc_config(clk->domain, clk->gpsc, clk->lpsc,
> @@ -43,8 +43,9 @@ static void __clk_enable(struct clk *clk)
>  			clk->clk_enable(clk);
>  	}
>  }
> +EXPORT_SYMBOL(davinci_clk_enable);

We don't want to export these as we dont want drivers to use this API.
This is to be used within mach-davinci only.

Thanks,
Sekhar

^ permalink raw reply

* Tearing down DMA transfer setup after DMA client has finished
From: Geert Uytterhoeven @ 2016-12-08 12:31 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <yw1xshpyzk6p.fsf@unicorn.mansr.com>

On Thu, Dec 8, 2016 at 1:20 PM, M?ns Rullg?rd <mans@mansr.com> wrote:
> Geert Uytterhoeven <geert@linux-m68k.org> writes:
>> On Thu, Dec 8, 2016 at 12:44 PM, M?ns Rullg?rd <mans@mansr.com> wrote:
>>> Vinod Koul <vinod.koul@intel.com> writes:
>>>> On Wed, Dec 07, 2016 at 04:45:58PM +0000, M?ns Rullg?rd wrote:
>>>>> Vinod Koul <vinod.koul@intel.com> writes:
>>>>> > On Tue, Dec 06, 2016 at 01:14:20PM +0000, M?ns Rullg?rd wrote:
>>>>> >> That's not going to work very well.  Device drivers typically request
>>>>> >> dma channels in their probe functions or when the device is opened.
>>>>> >> This means that reserving one of the few channels there will inevitably
>>>>> >> make some other device fail to operate.
>>>>> >
>>>>> > No that doesnt make sense at all, you should get a channel only when you
>>>>> > want to use it and not in probe!
>>>>>
>>>>> Tell that to just about every single driver ever written.
>>>>
>>>> Not really, few do yes which is wrong but not _all_ do that.
>>>
>>> Every driver I ever looked at does.  Name one you consider "correct."
>>
>> I'm far from claiming that drivers/tty/serial/sh-sci.c is perfect, but
>> it does request DMA channels at open time, not at probe time.
>
> In the part quoted above, I said most drivers request dma channels in
> their probe or open functions.  For the purposes of this discussion,
> that distinction is irrelevant.  In either case, the channel is held
> indefinitely.  If this wasn't the correct way to use the dmaengine,
> there would be no need for the virt-dma helpers which are specifically
> designed for cases the one currently at hand.

Sorry, I mainly read Vinod's "not in probe", and missed your "or when the
device is opened".

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert at linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* [PATCH 8/8] efi: Add EFI_SECURE_BOOT bit [ver #6]
From: David Howells @ 2016-12-08 12:31 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <148120020832.5854.5448601415491330495.stgit@warthog.procyon.org.uk>

From: Josh Boyer <jwboyer@fedoraproject.org>

UEFI machines can be booted in Secure Boot mode.  Add a EFI_SECURE_BOOT bit
that can be passed to efi_enabled() to find out whether secure boot is
enabled.

This will be used by the SysRq+x handler, registered by the x86 arch, to find
out whether secure boot mode is enabled so that it can be disabled.

Signed-off-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: David Howells <dhowells@redhat.com>
---

 arch/x86/kernel/setup.c |   15 +++++++++++++++
 include/linux/efi.h     |    1 +
 2 files changed, 16 insertions(+)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9c337b0e8ba7..d8972ec6257d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1152,6 +1152,21 @@ void __init setup_arch(char **cmdline_p)
 	/* Allocate bigger log buffer */
 	setup_log_buf(1);
 
+	if (IS_ENABLED(CONFIG_EFI)) {
+		switch (boot_params.secure_boot) {
+		case efi_secureboot_mode_disabled:
+			pr_info("Secure boot disabled\n");
+			break;
+		case efi_secureboot_mode_enabled:
+			set_bit(EFI_SECURE_BOOT, &efi.flags);
+			pr_info("Secure boot enabled\n");
+			break;
+		default:
+			pr_info("Secure boot could not be determined\n");
+			break;
+		}
+	}
+
 	reserve_initrd();
 
 	acpi_table_upgrade();
diff --git a/include/linux/efi.h b/include/linux/efi.h
index c894ed5bfa1c..e1893f5002c3 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1070,6 +1070,7 @@ extern int __init efi_setup_pcdp_console(char *);
 #define EFI_ARCH_1		7	/* First arch-specific bit */
 #define EFI_DBG			8	/* Print additional debug info at runtime */
 #define EFI_NX_PE_DATA		9	/* Can runtime data regions be mapped non-executable? */
+#define EFI_SECURE_BOOT		10	/* Are we in Secure Boot mode? */
 
 #ifdef CONFIG_EFI
 /*

^ permalink raw reply related

* [PATCH 7/8] efi: Handle secure boot from UEFI-2.6 [ver #6]
From: David Howells @ 2016-12-08 12:31 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <148120020832.5854.5448601415491330495.stgit@warthog.procyon.org.uk>

UEFI-2.6 adds a new variable, DeployedMode.  If it exists, this must be 1
if we're to engage lockdown mode.

Reported-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---

 drivers/firmware/efi/libstub/secureboot.c |   16 +++++++++++++++-
 include/linux/efi.h                       |    4 ++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c
index 39c91e091f6a..d653f76b9725 100644
--- a/drivers/firmware/efi/libstub/secureboot.c
+++ b/drivers/firmware/efi/libstub/secureboot.c
@@ -22,6 +22,9 @@ static const efi_char16_t const efi_SecureBoot_name[] = {
 static const efi_char16_t const efi_SetupMode_name[] = {
 	'S', 'e', 't', 'u', 'p', 'M', 'o', 'd', 'e', 0
 };
+static const efi_char16_t const efi_DeployedMode_name[] = {
+	'D', 'e', 'p', 'l', 'o', 'y', 'e', 'd', 'M', 'o', 'd', 'e', 0
+};
 
 /* SHIM variables */
 static const efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID;
@@ -40,7 +43,7 @@ static efi_char16_t const shim_MokSBState_name[] = {
 enum efi_secureboot_mode efi_get_secureboot(efi_system_table_t *sys_table_arg)
 {
 	u32 attr;
-	u8 secboot, setupmode, moksbstate;
+	u8 secboot, setupmode, deployedmode, moksbstate;
 	unsigned long size;
 	efi_status_t status;
 
@@ -59,6 +62,17 @@ enum efi_secureboot_mode efi_get_secureboot(efi_system_table_t *sys_table_arg)
 	if (secboot == 0 || setupmode == 1)
 		return efi_secureboot_mode_disabled;
 
+	/* UEFI-2.6 requires DeployedMode to be 1. */
+	if (sys_table_arg->hdr.revision >= EFI_2_60_SYSTEM_TABLE_REVISION) {
+		size = sizeof(deployedmode);
+		status = get_efi_var(efi_DeployedMode_name, &efi_variable_guid,
+				     NULL, &size, &deployedmode);
+		if (status != EFI_SUCCESS)
+			goto out_efi_err;
+		if (deployedmode == 0)
+			return efi_secureboot_mode_disabled;
+	}
+
 	/* See if a user has put shim into insecure mode.  If so, and if the
 	 * variable doesn't have the runtime attribute set, we might as well
 	 * honor that.
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 92e23f03045e..c894ed5bfa1c 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -645,6 +645,10 @@ typedef struct {
 
 #define EFI_SYSTEM_TABLE_SIGNATURE ((u64)0x5453595320494249ULL)
 
+#define EFI_2_60_SYSTEM_TABLE_REVISION  ((2 << 16) | (60))
+#define EFI_2_50_SYSTEM_TABLE_REVISION  ((2 << 16) | (50))
+#define EFI_2_40_SYSTEM_TABLE_REVISION  ((2 << 16) | (40))
+#define EFI_2_31_SYSTEM_TABLE_REVISION  ((2 << 16) | (31))
 #define EFI_2_30_SYSTEM_TABLE_REVISION  ((2 << 16) | (30))
 #define EFI_2_20_SYSTEM_TABLE_REVISION  ((2 << 16) | (20))
 #define EFI_2_10_SYSTEM_TABLE_REVISION  ((2 << 16) | (10))

^ permalink raw reply related

* [PATCH 6/8] efi: Disable secure boot if shim is in insecure mode [ver #6]
From: David Howells @ 2016-12-08 12:30 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <148120020832.5854.5448601415491330495.stgit@warthog.procyon.org.uk>

From: Josh Boyer <jwboyer@fedoraproject.org>

A user can manually tell the shim boot loader to disable validation of
images it loads.  When a user does this, it creates a UEFI variable called
MokSBState that does not have the runtime attribute set.  Given that the
user explicitly disabled validation, we can honor that and not enable
secure boot mode if that variable is set.

Signed-off-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: David Howells <dhowells@redhat.com>
---

 drivers/firmware/efi/libstub/secureboot.c |   24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c
index 62d6904da800..39c91e091f6a 100644
--- a/drivers/firmware/efi/libstub/secureboot.c
+++ b/drivers/firmware/efi/libstub/secureboot.c
@@ -23,6 +23,12 @@ static const efi_char16_t const efi_SetupMode_name[] = {
 	'S', 'e', 't', 'u', 'p', 'M', 'o', 'd', 'e', 0
 };
 
+/* SHIM variables */
+static const efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID;
+static efi_char16_t const shim_MokSBState_name[] = {
+	'M', 'o', 'k', 'S', 'B', 'S', 't', 'a', 't', 'e', 0
+};
+
 #define get_efi_var(name, vendor, ...) \
 	efi_call_runtime(get_variable, \
 			 (efi_char16_t *)(name), (efi_guid_t *)(vendor), \
@@ -33,7 +39,8 @@ static const efi_char16_t const efi_SetupMode_name[] = {
  */
 enum efi_secureboot_mode efi_get_secureboot(efi_system_table_t *sys_table_arg)
 {
-	u8 secboot, setupmode;
+	u32 attr;
+	u8 secboot, setupmode, moksbstate;
 	unsigned long size;
 	efi_status_t status;
 
@@ -52,6 +59,21 @@ enum efi_secureboot_mode efi_get_secureboot(efi_system_table_t *sys_table_arg)
 	if (secboot == 0 || setupmode == 1)
 		return efi_secureboot_mode_disabled;
 
+	/* See if a user has put shim into insecure mode.  If so, and if the
+	 * variable doesn't have the runtime attribute set, we might as well
+	 * honor that.
+	 */
+	size = sizeof(moksbstate);
+	status = get_efi_var(shim_MokSBState_name, &shim_guid,
+			     &attr, &size, &moksbstate);
+
+	/* If it fails, we don't care why.  Default to secure */
+	if (status != EFI_SUCCESS)
+		goto secure_boot_enabled;
+	if (!(attr & EFI_VARIABLE_RUNTIME_ACCESS) && moksbstate == 1)
+		return efi_secureboot_mode_disabled;
+
+secure_boot_enabled:
 	pr_efi(sys_table_arg, "UEFI Secure Boot is enabled.\n");
 	return efi_secureboot_mode_enabled;
 

^ permalink raw reply related

* [PATCH 5/8] efi: Get the secure boot status [ver #6]
From: David Howells @ 2016-12-08 12:30 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <148120020832.5854.5448601415491330495.stgit@warthog.procyon.org.uk>

Get the firmware's secure-boot status in the kernel boot wrapper and stash
it somewhere that the main kernel image can find.

The efi_get_secureboot() function is extracted from the arm stub and (a)
generalised so that it can be called from x86 and (b) made to use
efi_call_runtime() so that it can be run in mixed-mode.

Suggested-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: David Howells <dhowells@redhat.com>
---

 Documentation/x86/zero-page.txt           |    2 +
 arch/x86/boot/compressed/eboot.c          |    2 +
 arch/x86/boot/compressed/head_32.S        |    1 
 arch/x86/boot/compressed/head_64.S        |    1 
 arch/x86/include/asm/bootparam_utils.h    |    5 +-
 arch/x86/include/uapi/asm/bootparam.h     |    3 +
 arch/x86/kernel/asm-offsets.c             |    1 
 drivers/firmware/efi/libstub/Makefile     |    2 -
 drivers/firmware/efi/libstub/arm-stub.c   |   63 +++--------------------------
 drivers/firmware/efi/libstub/secureboot.c |   63 +++++++++++++++++++++++++++++
 include/linux/efi.h                       |    8 ++++
 11 files changed, 90 insertions(+), 61 deletions(-)
 create mode 100644 drivers/firmware/efi/libstub/secureboot.c

diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt
index 95a4d34af3fd..b8527c6b7646 100644
--- a/Documentation/x86/zero-page.txt
+++ b/Documentation/x86/zero-page.txt
@@ -31,6 +31,8 @@ Offset	Proto	Name		Meaning
 1E9/001	ALL	eddbuf_entries	Number of entries in eddbuf (below)
 1EA/001	ALL	edd_mbr_sig_buf_entries	Number of entries in edd_mbr_sig_buffer
 				(below)
+1EB/001	ALL     kbd_status      Numlock is enabled
+1EC/001	ALL     secure_boot	Secure boot is enabled in the firmware
 1EF/001	ALL	sentinel	Used to detect broken bootloaders
 290/040	ALL	edd_mbr_sig_buffer EDD MBR signatures
 2D0/A00	ALL	e820_map	E820 memory map table
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index c8c32ebcdfdb..5b151c262ac2 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1158,6 +1158,8 @@ struct boot_params *efi_main(struct efi_config *c,
 	else
 		setup_boot_services32(efi_early);
 
+	boot_params->secure_boot = efi_get_secureboot(sys_table);
+
 	setup_graphics(boot_params);
 
 	setup_efi_pci(boot_params);
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index d85b9625e836..c635f7e32f5c 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -61,6 +61,7 @@
 
 	__HEAD
 ENTRY(startup_32)
+	movb	$0, BP_secure_boot(%esi)
 #ifdef CONFIG_EFI_STUB
 	jmp	preferred_addr
 
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index beab8322f72a..ccd2c7461b7f 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -244,6 +244,7 @@ ENTRY(startup_64)
 	 * that maps our entire kernel(text+data+bss+brk), zero page
 	 * and command line.
 	 */
+	movb	$0, BP_secure_boot(%rsi)
 #ifdef CONFIG_EFI_STUB
 	/*
 	 * The entry point for the PE/COFF executable is efi_pe_entry, so
diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h
index 4a8cb8d7cbd5..7e16d53ff6a3 100644
--- a/arch/x86/include/asm/bootparam_utils.h
+++ b/arch/x86/include/asm/bootparam_utils.h
@@ -38,9 +38,10 @@ static void sanitize_boot_params(struct boot_params *boot_params)
 		memset(&boot_params->ext_ramdisk_image, 0,
 		       (char *)&boot_params->efi_info -
 			(char *)&boot_params->ext_ramdisk_image);
-		memset(&boot_params->kbd_status, 0,
+		boot_params->kbd_status = 0;
+		memset(&boot_params->_pad5, 0,
 		       (char *)&boot_params->hdr -
-		       (char *)&boot_params->kbd_status);
+		       (char *)&boot_params->_pad5);
 		memset(&boot_params->_pad7[0], 0,
 		       (char *)&boot_params->edd_mbr_sig_buffer[0] -
 			(char *)&boot_params->_pad7[0]);
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index b10bf319ed20..5138dacf8bb8 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -135,7 +135,8 @@ struct boot_params {
 	__u8  eddbuf_entries;				/* 0x1e9 */
 	__u8  edd_mbr_sig_buf_entries;			/* 0x1ea */
 	__u8  kbd_status;				/* 0x1eb */
-	__u8  _pad5[3];					/* 0x1ec */
+	__u8  secure_boot;				/* 0x1ec */
+	__u8  _pad5[2];					/* 0x1ed */
 	/*
 	 * The sentinel is set to a nonzero value (0xff) in header.S.
 	 *
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index c62e015b126c..de827d6ac8c2 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -81,6 +81,7 @@ void common(void) {
 
 	BLANK();
 	OFFSET(BP_scratch, boot_params, scratch);
+	OFFSET(BP_secure_boot, boot_params, secure_boot);
 	OFFSET(BP_loadflags, boot_params, hdr.loadflags);
 	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
 	OFFSET(BP_version, boot_params, hdr.version);
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 6621b13c370f..9af966863612 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -28,7 +28,7 @@ OBJECT_FILES_NON_STANDARD	:= y
 # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
 KCOV_INSTRUMENT			:= n
 
-lib-y				:= efi-stub-helper.o gop.o
+lib-y				:= efi-stub-helper.o gop.o secureboot.o
 
 # include the stub's generic dependencies from lib/ when building for ARM/arm64
 arm-deps := fdt_rw.c fdt_ro.c fdt_wip.c fdt.c fdt_empty_tree.c fdt_sw.c sort.c
diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c
index b4f7d78f9e8b..9984d0442442 100644
--- a/drivers/firmware/efi/libstub/arm-stub.c
+++ b/drivers/firmware/efi/libstub/arm-stub.c
@@ -20,52 +20,6 @@
 
 bool __nokaslr;
 
-static int efi_get_secureboot(efi_system_table_t *sys_table_arg)
-{
-	static efi_char16_t const sb_var_name[] = {
-		'S', 'e', 'c', 'u', 'r', 'e', 'B', 'o', 'o', 't', 0 };
-	static efi_char16_t const sm_var_name[] = {
-		'S', 'e', 't', 'u', 'p', 'M', 'o', 'd', 'e', 0 };
-
-	efi_guid_t var_guid = EFI_GLOBAL_VARIABLE_GUID;
-	efi_get_variable_t *f_getvar = sys_table_arg->runtime->get_variable;
-	u8 val;
-	unsigned long size = sizeof(val);
-	efi_status_t status;
-
-	status = f_getvar((efi_char16_t *)sb_var_name, (efi_guid_t *)&var_guid,
-			  NULL, &size, &val);
-
-	if (status != EFI_SUCCESS)
-		goto out_efi_err;
-
-	if (val == 0)
-		return 0;
-
-	status = f_getvar((efi_char16_t *)sm_var_name, (efi_guid_t *)&var_guid,
-			  NULL, &size, &val);
-
-	if (status != EFI_SUCCESS)
-		goto out_efi_err;
-
-	if (val == 1)
-		return 0;
-
-	return 1;
-
-out_efi_err:
-	switch (status) {
-	case EFI_NOT_FOUND:
-		return 0;
-	case EFI_DEVICE_ERROR:
-		return -EIO;
-	case EFI_SECURITY_VIOLATION:
-		return -EACCES;
-	default:
-		return -EINVAL;
-	}
-}
-
 efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg,
 			     void *__image, void **__fh)
 {
@@ -226,7 +180,7 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 	efi_guid_t loaded_image_proto = LOADED_IMAGE_PROTOCOL_GUID;
 	unsigned long reserve_addr = 0;
 	unsigned long reserve_size = 0;
-	int secure_boot = 0;
+	enum efi_secureboot_mode secure_boot;
 	struct screen_info *si;
 
 	/* Check if we were booted by the EFI firmware */
@@ -296,19 +250,14 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 		pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n");
 
 	secure_boot = efi_get_secureboot(sys_table);
-	if (secure_boot > 0)
-		pr_efi(sys_table, "UEFI Secure Boot is enabled.\n");
-
-	if (secure_boot < 0) {
-		pr_efi_err(sys_table,
-			"could not determine UEFI Secure Boot status.\n");
-	}
 
 	/*
-	 * Unauthenticated device tree data is a security hazard, so
-	 * ignore 'dtb=' unless UEFI Secure Boot is disabled.
+	 * Unauthenticated device tree data is a security hazard, so ignore
+	 * 'dtb=' unless UEFI Secure Boot is disabled.  We assume that secure
+	 * boot is enabled if we can't determine its state.
 	 */
-	if (secure_boot != 0 && strstr(cmdline_ptr, "dtb=")) {
+	if (secure_boot != efi_secureboot_mode_disabled &&
+	    strstr(cmdline_ptr, "dtb=")) {
 		pr_efi(sys_table, "Ignoring DTB from command line.\n");
 	} else {
 		status = handle_cmdline_files(sys_table, image, cmdline_ptr,
diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c
new file mode 100644
index 000000000000..62d6904da800
--- /dev/null
+++ b/drivers/firmware/efi/libstub/secureboot.c
@@ -0,0 +1,63 @@
+/*
+ * Secure boot handling.
+ *
+ * Copyright (C) 2013,2014 Linaro Limited
+ *     Roy Franz <roy.franz at linaro.org
+ * Copyright (C) 2013 Red Hat, Inc.
+ *     Mark Salter <msalter@redhat.com>
+ *
+ * This file is part of the Linux kernel, and is made available under the
+ * terms of the GNU General Public License version 2.
+ *
+ */
+
+#include <linux/efi.h>
+#include <asm/efi.h>
+
+/* BIOS variables */
+static const efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
+static const efi_char16_t const efi_SecureBoot_name[] = {
+	'S', 'e', 'c', 'u', 'r', 'e', 'B', 'o', 'o', 't', 0
+};
+static const efi_char16_t const efi_SetupMode_name[] = {
+	'S', 'e', 't', 'u', 'p', 'M', 'o', 'd', 'e', 0
+};
+
+#define get_efi_var(name, vendor, ...) \
+	efi_call_runtime(get_variable, \
+			 (efi_char16_t *)(name), (efi_guid_t *)(vendor), \
+			 __VA_ARGS__);
+
+/*
+ * Determine whether we're in secure boot mode.
+ */
+enum efi_secureboot_mode efi_get_secureboot(efi_system_table_t *sys_table_arg)
+{
+	u8 secboot, setupmode;
+	unsigned long size;
+	efi_status_t status;
+
+	size = sizeof(secboot);
+	status = get_efi_var(efi_SecureBoot_name, &efi_variable_guid,
+			     NULL, &size, &secboot);
+	if (status != EFI_SUCCESS)
+		goto out_efi_err;
+
+	size = sizeof(setupmode);
+	status = get_efi_var(efi_SetupMode_name, &efi_variable_guid,
+			     NULL, &size, &setupmode);
+	if (status != EFI_SUCCESS)
+		goto out_efi_err;
+
+	if (secboot == 0 || setupmode == 1)
+		return efi_secureboot_mode_disabled;
+
+	pr_efi(sys_table_arg, "UEFI Secure Boot is enabled.\n");
+	return efi_secureboot_mode_enabled;
+
+out_efi_err:
+	pr_efi_err(sys_table_arg, "Could not determine UEFI Secure Boot status.\n");
+	if (status == EFI_NOT_FOUND)
+		return efi_secureboot_mode_disabled;
+	return efi_secureboot_mode_unknown;
+}
diff --git a/include/linux/efi.h b/include/linux/efi.h
index c7904556d7a8..92e23f03045e 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1477,6 +1477,14 @@ efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg,
 bool efi_runtime_disabled(void);
 extern void efi_call_virt_check_flags(unsigned long flags, const char *call);
 
+enum efi_secureboot_mode {
+	efi_secureboot_mode_unset,
+	efi_secureboot_mode_unknown,
+	efi_secureboot_mode_disabled,
+	efi_secureboot_mode_enabled,
+};
+enum efi_secureboot_mode efi_get_secureboot(efi_system_table_t *sys_table);
+
 /*
  * Arch code can implement the following three template macros, avoiding
  * reptition for the void/non-void return cases of {__,}efi_call_virt():

^ permalink raw reply related

* [PATCH 4/8] efi: Add SHIM and image security database GUID definitions [ver #6]
From: David Howells @ 2016-12-08 12:30 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <148120020832.5854.5448601415491330495.stgit@warthog.procyon.org.uk>

Add the definitions for shim and image security database, both of which
are used widely in various Linux distros.

Signed-off-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---

 include/linux/efi.h |    3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 93a82de167eb..c7904556d7a8 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -610,6 +610,9 @@ void efi_native_runtime_setup(void);
 #define EFI_CONSOLE_OUT_DEVICE_GUID		EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4,  0x9a, 0x46, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
 #define APPLE_PROPERTIES_PROTOCOL_GUID		EFI_GUID(0x91bd12fe, 0xf6c3, 0x44fb,  0xa5, 0xb7, 0x51, 0x22, 0xab, 0x30, 0x3a, 0xe0)
 
+#define EFI_IMAGE_SECURITY_DATABASE_GUID	EFI_GUID(0xd719b2cb, 0x3d3a, 0x4596, 0xa3, 0xbc, 0xda, 0xd0, 0x0e, 0x67, 0x65, 0x6f)
+#define EFI_SHIM_LOCK_GUID			EFI_GUID(0x605dab50, 0xe046, 0x4300, 0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23)
+
 /*
  * This GUID is used to pass to the kernel proper the struct screen_info
  * structure that was populated by the stub based on the GOP protocol instance

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox