Linux-ARM-Kernel Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v5] arm64: fpsimd: improve stacking logic in non-interruptible context
From: Ard Biesheuvel @ 2016-12-12 17:56 UTC (permalink / raw)
  To: linux-arm-kernel

Currently, we allow kernel mode NEON in softirq or hardirq context by
stacking and unstacking a slice of the NEON register file for each call
to kernel_neon_begin() and kernel_neon_end(), respectively.

Given that
a) a CPU typically spends most of its time in userland, during which time
   no kernel mode NEON in process context is in progress,
b) a CPU spends most of its time in the kernel doing other things than
   kernel mode NEON when it gets interrupted to perform kernel mode NEON
   in softirq context

the stacking and subsequent unstacking is only necessary if we are
interrupting a thread while it is performing kernel mode NEON in process
context, which means that in all other cases, we can simply preserve the
userland FPSIMD state once, and only restore it upon return to userland,
even if we are being invoked from softirq or hardirq context.

So instead of checking whether we are running in interrupt context, keep
track of the level of nested kernel mode NEON calls in progress, and only
perform the eager stack/unstack if the level exceeds 1.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/kernel/fpsimd.c | 64 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 394c61db5566..c19363775436 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -220,45 +220,73 @@ void fpsimd_flush_task_state(struct task_struct *t)
 
 #ifdef CONFIG_KERNEL_MODE_NEON
 
-static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate);
-static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate);
+/*
+ * Although unlikely, it is possible for three kernel mode NEON contexts to
+ * be live at the same time: process context, softirq context and hardirq
+ * context. So while the userland context is stashed in the thread's fpsimd
+ * state structure, we need two additional levels of storage.
+ */
+static DEFINE_PER_CPU(struct fpsimd_partial_state, nested_fpsimdstate[2]);
+static DEFINE_PER_CPU(int, kernel_neon_nesting_level);
 
 /*
  * Kernel-side NEON support functions
  */
 void kernel_neon_begin_partial(u32 num_regs)
 {
-	if (in_interrupt()) {
-		struct fpsimd_partial_state *s = this_cpu_ptr(
-			in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
+	struct fpsimd_partial_state *s;
+	int level;
 
-		BUG_ON(num_regs > 32);
-		fpsimd_save_partial_state(s, roundup(num_regs, 2));
-	} else {
+	preempt_disable();
+
+	if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) {
 		/*
 		 * Save the userland FPSIMD state if we have one and if we
 		 * haven't done so already. Clear fpsimd_last_state to indicate
 		 * that there is no longer userland FPSIMD state in the
 		 * registers.
 		 */
-		preempt_disable();
-		if (current->mm &&
-		    !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE))
-			fpsimd_save_state(&current->thread.fpsimd_state);
+		if (current->mm) {
+			unsigned long flags;
+
+			local_irq_save(flags);
+			if (!test_and_set_thread_flag(TIF_FOREIGN_FPSTATE))
+				fpsimd_save_state(&current->thread.fpsimd_state);
+			local_irq_restore(flags);
+		} else {
+			set_thread_flag(TIF_FOREIGN_FPSTATE);
+		}
 		this_cpu_write(fpsimd_last_state, NULL);
 	}
+
+	level = this_cpu_inc_return(kernel_neon_nesting_level);
+	BUG_ON(level > 3);
+
+	if (level > 1) {
+		s = this_cpu_ptr(nested_fpsimdstate);
+
+		WARN_ON_ONCE(num_regs > 32);
+		num_regs = min(roundup(num_regs, 2), 32U);
+
+		fpsimd_save_partial_state(&s[level - 2], num_regs);
+	}
 }
 EXPORT_SYMBOL(kernel_neon_begin_partial);
 
 void kernel_neon_end(void)
 {
-	if (in_interrupt()) {
-		struct fpsimd_partial_state *s = this_cpu_ptr(
-			in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
-		fpsimd_load_partial_state(s);
-	} else {
-		preempt_enable();
+	struct fpsimd_partial_state *s;
+	int level;
+
+	level = this_cpu_read(kernel_neon_nesting_level);
+	BUG_ON(level < 1);
+
+	if (level > 1) {
+		s = this_cpu_ptr(nested_fpsimdstate);
+		fpsimd_load_partial_state(&s[level - 2]);
 	}
+	this_cpu_dec(kernel_neon_nesting_level);
+	preempt_enable();
 }
 EXPORT_SYMBOL(kernel_neon_end);
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH v4] arm64: fpsimd: improve stacking logic in non-interruptible context
From: Ard Biesheuvel @ 2016-12-12 17:55 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161212103512.GE1574@e103592.cambridge.arm.com>

On 12 December 2016 at 10:35, Dave Martin <Dave.Martin@arm.com> wrote:
> On Fri, Dec 09, 2016 at 08:57:20PM +0000, Ard Biesheuvel wrote:
>> On 9 December 2016 at 19:29, Dave Martin <Dave.Martin@arm.com> wrote:
>> > On Fri, Dec 09, 2016 at 06:21:55PM +0000, Catalin Marinas wrote:
>> >> On Fri, Dec 09, 2016 at 04:46:32PM +0000, Ard Biesheuvel wrote:
>> >> >  void kernel_neon_begin_partial(u32 num_regs)
>> >> >  {
>> >> > -   if (in_interrupt()) {
>> >> > -           struct fpsimd_partial_state *s = this_cpu_ptr(
>> >> > -                   in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
>> >> > +   struct fpsimd_partial_state *s;
>> >> > +   int level;
>> >> > +
>> >> > +   preempt_disable();
>> >> > +
>> >> > +   level = this_cpu_inc_return(kernel_neon_nesting_level);
>> >> > +   BUG_ON(level > 3);
>> >> > +
>> >> > +   if (level > 1) {
>> >> > +           s = this_cpu_ptr(nested_fpsimdstate);
>> >> >
>> >> > -           BUG_ON(num_regs > 32);
>> >> > -           fpsimd_save_partial_state(s, roundup(num_regs, 2));
>> >> > +           WARN_ON_ONCE(num_regs > 32);
>> >> > +           num_regs = min(roundup(num_regs, 2), 32U);
>> >> > +
>> >> > +           fpsimd_save_partial_state(&s[level - 2], num_regs);
>> >> >     } else {
>> >> >             /*
>> >> >              * Save the userland FPSIMD state if we have one and if we
>> >> > @@ -241,7 +256,6 @@ void kernel_neon_begin_partial(u32 num_regs)
>> >> >              * that there is no longer userland FPSIMD state in the
>> >> >              * registers.
>> >> >              */
>> >> > -           preempt_disable();
>> >> >             if (current->mm &&
>> >> >                 !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE))
>> >> >                     fpsimd_save_state(&current->thread.fpsimd_state);
>> >>
>> >> I wonder whether we could actually do this saving and flag/level setting
>> >> in reverse to simplify the races. Something like your previous patch but
>> >> only set TIF_FOREIGN_FPSTATE after saving:
>> >>
>> >>       level = this_cpu_read(kernel_neon_nesting_level);
>> >>       if (level > 0) {
>> >>               ...
>> >>               fpsimd_save_partial_state();
>> >>       } else {
>> >>               if (!test_thread_flag(TIF_FOREIGN_FPSTATE))
>> >>                       fpsimd_save_state();
>> >>               set_thread_flag(TIF_FOREIGN_FPSTATE);
>> >>       }
>> >>       this_cpu_inc(kernel_neon_nesting_level);
>> >>
>> >> There is a risk of extra saving if we get an interrupt after
>> >> test_thread_flag() and before set_thread_flag() but I don't think this
>> >> would corrupt any state, just writing things twice.
>> >
>> > I would worry that we can save two states over the same buffer and then
>> > restore an uninitialised buffer in this case unless we are careful.
>> > Because the level-dependent code is now misbracketed by the inc/dec,
>> > a preempting call races with the outer call and use the same value.
>> >
>> > I guess we could do
>> >
>> > if (!test_thread_flag(TIF_FOREIGN_FPSTATE))
>> >         fpsimd_save_state();
>> > clear_thread_flag(TIF_FOREIGN_FPSTATE);
>> >
>> > at the start unconditionally, before the _inc_return().
>> >
>> > The task state may then get saved in the middle of being saved, but
>> > as you say it shouldn't have changed in the meantime.
>>
>> It /will/ have changed in the meantime: when the interrupted context
>> is resumed, it will happily proceed with saving the state where it
>> left off, but now the register file contains whatever was left after
>> the interrupt handler is done with the NEON.
>
> Hmmm, true.  The NEON regs will have been restored by kernel_neon_end()
> in the inner context, but the extra SVE bits won't have been.
>

Even worse: both the interrupter and the interruptee think they are
preserving the userland context, so once the interrupter is done, it
will not restore the context as it found it. The interruptee will then
proceed and write whatever is left in those registers into the saved
state.

>>
>> > The nested
>> > save code may then do a partial save of the same state on top of that
>> > which could get restored at the inner kernel_neon_end() call.
>> >
>>
>> I'm afraid the only way to deal with this correctly is to treat the
>> whole sequence as a critical section, which means execute it with
>> interrupts disabled.
>
> Or we make the KERNEL_MODE_NEON code SVE-aware, which is where I started
> off.  In that case, we do SVE (partial) save/restore whenever
> kernel_mode_neon() is called with live SVE state.  The change here is
> that would we consider that there is always live SVE state until the
> fpsimd_save_state() actually finishes at the outer level.  We may want
> to delay setting of TIF_FOREIGN_FPSTATE for that purpose.
>
> This means you do take an additional latency hit if you want to use NEON
> in an interrupting context and there happens to be live SVE state.  It's
> a consequence of the architecture though -- I don't think there's any
> way to get around it.  We can still scale the cost by implementing
> sve_save_partial_state() or something equivalent.
>
> You original inc()+save() ... restore()+dec() seems sound enough if
> viewed this way.  Unless I'm missing something?
>

I think having a small critical section is not so bad. Let me send out
a v5 so we can discuss ...

^ permalink raw reply

* [PATCH] watchdog: bcm2835_wdt: set WDOG_HW_RUNNING bit when appropriate
From: Eric Anholt @ 2016-12-12 17:46 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1481536123-9279-1-git-send-email-rasmus.villemoes@prevas.dk>

Rasmus Villemoes <rasmus.villemoes@prevas.dk> writes:

> A bootloader may start the watchdog device before handing control to
> the kernel - in that case, we should tell the kernel about it so the
> watchdog framework can keep it alive until userspace opens
> /dev/watchdog0.

I don't believe our current bootloaders (the closed firmware or u-boot)
set up the watchdog, but this seems reasonable since they might want to
later.

Acked-by: Eric Anholt <eric@anholt.net>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 832 bytes
Desc: not available
URL: <http://lists.infradead.org/pipermail/linux-arm-kernel/attachments/20161212/673b1918/attachment.sig>

^ permalink raw reply

* [PATCH] crypto: arm64/aes: reimplement bit-sliced ARM/NEON implementation for arm64
From: Ard Biesheuvel @ 2016-12-12 17:45 UTC (permalink / raw)
  To: linux-arm-kernel

This is a reimplementation of the NEON version of the bit-sliced AES
algorithm. This code is heavily based on Andy Polyakov's OpenSSL version
for ARM, which is also available in the kernel. This is an alternative for
the existing NEON implementation for arm64 authored by me, which suffers
from poor performance due to its reliance on the pathologically slow four
register variant of the tbl/tbx NEON instruction.

This version is about ~30% (*) faster than the generic C code, but only in
cases where the input can be 8x interleaved (this is a fundamental property
of bit slicing). For this reason, only the chaining modes ECB, XTS and CTR
are implemented. (The significance of ECB is that it could potentially be
used by other chaining modes)

* Measured on Cortex-A57. Note that this is still an order of magnitude
  slower than the implementations that use the dedicated AES instructions
  introduced in ARMv8, but those are part of an optional extension, and so
  it is good to have a fallback.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/Kconfig           |   6 +
 arch/arm64/crypto/Makefile          |   3 +
 arch/arm64/crypto/aes-neonbs-core.S | 905 ++++++++++++++++++++++++++++++++++++
 arch/arm64/crypto/aes-neonbs-glue.c | 300 ++++++++++++
 4 files changed, 1214 insertions(+)
 create mode 100644 arch/arm64/crypto/aes-neonbs-core.S
 create mode 100644 arch/arm64/crypto/aes-neonbs-glue.c

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 450a85df041a..cd0e7a6146b7 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -72,4 +72,10 @@ config CRYPTO_CRC32_ARM64
 	depends on ARM64
 	select CRYPTO_HASH
 
+config CRYPTO_AES_NEON_BS
+	tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_AES
+
 endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index aa8888d7b744..11d20714ec48 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -41,6 +41,9 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
 sha512-arm64-y := sha512-glue.o sha512-core.o
 
+obj-$(CONFIG_CRYPTO_AES_NEON_BS) += aes-neon-bs.o
+aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
+
 AFLAGS_aes-ce.o		:= -DINTERLEAVE=4
 AFLAGS_aes-neon.o	:= -DINTERLEAVE=4
 
diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S
new file mode 100644
index 000000000000..d027c276cc75
--- /dev/null
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -0,0 +1,905 @@
+/*
+ * Bit sliced AES using NEON instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * The algorithm implemented here is described in detail by the paper
+ * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
+ * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
+ *
+ * This implementation is based primarily on the OpenSSL implementation
+ * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+
+	rounds		.req	x11
+	bskey		.req	x12
+
+	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
+	eor		\b2, \b2, \b1
+	eor		\b5, \b5, \b6
+	eor		\b3, \b3, \b0
+	eor		\b6, \b6, \b2
+	eor		\b5, \b5, \b0
+	eor		\b6, \b6, \b3
+	eor		\b3, \b3, \b7
+	eor		\b7, \b7, \b5
+	eor		\b3, \b3, \b4
+	eor		\b4, \b4, \b5
+	eor		\b2, \b2, \b7
+	eor		\b3, \b3, \b1
+	eor		\b1, \b1, \b5
+	.endm
+
+	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
+	eor		\b0, \b0, \b6
+	eor		\b1, \b1, \b4
+	eor		\b4, \b4, \b6
+	eor		\b2, \b2, \b0
+	eor		\b6, \b6, \b1
+	eor		\b1, \b1, \b5
+	eor		\b5, \b5, \b3
+	eor		\b3, \b3, \b7
+	eor		\b7, \b7, \b5
+	eor		\b2, \b2, \b5
+	eor		\b4, \b4, \b7
+	.endm
+
+	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
+	eor		\b1, \b1, \b7
+	eor		\b4, \b4, \b7
+	eor		\b7, \b7, \b5
+	eor		\b1, \b1, \b3
+	eor		\b2, \b2, \b5
+	eor		\b3, \b3, \b7
+	eor		\b6, \b6, \b1
+	eor		\b2, \b2, \b0
+	eor		\b5, \b5, \b3
+	eor		\b4, \b4, \b6
+	eor		\b0, \b0, \b6
+	eor		\b1, \b1, \b4
+	.endm
+
+	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
+	eor		\b1, \b1, \b5
+	eor		\b2, \b2, \b7
+	eor		\b3, \b3, \b1
+	eor		\b4, \b4, \b5
+	eor		\b7, \b7, \b5
+	eor		\b3, \b3, \b4
+	eor 		\b5, \b5, \b0
+	eor		\b3, \b3, \b7
+	eor		\b6, \b6, \b2
+	eor		\b2, \b2, \b1
+	eor		\b6, \b6, \b3
+	eor		\b3, \b3, \b0
+	eor		\b5, \b5, \b6
+	.endm
+
+	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
+	eor 		\t0, \y0, \y1
+	and		\t0, \t0, \x0
+	eor		\x0, \x0, \x1
+	and		\t1, \x1, \y0
+	and		\x0, \x0, \y1
+	eor		\x1, \t1, \t0
+	eor		\x0, \x0, \t1
+	.endm
+
+	.macro		mul_gf4_n, x0, x1, y0, y1, t0
+	eor		\t0, \y0, \y1
+	and		\t0, \t0, \x0
+	eor		\x0, \x0, \x1
+	and		\x1, \x1, \y0
+	and		\x0, \x0, \y1
+	eor		\x1, \x1, \x0
+	eor		\x0, \x0, \t0
+	.endm
+
+	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
+	eor		\t0, \y0, \y1
+	eor 		\t1, \y2, \y3
+	and		\t0, \t0, \x0
+	and		\t1, \t1, \x2
+	eor		\x0, \x0, \x1
+	eor		\x2, \x2, \x3
+	and		\x1, \x1, \y0
+	and		\x3, \x3, \y2
+	and		\x0, \x0, \y1
+	and		\x2, \x2, \y3
+	eor		\x1, \x1, \x0
+	eor		\x2, \x2, \x3
+	eor		\x0, \x0, \t0
+	eor		\x3, \x3, \t1
+	.endm
+
+	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
+				    y0, y1, y2, y3, t0, t1, t2, t3
+	eor		\t0, \x0, \x2
+	eor		\t1, \x1, \x3
+	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
+	eor		\y0, \y0, \y2
+	eor		\y1, \y1, \y3
+	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
+	eor		\x0, \x0, \t0
+	eor		\x2, \x2, \t0
+	eor		\x1, \x1, \t1
+	eor		\x3, \x3, \t1
+	eor		\t0, \x4, \x6
+	eor		\t1, \x5, \x7
+	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
+	eor		\y0, \y0, \y2
+	eor		\y1, \y1, \y3
+	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
+	eor		\x4, \x4, \t0
+	eor		\x6, \x6, \t0
+	eor		\x5, \x5, \t1
+	eor		\x7, \x7, \t1
+	.endm
+
+	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
+				   t0, t1, t2, t3, s0, s1, s2, s3
+	eor		\t3, \x4, \x6
+	eor		\t2, \x5, \x7
+	eor		\t1, \x1, \x3
+	eor		\s1, \x7, \x6
+	mov		\t0, \t2
+	eor		\s0, \x0, \x2
+	orr		\t2, \t2, \t1
+	eor		\s3, \t3, \t0
+	and		\s2, \t3, \s0
+	orr		\t3, \t3, \s0
+	eor		\s0, \s0, \t1
+	and		\t0, \t0, \t1
+	eor		\t1, \x3, \x2
+	and		\s3, \s3, \s0
+	and		\s1, \s1, \t1
+	eor		\t1, \x4, \x5
+	eor		\s0, \x1, \x0
+	eor		\t3, \t3, \s1
+	eor		\t2, \t2, \s1
+	and		\s1, \t1, \s0
+	orr		\t1, \t1, \s0
+	eor		\t3, \t3, \s3
+	eor		\t0, \t0, \s1
+	eor		\t2, \t2, \s2
+	eor		\t1, \t1, \s3
+	eor		\t0, \t0, \s2
+	and		\s0, \x7, \x3
+	eor		\t1, \t1, \s2
+	and		\s1, \x6, \x2
+	and		\s2, \x5, \x1
+	orr		\s3, \x4, \x0
+	eor		\t3, \t3, \s0
+	eor		\t1, \t1, \s2
+	eor		\t0, \t0, \s3
+	eor		\t2, \t2, \s1
+	and		\s2, \t3, \t1
+	mov		\s0, \t0
+	eor		\s1, \t2, \s2
+	eor		\s3, \t0, \s2
+	eor		\s2, \t0, \s2
+	bsl		\s1, \t1, \t0
+	bsl		\s3, \t3, \t2
+	eor		\t3, \t3, \t2
+	bsl		\s0, \s1, \s2
+	bsl		\t0, \s2, \s1
+	and		\s2, \s0, \s3
+	eor		\t1, \t1, \t0
+	eor		\s2, \s2, \t3
+	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
+			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+	.endm
+
+	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
+			      t0, t1, t2, t3, s0, s1, s2, s3
+	in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
+			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
+	inv_gf256	\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
+			\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
+			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
+			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
+	out_bs_ch	\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
+			\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
+	.endm
+
+	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
+				  t0, t1, t2, t3, s0, s1, s2, s3
+	inv_in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
+			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
+	inv_gf256	\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
+			\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
+			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
+			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
+	inv_out_bs_ch	\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
+			\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
+	.endm
+
+	.macro		enc_next_rk
+	ldp		q16, q17, [bskey], #32
+	ldp		q18, q19, [bskey], #32
+	ldp		q20, q21, [bskey], #32
+	ldp		q22, q23, [bskey], #32
+	.endm
+
+	.macro		dec_next_rk
+	ldp		q16, q17, [bskey, #-128]!
+	ldp		q18, q19, [bskey, #32]
+	ldp		q20, q21, [bskey, #64]
+	ldp		q22, q23, [bskey, #96]
+	.endm
+
+	.macro		add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
+	eor		\x0\().16b, \x0\().16b, v16.16b
+	eor		\x1\().16b, \x1\().16b, v17.16b
+	eor		\x2\().16b, \x2\().16b, v18.16b
+	eor		\x3\().16b, \x3\().16b, v19.16b
+	eor		\x4\().16b, \x4\().16b, v20.16b
+	eor		\x5\().16b, \x5\().16b, v21.16b
+	eor		\x6\().16b, \x6\().16b, v22.16b
+	eor		\x7\().16b, \x7\().16b, v23.16b
+	.endm
+
+	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
+	tbl		\x0\().16b, {\x0\().16b}, \mask\().16b
+	tbl		\x1\().16b, {\x1\().16b}, \mask\().16b
+	tbl		\x2\().16b, {\x2\().16b}, \mask\().16b
+	tbl		\x3\().16b, {\x3\().16b}, \mask\().16b
+	tbl		\x4\().16b, {\x4\().16b}, \mask\().16b
+	tbl		\x5\().16b, {\x5\().16b}, \mask\().16b
+	tbl		\x6\().16b, {\x6\().16b}, \mask\().16b
+	tbl		\x7\().16b, {\x7\().16b}, \mask\().16b
+	.endm
+
+	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
+				  t0, t1, t2, t3, t4, t5, t6, t7, inv
+	ext		\t0\().16b, \x0\().16b, \x0\().16b, #12
+	ext		\t1\().16b, \x1\().16b, \x1\().16b, #12
+	eor		\x0\().16b, \x0\().16b, \t0\().16b
+	ext		\t2\().16b, \x2\().16b, \x2\().16b, #12
+	eor		\x1\().16b, \x1\().16b, \t1\().16b
+	ext		\t3\().16b, \x3\().16b, \x3\().16b, #12
+	eor		\x2\().16b, \x2\().16b, \t2\().16b
+	ext		\t4\().16b, \x4\().16b, \x4\().16b, #12
+	eor		\x3\().16b, \x3\().16b, \t3\().16b
+	ext		\t5\().16b, \x5\().16b, \x5\().16b, #12
+	eor		\x4\().16b, \x4\().16b, \t4\().16b
+	ext		\t6\().16b, \x6\().16b, \x6\().16b, #12
+	eor		\x5\().16b, \x5\().16b, \t5\().16b
+	ext		\t7\().16b, \x7\().16b, \x7\().16b, #12
+	eor		\x6\().16b, \x6\().16b, \t6\().16b
+	eor		\t1\().16b, \t1\().16b, \x0\().16b
+	eor		\x7\().16b, \x7\().16b, \t7\().16b
+	ext		\x0\().16b, \x0\().16b, \x0\().16b, #8
+	eor		\t2\().16b, \t2\().16b, \x1\().16b
+	eor		\t0\().16b, \t0\().16b, \x7\().16b
+	eor		\t1\().16b, \t1\().16b, \x7\().16b
+	ext		\x1\().16b, \x1\().16b, \x1\().16b, #8
+	eor		\t5\().16b, \t5\().16b, \x4\().16b
+	eor		\x0\().16b, \x0\().16b, \t0\().16b
+	eor		\t6\().16b, \t6\().16b, \x5\().16b
+	eor		\x1\().16b, \x1\().16b, \t1\().16b
+	ext		\t0\().16b, \x4\().16b, \x4\().16b, #8
+	eor		\t4\().16b, \t4\().16b, \x3\().16b
+	ext		\t1\().16b, \x5\().16b, \x5\().16b, #8
+	eor		\t7\().16b, \t7\().16b, \x6\().16b
+	ext		\x4\().16b, \x3\().16b, \x3\().16b, #8
+	eor		\t3\().16b, \t3\().16b, \x2\().16b
+	ext		\x5\().16b, \x7\().16b, \x7\().16b, #8
+	eor		\t4\().16b, \t4\().16b, \x7\().16b
+	ext		\x3\().16b, \x6\().16b, \x6\().16b, #8
+	eor		\t3\().16b, \t3\().16b, \x7\().16b
+	ext		\x6\().16b, \x2\().16b, \x2\().16b, #8
+	eor		\x7\().16b, \t1\().16b, \t5\().16b
+	.ifb		\inv
+	eor		\x2\().16b, \t0\().16b, \t4\().16b
+	eor		\x4\().16b, \x4\().16b, \t3\().16b
+	eor		\x5\().16b, \x5\().16b, \t7\().16b
+	eor		\x3\().16b, \x3\().16b, \t6\().16b
+	eor		\x6\().16b, \x6\().16b, \t2\().16b
+	.else
+	eor		\t3\().16b, \t3\().16b, \x4\().16b
+	eor		\x5\().16b, \x5\().16b, \t7\().16b
+	eor		\x2\().16b, \x3\().16b, \t6\().16b
+	eor		\x3\().16b, \t0\().16b, \t4\().16b
+	eor		\x4\().16b, \x6\().16b, \t2\().16b
+	mov		\x6\().16b, \t3\().16b
+	.endif
+	.endm
+
+	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
+				      t0, t1, t2, t3, t4, t5, t6, t7
+	ext		\t0\().16b, \x0\().16b, \x0\().16b, #8
+	ext		\t6\().16b, \x6\().16b, \x6\().16b, #8
+	ext		\t7\().16b, \x7\().16b, \x7\().16b, #8
+	eor		\t0\().16b, \t0\().16b, \x0\().16b
+	ext		\t1\().16b, \x1\().16b, \x1\().16b, #8
+	eor		\t6\().16b, \t6\().16b, \x6\().16b
+	ext		\t2\().16b, \x2\().16b, \x2\().16b, #8
+	eor		\t7\().16b, \t7\().16b, \x7\().16b
+	ext		\t3\().16b, \x3\().16b, \x3\().16b, #8
+	eor		\t1\().16b, \t1\().16b, \x1\().16b
+	ext		\t4\().16b, \x4\().16b, \x4\().16b, #8
+	eor		\t2\().16b, \t2\().16b, \x2\().16b
+	ext		\t5\().16b, \x5\().16b, \x5\().16b, #8
+	eor		\t3\().16b, \t3\().16b, \x3\().16b
+	eor		\t4\().16b, \t4\().16b, \x4\().16b
+	eor		\t5\().16b, \t5\().16b, \x5\().16b
+	eor		\x0\().16b, \x0\().16b, \t6\().16b
+	eor		\x1\().16b, \x1\().16b, \t6\().16b
+	eor		\x2\().16b, \x2\().16b, \t0\().16b
+	eor		\x4\().16b, \x4\().16b, \t2\().16b
+	eor		\x3\().16b, \x3\().16b, \t1\().16b
+	eor		\x1\().16b, \x1\().16b, \t7\().16b
+	eor		\x2\().16b, \x2\().16b, \t7\().16b
+	eor		\x4\().16b, \x4\().16b, \t6\().16b
+	eor		\x5\().16b, \x5\().16b, \t3\().16b
+	eor		\x3\().16b, \x3\().16b, \t6\().16b
+	eor		\x6\().16b, \x6\().16b, \t4\().16b
+	eor		\x4\().16b, \x4\().16b, \t7\().16b
+	eor		\x5\().16b, \x5\().16b, \t7\().16b
+	eor		\x7\().16b, \x7\().16b, \t5\().16b
+	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
+			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
+	.endm
+
+	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
+	ushr		\t0\().2d, \b0\().2d, #\n
+	ushr		\t1\().2d, \b1\().2d, #\n
+	eor		\t0\().16b, \t0\().16b, \a0\().16b
+	eor		\t1\().16b, \t1\().16b, \a1\().16b
+	and		\t0\().16b, \t0\().16b, \mask\().16b
+	and		\t1\().16b, \t1\().16b, \mask\().16b
+	eor		\a0\().16b, \a0\().16b, \t0\().16b
+	shl		\t0\().2d, \t0\().2d, #\n
+	eor		\a1\().16b, \a1\().16b, \t1\().16b
+	shl		\t1\().2d, \t1\().2d, #\n
+	eor		\b0\().16b, \b0\().16b, \t0\().16b
+	eor		\b1\().16b, \b1\().16b, \t1\().16b
+	.endm
+
+	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
+	movi		\t0\().16b, #0x55
+	movi		\t1\().16b, #0x33
+	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
+	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
+	movi		\t0\().16b, #0x0f
+	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
+	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
+	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
+	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
+	.endm
+
+
+	.align		6
+M0:	.octa		0x0004080c0105090d02060a0e03070b0f
+
+M0SR:	.octa		0x0004080c05090d010a0e02060f03070b
+SR:	.octa		0x0f0e0d0c0a09080b0504070600030201
+SRM0:	.octa		0x01060b0c0207080d0304090e00050a0f
+
+M0ISR:	.octa		0x0004080c0d0105090a0e0206070b0f03
+ISR:	.octa		0x0f0e0d0c080b0a090504070602010003
+ISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
+
+	/*
+	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
+	 */
+ENTRY(aesbs_convert_key)
+	ld1		{v7.4s}, [x1], #16		// load round 0 key
+	ld1		{v17.4s}, [x1], #16		// load round 1 key
+
+	movi		v8.16b,  #0x01			// bit masks
+	movi		v9.16b,  #0x02
+	movi		v10.16b, #0x04
+	movi		v11.16b, #0x08
+	movi		v12.16b, #0x10
+	movi		v13.16b, #0x20
+	movi		v14.16b, #0x40
+	movi		v15.16b, #0x80
+	ldr		q16, M0
+
+	sub		x2, x2, #1
+	str		q7, [x0], #16		// save round 0 key
+
+.Lkey_loop:
+	tbl		v7.16b ,{v17.16b}, v16.16b
+	ld1		{v17.4s}, [x1], #16		// load next round key
+
+	cmtst		v0.16b, v7.16b, v8.16b
+	cmtst		v1.16b, v7.16b, v9.16b
+	cmtst		v2.16b, v7.16b, v10.16b
+	cmtst		v3.16b, v7.16b, v11.16b
+	cmtst		v4.16b, v7.16b, v12.16b
+	cmtst		v5.16b, v7.16b, v13.16b
+	cmtst		v6.16b, v7.16b, v14.16b
+	cmtst		v7.16b, v7.16b, v15.16b
+	not		v0.16b, v0.16b
+	not		v1.16b, v1.16b
+	not		v5.16b, v5.16b
+	not		v6.16b, v6.16b
+
+	subs		x2, x2, #1
+	stp		q2, q3, [x0, #32]
+	stp		q4, q5, [x0, #64]
+	stp		q6, q7, [x0, #96]
+	stp		q0, q1, [x0], #128
+	b.ne		.Lkey_loop
+
+	movi		v7.16b, #0x63			// compose .L63
+	eor		v17.16b, v17.16b, v7.16b
+	str		q17, [x0]
+	ret
+ENDPROC(aesbs_convert_key)
+
+	.align		4
+aesbs_encrypt8:
+	ldr		q9, [bskey], #16		// round 0 key
+	ldr		q8, M0SR
+	ldr		q24, SR
+
+	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
+	eor		v11.16b, v1.16b, v9.16b
+	tbl		v0.16b, {v10.16b}, v8.16b
+	eor		v12.16b, v2.16b, v9.16b
+	tbl		v1.16b, {v11.16b}, v8.16b
+	eor		v13.16b, v3.16b, v9.16b
+	tbl		v2.16b, {v12.16b}, v8.16b
+	eor		v14.16b, v4.16b, v9.16b
+	tbl		v3.16b, {v13.16b}, v8.16b
+	eor		v15.16b, v5.16b, v9.16b
+	tbl		v4.16b, {v14.16b}, v8.16b
+	eor		v10.16b, v6.16b, v9.16b
+	tbl		v5.16b, {v15.16b}, v8.16b
+	eor		v11.16b, v7.16b, v9.16b
+	tbl		v6.16b, {v10.16b}, v8.16b
+	tbl		v7.16b, {v11.16b}, v8.16b
+
+	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+
+	sub		rounds, rounds, #1
+	b		.Lenc_sbox
+
+.Lenc_loop:
+	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
+.Lenc_sbox:
+	sbox		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
+								v13, v14, v15
+	subs		rounds, rounds, #1
+	b.cc		.Lenc_done
+
+	enc_next_rk
+
+	mix_cols	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
+								v13, v14, v15
+
+	add_round_key	v0, v1, v2, v3, v4, v5, v6, v7
+
+	b.ne		.Lenc_loop
+	ldr		q24, SRM0
+	b		.Lenc_loop
+
+.Lenc_done:
+	ldr		q12, [bskey]			// last round key
+
+	bitslice	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
+
+	eor		v0.16b, v0.16b, v12.16b
+	eor		v1.16b, v1.16b, v12.16b
+	eor		v4.16b, v4.16b, v12.16b
+	eor		v6.16b, v6.16b, v12.16b
+	eor		v3.16b, v3.16b, v12.16b
+	eor		v7.16b, v7.16b, v12.16b
+	eor		v2.16b, v2.16b, v12.16b
+	eor		v5.16b, v5.16b, v12.16b
+	ret
+ENDPROC(aesbs_encrypt8)
+
+	.align		4
+aesbs_decrypt8:
+	lsl		x9, rounds, #7
+	add		bskey, bskey, x9
+
+	ldr		q9, [bskey, #-112]!		// round 0 key
+	ldr		q8, M0ISR
+	ldr		q24, ISR
+
+	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
+	eor		v11.16b, v1.16b, v9.16b
+	tbl		v0.16b, {v10.16b}, v8.16b
+	eor		v12.16b, v2.16b, v9.16b
+	tbl		v1.16b, {v11.16b}, v8.16b
+	eor		v13.16b, v3.16b, v9.16b
+	tbl		v2.16b, {v12.16b}, v8.16b
+	eor		v14.16b, v4.16b, v9.16b
+	tbl		v3.16b, {v13.16b}, v8.16b
+	eor		v15.16b, v5.16b, v9.16b
+	tbl		v4.16b, {v14.16b}, v8.16b
+	eor		v10.16b, v6.16b, v9.16b
+	tbl		v5.16b, {v15.16b}, v8.16b
+	eor		v11.16b, v7.16b, v9.16b
+	tbl		v6.16b, {v10.16b}, v8.16b
+	tbl		v7.16b, {v11.16b}, v8.16b
+
+	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+
+	sub		rounds, rounds, #1
+	b		.Ldec_sbox
+
+.Ldec_loop:
+	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
+.Ldec_sbox:
+	inv_sbox	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
+								v13, v14, v15
+	subs		rounds, rounds, #1
+	b.cc		.Ldec_done
+
+	dec_next_rk
+
+	add_round_key	v0, v1, v6, v4, v2, v7, v3, v5
+
+	inv_mix_cols	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
+								v13, v14, v15
+
+	b.ne		.Ldec_loop
+	ldr		q24, ISRM0
+	b		.Ldec_loop
+.Ldec_done:
+	ldr		q12, [bskey, #-16]		// last round key
+
+	bitslice	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
+
+	eor		v0.16b, v0.16b, v12.16b
+	eor		v1.16b, v1.16b, v12.16b
+	eor		v6.16b, v6.16b, v12.16b
+	eor		v4.16b, v4.16b, v12.16b
+	eor		v2.16b, v2.16b, v12.16b
+	eor		v7.16b, v7.16b, v12.16b
+	eor		v3.16b, v3.16b, v12.16b
+	eor		v5.16b, v5.16b, v12.16b
+	ret
+ENDPROC(aesbs_decrypt8)
+
+	/*
+	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks)
+	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks)
+	 */
+	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+99:	mov		x5, #1
+	lsl		x5, x5, x4
+	subs		w4, w4, #8
+	csel		x4, x4, xzr, pl
+	csel		x5, x5, xzr, mi
+
+	ld1		{v0.16b}, [x1], #16
+	tbnz		x5, #1, 0f
+	ld1		{v1.16b}, [x1], #16
+	tbnz		x5, #2, 0f
+	ld1		{v2.16b}, [x1], #16
+	tbnz		x5, #3, 0f
+	ld1		{v3.16b}, [x1], #16
+	tbnz		x5, #4, 0f
+	ld1		{v4.16b}, [x1], #16
+	tbnz		x5, #5, 0f
+	ld1		{v5.16b}, [x1], #16
+	tbnz		x5, #6, 0f
+	ld1		{v6.16b}, [x1], #16
+	tbnz		x5, #7, 0f
+	ld1		{v7.16b}, [x1], #16
+
+0:	mov		bskey, x2
+	mov		rounds, x3
+	bl		\do8
+
+	st1		{\o0\().16b}, [x0], #16
+	tbnz		x5, #1, 1f
+	st1		{\o1\().16b}, [x0], #16
+	tbnz		x5, #2, 1f
+	st1		{\o2\().16b}, [x0], #16
+	tbnz		x5, #3, 1f
+	st1		{\o3\().16b}, [x0], #16
+	tbnz		x5, #4, 1f
+	st1		{\o4\().16b}, [x0], #16
+	tbnz		x5, #5, 1f
+	st1		{\o5\().16b}, [x0], #16
+	tbnz		x5, #6, 1f
+	st1		{\o6\().16b}, [x0], #16
+	tbnz		x5, #7, 1f
+	st1		{\o7\().16b}, [x0], #16
+
+	cbnz		x4, 99b
+
+1:	ldp		x29, x30, [sp], #16
+	ret
+	.endm
+
+	.align		4
+ENTRY(aesbs_ecb_encrypt)
+	__ecb_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
+ENDPROC(aesbs_ecb_encrypt)
+
+	.align		4
+ENTRY(aesbs_ecb_decrypt)
+	__ecb_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
+ENDPROC(aesbs_ecb_decrypt)
+
+	.macro		next_tweak, out, in, const, tmp
+	sshr		\tmp\().2d,  \in\().2d,   #63
+	and		\tmp\().16b, \tmp\().16b, \const\().16b
+	add		\out\().2d,  \in\().2d,   \in\().2d
+	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
+	eor		\out\().16b, \out\().16b, \tmp\().16b
+	.endm
+
+	.align		4
+.Lxts_mul_x:
+CPU_LE(	.quad		1, 0x87		)
+CPU_BE(	.quad		0x87, 1		)
+
+	/*
+	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks, u8 iv[])
+	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks, u8 iv[])
+	 */
+__xts_crypt8:
+	mov		x6, #1
+	lsl		x6, x6, x4
+	subs		w4, w4, #8
+	csel		x4, x4, xzr, pl
+	csel		x6, x6, xzr, mi
+
+	ld1		{v0.16b}, [x1], #16
+	next_tweak	v26, v25, v30, v31
+	eor		v0.16b, v0.16b, v25.16b
+	tbnz		x6, #1, 0f
+
+	ld1		{v1.16b}, [x1], #16
+	next_tweak	v27, v26, v30, v31
+	eor		v1.16b, v1.16b, v26.16b
+	tbnz		x6, #2, 0f
+
+	ld1		{v2.16b}, [x1], #16
+	next_tweak	v28, v27, v30, v31
+	eor		v2.16b, v2.16b, v27.16b
+	tbnz		x6, #3, 0f
+
+	ld1		{v3.16b}, [x1], #16
+	next_tweak	v29, v28, v30, v31
+	eor		v3.16b, v3.16b, v28.16b
+	tbnz		x6, #4, 0f
+
+	ld1		{v4.16b}, [x1], #16
+	str		q29, [sp, #16]
+	eor		v4.16b, v4.16b, v29.16b
+	next_tweak	v29, v29, v30, v31
+	tbnz		x6, #5, 0f
+
+	ld1		{v5.16b}, [x1], #16
+	str		q29, [sp, #32]
+	eor		v5.16b, v5.16b, v29.16b
+	next_tweak	v29, v29, v30, v31
+	tbnz		x6, #6, 0f
+
+	ld1		{v6.16b}, [x1], #16
+	str		q29, [sp, #48]
+	eor		v6.16b, v6.16b, v29.16b
+	next_tweak	v29, v29, v30, v31
+	tbnz		x6, #7, 0f
+
+	ld1		{v7.16b}, [x1], #16
+	str		q29, [sp, #64]
+	eor		v7.16b, v7.16b, v29.16b
+	next_tweak	v29, v29, v30, v31
+
+0:	mov		bskey, x2
+	mov		rounds, x3
+	br		x7
+ENDPROC(__xts_crypt8)
+
+	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
+	stp		x29, x30, [sp, #-80]!
+	mov		x29, sp
+
+	ldr		q30, .Lxts_mul_x
+	ld1		{v25.16b}, [x5]
+
+99:	adr		x7, \do8
+	bl		__xts_crypt8
+
+	ldp		q16, q17, [sp, #16]
+	ldp		q18, q19, [sp, #48]
+
+	eor		\o0\().16b, \o0\().16b, v25.16b
+	eor		\o1\().16b, \o1\().16b, v26.16b
+	eor		\o2\().16b, \o2\().16b, v27.16b
+	eor		\o3\().16b, \o3\().16b, v28.16b
+
+	st1		{\o0\().16b}, [x0], #16
+	mov		v25.16b, v26.16b
+	tbnz		x6, #1, 1f
+	st1		{\o1\().16b}, [x0], #16
+	mov		v25.16b, v27.16b
+	tbnz		x6, #2, 1f
+	st1		{\o2\().16b}, [x0], #16
+	mov		v25.16b, v28.16b
+	tbnz		x6, #3, 1f
+	st1		{\o3\().16b}, [x0], #16
+	mov		v25.16b, v29.16b
+	tbnz		x6, #4, 1f
+
+	eor		\o4\().16b, \o4\().16b, v16.16b
+	eor		\o5\().16b, \o5\().16b, v17.16b
+	eor		\o6\().16b, \o6\().16b, v18.16b
+	eor		\o7\().16b, \o7\().16b, v19.16b
+
+	st1		{\o4\().16b}, [x0], #16
+	tbnz		x6, #5, 1f
+	st1		{\o5\().16b}, [x0], #16
+	tbnz		x6, #6, 1f
+	st1		{\o6\().16b}, [x0], #16
+	tbnz		x6, #7, 1f
+	st1		{\o7\().16b}, [x0], #16
+
+	cbnz		x4, 99b
+
+1:	st1		{v25.16b}, [x5]
+	ldp		x29, x30, [sp], #80
+	ret
+	.endm
+
+ENTRY(aesbs_xts_encrypt)
+	__xts_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
+ENDPROC(aesbs_xts_encrypt)
+
+ENTRY(aesbs_xts_decrypt)
+	__xts_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
+ENDPROC(aesbs_xts_decrypt)
+
+	.macro		next_ctr, v
+	mov		\v\().d[1], x8
+	mov		\v\().d[0], x7
+	adds		x8, x8, #1
+	adc		x7, x7, xzr
+	rev64		\v\().16b, \v\().16b
+	.endm
+
+	/*
+	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+	 *		     int rounds, int blocks, u8 iv[], bool final)
+	 */
+ENTRY(aesbs_ctr_encrypt)
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+	add		x4, x4, x6		// do one extra block if final
+
+	ldp		x7, x8, [x5]
+	ld1		{v0.16b}, [x5]
+CPU_LE(	rev		x7, x7		)
+CPU_LE(	rev		x8, x8		)
+	adds		x8, x8, #1
+	adc		x7, x7, xzr
+
+99:	mov		x9, #1
+	lsl		x9, x9, x4
+	subs		w4, w4, #8
+	csel		x4, x4, xzr, pl
+	csel		x9, x9, xzr, le
+
+	tbnz		x9, #1, 0f
+
+	next_ctr	v1
+	tbnz		x9, #2, 0f
+
+	next_ctr	v2
+	tbnz		x9, #3, 0f
+
+	next_ctr	v3
+	tbnz		x9, #4, 0f
+
+	next_ctr	v4
+	tbnz		x9, #5, 0f
+
+	next_ctr	v5
+	tbnz		x9, #6, 0f
+
+	next_ctr	v6
+	tbnz		x9, #7, 0f
+
+	next_ctr	v7
+
+0:	mov		bskey, x2
+	mov		rounds, x3
+	bl		aesbs_encrypt8
+
+	lsr		x9, x9, x6		// disregard the final block
+	tbnz		x9, #0, 0f
+
+	ld1		{v8.16b}, [x1], #16
+	eor		v0.16b, v0.16b, v8.16b
+	st1		{v0.16b}, [x0], #16
+	tbnz		x9, #1, 1f
+
+	ld1		{v9.16b}, [x1], #16
+	eor		v1.16b, v1.16b, v9.16b
+	st1		{v1.16b}, [x0], #16
+	tbnz		x9, #2, 2f
+
+	ld1		{v10.16b}, [x1], #16
+	eor		v4.16b, v4.16b, v10.16b
+	st1		{v4.16b}, [x0], #16
+	tbnz		x9, #3, 3f
+
+	ld1		{v11.16b}, [x1], #16
+	eor		v6.16b, v6.16b, v11.16b
+	st1		{v6.16b}, [x0], #16
+	tbnz		x9, #4, 4f
+
+	ld1		{v12.16b}, [x1], #16
+	eor		v3.16b, v3.16b, v12.16b
+	st1		{v3.16b}, [x0], #16
+	tbnz		x9, #5, 5f
+
+	ld1		{v13.16b}, [x1], #16
+	eor		v7.16b, v7.16b, v13.16b
+	st1		{v7.16b}, [x0], #16
+	tbnz		x9, #6, 6f
+
+	ld1		{v14.16b}, [x1], #16
+	eor		v2.16b, v2.16b, v14.16b
+	st1		{v2.16b}, [x0], #16
+	tbnz		x9, #7, 7f
+
+	ld1		{v15.16b}, [x1], #16
+	eor		v5.16b, v5.16b, v15.16b
+	st1		{v5.16b}, [x0], #16
+
+	next_ctr	v0
+	cbnz		x4, 99b
+
+0:	st1		{v0.16b}, [x5]
+8:	ldp		x29, x30, [sp], #16
+	ret
+
+	/*
+	 * If we are handling the tail of the input (x6 == 1), return the
+	 * final keystream block back to the caller via the IV buffer.
+	 */
+1:	cbz		x6, 8b
+	st1		{v1.16b}, [x5]
+	b		8b
+2:	cbz		x6, 8b
+	st1		{v4.16b}, [x5]
+	b		8b
+3:	cbz		x6, 8b
+	st1		{v6.16b}, [x5]
+	b		8b
+4:	cbz		x6, 8b
+	st1		{v3.16b}, [x5]
+	b		8b
+5:	cbz		x6, 8b
+	st1		{v7.16b}, [x5]
+	b		8b
+6:	cbz		x6, 8b
+	st1		{v2.16b}, [x5]
+	b		8b
+7:	cbz		x6, 8b
+	st1		{v5.16b}, [x5]
+	b		8b
+ENDPROC(aesbs_ctr_encrypt)
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
new file mode 100644
index 000000000000..57982172563c
--- /dev/null
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -0,0 +1,300 @@
+/*
+ * Bit sliced AES using NEON instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/xts.h>
+#include <linux/module.h>
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks);
+asmlinkage void aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks);
+
+asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]);
+asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]);
+
+asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[], bool final);
+
+asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds);
+
+struct aesbs_key {
+	u8			key[13 * (8 * AES_BLOCK_SIZE) + 32];
+};
+
+struct aesbs_ctx {
+	struct aesbs_key	bskey;
+	int			rounds;
+};
+
+struct aesbs_xts_ctx {
+	struct aesbs_key	bskey;
+	struct crypto_cipher	*tweak_tfm;
+	int			rounds;
+};
+
+static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			unsigned int key_len)
+{
+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_aes_ctx rk;
+	int err;
+
+	err = crypto_aes_expand_key(&rk, in_key, key_len);
+	if (err)
+		return err;
+
+	ctx->rounds = 6 + key_len / 4;
+
+	kernel_neon_begin();
+	aesbs_convert_key(ctx->bskey.key, rk.key_enc, ctx->rounds);
+	kernel_neon_end();
+
+	return 0;
+}
+
+static int xts_init(struct crypto_skcipher *tfm)
+{
+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	ctx->tweak_tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(ctx->tweak_tfm))
+		return PTR_ERR(ctx->tweak_tfm);
+
+	return 0;
+}
+
+static void xts_exit(struct crypto_skcipher *tfm)
+{
+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	crypto_free_cipher(ctx->tweak_tfm);
+}
+
+static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			    unsigned int key_len)
+{
+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_aes_ctx rk;
+	int err;
+
+	err = xts_verify_key(tfm, in_key, key_len);
+	if (err)
+		return err;
+
+	err = crypto_cipher_setkey(ctx->tweak_tfm, in_key + key_len / 2,
+				   key_len / 2);
+	if (err)
+		return err;
+
+	err = crypto_aes_expand_key(&rk, in_key, key_len / 2);
+	if (err)
+		return err;
+
+	ctx->rounds = 6 + key_len / 8;
+
+	kernel_neon_begin();
+	aesbs_convert_key(ctx->bskey.key, rk.key_enc, ctx->rounds);
+	kernel_neon_end();
+
+	return 0;
+}
+
+static int __ecb_crypt(struct skcipher_request *req,
+		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks))
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+	while (walk.nbytes >= AES_BLOCK_SIZE) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+		if (walk.nbytes < walk.total)
+			blocks = round_down(blocks,
+					    walk.chunksize / AES_BLOCK_SIZE);
+
+		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->bskey.key,
+		   ctx->rounds, blocks);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+	return __ecb_crypt(req, aesbs_ecb_encrypt);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+	return __ecb_crypt(req, aesbs_ecb_decrypt);
+}
+
+static int __xts_crypt(struct skcipher_request *req,
+		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]))
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	crypto_cipher_encrypt_one(ctx->tweak_tfm, walk.iv, walk.iv);
+
+	kernel_neon_begin();
+	while (walk.nbytes >= AES_BLOCK_SIZE) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+		if (walk.nbytes < walk.total)
+			blocks = round_down(blocks,
+					    walk.chunksize / AES_BLOCK_SIZE);
+
+		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->bskey.key,
+		   ctx->rounds, blocks, walk.iv);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+	return __xts_crypt(req, aesbs_xts_encrypt);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+	return __xts_crypt(req, aesbs_xts_decrypt);
+}
+
+static int ctr_encrypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+	while (walk.nbytes > 0) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+		bool final = (walk.total % AES_BLOCK_SIZE) != 0;
+
+		if (walk.nbytes < walk.total) {
+			blocks = round_down(blocks,
+					    walk.chunksize / AES_BLOCK_SIZE);
+			final = false;
+		}
+
+		aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				  ctx->bskey.key, ctx->rounds, blocks, walk.iv,
+				  final);
+
+		if (final) {
+			u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+			u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+
+			if (dst != src)
+				memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
+			crypto_xor(dst, walk.iv, walk.total % AES_BLOCK_SIZE);
+
+			err = skcipher_walk_done(&walk, 0);
+			break;
+		}
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static struct skcipher_alg aes_algs[] = { {
+	.base.cra_name		= "ecb(aes)",
+	.base.cra_driver_name	= "ecb-aes-neonbs",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
+	.base.cra_module	= THIS_MODULE,
+
+	.min_keysize		= AES_MIN_KEY_SIZE,
+	.max_keysize		= AES_MAX_KEY_SIZE,
+	.chunksize		= 8 * AES_BLOCK_SIZE,
+	.setkey			= aesbs_setkey,
+	.encrypt		= ecb_encrypt,
+	.decrypt		= ecb_decrypt,
+}, {
+	.base.cra_name		= "xts(aes)",
+	.base.cra_driver_name	= "xts-aes-neonbs",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct aesbs_xts_ctx),
+	.base.cra_module	= THIS_MODULE,
+
+	.min_keysize		= 2 * AES_MIN_KEY_SIZE,
+	.max_keysize		= 2 * AES_MAX_KEY_SIZE,
+	.chunksize		= 8 * AES_BLOCK_SIZE,
+	.ivsize			= AES_BLOCK_SIZE,
+	.setkey			= aesbs_xts_setkey,
+	.encrypt		= xts_encrypt,
+	.decrypt		= xts_decrypt,
+	.init			= xts_init,
+	.exit			= xts_exit,
+}, {
+	.base.cra_name		= "ctr(aes)",
+	.base.cra_driver_name	= "ctr-aes-neonbs",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
+	.base.cra_module	= THIS_MODULE,
+
+	.min_keysize		= AES_MIN_KEY_SIZE,
+	.max_keysize		= AES_MAX_KEY_SIZE,
+	.chunksize		= 8 * AES_BLOCK_SIZE,
+	.ivsize			= AES_BLOCK_SIZE,
+	.setkey			= aesbs_setkey,
+	.encrypt		= ctr_encrypt,
+	.decrypt		= ctr_encrypt,
+} };
+
+static int __init aes_init(void)
+{
+	return crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
+}
+
+static void aes_exit(void)
+{
+	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
+}
+
+module_init(aes_init);
+module_exit(aes_exit);
-- 
2.7.4

^ permalink raw reply related

* [PATCH] ARM: dts: vexpress: Support GICC_DIR operations
From: Marc Zyngier @ 2016-12-12 17:35 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161210201351.25894-1-christoffer.dall@linaro.org>

[+Sudeep]

On 10/12/16 20:13, Christoffer Dall wrote:
> The GICv2 CPU interface registers span across 8K, not 4K as indicated in
> the DT.  Only the GICC_DIR register is located after the initial 4K
> boundary, leaving a functional system but without support for separately
> EOI'ing and deactivating interrupts.
> 
> After this change the system support split priority drop and interrupt
> deactivation.
> 
> Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
> ---
>  arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts b/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts
> index 0205c97..2e0cf39 100644
> --- a/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts
> +++ b/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts
> @@ -126,7 +126,7 @@
>  		#address-cells = <0>;
>  		interrupt-controller;
>  		reg = <0 0x2c001000 0 0x1000>,
> -		      <0 0x2c002000 0 0x1000>,
> +		      <0 0x2c002000 0 0x2000>,
>  		      <0 0x2c004000 0 0x2000>,
>  		      <0 0x2c006000 0 0x2000>;
>  		interrupts = <1 9 0xf04>;
> 

Acked-by: Marc Zyngier <marc.zyngier@arm.com>

	M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply

* [PATCH 4/4] dt-bindings: input: Specify the interrupt number of TPS65217 power button
From: Rob Herring @ 2016-12-12 17:27 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161209062833.5768-5-woogyom.kim@gmail.com>

On Fri, Dec 09, 2016 at 03:28:33PM +0900, Milo Kim wrote:
> Specify the power button interrupt number which is from the datasheet.
> 
> Signed-off-by: Milo Kim <woogyom.kim@gmail.com>
> ---
>  Documentation/devicetree/bindings/input/tps65218-pwrbutton.txt | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)

Acked-by: Rob Herring <robh@kernel.org>

^ permalink raw reply

* [PATCH 3/4] dt-bindings: power/supply: Update TPS65217 properties
From: Rob Herring @ 2016-12-12 17:26 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161209062833.5768-4-woogyom.kim@gmail.com>

On Fri, Dec 09, 2016 at 03:28:32PM +0900, Milo Kim wrote:
> Add interrupt specifiers for USB and AC charger input. Interrupt numbers
> are from the datasheet.
> Fix wrong property for compatible string.
> 
> Signed-off-by: Milo Kim <woogyom.kim@gmail.com>
> ---
>  .../devicetree/bindings/power/supply/tps65217_charger.txt          | 7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)

Acked-by: Rob Herring <robh@kernel.org>

^ permalink raw reply

* [PATCH 2/4] dt-bindings: mfd: Remove TPS65217 interrupts
From: Rob Herring @ 2016-12-12 17:25 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161209062833.5768-3-woogyom.kim@gmail.com>

On Fri, Dec 09, 2016 at 03:28:31PM +0900, Milo Kim wrote:
> Interrupt numbers are from the datasheet, so no need to keep them in
> the ABI. Use the number in the DT file.

I don't see the purpose of ripping this out. The headers have always 
been for convienence, not whether the values come from the datasheet or 
not.

> Signed-off-by: Milo Kim <woogyom.kim@gmail.com>
> ---
>  arch/arm/boot/dts/am335x-bone-common.dtsi |  8 +++-----
>  include/dt-bindings/mfd/tps65217.h        | 26 --------------------------
>  2 files changed, 3 insertions(+), 31 deletions(-)
>  delete mode 100644 include/dt-bindings/mfd/tps65217.h

^ permalink raw reply

* [PATCH] clk: bcm: Fix 'maybe-uninitialized' warning in bcm2835_clock_choose_div_and_prate()
From: Eric Anholt @ 2016-12-12 17:24 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1481529653-28133-1-git-send-email-boris.brezillon@free-electrons.com>

Boris Brezillon <boris.brezillon@free-electrons.com> writes:

> best_rate is reported as potentially uninitialized by gcc.
>
> Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
> Fixes: 155e8b3b0ee3 ("clk: bcm: Support rate change propagation on bcm2835 clocks")
> Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>

Reviewed-by: Eric Anholt <eric@anholt.net>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 832 bytes
Desc: not available
URL: <http://lists.infradead.org/pipermail/linux-arm-kernel/attachments/20161212/65cadde3/attachment.sig>

^ permalink raw reply

* [PATCH] dt-bindings: Document the hi3660 reset bindings
From: Rob Herring @ 2016-12-12 17:20 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1481249504-7942-1-git-send-email-zhangfei.gao@linaro.org>

On Fri, Dec 09, 2016 at 10:11:44AM +0800, Zhangfei Gao wrote:
> Add DT bindings documentation for hi3660 SoC reset controller.
> 
> Signed-off-by: Zhangfei Gao <zhangfei.gao@linaro.org>
> ---
>  .../bindings/reset/hisilicon,hi3660-reset.txt      | 43 ++++++++++++++++++++++
>  1 file changed, 43 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/reset/hisilicon,hi3660-reset.txt

Acked-by: Rob Herring <robh@kernel.org>

^ permalink raw reply

* [PATCH 1/2] dt-bindings: zx296718-clk: add compatible for audio clock controller
From: Rob Herring @ 2016-12-12 17:10 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1481189157-8995-1-git-send-email-shawnguo@kernel.org>

On Thu, Dec 08, 2016 at 05:25:56PM +0800, Shawn Guo wrote:
> From: Shawn Guo <shawn.guo@linaro.org>
> 
> It adds the compatible string for zx296718 audio clock controller.
> 
> Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
> ---
>  Documentation/devicetree/bindings/clock/zx296718-clk.txt | 3 +++
>  1 file changed, 3 insertions(+)

Acked-by: Rob Herring <robh@kernel.org>

^ permalink raw reply

* [PATCH 5/5] Documentation: fsl-quadspi: Add fsl, ls1012a-qspi compatible string
From: Rob Herring @ 2016-12-12 17:09 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1481188984-43683-6-git-send-email-yao.yuan@freescale.com>

On Thu, Dec 08, 2016 at 05:23:04PM +0800, Yuan Yao wrote:
> From: Yuan Yao <yao.yuan@nxp.com>

Same problem in this subject too.

> 
> new compatible string: "fsl,ls1012a-qspi".
> 
> Signed-off-by: Yuan Yao <yao.yuan@nxp.com>
> ---
>  Documentation/devicetree/bindings/mtd/fsl-quadspi.txt | 1 +
>  1 file changed, 1 insertion(+)

Acked-by: Rob Herring <robh@kernel.org>

^ permalink raw reply

* [PATCH 3/5] Documentation: dt: mtd: add chip support for "jedec, spi-nor"
From: Rob Herring @ 2016-12-12 17:09 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1481188984-43683-4-git-send-email-yao.yuan@freescale.com>

On Thu, Dec 08, 2016 at 05:23:02PM +0800, Yuan Yao wrote:
> From: Yuan Yao <yao.yuan@nxp.com>

The compatible string is wrong in the subject.

> 
> "sst25wf040b" and "en25s64" are also chip compatible with SPI NOR flash.
> 
> Signed-off-by: Yuan Yao <yao.yuan@nxp.com>
> ---
>  Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt | 2 ++
>  1 file changed, 2 insertions(+)

Otherwise,

Acked-by: Rob Herring <robh@kernel.org>

^ permalink raw reply

* [PATCH 2/5] Documentation: fsl: dspi: Add fsl, ls1012a-dspi compatible string
From: Rob Herring @ 2016-12-12 17:08 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1481188984-43683-3-git-send-email-yao.yuan@freescale.com>

On Thu, Dec 08, 2016 at 05:23:01PM +0800, Yuan Yao wrote:
> From: Yuan Yao <yao.yuan@nxp.com>
> 
> new compatible string: "fsl,ls1012a-dspi".
> 
> Signed-off-by: Yuan Yao <yao.yuan@nxp.com>
> ---
>  Documentation/devicetree/bindings/spi/spi-fsl-dspi.txt | 1 +
>  1 file changed, 1 insertion(+)

Acked-by: Rob Herring <robh@kernel.org>

^ permalink raw reply

* [PATCH] efi/libstub: arm*: Pass latest memory map to the kernel
From: Jeffrey Hugo @ 2016-12-12 17:00 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <584E6F50.3020901@arm.com>

On 12/12/2016 2:35 AM, James Morse wrote:
> Hi Ard,
>
> On 09/12/16 18:24, Ard Biesheuvel wrote:
>> As reported by James, the current libstub code involving the annotated
>> memory map only works somewhat correctly by accident, due to the fact
>> that a pool allocation happens to be reused immediately, retaining its
>> former contents.
>>
>> Instead of juggling memory maps, which makes the code more complex than
>> it needs to be, simply put a placholder value into the FDT, and only
>> write the actual value after ExitBootServices() has been called.
>
>> diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c
>> index a6a93116a8f0..5d39dff77f17 100644
>> --- a/drivers/firmware/efi/libstub/fdt.c
>> +++ b/drivers/firmware/efi/libstub/fdt.c
>> @@ -101,7 +101,7 @@ efi_status_t update_fdt(efi_system_table_t *sys_table, void *orig_fdt,
>>  	if (status)
>>  		goto fdt_set_fail;
>>
>> -	fdt_val64 = cpu_to_fdt64((u64)(unsigned long)memory_map);
>> +	fdt_val64 = U64_MAX; /* placeholder */
>>  	status = fdt_setprop(fdt, node, "linux,uefi-mmap-start",
>>  			     &fdt_val64,  sizeof(fdt_val64));
>>  	if (status)
>> @@ -148,6 +148,24 @@ efi_status_t update_fdt(efi_system_table_t *sys_table, void *orig_fdt,
>>  	return EFI_LOAD_ERROR;
>>  }
>>
>> +static efi_status_t update_fdt_memmap(void *fdt, u64 memmap)
>> +{
>> +	int node = fdt_path_offset(fdt, "/chosen");
>> +	efi_status_t status;
>> +
>> +	if (node < 0)
>> +		return EFI_LOAD_ERROR;
>> +
>> +	memmap = cpu_to_fdt64(memmap);
>> +	status = fdt_setprop_inplace(fdt, node, "linux,uefi-mmap-start",
>> +				     &memmap, sizeof(memmap));
>> +
>> +	if (status)
>> +		return EFI_LOAD_ERROR;
>> +
>> +	return EFI_SUCCESS;
>> +}
>
> v4.9.0 with this patch doesn't boot on my Seattle (with known buggy UEFI FW)
> [0]. It looks like the memory map is truncated (and missing a runtime region,
> compare with [1]). Should 'linux,uefi-mmap-size' be updated too? (Otherwise its
> the size when we retrieved the runtime mapping, but before we allocated the FDT)
>

Overall this fails for me as well.  It appears to work, until I trigger 
the race condition I fixed, then OOM killer gets triggered the instant 
rootfs starts to initialize.  Since I see James has a number of 
comments, I did not investigate further to determine why the patch is 
not working on my system.

-- 
Jeffrey Hugo
Qualcomm Datacenter Technologies as an affiliate of Qualcomm 
Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.

^ permalink raw reply

* [PATCH 1/1] arm/module: maximum utilization of module area.
From: Ard Biesheuvel @ 2016-12-12 16:57 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <alpine.LFD.2.20.1612121026250.1657@knanqh.ubzr>

On 12 December 2016 at 15:28, Nicolas Pitre <nicolas.pitre@linaro.org> wrote:
> On Mon, 12 Dec 2016, Vaneet Narang wrote:
>
>> Hi,
>>
>> >A PC24 relocation has a range of +/-32MB.  This means that where-ever
>> >the module is placed, it must be capable of reaching any function
>> >within the kernel text, which may itself be quite large (eg, 8MB, or
>> >possibly larger).  The module area exists to allow modules to be
>> >located in an area where PC24 relocations are able to reach all of the
>> >kernel text on sensibly configured kernels, thereby allowing for
>> >optimal performance.
>> >
>> >If you wish to load large modules, then enable ARM_MODULE_PLTS, which
>> >will use the less efficient PLT method (which is basically an indirect
>> >function call) for relocations that PC24 can't handle, and will allow
>> >the module to be loaded into the vmalloc area.
>> >
>> >Growing the module area so that smaller modules also get penalised by
>> >the PLT indirection is not sane.
>>
>> This is exactly what i am saying. These changes are useful to accomdate
>> 22MB modules without enabling ARM_MODULE_PLTS.
>
> I think you need to figure out why you need such a huge module in the
> first place.  That is very uncommon indeed.
>

Also, note that the module PLT code was recently optimized, to remove
some pathological behavior which severely affected load times of large
modules.

Can you quantify the performance hit you are taking when using module
PLTs? And the actual increase in memory footprint?

^ permalink raw reply

* [PATCH v5 2/4] drm: bridge: add support for TI ths8135
From: Laurent Pinchart @ 2016-12-12 16:49 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161212164547.jw5ejbums6nwwtw2@rob-hp-laptop>

Hello,

On Monday 12 Dec 2016 10:45:47 Rob Herring wrote:
> On Wed, Dec 07, 2016 at 11:42:43AM +0100, Bartosz Golaszewski wrote:
> > THS8135 is a configurable video DAC. Add DT bindings for this chip and
> > use the dumb-vga-dac driver for now as no configuration is required to
> > make it work.
> > 
> > Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
> > ---
> > 
> >  .../bindings/display/bridge/ti,ths8135.txt         | 52 +++++++++++++++++
> >  drivers/gpu/drm/bridge/dumb-vga-dac.c              |  1 +
> >  2 files changed, 53 insertions(+)
> >  create mode 100644
> >  Documentation/devicetree/bindings/display/bridge/ti,ths8135.txt
> Acked-by: Rob Herring <robh@kernel.org>
> 
> But one nit below:
> > diff --git
> > a/Documentation/devicetree/bindings/display/bridge/ti,ths8135.txt
> > b/Documentation/devicetree/bindings/display/bridge/ti,ths8135.txt new
> > file mode 100644
> > index 0000000..23cd8ee
> > --- /dev/null
> > +++ b/Documentation/devicetree/bindings/display/bridge/ti,ths8135.txt
> > @@ -0,0 +1,52 @@
> > +THS8135 Video DAC
> > +-----------------
> > +
> > +This is the binding for Texas Instruments THS8135 Video DAC bridge.
> > +
> > +Required properties:
> > +
> > +- compatible: Must be "ti,ths8135"
> > +
> > +Required nodes:
> > +
> > +This device has two video ports. Their connections are modelled using the
> > OF +graph bindings specified in
> > Documentation/devicetree/bindings/graph.txt. +
> > +- Video port 0 for RGB input
> > +- Video port 1 for VGA output
> > +
> > +Example
> > +-------
> > +
> > +vga-bridge {
> > +	compatible = "ti,ths8135";
> > +	#address-cells = <1>;
> > +	#size-cells = <0>;
> > +
> > +	ports {
> > +		#address-cells = <1>;
> > +		#size-cells = <0>;
> > +
> > +		port at 0 {
> > +			#address-cells = <1>;
> > +			#size-cells = <0>;
> > +			reg = <0>;
> > +
> > +			vga_bridge_in: endpoint at 0 {
> > +				reg = <0>;
> 
> You don't need reg here.

In which case the endpoint node should be named endpoint, not endpoint at 0. You 
could then also remove the #address-cells and #size-cells properties from the 
port at 0 node. Same for port at 1 below.

> > +				remote-endpoint = <&lcdc_out_vga>;
> > +			};
> > +		};
> > +
> > +		port at 1 {
> > +			#address-cells = <1>;
> > +			#size-cells = <0>;
> > +			reg = <1>;
> > +
> > +			vga_bridge_out: endpoint at 0 {
> > +				reg = <0>;
> > +				remote-endpoint = <&vga_con_in>;
> > +			};
> > +		};
> > +	};
> > +};
> > diff --git a/drivers/gpu/drm/bridge/dumb-vga-dac.c
> > b/drivers/gpu/drm/bridge/dumb-vga-dac.c index afec232..498fa75 100644
> > --- a/drivers/gpu/drm/bridge/dumb-vga-dac.c
> > +++ b/drivers/gpu/drm/bridge/dumb-vga-dac.c
> > @@ -204,6 +204,7 @@ static int dumb_vga_remove(struct platform_device
> > *pdev)
> >  
> >  static const struct of_device_id dumb_vga_match[] = {
> >  	{ .compatible = "dumb-vga-dac" },
> > +	{ .compatible = "ti,ths8135" },
> >  	{},
> >  };
> >  MODULE_DEVICE_TABLE(of, dumb_vga_match);

-- 
Regards,

Laurent Pinchart

^ permalink raw reply

* [PATCH] ARM: at91/dt: sama5d2: add ssc0 definition
From: Alexandre Belloni @ 2016-12-12 16:47 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <CAJP5LRNHDXQtZDRcQcNuS76uvUndKVjKqWC2KXTM+DOMrro5kg@mail.gmail.com>

On 05/12/2016 at 12:28:34 +0200, Alex Gershgorin wrote :
> 

> From 3a5a8e78ccccab2858c5944000884ab3c49eba5a Mon Sep 17 00:00:00 2001
> From: Alex <Alex.Gershgorin@qcore.com>
> Date: Sun, 4 Dec 2016 16:03:56 +0200
> Subject: [PATCH] ARM: at91/dt: sama5d2: add ssc0 definition
> 
> The sama5d2 SoC has Synchronous Serial Controller which provides
> synchronous communication link with external devices.
> It's generally used in audio and telecom applications such as
> I2S, Short Frame Sync, Long Frame Sync.
> 
> Signed-off-by: Alex Gershgorin <alex.gershgorin@qcore.com>
> ---
>  arch/arm/boot/dts/sama5d2.dtsi | 16 ++++++++++++++++
>  1 file changed, 16 insertions(+)
> 
Applied, thanks.

-- 
Alexandre Belloni, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply

* [PATCH v5 2/4] drm: bridge: add support for TI ths8135
From: Rob Herring @ 2016-12-12 16:45 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1481107365-24839-3-git-send-email-bgolaszewski@baylibre.com>

On Wed, Dec 07, 2016 at 11:42:43AM +0100, Bartosz Golaszewski wrote:
> THS8135 is a configurable video DAC. Add DT bindings for this chip and
> use the dumb-vga-dac driver for now as no configuration is required to
> make it work.
> 
> Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
> ---
>  .../bindings/display/bridge/ti,ths8135.txt         | 52 ++++++++++++++++++++++
>  drivers/gpu/drm/bridge/dumb-vga-dac.c              |  1 +
>  2 files changed, 53 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/display/bridge/ti,ths8135.txt

Acked-by: Rob Herring <robh@kernel.org>

But one nit below:

> 
> diff --git a/Documentation/devicetree/bindings/display/bridge/ti,ths8135.txt b/Documentation/devicetree/bindings/display/bridge/ti,ths8135.txt
> new file mode 100644
> index 0000000..23cd8ee
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/display/bridge/ti,ths8135.txt
> @@ -0,0 +1,52 @@
> +THS8135 Video DAC
> +-----------------
> +
> +This is the binding for Texas Instruments THS8135 Video DAC bridge.
> +
> +Required properties:
> +
> +- compatible: Must be "ti,ths8135"
> +
> +Required nodes:
> +
> +This device has two video ports. Their connections are modelled using the OF
> +graph bindings specified in Documentation/devicetree/bindings/graph.txt.
> +
> +- Video port 0 for RGB input
> +- Video port 1 for VGA output
> +
> +Example
> +-------
> +
> +vga-bridge {
> +	compatible = "ti,ths8135";
> +	#address-cells = <1>;
> +	#size-cells = <0>;
> +
> +	ports {
> +		#address-cells = <1>;
> +		#size-cells = <0>;
> +
> +		port at 0 {
> +			#address-cells = <1>;
> +			#size-cells = <0>;
> +			reg = <0>;
> +
> +			vga_bridge_in: endpoint at 0 {
> +				reg = <0>;

You don't need reg here.

> +				remote-endpoint = <&lcdc_out_vga>;
> +			};
> +		};
> +
> +		port at 1 {
> +			#address-cells = <1>;
> +			#size-cells = <0>;
> +			reg = <1>;
> +
> +			vga_bridge_out: endpoint at 0 {
> +				reg = <0>;
> +				remote-endpoint = <&vga_con_in>;
> +			};
> +		};
> +	};
> +};
> diff --git a/drivers/gpu/drm/bridge/dumb-vga-dac.c b/drivers/gpu/drm/bridge/dumb-vga-dac.c
> index afec232..498fa75 100644
> --- a/drivers/gpu/drm/bridge/dumb-vga-dac.c
> +++ b/drivers/gpu/drm/bridge/dumb-vga-dac.c
> @@ -204,6 +204,7 @@ static int dumb_vga_remove(struct platform_device *pdev)
>  
>  static const struct of_device_id dumb_vga_match[] = {
>  	{ .compatible = "dumb-vga-dac" },
> +	{ .compatible = "ti,ths8135" },
>  	{},
>  };
>  MODULE_DEVICE_TABLE(of, dumb_vga_match);
> -- 
> 2.9.3
> 

^ permalink raw reply

* [PATCH 2/2] FPGA: Add TS-7300 FPGA manager
From: Alan Tull @ 2016-12-12 16:44 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <39bc0569-81e1-8c35-0280-4ba1824b2710@gmail.com>

On Mon, 12 Dec 2016, Florian Fainelli wrote:

> On 12/12/2016 08:01 AM, Alan Tull wrote:
> > On Sun, 11 Dec 2016, Florian Fainelli wrote:
> > 
> >> Add support for loading bitstreams on the Altera Cyclone II FPGA
> >> populated on the TS-7300 board. This is done through the configuration
> >> and data registers offered through a memory interface between the EP93xx
> >> SoC and the FPGA.
> >>
> >> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> > 
> > Hi Florain,
> > 
> > Thanks for submitting!
> > 
> > How specific is this to the tx7300 board?
> > 
> > I'm unclear about the programming method here.  Are these registers
> > exposed by the EP93xx?  Is it possible that another cpu could access
> > these two registers to configure the cyclone ii?  Is this passive
> > serial?
> 
> So here is my understanding, from glancing at the TS-7300 board manual:
> 
> - there is an on-board CPLD which does a variety of services and I/O for
> the EP9302 SoC, one of these services is the configuration of the
> on-board FPGA
> 
> - the programming interface here is some kind of abstraction around a
> Cyclone II FPGA, and is by no means standard, nor directly exposed to
> the CPU
> 
> - unless you go through the CPLD, there is no other way that you could
> configure the FPGA
> 
> Does that help answer your questions?

Yes it does.  Maybe a brief comment explaining that similar to what
you just said.

> >> +static int ts73xx_fpga_write_init(struct fpga_manager *mgr, u32 flags,
> >> +				  const char *buf, size_t count)
> >> +{
> >> +	struct ts73xx_fpga_priv *priv = mgr->priv;
> >> +
> >> +	/* Reset the FPGA */
> >> +	writeb(0, priv->io_base + TS73XX_FPGA_CONFIG_REG);
> >> +	udelay(30);
> >> +	writeb(0x2, priv->io_base + TS73XX_FPGA_CONFIG_REG);
> >> +	udelay(80);
> > 
> > Could these udelay values be macros?
> 
> The bit definitions could be defined, but the delays, why would that be
> useful?

If it is helpful for someone reading the code to know what the delays
are, if some future generation of the board/cpld uses this same
driver.  So when this driver is broken for the next generation
board/cpld, people trying to fix this know what the delay is there for
and can have a better chance at adjusting the right delay.

> 
> > 
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static inline int ts73xx_fpga_can_write(struct ts73xx_fpga_priv *priv)
> >> +{
> >> +	unsigned int timeout = 1000;
> > 
> > Another macro?
> 
> The delay is just an arbitrary good timeout.
> -- 
> Florian
> 

^ permalink raw reply

* [PATCH v3 11/12] arm64: dts: marvell: add sdhci support for Armada 7K/8K
From: Gregory CLEMENT @ 2016-12-12 16:37 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161209200729.GO14217@n2100.armlinux.org.uk>

Hi Russell King,
 
 On ven., d?c. 09 2016, Russell King - ARM Linux <linux@armlinux.org.uk> wrote:

> On Fri, Dec 09, 2016 at 11:30:07AM +0100, Gregory CLEMENT wrote:
>> Also enable it on the Armada 7040 DB board
>> 
>> Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
>
> Hi,
>
> Can this also be added to the cp110 on the 7k/8k SoCs as well, or does
> it rely on other unmerged support?

I did not add this one because until now I had not the hardware setup
available to test it.

But at least I can add the ressources in the device tree and now I can
test it partially.

Gregory

>
> Thanks.
>
> -- 
> RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
> FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
> according to speedtest.net.

-- 
Gregory Clement, Free Electrons
Kernel, drivers, real-time and embedded Linux
development, consulting, training and support.
http://free-electrons.com

^ permalink raw reply

* [PATCH v2 2/2] ASoC: atmel: tse850: rely on the ssc to register as a cpu dai by itself
From: Rob Herring @ 2016-12-12 16:34 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1481052157-23400-3-git-send-email-peda@axentia.se>

On Tue, Dec 06, 2016 at 08:22:37PM +0100, Peter Rosin wrote:
> This breaks devicetree compatibility, but in this case that is ok. All
> affected units are either on my desk, or running an even older version
> of the driver that is not compatible with the upstreamed version anyway
> (and when these other units are eventually updated, they will get a
> fresh dtb as well, so that is not a significant problem either).

Perfect.

> All of that is of course assuming that noone else has managed to build
> something that can use this driver, but that seems extremely improbable.
> 
> Signed-off-by: Peter Rosin <peda@axentia.se>
> ---
>  .../bindings/sound/axentia,tse850-pcm5142.txt      | 11 ++++++++---

Acked-by: Rob Herring <robh@kernel.org>

>  sound/soc/atmel/tse850-pcm5142.c                   | 23 +++-------------------
>  2 files changed, 11 insertions(+), 23 deletions(-)

^ permalink raw reply

* [PATCH 2/2] FPGA: Add TS-7300 FPGA manager
From: Florian Fainelli @ 2016-12-12 16:27 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <alpine.DEB.2.10.1612120951520.3310@atull-730U3E-740U3E>

On 12/12/2016 08:01 AM, Alan Tull wrote:
> On Sun, 11 Dec 2016, Florian Fainelli wrote:
> 
>> Add support for loading bitstreams on the Altera Cyclone II FPGA
>> populated on the TS-7300 board. This is done through the configuration
>> and data registers offered through a memory interface between the EP93xx
>> SoC and the FPGA.
>>
>> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> 
> Hi Florain,
> 
> Thanks for submitting!
> 
> How specific is this to the tx7300 board?
> 
> I'm unclear about the programming method here.  Are these registers
> exposed by the EP93xx?  Is it possible that another cpu could access
> these two registers to configure the cyclone ii?  Is this passive
> serial?

So here is my understanding, from glancing at the TS-7300 board manual:

- there is an on-board CPLD which does a variety of services and I/O for
the EP9302 SoC, one of these services is the configuration of the
on-board FPGA

- the programming interface here is some kind of abstraction around a
Cyclone II FPGA, and is by no means standard, nor directly exposed to
the CPU

- unless you go through the CPLD, there is no other way that you could
configure the FPGA

Does that help answer your questions?

> 
> Please cc linux-fpga at vger.kernel.org for the next version.
> 
> Other comments below...

OK, I will fix those,

> 
>> ---
>>  drivers/fpga/Kconfig       |   7 ++
>>  drivers/fpga/Makefile      |   1 +
>>  drivers/fpga/ts73xx-fpga.c | 165 +++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 173 insertions(+)
>>  create mode 100644 drivers/fpga/ts73xx-fpga.c
>>
>> diff --git a/drivers/fpga/Kconfig b/drivers/fpga/Kconfig
>> index cd84934774cc..109625707ef0 100644
>> --- a/drivers/fpga/Kconfig
>> +++ b/drivers/fpga/Kconfig
>> @@ -26,6 +26,13 @@ config FPGA_MGR_ZYNQ_FPGA
>>  	help
>>  	  FPGA manager driver support for Xilinx Zynq FPGAs.
>>  
>> +config FPGA_MGR_TS73XX
>> +	tristate "Technologic Systems TS-73xx SBC FPGA Manager"
>> +	depends on ARCH_EP93XX && MACH_TS72XX
>> +	help
>> +	  FPGA manager driver support for the Altera Cyclone II FPGA
>> +	  present on the TS-73xx SBC boards.
>> +
>>  endif # FPGA
>>  
>>  endmenu
>> diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
>> index 8d83fc6b1613..5d51265cc1b4 100644
>> --- a/drivers/fpga/Makefile
>> +++ b/drivers/fpga/Makefile
>> @@ -8,3 +8,4 @@ obj-$(CONFIG_FPGA)			+= fpga-mgr.o
>>  # FPGA Manager Drivers
>>  obj-$(CONFIG_FPGA_MGR_SOCFPGA)		+= socfpga.o
>>  obj-$(CONFIG_FPGA_MGR_ZYNQ_FPGA)	+= zynq-fpga.o
>> +obj-$(CONFIG_FPGA_MGR_TS73XX)		+= ts73xx-fpga.o
>> diff --git a/drivers/fpga/ts73xx-fpga.c b/drivers/fpga/ts73xx-fpga.c
>> new file mode 100644
>> index 000000000000..2b3d5d668dfc
>> --- /dev/null
>> +++ b/drivers/fpga/ts73xx-fpga.c
>> @@ -0,0 +1,165 @@
>> +/*
>> + * Technologic Systems TS-73xx SBC FPGA loader
>> + *
>> + * Copyright (C) 2016 Florian Fainelli <f.fainelli@gmail.com>
>> + *
>> + * FPGA Manager Driver for the on-board Altera Cyclone II FPGA found on
>> + * TS-7300, heavily based on load_fpga.c in their vendor tree.
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; version 2 of the License.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>> + * GNU General Public License for more details.
>> + */
>> +
>> +#include <linux/delay.h>
>> +#include <linux/io.h>
>> +#include <linux/module.h>
>> +#include <linux/platform_device.h>
>> +#include <linux/string.h>
>> +#include <linux/bitrev.h>
>> +#include <linux/fpga/fpga-mgr.h>
>> +
>> +#define TS73XX_FPGA_DATA_REG	0
>> +#define TS73XX_FPGA_CONFIG_REG	1
>> +
>> +struct ts73xx_fpga_priv {
>> +	void __iomem	*io_base;
>> +	struct device	*dev;
>> +};
>> +
>> +static enum fpga_mgr_states ts73xx_fpga_state(struct fpga_manager *mgr)
>> +{
>> +	return FPGA_MGR_STATE_UNKNOWN;
>> +}
>> +
>> +static int ts73xx_fpga_write_init(struct fpga_manager *mgr, u32 flags,
>> +				  const char *buf, size_t count)
>> +{
>> +	struct ts73xx_fpga_priv *priv = mgr->priv;
>> +
>> +	/* Reset the FPGA */
>> +	writeb(0, priv->io_base + TS73XX_FPGA_CONFIG_REG);
>> +	udelay(30);
>> +	writeb(0x2, priv->io_base + TS73XX_FPGA_CONFIG_REG);
>> +	udelay(80);
> 
> Could these udelay values be macros?

The bit definitions could be defined, but the delays, why would that be
useful?

> 
>> +
>> +	return 0;
>> +}
>> +
>> +static inline int ts73xx_fpga_can_write(struct ts73xx_fpga_priv *priv)
>> +{
>> +	unsigned int timeout = 1000;
> 
> Another macro?

The delay is just an arbitrary good timeout.
-- 
Florian

^ permalink raw reply

* [PATCH v6 5/5] ARM: configs: stm32: Add I2C support for STM32 defconfig
From: M'boumba Cedric Madianga @ 2016-12-12 16:15 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1481559342-6106-1-git-send-email-cedric.madianga@gmail.com>

Signed-off-by: M'boumba Cedric Madianga <cedric.madianga@gmail.com>
---
 arch/arm/configs/stm32_defconfig | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm/configs/stm32_defconfig b/arch/arm/configs/stm32_defconfig
index e7b56d4..9494eaf 100644
--- a/arch/arm/configs/stm32_defconfig
+++ b/arch/arm/configs/stm32_defconfig
@@ -52,6 +52,9 @@ CONFIG_SERIAL_NONSTANDARD=y
 CONFIG_SERIAL_STM32=y
 CONFIG_SERIAL_STM32_CONSOLE=y
 # CONFIG_HW_RANDOM is not set
+CONFIG_I2C=y
+CONFIG_I2C_CHARDEV=y
+CONFIG_I2C_STM32F4=y
 # CONFIG_HWMON is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_NEW_LEDS=y
-- 
1.9.1

^ permalink raw reply related

* [PATCH v6 4/5] ARM: dts: stm32: Add I2C1 support for STM32429 eval board
From: M'boumba Cedric Madianga @ 2016-12-12 16:15 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1481559342-6106-1-git-send-email-cedric.madianga@gmail.com>

Signed-off-by: M'boumba Cedric Madianga <cedric.madianga@gmail.com>
---
 arch/arm/boot/dts/stm32429i-eval.dts | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/arm/boot/dts/stm32429i-eval.dts b/arch/arm/boot/dts/stm32429i-eval.dts
index afb90bc..74e0045 100644
--- a/arch/arm/boot/dts/stm32429i-eval.dts
+++ b/arch/arm/boot/dts/stm32429i-eval.dts
@@ -141,3 +141,9 @@
 	pinctrl-names = "default";
 	status = "okay";
 };
+
+&i2c1 {
+	pinctrl-0 = <&i2c1_pins_b>;
+	pinctrl-names = "default";
+	status = "okay";
+};
-- 
1.9.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox