Linux-ARM-Kernel Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 4/6] crypto: arm/crct10dif - port x86 SSE implementation to ARM
From: Ard Biesheuvel @ 2016-12-05 18:42 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1480963348-24203-1-git-send-email-ard.biesheuvel@linaro.org>

This is a transliteration of the Intel algorithm implemented
using SSE and PCLMULQDQ instructions that resides in the file
arch/x86/crypto/crct10dif-pcl-asm_64.S, but simplified to only
operate on buffers that are 16 byte aligned (but of any size)

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm/crypto/Kconfig             |   5 +
 arch/arm/crypto/Makefile            |   2 +
 arch/arm/crypto/crct10dif-ce-core.S | 427 ++++++++++++++++++++
 arch/arm/crypto/crct10dif-ce-glue.c | 101 +++++
 4 files changed, 535 insertions(+)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 27ed1b1cd1d7..fce801fa52a1 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -120,4 +120,9 @@ config CRYPTO_GHASH_ARM_CE
 	  that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
 	  that is part of the ARMv8 Crypto Extensions
 
+config CRYPTO_CRCT10DIF_ARM_CE
+	tristate "CRCT10DIF digest algorithm using PMULL instructions"
+	depends on KERNEL_MODE_NEON && CRC_T10DIF
+	select CRYPTO_HASH
+
 endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index fc5150702b64..fc77265014b7 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -13,6 +13,7 @@ ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
+ce-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o
 
 ifneq ($(ce-obj-y)$(ce-obj-m),)
 ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y)
@@ -36,6 +37,7 @@ sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
+crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
 
 quiet_cmd_perl = PERL    $@
       cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
new file mode 100644
index 000000000000..ce45ba0c0687
--- /dev/null
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -0,0 +1,427 @@
+//
+// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
+//
+// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2 as
+// published by the Free Software Foundation.
+//
+
+//
+// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+//
+// Copyright (c) 2013, Intel Corporation
+//
+// Authors:
+//     Erdinc Ozturk <erdinc.ozturk@intel.com>
+//     Vinodh Gopal <vinodh.gopal@intel.com>
+//     James Guilford <james.guilford@intel.com>
+//     Tim Chen <tim.c.chen@linux.intel.com>
+//
+// This software is available to you under a choice of one of two
+// licenses.  You may choose to be licensed under the terms of the GNU
+// General Public License (GPL) Version 2, available from the file
+// COPYING in the main directory of this source tree, or the
+// OpenIB.org BSD license below:
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+//
+// * Neither the name of the Intel Corporation nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+//
+// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//       Function API:
+//       UINT16 crc_t10dif_pcl(
+//               UINT16 init_crc, //initial CRC value, 16 bits
+//               const unsigned char *buf, //buffer pointer to calculate CRC on
+//               UINT64 len //buffer length in bytes (64-bit data)
+//       );
+//
+//       Reference paper titled "Fast CRC Computation for Generic
+//	Polynomials Using PCLMULQDQ Instruction"
+//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
+//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+//
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+#ifdef CONFIG_CPU_ENDIAN_BE8
+#define CPU_LE(code...)
+#else
+#define CPU_LE(code...)		code
+#endif
+
+	.text
+	.fpu		crypto-neon-fp-armv8
+
+	arg1_low32	.req	r0
+	arg2		.req	r1
+	arg3		.req	r2
+
+	qzr		.req	q13
+
+	q0l		.req	d0
+	q0h		.req	d1
+	q1l		.req	d2
+	q1h		.req	d3
+	q2l		.req	d4
+	q2h		.req	d5
+	q3l		.req	d6
+	q3h		.req	d7
+	q4l		.req	d8
+	q4h		.req	d9
+	q5l		.req	d10
+	q5h		.req	d11
+	q6l		.req	d12
+	q6h		.req	d13
+	q7l		.req	d14
+	q7h		.req	d15
+
+ENTRY(crc_t10dif_pmull)
+	vmov.i8		qzr, #0			// init zero register
+
+	// adjust the 16-bit initial_crc value, scale it to 32 bits
+	lsl		arg1_low32, arg1_low32, #16
+
+	// check if smaller than 256
+	cmp		arg3, #256
+
+	// for sizes less than 128, we can't fold 64B at a time...
+	blt		_less_than_128
+
+	// load the initial crc value
+	// crc value does not need to be byte-reflected, but it needs
+	// to be moved to the high part of the register.
+	// because data will be byte-reflected and will align with
+	// initial crc at correct place.
+	vmov		s0, arg1_low32		// initial crc
+	vext.8		q10, qzr, q0, #4
+
+	// receive the initial 64B data, xor the initial crc value
+	vld1.64		{q0-q1}, [arg2, :128]!
+	vld1.64		{q2-q3}, [arg2, :128]!
+	vld1.64		{q4-q5}, [arg2, :128]!
+	vld1.64		{q6-q7}, [arg2, :128]!
+CPU_LE(	vrev64.8	q0, q0			)
+CPU_LE(	vrev64.8	q1, q1			)
+CPU_LE(	vrev64.8	q2, q2			)
+CPU_LE(	vrev64.8	q3, q3			)
+CPU_LE(	vrev64.8	q4, q4			)
+CPU_LE(	vrev64.8	q5, q5			)
+CPU_LE(	vrev64.8	q6, q6			)
+CPU_LE(	vrev64.8	q7, q7			)
+
+	vswp		d0, d1
+	vswp		d2, d3
+	vswp		d4, d5
+	vswp		d6, d7
+	vswp		d8, d9
+	vswp		d10, d11
+	vswp		d12, d13
+	vswp		d14, d15
+
+	// XOR the initial_crc value
+	veor.8		q0, q0, q10
+
+	adr		ip, rk3
+	vld1.64		{q10}, [ip, :128]	// xmm10 has rk3 and rk4
+
+	//
+	// we subtract 256 instead of 128 to save one instruction from the loop
+	//
+	sub		arg3, arg3, #256
+
+	// at this section of the code, there is 64*x+y (0<=y<64) bytes of
+	// buffer. The _fold_64_B_loop will fold 64B at a time
+	// until we have 64+y Bytes of buffer
+
+
+	// fold 64B at a time. This section of the code folds 4 vector
+	// registers in parallel
+_fold_64_B_loop:
+
+	.macro		fold64, reg1, reg2
+	vld1.64		{q11-q12}, [arg2, :128]!
+
+	vmull.p64	q8, \reg1\()h, d21
+	vmull.p64	\reg1, \reg1\()l, d20
+	vmull.p64	q9, \reg2\()h, d21
+	vmull.p64	\reg2, \reg2\()l, d20
+
+CPU_LE(	vrev64.8	q11, q11		)
+CPU_LE(	vrev64.8	q12, q12		)
+	vswp		d22, d23
+	vswp		d24, d25
+
+	veor.8		\reg1, \reg1, q8
+	veor.8		\reg2, \reg2, q9
+	veor.8		\reg1, \reg1, q11
+	veor.8		\reg2, \reg2, q12
+	.endm
+
+	fold64		q0, q1
+	fold64		q2, q3
+	fold64		q4, q5
+	fold64		q6, q7
+
+	subs		arg3, arg3, #128
+
+	// check if there is another 64B in the buffer to be able to fold
+	bge		_fold_64_B_loop
+
+	// at this point, the buffer pointer is pointing at the last y Bytes
+	// of the buffer the 64B of folded data is in 4 of the vector
+	// registers: v0, v1, v2, v3
+
+	// fold the 8 vector registers to 1 vector register with different
+	// constants
+
+	adr		ip, rk9
+	vld1.64		{q10}, [ip, :128]!
+
+	.macro		fold16, reg, rk
+	vmull.p64	q8, \reg\()l, d20
+	vmull.p64	\reg, \reg\()h, d21
+	.ifnb		\rk
+	vld1.64		{q10}, [ip, :128]!
+	.endif
+	veor.8		q7, q7, q8
+	veor.8		q7, q7, \reg
+	.endm
+
+	fold16		q0, rk11
+	fold16		q1, rk13
+	fold16		q2, rk15
+	fold16		q3, rk17
+	fold16		q4, rk19
+	fold16		q5, rk1
+	fold16		q6
+
+	// instead of 64, we add 48 to the loop counter to save 1 instruction
+	// from the loop instead of a cmp instruction, we use the negative
+	// flag with the jl instruction
+	adds		arg3, arg3, #(128-16)
+	blt		_final_reduction_for_128
+
+	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
+	// and the rest is in memory. We can fold 16 bytes@a time if y>=16
+	// continue folding 16B at a time
+
+_16B_reduction_loop:
+	vmull.p64	q8, d14, d20
+	vmull.p64	q7, d15, d21
+	veor.8		q7, q7, q8
+
+	vld1.64		{q0}, [arg2, :128]!
+CPU_LE(	vrev64.8	q0, q0		)
+	vswp		d0, d1
+	veor.8		q7, q7, q0
+	subs		arg3, arg3, #16
+
+	// instead of a cmp instruction, we utilize the flags with the
+	// jge instruction equivalent of: cmp arg3, 16-16
+	// check if there is any more 16B in the buffer to be able to fold
+	bge		_16B_reduction_loop
+
+	// now we have 16+z bytes left to reduce, where 0<= z < 16.
+	// first, we reduce the data in the xmm7 register
+
+_final_reduction_for_128:
+	// check if any more data to fold. If not, compute the CRC of
+	// the final 128 bits
+	adds		arg3, arg3, #16
+	beq		_128_done
+
+	// here we are getting data that is less than 16 bytes.
+	// since we know that there was data before the pointer, we can
+	// offset the input pointer before the actual point, to receive
+	// exactly 16 bytes. after that the registers need to be adjusted.
+_get_last_two_regs:
+	add		arg2, arg2, arg3
+	sub		arg2, arg2, #16
+	vld1.64		{q1}, [arg2]
+CPU_LE(	vrev64.8	q1, q1			)
+	vswp		d2, d3
+
+	// get rid of the extra data that was loaded before
+	// load the shift constant
+	adr		ip, tbl_shf_table + 16
+	sub		ip, ip, arg3
+	vld1.8		{q0}, [ip]
+
+	// shift v2 to the left by arg3 bytes
+	vtbl.8		d4, {d14-d15}, d0
+	vtbl.8		d5, {d14-d15}, d1
+
+	// shift v7 to the right by 16-arg3 bytes
+	vmov.i8		q9, #0x80
+	veor.8		q0, q0, q9
+	vtbl.8		d18, {d14-d15}, d0
+	vtbl.8		d19, {d14-d15}, d1
+
+	// blend
+	vshr.s8		q0, q0, #7		// convert to 8-bit mask
+	vbsl.8		q0, q2, q1
+
+	// fold 16 Bytes
+	vmull.p64	q8, d18, d20
+	vmull.p64	q7, d19, d21
+	veor.8		q7, q7, q8
+	veor.8		q7, q7, q0
+
+_128_done:
+	// compute crc of a 128-bit value
+	vldr		d20, rk5
+	vldr		d21, rk6		// rk5 and rk6 in xmm10
+
+	// 64b fold
+	vext.8		q0, qzr, q7, #8
+	vmull.p64	q7, d15, d20
+	veor.8		q7, q7, q0
+
+	// 32b fold
+	vext.8		q0, q7, qzr, #12
+	vmov		s31, s3
+	vmull.p64	q0, d0, d21
+	veor.8		q7, q0, q7
+
+	// barrett reduction
+_barrett:
+	vldr		d20, rk7
+	vldr		d21, rk8
+
+	vmull.p64	q0, d15, d20
+	vext.8		q0, qzr, q0, #12
+	vmull.p64	q0, d1, d21
+	vext.8		q0, qzr, q0, #12
+	veor.8		q7, q7, q0
+	vmov		r0, s29
+
+_cleanup:
+	// scale the result back to 16 bits
+	lsr		r0, r0, #16
+	bx		lr
+
+_less_than_128:
+	teq		arg3, #0
+	beq		_cleanup
+
+	vmov.i8		q0, #0
+	vmov		s3, arg1_low32		// get the initial crc value
+
+	vld1.64		{q7}, [arg2, :128]!
+CPU_LE(	vrev64.8	q7, q7		)
+	vswp		d14, d15
+	veor.8		q7, q7, q0
+
+	cmp		arg3, #16
+	beq		_128_done		// exactly 16 left
+	blt		_less_than_16_left
+
+	// now if there is, load the constants
+	vldr		d20, rk1
+	vldr		d21, rk2		// rk1 and rk2 in xmm10
+
+	// check if there is enough buffer to be able to fold 16B at a time
+	subs		arg3, arg3, #32
+	addlt		arg3, arg3, #16
+	blt		_get_last_two_regs
+	b		_16B_reduction_loop
+
+_less_than_16_left:
+	// shl r9, 4
+	adr		ip, tbl_shf_table + 16
+	sub		ip, ip, arg3
+	vld1.8		{q0}, [ip]
+	vmov.i8		q9, #0x80
+	veor.8		q0, q0, q9
+	vtbl.8		d18, {d14-d15}, d0
+	vtbl.8		d15, {d14-d15}, d1
+	vmov		d14, d18
+	b		_128_done
+ENDPROC(crc_t10dif_pmull)
+
+// precomputed constants
+// these constants are precomputed from the poly:
+// 0x8bb70000 (0x8bb7 scaled to 32 bits)
+	.align		4
+// Q = 0x18BB70000
+// rk1 = 2^(32*3) mod Q << 32
+// rk2 = 2^(32*5) mod Q << 32
+// rk3 = 2^(32*15) mod Q << 32
+// rk4 = 2^(32*17) mod Q << 32
+// rk5 = 2^(32*3) mod Q << 32
+// rk6 = 2^(32*2) mod Q << 32
+// rk7 = floor(2^64/Q)
+// rk8 = Q
+
+rk3:	.quad		0x9d9d000000000000
+rk4:	.quad		0x7cf5000000000000
+rk5:	.quad		0x2d56000000000000
+rk6:	.quad		0x1368000000000000
+rk7:	.quad		0x00000001f65a57f8
+rk8:	.quad		0x000000018bb70000
+rk9:	.quad		0xceae000000000000
+rk10:	.quad		0xbfd6000000000000
+rk11:	.quad		0x1e16000000000000
+rk12:	.quad		0x713c000000000000
+rk13:	.quad		0xf7f9000000000000
+rk14:	.quad		0x80a6000000000000
+rk15:	.quad		0x044c000000000000
+rk16:	.quad		0xe658000000000000
+rk17:	.quad		0xad18000000000000
+rk18:	.quad		0xa497000000000000
+rk19:	.quad		0x6ee3000000000000
+rk20:	.quad		0xe7b5000000000000
+rk1:	.quad		0x2d56000000000000
+rk2:	.quad		0x06df000000000000
+
+tbl_shf_table:
+// use these values for shift constants for the tbl/tbx instruction
+// different alignments result in values as shown:
+//	DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
+//	DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
+//	DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
+//	DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
+//	DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
+//	DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
+//	DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
+//	DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
+//	DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
+//	DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
+//	DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
+//	DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
+//	DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
+//	DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
+//	DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
+
+	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
+	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
new file mode 100644
index 000000000000..d428355cf38d
--- /dev/null
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -0,0 +1,101 @@
+/*
+ * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/crc-t10dif.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+#define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
+
+asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u32 len);
+
+static int crct10dif_init(struct shash_desc *desc)
+{
+	u16 *crc = shash_desc_ctx(desc);
+
+	*crc = 0;
+	return 0;
+}
+
+static int crct10dif_update(struct shash_desc *desc, const u8 *data,
+			    unsigned int length)
+{
+	u16 *crc = shash_desc_ctx(desc);
+	unsigned int l;
+
+	if (!may_use_simd()) {
+		*crc = crc_t10dif_generic(*crc, data, length);
+	} else {
+		if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
+			l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
+				  ((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
+
+			*crc = crc_t10dif_generic(*crc, data, l);
+
+			length -= l;
+			data += l;
+		}
+		if (length > 0) {
+			kernel_neon_begin();
+			*crc = crc_t10dif_pmull(*crc, data, length);
+			kernel_neon_end();
+		}
+	}
+	return 0;
+}
+
+static int crct10dif_final(struct shash_desc *desc, u8 *out)
+{
+	u16 *crc = shash_desc_ctx(desc);
+
+	*(u16 *)out = *crc;
+	return 0;
+}
+
+static struct shash_alg crc_t10dif_alg = {
+	.digestsize		= CRC_T10DIF_DIGEST_SIZE,
+	.init			= crct10dif_init,
+	.update			= crct10dif_update,
+	.final			= crct10dif_final,
+	.descsize		= CRC_T10DIF_DIGEST_SIZE,
+
+	.base.cra_name		= "crct10dif",
+	.base.cra_driver_name	= "crct10dif-arm-ce",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= CRC_T10DIF_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+};
+
+static int __init crc_t10dif_mod_init(void)
+{
+	if (!(elf_hwcap2 & HWCAP2_PMULL))
+		return -ENODEV;
+
+	return crypto_register_shash(&crc_t10dif_alg);
+}
+
+static void __exit crc_t10dif_mod_exit(void)
+{
+	crypto_unregister_shash(&crc_t10dif_alg);
+}
+
+module_init(crc_t10dif_mod_init);
+module_exit(crc_t10dif_mod_exit);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("crct10dif");
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 5/6] crypto: arm64/crc32 - accelerated support based on x86 SSE implementation
From: Ard Biesheuvel @ 2016-12-05 18:42 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1480963348-24203-1-git-send-email-ard.biesheuvel@linaro.org>

This is a combination of the the Intel algorithm implemented using SSE
and PCLMULQDQ instructions from arch/x86/crypto/crc32-pclmul_asm.S, and
the new CRC32 extensions introduced for both 32-bit and 64-bit ARM in
version 8 of the architecture. Two versions of the above combo are
provided, one for CRC32 and one for CRC32C.

The PMULL/NEON algorithm is faster, but operates on blocks of at least
64 bytes, and on multiples of 16 bytes only. For the remaining input,
or for all input on systems that lack the PMULL 64x64->128 instructions,
the CRC32 instructions will be used.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/Kconfig         |   6 +
 arch/arm64/crypto/Makefile        |   3 +
 arch/arm64/crypto/crc32-ce-core.S | 266 ++++++++++++++++++++
 arch/arm64/crypto/crc32-ce-glue.c | 212 ++++++++++++++++
 4 files changed, 487 insertions(+)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index d773c0659202..21835deb1ab9 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -28,6 +28,11 @@ config CRYPTO_CRCT10DIF_ARM64_CE
 	depends on KERNEL_MODE_NEON && CRC_T10DIF
 	select CRYPTO_HASH
 
+config CRYPTO_CRC32_ARM64_CE
+	tristate "CRC32 and CRC32C digest algorithms using PMULL instructions"
+	depends on KERNEL_MODE_NEON && CRC32
+	select CRYPTO_HASH
+
 config CRYPTO_AES_ARM64_CE
 	tristate "AES core cipher using ARMv8 Crypto Extensions"
 	depends on ARM64 && KERNEL_MODE_NEON
@@ -58,4 +63,5 @@ config CRYPTO_CRC32_ARM64
 	tristate "CRC32 and CRC32C using optional ARMv8 instructions"
 	depends on ARM64
 	select CRYPTO_HASH
+
 endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 36fd3eb4201b..144387805a46 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -20,6 +20,9 @@ ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
 crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
 
+obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
+crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
+
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
 CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
 
diff --git a/arch/arm64/crypto/crc32-ce-core.S b/arch/arm64/crypto/crc32-ce-core.S
new file mode 100644
index 000000000000..18f5a8442276
--- /dev/null
+++ b/arch/arm64/crypto/crc32-ce-core.S
@@ -0,0 +1,266 @@
+/*
+ * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.align		6
+	.cpu		generic+crypto+crc
+
+.Lcrc32_constants:
+	/*
+	 * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
+	 * #define CONSTANT_R1  0x154442bd4LL
+	 *
+	 * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
+	 * #define CONSTANT_R2  0x1c6e41596LL
+	 */
+	.octa		0x00000001c6e415960000000154442bd4
+
+	/*
+	 * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
+	 * #define CONSTANT_R3  0x1751997d0LL
+	 *
+	 * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
+	 * #define CONSTANT_R4  0x0ccaa009eLL
+	 */
+	.octa		0x00000000ccaa009e00000001751997d0
+
+	/*
+	 * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
+	 * #define CONSTANT_R5  0x163cd6124LL
+	 */
+	.quad		0x0000000163cd6124
+	.quad		0x00000000FFFFFFFF
+
+	/*
+	 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+	 *
+	 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
+	 *                                                      = 0x1F7011641LL
+	 * #define CONSTANT_RU  0x1F7011641LL
+	 */
+	.octa		0x00000001F701164100000001DB710641
+
+.Lcrc32c_constants:
+	.octa		0x000000009e4addf800000000740eef02
+	.octa		0x000000014cd00bd600000000f20c0dfe
+	.quad		0x00000000dd45aab8
+	.quad		0x00000000FFFFFFFF
+	.octa		0x00000000dea713f10000000105ec76f0
+
+	vCONSTANT	.req	v0
+	dCONSTANT	.req	d0
+	qCONSTANT	.req	q0
+
+	BUF		.req	x0
+	LEN		.req	x1
+	CRC		.req	x2
+
+	vzr		.req	v9
+
+	/**
+	 * Calculate crc32
+	 * BUF - buffer
+	 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
+	 * CRC - initial crc32
+	 * return %eax crc32
+	 * uint crc32_pmull_le(unsigned char const *buffer,
+	 *                     size_t len, uint crc32)
+	 */
+ENTRY(crc32_pmull_le)
+	adr		x3, .Lcrc32_constants
+	b		0f
+
+ENTRY(crc32c_pmull_le)
+	adr		x3, .Lcrc32c_constants
+
+0:	bic		LEN, LEN, #15
+	ld1		{v1.16b-v4.16b}, [BUF], #0x40
+	movi		vzr.16b, #0
+	fmov		dCONSTANT, CRC
+	eor		v1.16b, v1.16b, vCONSTANT.16b
+	sub		LEN, LEN, #0x40
+	cmp		LEN, #0x40
+	b.lt		less_64
+
+	ldr		qCONSTANT, [x3]
+
+loop_64:		/* 64 bytes Full cache line folding */
+	sub		LEN, LEN, #0x40
+
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull2		v6.1q, v2.2d, vCONSTANT.2d
+	pmull2		v7.1q, v3.2d, vCONSTANT.2d
+	pmull2		v8.1q, v4.2d, vCONSTANT.2d
+
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	pmull		v2.1q, v2.1d, vCONSTANT.1d
+	pmull		v3.1q, v3.1d, vCONSTANT.1d
+	pmull		v4.1q, v4.1d, vCONSTANT.1d
+
+	eor		v1.16b, v1.16b, v5.16b
+	ld1		{v5.16b}, [BUF], #0x10
+	eor		v2.16b, v2.16b, v6.16b
+	ld1		{v6.16b}, [BUF], #0x10
+	eor		v3.16b, v3.16b, v7.16b
+	ld1		{v7.16b}, [BUF], #0x10
+	eor		v4.16b, v4.16b, v8.16b
+	ld1		{v8.16b}, [BUF], #0x10
+
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v2.16b, v2.16b, v6.16b
+	eor		v3.16b, v3.16b, v7.16b
+	eor		v4.16b, v4.16b, v8.16b
+
+	cmp		LEN, #0x40
+	b.ge		loop_64
+
+less_64:		/* Folding cache line into 128bit */
+	ldr		qCONSTANT, [x3, #16]
+
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v1.16b, v1.16b, v2.16b
+
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v1.16b, v1.16b, v3.16b
+
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v1.16b, v1.16b, v4.16b
+
+	cbz		LEN, fold_64
+
+loop_16:		/* Folding rest buffer into 128bit */
+	subs		LEN, LEN, #0x10
+
+	ld1		{v2.16b}, [BUF], #0x10
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v1.16b, v1.16b, v2.16b
+
+	b.ne		loop_16
+
+fold_64:
+	/* perform the last 64 bit fold, also adds 32 zeroes
+	 * to the input stream */
+	ext		v2.16b, v1.16b, v1.16b, #8
+	pmull2		v2.1q, v2.2d, vCONSTANT.2d
+	ext		v1.16b, v1.16b, vzr.16b, #8
+	eor		v1.16b, v1.16b, v2.16b
+
+	/* final 32-bit fold */
+	ldr		dCONSTANT, [x3, #32]
+	ldr		d3, [x3, #40]
+
+	ext		v2.16b, v1.16b, vzr.16b, #4
+	and		v1.16b, v1.16b, v3.16b
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v2.16b
+
+	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+	ldr		qCONSTANT, [x3, #48]
+
+	and		v2.16b, v1.16b, v3.16b
+	ext		v2.16b, vzr.16b, v2.16b, #8
+	pmull2		v2.1q, v2.2d, vCONSTANT.2d
+	and		v2.16b, v2.16b, v3.16b
+	pmull		v2.1q, v2.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v2.16b
+	mov		w0, v1.s[1]
+
+	ret
+ENDPROC(crc32_pmull_le)
+ENDPROC(crc32c_pmull_le)
+
+	.macro		__crc32, c
+0:	subs		x2, x2, #16
+	b.mi		8f
+	ldp		x3, x4, [x1], #16
+CPU_BE(	rev		x3, x3		)
+CPU_BE(	rev		x4, x4		)
+	crc32\c\()x	w0, w0, x3
+	crc32\c\()x	w0, w0, x4
+	b.ne		0b
+	ret
+
+8:	tbz		x2, #3, 4f
+	ldr		x3, [x1], #8
+CPU_BE(	rev		x3, x3		)
+	crc32\c\()x	w0, w0, x3
+4:	tbz		x2, #2, 2f
+	ldr		w3, [x1], #4
+CPU_BE(	rev		w3, w3		)
+	crc32\c\()w	w0, w0, w3
+2:	tbz		x2, #1, 1f
+	ldrh		w3, [x1], #2
+CPU_BE(	rev16		w3, w3		)
+	crc32\c\()h	w0, w0, w3
+1:	tbz		x2, #0, 0f
+	ldrb		w3, [x1]
+	crc32\c\()b	w0, w0, w3
+0:	ret
+	.endm
+
+	.align		5
+ENTRY(crc32_armv8_le)
+	__crc32
+ENDPROC(crc32_armv8_le)
+
+	.align		5
+ENTRY(crc32c_armv8_le)
+	__crc32		c
+ENDPROC(crc32c_armv8_le)
diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
new file mode 100644
index 000000000000..8594127d5e01
--- /dev/null
+++ b/arch/arm64/crypto/crc32-ce-glue.c
@@ -0,0 +1,212 @@
+/*
+ * Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/crc32.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/unaligned.h>
+
+#define PMULL_MIN_LEN		64L	/* minimum size of buffer
+					 * for crc32_pmull_le_16 */
+#define SCALE_F			16L	/* size of NEON register */
+
+asmlinkage u32 crc32_pmull_le(const u8 buf[], u64 len, u32 init_crc);
+asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], size_t len);
+
+asmlinkage u32 crc32c_pmull_le(const u8 buf[], u64 len, u32 init_crc);
+asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], size_t len);
+
+static u32 (*fallback_crc32)(u32 init_crc, const u8 buf[], size_t len);
+static u32 (*fallback_crc32c)(u32 init_crc, const u8 buf[], size_t len);
+
+static int crc32_pmull_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 0;
+	return 0;
+}
+
+static int crc32c_pmull_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = ~0;
+	return 0;
+}
+
+static int crc32_pmull_setkey(struct crypto_shash *hash, const u8 *key,
+			      unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = le32_to_cpup((__le32 *)key);
+	return 0;
+}
+
+static int crc32_pmull_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *crc = shash_desc_ctx(desc);
+
+	*crc = *mctx;
+	return 0;
+}
+
+static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+	unsigned int l;
+
+	if ((u64)data % SCALE_F) {
+		l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
+
+		*crc = fallback_crc32(*crc, data, l);
+
+		data += l;
+		length -= l;
+	}
+
+	if (length >= PMULL_MIN_LEN) {
+		l = round_down(length, SCALE_F);
+
+		kernel_neon_begin_partial(10);
+		*crc = crc32_pmull_le(data, l, *crc);
+		kernel_neon_end();
+
+		data += l;
+		length -= l;
+	}
+
+	if (length > 0)
+		*crc = fallback_crc32(*crc, data, length);
+
+	return 0;
+}
+
+static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+	unsigned int l;
+
+	if ((u64)data % SCALE_F) {
+		l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
+
+		*crc = fallback_crc32c(*crc, data, l);
+
+		data += l;
+		length -= l;
+	}
+
+	if (length >= PMULL_MIN_LEN) {
+		l = round_down(length, SCALE_F);
+
+		kernel_neon_begin_partial(10);
+		*crc = crc32c_pmull_le(data, l, *crc);
+		kernel_neon_end();
+
+		data += l;
+		length -= l;
+	}
+
+	if (length > 0) {
+		*crc = fallback_crc32c(*crc, data, length);
+	}
+
+	return 0;
+}
+
+static int crc32_pmull_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	put_unaligned_le32(*crc, out);
+	return 0;
+}
+
+static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	put_unaligned_le32(~*crc, out);
+	return 0;
+}
+
+static struct shash_alg crc32_pmull_algs[] = { {
+	.setkey			= crc32_pmull_setkey,
+	.init			= crc32_pmull_init,
+	.update			= crc32_pmull_update,
+	.final			= crc32_pmull_final,
+	.descsize		= sizeof(u32),
+	.digestsize		= sizeof(u32),
+
+	.base.cra_ctxsize	= sizeof(u32),
+	.base.cra_init		= crc32_pmull_cra_init,
+	.base.cra_name		= "crc32",
+	.base.cra_driver_name	= "crc32-arm64-ce",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= 1,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.setkey			= crc32_pmull_setkey,
+	.init			= crc32_pmull_init,
+	.update			= crc32c_pmull_update,
+	.final			= crc32c_pmull_final,
+	.descsize		= sizeof(u32),
+	.digestsize		= sizeof(u32),
+
+	.base.cra_ctxsize	= sizeof(u32),
+	.base.cra_init		= crc32c_pmull_cra_init,
+	.base.cra_name		= "crc32c",
+	.base.cra_driver_name	= "crc32c-arm64-ce",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= 1,
+	.base.cra_module	= THIS_MODULE,
+} };
+
+static int __init crc32_pmull_mod_init(void)
+{
+	if (elf_hwcap & HWCAP_CRC32) {
+		fallback_crc32 = crc32_armv8_le;
+		fallback_crc32c = crc32c_armv8_le;
+	} else {
+		fallback_crc32 = crc32_le;
+		fallback_crc32c = __crc32c_le;
+	}
+
+	return crypto_register_shashes(crc32_pmull_algs,
+				       ARRAY_SIZE(crc32_pmull_algs));
+}
+
+static void __exit crc32_pmull_mod_exit(void)
+{
+	crypto_unregister_shashes(crc32_pmull_algs,
+				  ARRAY_SIZE(crc32_pmull_algs));
+}
+
+module_cpu_feature_match(PMULL, crc32_pmull_mod_init);
+module_exit(crc32_pmull_mod_exit);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 6/6] crypto: arm/crc32 - accelerated support based on x86 SSE implementation
From: Ard Biesheuvel @ 2016-12-05 18:42 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1480963348-24203-1-git-send-email-ard.biesheuvel@linaro.org>

This is a combination of the the Intel algorithm implemented using SSE
and PCLMULQDQ instructions from arch/x86/crypto/crc32-pclmul_asm.S, and
the new CRC32 extensions introduced for both 32-bit and 64-bit ARM in
version 8 of the architecture. Two versions of the above combo are
provided, one for CRC32 and one for CRC32C.

The PMULL/NEON algorithm is faster, but operates on blocks of at least
64 bytes, and on multiples of 16 bytes only. For the remaining input,
or for all input on systems that lack the PMULL 64x64->128 instructions,
the CRC32 instructions will be used.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm/crypto/Kconfig         |   5 +
 arch/arm/crypto/Makefile        |   2 +
 arch/arm/crypto/crc32-ce-core.S | 306 ++++++++++++++++++++
 arch/arm/crypto/crc32-ce-glue.c | 242 ++++++++++++++++
 4 files changed, 555 insertions(+)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index fce801fa52a1..de7bb20815bf 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -125,4 +125,9 @@ config CRYPTO_CRCT10DIF_ARM_CE
 	depends on KERNEL_MODE_NEON && CRC_T10DIF
 	select CRYPTO_HASH
 
+config CRYPTO_CRC32_ARM_CE
+	tristate "CRC32(C) digest algorithm using CRC and/or PMULL instructions"
+	depends on KERNEL_MODE_NEON && CRC32
+	select CRYPTO_HASH
+
 endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index fc77265014b7..b578a1820ab1 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -14,6 +14,7 @@ ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o
+ce-obj-$(CONFIG_CRYPTO_CRC32_ARM_CE) += crc32-arm-ce.o
 
 ifneq ($(ce-obj-y)$(ce-obj-m),)
 ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y)
@@ -38,6 +39,7 @@ sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
+crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
 
 quiet_cmd_perl = PERL    $@
       cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm/crypto/crc32-ce-core.S b/arch/arm/crypto/crc32-ce-core.S
new file mode 100644
index 000000000000..e63d400dc5c1
--- /dev/null
+++ b/arch/arm/crypto/crc32-ce-core.S
@@ -0,0 +1,306 @@
+/*
+ * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.align		6
+	.arch		armv8-a
+	.arch_extension	crc
+	.fpu		crypto-neon-fp-armv8
+
+.Lcrc32_constants:
+	/*
+	 * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
+	 * #define CONSTANT_R1  0x154442bd4LL
+	 *
+	 * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
+	 * #define CONSTANT_R2  0x1c6e41596LL
+	 */
+	.quad		0x0000000154442bd4
+	.quad		0x00000001c6e41596
+
+	/*
+	 * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
+	 * #define CONSTANT_R3  0x1751997d0LL
+	 *
+	 * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
+	 * #define CONSTANT_R4  0x0ccaa009eLL
+	 */
+	.quad		0x00000001751997d0
+	.quad		0x00000000ccaa009e
+
+	/*
+	 * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
+	 * #define CONSTANT_R5  0x163cd6124LL
+	 */
+	.quad		0x0000000163cd6124
+	.quad		0x00000000FFFFFFFF
+
+	/*
+	 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+	 *
+	 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
+	 *                                                      = 0x1F7011641LL
+	 * #define CONSTANT_RU  0x1F7011641LL
+	 */
+	.quad		0x00000001DB710641
+	.quad		0x00000001F7011641
+
+.Lcrc32c_constants:
+	.quad		0x00000000740eef02
+	.quad		0x000000009e4addf8
+	.quad		0x00000000f20c0dfe
+	.quad		0x000000014cd00bd6
+	.quad		0x00000000dd45aab8
+	.quad		0x00000000FFFFFFFF
+	.quad		0x0000000105ec76f0
+	.quad		0x00000000dea713f1
+
+	dCONSTANTl	.req	d0
+	dCONSTANTh	.req	d1
+	qCONSTANT	.req	q0
+
+	BUF		.req	r0
+	LEN		.req	r1
+	CRC		.req	r2
+
+	qzr		.req	q9
+
+	/**
+	 * Calculate crc32
+	 * BUF - buffer
+	 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
+	 * CRC - initial crc32
+	 * return %eax crc32
+	 * uint crc32_pmull_le(unsigned char const *buffer,
+	 *                     size_t len, uint crc32)
+	 */
+ENTRY(crc32_pmull_le)
+	adr		r3, .Lcrc32_constants
+	b		0f
+
+ENTRY(crc32c_pmull_le)
+	adr		r3, .Lcrc32c_constants
+
+0:	bic		LEN, LEN, #15
+	vld1.8		{q1-q2}, [BUF, :128]!
+	vld1.8		{q3-q4}, [BUF, :128]!
+	vmov.i8		qzr, #0
+	vmov.i8		qCONSTANT, #0
+	vmov		dCONSTANTl[0], CRC
+	veor.8		d2, d2, dCONSTANTl
+	sub		LEN, LEN, #0x40
+	cmp		LEN, #0x40
+	blt		less_64
+
+	vld1.64		{qCONSTANT}, [r3]
+
+loop_64:		/* 64 bytes Full cache line folding */
+	sub		LEN, LEN, #0x40
+
+	vmull.p64	q5, d3, dCONSTANTh
+	vmull.p64	q6, d5, dCONSTANTh
+	vmull.p64	q7, d7, dCONSTANTh
+	vmull.p64	q8, d9, dCONSTANTh
+
+	vmull.p64	q1, d2, dCONSTANTl
+	vmull.p64	q2, d4, dCONSTANTl
+	vmull.p64	q3, d6, dCONSTANTl
+	vmull.p64	q4, d8, dCONSTANTl
+
+	veor.8		q1, q1, q5
+	vld1.8		{q5}, [BUF, :128]!
+	veor.8		q2, q2, q6
+	vld1.8		{q6}, [BUF, :128]!
+	veor.8		q3, q3, q7
+	vld1.8		{q7}, [BUF, :128]!
+	veor.8		q4, q4, q8
+	vld1.8		{q8}, [BUF, :128]!
+
+	veor.8		q1, q1, q5
+	veor.8		q2, q2, q6
+	veor.8		q3, q3, q7
+	veor.8		q4, q4, q8
+
+	cmp		LEN, #0x40
+	bge		loop_64
+
+less_64:		/* Folding cache line into 128bit */
+	vldr		dCONSTANTl, [r3, #16]
+	vldr		dCONSTANTh, [r3, #24]
+
+	vmull.p64	q5, d3, dCONSTANTh
+	vmull.p64	q1, d2, dCONSTANTl
+	veor.8		q1, q1, q5
+	veor.8		q1, q1, q2
+
+	vmull.p64	q5, d3, dCONSTANTh
+	vmull.p64	q1, d2, dCONSTANTl
+	veor.8		q1, q1, q5
+	veor.8		q1, q1, q3
+
+	vmull.p64	q5, d3, dCONSTANTh
+	vmull.p64	q1, d2, dCONSTANTl
+	veor.8		q1, q1, q5
+	veor.8		q1, q1, q4
+
+	teq		LEN, #0
+	beq		fold_64
+
+loop_16:		/* Folding rest buffer into 128bit */
+	subs		LEN, LEN, #0x10
+
+	vld1.8		{q2}, [BUF, :128]!
+	vmull.p64	q5, d3, dCONSTANTh
+	vmull.p64	q1, d2, dCONSTANTl
+	veor.8		q1, q1, q5
+	veor.8		q1, q1, q2
+
+	bne		loop_16
+
+fold_64:
+	/* perform the last 64 bit fold, also adds 32 zeroes
+	 * to the input stream */
+	vmull.p64	q2, d2, dCONSTANTh
+	vext.8		q1, q1, qzr, #8
+	veor.8		q1, q1, q2
+
+	/* final 32-bit fold */
+	vldr		dCONSTANTl, [r3, #32]
+	vldr		d6, [r3, #40]
+	vmov.i8		d7, #0
+
+	vext.8		q2, q1, qzr, #4
+	vand.8		d2, d2, d6
+	vmull.p64	q1, d2, dCONSTANTl
+	veor.8		q1, q1, q2
+
+	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+	vldr		dCONSTANTl, [r3, #48]
+	vldr		dCONSTANTh, [r3, #56]
+
+	vand.8		q2, q1, q3
+	vext.8		q2, qzr, q2, #8
+	vmull.p64	q2, d5, dCONSTANTh
+	vand.8		q2, q2, q3
+	vmull.p64	q2, d4, dCONSTANTl
+	veor.8		q1, q1, q2
+	vmov		r0, s5
+
+	bx		lr
+ENDPROC(crc32_pmull_le)
+ENDPROC(crc32c_pmull_le)
+
+	.macro		__crc32, c
+	subs		ip, r2, #8
+	bmi		.Ltail\c
+
+	tst		r1, #3
+	bne		.Lunaligned\c
+
+	teq		ip, #0
+.Laligned8\c:
+	ldrd		r2, r3, [r1], #8
+ARM_BE8(rev		r2, r2		)
+ARM_BE8(rev		r3, r3		)
+	crc32\c\()w	r0, r0, r2
+	crc32\c\()w	r0, r0, r3
+	bxeq		lr
+	subs		ip, ip, #8
+	bpl		.Laligned8\c
+
+.Ltail\c:
+	tst		ip, #4
+	beq		2f
+	ldr		r3, [r1], #4
+ARM_BE8(rev		r3, r3		)
+	crc32\c\()w	r0, r0, r3
+
+2:	tst		ip, #2
+	beq		1f
+	ldrh		r3, [r1], #2
+ARM_BE8(rev16		r3, r3		)
+	crc32\c\()h	r0, r0, r3
+
+1:	tst		ip, #1
+	bxeq		lr
+	ldrb		r3, [r1]
+	crc32\c\()b	r0, r0, r3
+	bx		lr
+
+.Lunaligned\c:
+	tst		r1, #1
+	beq		2f
+	ldrb		r3, [r1], #1
+	subs		r2, r2, #1
+	crc32\c\()b	r0, r0, r3
+
+	tst		r1, #2
+	beq		0f
+2:	ldrh		r3, [r1], #2
+	subs		r2, r2, #2
+ARM_BE8(rev16		r3, r3		)
+	crc32\c\()h	r0, r0, r3
+
+0:	subs		ip, r2, #8
+	bpl		.Laligned8\c
+	b		.Ltail\c
+	.endm
+
+	.align		5
+ENTRY(crc32_armv8_le)
+	__crc32
+ENDPROC(crc32_armv8_le)
+
+	.align		5
+ENTRY(crc32c_armv8_le)
+	__crc32		c
+ENDPROC(crc32c_armv8_le)
diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c
new file mode 100644
index 000000000000..e1566bec1016
--- /dev/null
+++ b/arch/arm/crypto/crc32-ce-glue.c
@@ -0,0 +1,242 @@
+/*
+ * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/crc32.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/unaligned.h>
+
+#define PMULL_MIN_LEN		64L	/* minimum size of buffer
+					 * for crc32_pmull_le_16 */
+#define SCALE_F			16L	/* size of NEON register */
+
+asmlinkage u32 crc32_pmull_le(const u8 buf[], u32 len, u32 init_crc);
+asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], u32 len);
+
+asmlinkage u32 crc32c_pmull_le(const u8 buf[], u32 len, u32 init_crc);
+asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], u32 len);
+
+static u32 (*fallback_crc32)(u32 init_crc, const u8 buf[], u32 len);
+static u32 (*fallback_crc32c)(u32 init_crc, const u8 buf[], u32 len);
+
+static int crc32_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 0;
+	return 0;
+}
+
+static int crc32c_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = ~0;
+	return 0;
+}
+
+static int crc32_setkey(struct crypto_shash *hash, const u8 *key,
+			unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = le32_to_cpup((__le32 *)key);
+	return 0;
+}
+
+static int crc32_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *crc = shash_desc_ctx(desc);
+
+	*crc = *mctx;
+	return 0;
+}
+
+static int crc32_update(struct shash_desc *desc, const u8 *data,
+			unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	*crc = crc32_armv8_le(*crc, data, length);
+	return 0;
+}
+
+static int crc32c_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	*crc = crc32c_armv8_le(*crc, data, length);
+	return 0;
+}
+
+static int crc32_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	put_unaligned_le32(*crc, out);
+	return 0;
+}
+
+static int crc32c_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	put_unaligned_le32(~*crc, out);
+	return 0;
+}
+
+static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+	unsigned int l;
+
+	if (may_use_simd()) {
+		if ((u32)data % SCALE_F) {
+			l = min_t(u32, length, SCALE_F - ((u32)data % SCALE_F));
+
+			*crc = fallback_crc32(*crc, data, l);
+
+			data += l;
+			length -= l;
+		}
+
+		if (length >= PMULL_MIN_LEN) {
+			l = round_down(length, SCALE_F);
+
+			kernel_neon_begin();
+			*crc = crc32_pmull_le(data, l, *crc);
+			kernel_neon_end();
+
+			data += l;
+			length -= l;
+		}
+	}
+
+	if (length > 0)
+		*crc = fallback_crc32(*crc, data, length);
+
+	return 0;
+}
+
+static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+	unsigned int l;
+
+	if (may_use_simd()) {
+		if ((u32)data % SCALE_F) {
+			l = min_t(u32, length, SCALE_F - ((u32)data % SCALE_F));
+
+			*crc = fallback_crc32c(*crc, data, l);
+
+			data += l;
+			length -= l;
+		}
+
+		if (length >= PMULL_MIN_LEN) {
+			l = round_down(length, SCALE_F);
+
+			kernel_neon_begin();
+			*crc = crc32c_pmull_le(data, l, *crc);
+			kernel_neon_end();
+
+			data += l;
+			length -= l;
+		}
+	}
+
+	if (length > 0)
+		*crc = fallback_crc32c(*crc, data, length);
+
+	return 0;
+}
+
+static struct shash_alg crc32_pmull_algs[] = { {
+	.setkey			= crc32_setkey,
+	.init			= crc32_init,
+	.update			= crc32_update,
+	.final			= crc32_final,
+	.descsize		= sizeof(u32),
+	.digestsize		= sizeof(u32),
+
+	.base.cra_ctxsize	= sizeof(u32),
+	.base.cra_init		= crc32_cra_init,
+	.base.cra_name		= "crc32",
+	.base.cra_driver_name	= "crc32-arm-ce",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= 1,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.setkey			= crc32_setkey,
+	.init			= crc32_init,
+	.update			= crc32c_update,
+	.final			= crc32c_final,
+	.descsize		= sizeof(u32),
+	.digestsize		= sizeof(u32),
+
+	.base.cra_ctxsize	= sizeof(u32),
+	.base.cra_init		= crc32c_cra_init,
+	.base.cra_name		= "crc32c",
+	.base.cra_driver_name	= "crc32c-arm-ce",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= 1,
+	.base.cra_module	= THIS_MODULE,
+} };
+
+static int __init crc32_pmull_mod_init(void)
+{
+	if (elf_hwcap2 & HWCAP2_PMULL) {
+		crc32_pmull_algs[0].update = crc32_pmull_update;
+		crc32_pmull_algs[1].update = crc32c_pmull_update;
+
+		if (elf_hwcap2 & HWCAP2_CRC32) {
+			fallback_crc32 = crc32_armv8_le;
+			fallback_crc32c = crc32c_armv8_le;
+		} else {
+			fallback_crc32 = crc32_le;
+			fallback_crc32c = __crc32c_le;
+		}
+	} else if (!(elf_hwcap2 & HWCAP2_CRC32)) {
+		return -ENODEV;
+	}
+
+	return crypto_register_shashes(crc32_pmull_algs,
+				       ARRAY_SIZE(crc32_pmull_algs));
+}
+
+static void __exit crc32_pmull_mod_exit(void)
+{
+	crypto_unregister_shashes(crc32_pmull_algs,
+				  ARRAY_SIZE(crc32_pmull_algs));
+}
+
+module_init(crc32_pmull_mod_init);
+module_exit(crc32_pmull_mod_exit);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("crc32");
+MODULE_ALIAS_CRYPTO("crc32c");
-- 
2.7.4

^ permalink raw reply related

* [PATCH] ARM: dts: imx6sx-udoo-neo: Pass the 'phy-reset-duration' property
From: Fabio Estevam @ 2016-12-05 19:28 UTC (permalink / raw)
  To: linux-arm-kernel

From: Fabio Estevam <fabio.estevam@nxp.com>

imx6sx-udoo-neo has a KSZ8091 Ethernet PHY, which requires the reset
signal to be low for at least 10ms.

Pass the 'phy-reset-duration' property to reflect such requirement.

Signed-off-by: Fabio Estevam <fabio.estevam@nxp.com>
---
 arch/arm/boot/dts/imx6sx-udoo-neo.dtsi | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/boot/dts/imx6sx-udoo-neo.dtsi b/arch/arm/boot/dts/imx6sx-udoo-neo.dtsi
index 2b65d26..d1ce9af 100644
--- a/arch/arm/boot/dts/imx6sx-udoo-neo.dtsi
+++ b/arch/arm/boot/dts/imx6sx-udoo-neo.dtsi
@@ -86,6 +86,7 @@
 	pinctrl-names = "default";
 	pinctrl-0 = <&pinctrl_enet1>;
 	phy-mode = "rmii";
+	phy-reset-duration = <10>;
 	phy-reset-gpios = <&gpio2 1 GPIO_ACTIVE_LOW>;
 };
 
-- 
2.7.4

^ permalink raw reply related

* ILP32 for ARM64: testing with glibc testsuite
From: Steve Ellcey @ 2016-12-05 19:33 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <mvmeg1mn0yp.fsf@hawking.suse.de>

On Mon, 2016-12-05 at 11:07 +0100, Andreas Schwab wrote:
> On Dez 05 2016, "Zhangjian (Bamvor)" <bamvor.zhangjian@huawei.com>
> wrote:
> 
> > 
> > Is there some progresses on it? We could collabrate to fix those
> > issues.
> All the elf/nptl/rt fails should be fixed by the recent binutils
> fixes.
> 
> Andreas.

I am using binutils ToT and Yury's latest patch (https://sourceware.org
/ml/binutils/2016-12/msg00039.html) and I am still seeing some nptl and
rt failures in the glibc testsuite, specifically:

FAIL: nptl/tst-cancel26
FAIL: nptl/tst-cancel27
FAIL: nptl/tst-stack4
FAIL: rt/tst-mqueue1
FAIL: rt/tst-mqueue2
FAIL: rt/tst-mqueue4
FAIL: rt/tst-mqueue7

Steve Ellcey
sellcey at caviumnetworks.com

^ permalink raw reply

* [PATCH] ARM: dts: imx7d: fix LCDIF clock assignment
From: Stefan Agner @ 2016-12-05 20:29 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161205070609.a6t7hzh3m3l2t37s@pengutronix.de>

On 2016-12-04 23:06, Uwe Kleine-K?nig wrote:
> Hello Stefan,
> 
> On Sun, Dec 04, 2016 at 05:26:58PM -0800, Stefan Agner wrote:
>> Since this fixes a kernel freeze, is there a chance to get this still in
>> 4.9?
> 
> a Fixes:-Line would be nice then.

Good point.

Fixes: e8ed73f691bd ("ARM: dts: imx7d: add lcdif support")

--
Stefan

^ permalink raw reply

* [PATCH v3] PCI/ACPI: xgene: Add ECAM quirk for X-Gene PCIe controller
From: Bjorn Helgaas @ 2016-12-05 21:20 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <CADaLNDk-cMW8k+bXqH3+yYCye_kHf6RXUYQ=eCCZ4FF2ZTjgkQ@mail.gmail.com>

On Fri, Dec 02, 2016 at 11:06:30PM -0800, Duc Dang wrote:
> On Fri, Dec 2, 2016 at 3:39 PM, Bjorn Helgaas <helgaas@kernel.org> wrote:

> > diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c
> > index 8a177a1..a16fc8e 100644
> > --- a/arch/arm64/kernel/pci.c
> > +++ b/arch/arm64/kernel/pci.c
> > @@ -114,6 +114,19 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
> >         return 0;
> >  }
> >
> > +static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
> > +{
> > +       struct resource_entry *entry, *tmp;
> > +       int status;
> > +
> > +       status = acpi_pci_probe_root_resources(ci);
> > +       resource_list_for_each_entry_safe(entry, tmp, &ci->resources) {
> > +               if (!(entry->res->flags & IORESOURCE_WINDOW))
> > +                       resource_list_destroy_entry(entry);
> > +       }
> > +       return status;
> > +}
> > +
> >  /*
> >   * Lookup the bus range for the domain in MCFG, and set up config space
> >   * mapping.
> > @@ -190,6 +203,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
> >         }
> >
> >         root_ops->release_info = pci_acpi_generic_release_info;
> > +       root_ops->prepare_resources = pci_acpi_root_prepare_resources;
> >         root_ops->pci_ops = &ri->cfg->ops->pci_ops;
> >         bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg);
> >         if (!bus)
> 
> I tried your patch above with my X-Gene ECAM v4 patch on Mustang, here
> is the kernel boot log and output of 'cat /proc/iomem'. The PCIe core
> does not print the MMIO space as a window (which is expected per your
> patch above).

Thanks!

> ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-ff])
> acpi PNP0A08:00: _OSC: OS supports [ExtendedConfig ASPM ClockPM Segments MSI]
> acpi PNP0A08:00: _OSC: OS now controls [PCIeHotplug PME AER PCIeCapability]
> acpi PNP0A08:00: MCFG quirk: ECAM at [mem 0xe0d0000000-0xe0dfffffff] for [bus 00-ff] with xgene_v1_pcie_ecam_ops
> acpi PNP0A08:00: [Firmware Bug]: ECAM area [mem 0xe0d0000000-0xe0dfffffff] not reserved in ACPI namespace
> acpi PNP0A08:00: ECAM at [mem 0xe0d0000000-0xe0dfffffff] for [bus 00-ff]
> Remapped I/O 0x000000e010000000 to [io  0x0000-0xffff window]
> PCI host bridge to bus 0000:00
> pci_bus 0000:00: root bus resource [io  0x0000-0xffff window] (bus address [0x10000000-0x1000ffff])
> pci_bus 0000:00: root bus resource [mem 0xe040000000-0xe07fffffff window] (bus address [0x40000000-0x7fffffff])
> pci_bus 0000:00: root bus resource [mem 0xf000000000-0xffffffffff window]
> pci_bus 0000:00: root bus resource [bus 00-ff]

Yup, no bridge register space here; that's good.  I assume the bridge
registers are at [mem 0x1f2b0000-0x1f2bffff] as shown in /proc/iomem
below.

> [root@(none) ~]# cat /proc/io mem
> ...
> 19000000-19007fff : 808622B7:00
> 1900c100-190fffff : 808622B7:00
>   1900c100-190fffff : 808622B7:00
> 19800000-19807fff : 808622B7:01
> 1980c100-198fffff : 808622B7:01
>   1980c100-198fffff : 808622B7:01
> ...
> 1f280000-1f28ffff : 808622B7:00
> 1f290000-1f29ffff : 808622B7:01

I'm curious what these "808622B7" devices are.  Per ACPI 6.0, sec
6.1.5, that looks like a PCI vendor ID, which I guess is a valid ACPI
ID.  But these resources don't seem to have any connection with PCI
(they're not in any of the host bridge apertures).

> 1f2b0000-1f2bffff : PNP0A08:00

Looks like the bridge register space; good.

> e040000000-e07fffffff : PCI Bus 0000:00
>   e040000000-e0401fffff : PCI Bus 0000:01
>     e040000000-e0400fffff : 0000:01:00.0
>       e040000000-e0400fffff : mlx4_core
>     e040100000-e0401fffff : 0000:01:00.0

> e0d0000000-e0dfffffff : PCI ECAM

This region should be described in either a PNP0C02 device or (if we
decide we can allow "consumer" descriptors) the PNP0A08 device.  I
assume you'll fix that in a future firmware release.

But I think this reservation from pci_ecam_create() is good enough for
now.

> f000000000-ffffffffff : PCI Bus 0000:00
>   f000000000-f001ffffff : PCI Bus 0000:01
>     f000000000-f001ffffff : 0000:01:00.0
>       f000000000-f001ffffff : mlx4_core

^ permalink raw reply

* [PATCH v3] PCI/ACPI: xgene: Add ECAM quirk for X-Gene PCIe controller
From: Bjorn Helgaas @ 2016-12-05 21:21 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <62dd5462-4d68-b89f-2b45-0d58bbd96bcd@redhat.com>

On Fri, Dec 02, 2016 at 07:33:46PM -0500, Jon Masters wrote:
> On 12/02/2016 06:39 PM, Bjorn Helgaas wrote:
> > On Thu, Dec 01, 2016 at 11:08:23PM -0500, Jon Masters wrote:
> 
> >> Let's see if I summarized this correctly...
> >>
> >> 1. The MMIO registers for the host bridge itself need to be described
> >>    somewhere, especially if we need to find those in a quirk and poke
> >>    them. Since those registers are very much part of the bridge device,
> >>    it makes sense for them to be in the _CRS for PNP0A08/PNP0A03.
> >>
> >> 2. The address space covering these registers MUST be described as a
> >>    ResourceConsumer in order to avoid accidentally exposing them as
> >>    available for use by downstream devices on the PCI bus.
> >>
> >> 3. The ACPI specification allows for resources of the type "Memory32Fixed".
> >>    This is a macro that doesn't have the notion of a producer or consumer.
> >>    HOWEVER various interpretations seem to be that this could/should
> >>    default to being interpreted as a consumed region.
> > 
> > I agree; I think that per spec, Memory24, Memory32, Memory32Fixed, IO,
> > and FixedIO should all be for consumed resources, not for bridge
> > windows, since they don't have the notion of producer.
> 
> Ok. If we ultimately codify this somewhere as the general Linux kernel
> consensus (Rafael?) then we can also go and get the various ARM server
> specs updated to reflect this in (for e.g.) reference firmware builds.
> 
> > I'm pretty sure there's x86 firmware in the field that uses these for
> > windows, so I think we have to accept that usage, at least on x86.
> 
> Ok. I was pondering how to even go about finding that out, but even if
> I scheduled a job across RH's infra to look, that would be a drop in
> the bucket of possible machines that might be out there doing this.

Hmmm, when researching this, I thought I came across a change
specifically for a machine that used Memory32Fixed this way, but I
can't find it now.

The only thing I did find was some old experiments with Windows that
showed it interpreting a Memory32Fixed region as a window and putting
PCI devices in it: https://bugzilla.kernel.org/show_bug.cgi?id=15817
But that was a synthetic example with qemu, not a real machine in the
field.

> > Even without this patch, I don't think it's a show-stopper to have
> > Linux mistakenly thinking this region is routed to PCI, because the
> > driver does reserve it and the PCI core will never try to use it.
> 
> Ok. So are you happy with pulling in Duc's v4 patch and retaining
> status quo on the bridge resources for 4.10?

Yes, I think it looks good.  I'll finish packaging things up and
repost the current series.

Bjorn

^ permalink raw reply

* [PATCH v3] PCI/ACPI: xgene: Add ECAM quirk for X-Gene PCIe controller
From: Duc Dang @ 2016-12-05 21:40 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161205212012.GA22455@bhelgaas-glaptop.roam.corp.google.com>

On Mon, Dec 5, 2016 at 1:20 PM, Bjorn Helgaas <helgaas@kernel.org> wrote:
> On Fri, Dec 02, 2016 at 11:06:30PM -0800, Duc Dang wrote:
>> On Fri, Dec 2, 2016 at 3:39 PM, Bjorn Helgaas <helgaas@kernel.org> wrote:
>
>> > diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c
>> > index 8a177a1..a16fc8e 100644
>> > --- a/arch/arm64/kernel/pci.c
>> > +++ b/arch/arm64/kernel/pci.c
>> > @@ -114,6 +114,19 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
>> >         return 0;
>> >  }
>> >
>> > +static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
>> > +{
>> > +       struct resource_entry *entry, *tmp;
>> > +       int status;
>> > +
>> > +       status = acpi_pci_probe_root_resources(ci);
>> > +       resource_list_for_each_entry_safe(entry, tmp, &ci->resources) {
>> > +               if (!(entry->res->flags & IORESOURCE_WINDOW))
>> > +                       resource_list_destroy_entry(entry);
>> > +       }
>> > +       return status;
>> > +}
>> > +
>> >  /*
>> >   * Lookup the bus range for the domain in MCFG, and set up config space
>> >   * mapping.
>> > @@ -190,6 +203,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
>> >         }
>> >
>> >         root_ops->release_info = pci_acpi_generic_release_info;
>> > +       root_ops->prepare_resources = pci_acpi_root_prepare_resources;
>> >         root_ops->pci_ops = &ri->cfg->ops->pci_ops;
>> >         bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg);
>> >         if (!bus)
>>
>> I tried your patch above with my X-Gene ECAM v4 patch on Mustang, here
>> is the kernel boot log and output of 'cat /proc/iomem'. The PCIe core
>> does not print the MMIO space as a window (which is expected per your
>> patch above).
>
> Thanks!
>
>> ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-ff])
>> acpi PNP0A08:00: _OSC: OS supports [ExtendedConfig ASPM ClockPM Segments MSI]
>> acpi PNP0A08:00: _OSC: OS now controls [PCIeHotplug PME AER PCIeCapability]
>> acpi PNP0A08:00: MCFG quirk: ECAM at [mem 0xe0d0000000-0xe0dfffffff] for [bus 00-ff] with xgene_v1_pcie_ecam_ops
>> acpi PNP0A08:00: [Firmware Bug]: ECAM area [mem 0xe0d0000000-0xe0dfffffff] not reserved in ACPI namespace
>> acpi PNP0A08:00: ECAM at [mem 0xe0d0000000-0xe0dfffffff] for [bus 00-ff]
>> Remapped I/O 0x000000e010000000 to [io  0x0000-0xffff window]
>> PCI host bridge to bus 0000:00
>> pci_bus 0000:00: root bus resource [io  0x0000-0xffff window] (bus address [0x10000000-0x1000ffff])
>> pci_bus 0000:00: root bus resource [mem 0xe040000000-0xe07fffffff window] (bus address [0x40000000-0x7fffffff])
>> pci_bus 0000:00: root bus resource [mem 0xf000000000-0xffffffffff window]
>> pci_bus 0000:00: root bus resource [bus 00-ff]
>
> Yup, no bridge register space here; that's good.  I assume the bridge
> registers are at [mem 0x1f2b0000-0x1f2bffff] as shown in /proc/iomem
> below.

Yes,  the bridge registers are at [mem 0x1f2b0000-0x1f2bffff].

>
>> [root@(none) ~]# cat /proc/io mem
>> ...
>> 19000000-19007fff : 808622B7:00
>> 1900c100-190fffff : 808622B7:00
>>   1900c100-190fffff : 808622B7:00
>> 19800000-19807fff : 808622B7:01
>> 1980c100-198fffff : 808622B7:01
>>   1980c100-198fffff : 808622B7:01
>> ...
>> 1f280000-1f28ffff : 808622B7:00
>> 1f290000-1f29ffff : 808622B7:01
>
> I'm curious what these "808622B7" devices are.  Per ACPI 6.0, sec
> 6.1.5, that looks like a PCI vendor ID, which I guess is a valid ACPI
> ID.  But these resources don't seem to have any connection with PCI
> (they're not in any of the host bridge apertures).

These are DesignWare USB 3.0 controllers (DWC3). The ACPI ID is
defined in drivers/usb/dwc3/core.c.
>
>> 1f2b0000-1f2bffff : PNP0A08:00
>
> Looks like the bridge register space; good.

Yes, it is.

>
>> e040000000-e07fffffff : PCI Bus 0000:00
>>   e040000000-e0401fffff : PCI Bus 0000:01
>>     e040000000-e0400fffff : 0000:01:00.0
>>       e040000000-e0400fffff : mlx4_core
>>     e040100000-e0401fffff : 0000:01:00.0
>
>> e0d0000000-e0dfffffff : PCI ECAM
>
> This region should be described in either a PNP0C02 device or (if we
> decide we can allow "consumer" descriptors) the PNP0A08 device.  I
> assume you'll fix that in a future firmware release.

Yes, future firmware will have PNP0C02 node that describes this ECAM
space (or a new resource in PNP0A08 if we use 'consumer' descriptor).

>
> But I think this reservation from pci_ecam_create() is good enough for
> now.
>
>> f000000000-ffffffffff : PCI Bus 0000:00
>>   f000000000-f001ffffff : PCI Bus 0000:01
>>     f000000000-f001ffffff : 0000:01:00.0
>>       f000000000-f001ffffff : mlx4_core
Regards,
Duc Dang.

^ permalink raw reply

* [PATCH v3] PCI/ACPI: xgene: Add ECAM quirk for X-Gene PCIe controller
From: Bjorn Helgaas @ 2016-12-05 21:53 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <CADaLNDm6fPQ6ekQa85fveAamJtF3+HGeOvprmgGJ4gLnMhF2_w@mail.gmail.com>

On Thu, Dec 01, 2016 at 06:52:23PM -0800, Duc Dang wrote:
> On Thu, Dec 1, 2016 at 10:33 AM, Bjorn Helgaas <helgaas@kernel.org> wrote:

> I made similar changes in v4 patch. The ECAM quirk will be built when
> ACPI and PCI_QUIRKS are enabled.
> 
> When building for DT only, the ECAM quirk won't be compiled.

Perfect.

> >>  #define XGENE_PCIE_IP_VER_UNKN               0
> >>  #define XGENE_PCIE_IP_VER_1          1
> >> +#define XGENE_PCIE_IP_VER_2          2
> >
> > This isn't used anywhere, which makes me wonder whether it's worth
> > keeping it.
> 
> V2 controller will use this XGENE_PCIE_IP_VER_2 (port->version =
> XGENE_PCIE_IP_VER_2). This will be used to indicate that the
> controller is V2, and to enable configuration request retry status
> feature (by not disable it like V1 controller).

OK, I see.  You don't actually need XGENE_PCIE_IP_VER_2, you just need
port->version to be something other than XGENE_PCIE_IP_VER_1.  So this
is fine as it is.

> >>  static void __iomem *xgene_pcie_get_cfg_base(struct pci_bus *bus)
> >>  {
> >> -     struct xgene_pcie_port *port = bus->sysdata;
> >> +     struct pci_config_window *cfg;
> >> +     struct xgene_pcie_port *port;
> >> +
> >> +     if (acpi_disabled)
> >> +             port = bus->sysdata;
> >> +     else {
> >> +             cfg = bus->sysdata;
> >> +             port = cfg->priv;
> >> +     }
> >
> > I would really, really like to figure out a way to get rid of these
> > "if (acpi_disabled)" checks sprinkled through here.  Is there any way
> > we can set up bus->sysdata to be the same, regardless of whether we're
> > using this as a platform driver or an ACPI quirk?
> 
> Right now, I created a inline function to extract xgene_pcie_port from
> pci_bus. In order to get rid of acpi_disabled, I will need to make
> sysdata in DT case also point to pci_config_window structure, which
> means I will need to convert and test the DT driver to use ecam ops.
> It is a separate patch itself. So I think I should do it at later time
> (after this ECAM quirk patch). I hope you are ok with this.

OK.  I did the simple-minded version of leaving the DT ops the same
but making sysdata point to a dummy pci_config_window.  Your proposal
of using ECAM for DT would be much better.

It's interesting that you actually already use the same accessors
except that DT uses the 32-bit pci_generic_config_write32() and ACPI
uses the regular pci_generic_config_write(). I guess that means the
hardware actually *does* support sub-32 bit writes?

> I need to define the function (xgene_get_csr_resource()) inside
> pci-xgene.c to duplicate the code of acpi_get_rc_addr. The reason is
> X-Gene firmware does not have a dedicate PNP0C02 node to declare the
> resource, and if I use acpi_get_rc_resources() with "PNP0A08", I got
> error due to acpi_bus_get_device() returns error.

Looks good.

> > All these init functions are almost identical.  Can we factor this out
> > by having wrappers that do nothing more than pass in the table and
> > version, and put the kzalloc and ioremap in a shared back-end?
> 
> I refactor-ed these .init functions. And as a result, there are only 2
> ecam ops left: xgene_v1_pcie_ecam_ops and xgene_v2_pcie_ecam_ops.

Looks good.

Bjorn

^ permalink raw reply

* [PATCH v3] PCI/ACPI: xgene: Add ECAM quirk for X-Gene PCIe controller
From: Duc Dang @ 2016-12-05 22:09 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161205215318.GB22455@bhelgaas-glaptop.roam.corp.google.com>

On Mon, Dec 5, 2016 at 1:53 PM, Bjorn Helgaas <helgaas@kernel.org> wrote:
> On Thu, Dec 01, 2016 at 06:52:23PM -0800, Duc Dang wrote:
>> On Thu, Dec 1, 2016 at 10:33 AM, Bjorn Helgaas <helgaas@kernel.org> wrote:
>
>> I made similar changes in v4 patch. The ECAM quirk will be built when
>> ACPI and PCI_QUIRKS are enabled.
>>
>> When building for DT only, the ECAM quirk won't be compiled.
>
> Perfect.
>
>> >>  #define XGENE_PCIE_IP_VER_UNKN               0
>> >>  #define XGENE_PCIE_IP_VER_1          1
>> >> +#define XGENE_PCIE_IP_VER_2          2
>> >
>> > This isn't used anywhere, which makes me wonder whether it's worth
>> > keeping it.
>>
>> V2 controller will use this XGENE_PCIE_IP_VER_2 (port->version =
>> XGENE_PCIE_IP_VER_2). This will be used to indicate that the
>> controller is V2, and to enable configuration request retry status
>> feature (by not disable it like V1 controller).
>
> OK, I see.  You don't actually need XGENE_PCIE_IP_VER_2, you just need
> port->version to be something other than XGENE_PCIE_IP_VER_1.  So this
> is fine as it is.
>
>> >>  static void __iomem *xgene_pcie_get_cfg_base(struct pci_bus *bus)
>> >>  {
>> >> -     struct xgene_pcie_port *port = bus->sysdata;
>> >> +     struct pci_config_window *cfg;
>> >> +     struct xgene_pcie_port *port;
>> >> +
>> >> +     if (acpi_disabled)
>> >> +             port = bus->sysdata;
>> >> +     else {
>> >> +             cfg = bus->sysdata;
>> >> +             port = cfg->priv;
>> >> +     }
>> >
>> > I would really, really like to figure out a way to get rid of these
>> > "if (acpi_disabled)" checks sprinkled through here.  Is there any way
>> > we can set up bus->sysdata to be the same, regardless of whether we're
>> > using this as a platform driver or an ACPI quirk?
>>
>> Right now, I created a inline function to extract xgene_pcie_port from
>> pci_bus. In order to get rid of acpi_disabled, I will need to make
>> sysdata in DT case also point to pci_config_window structure, which
>> means I will need to convert and test the DT driver to use ecam ops.
>> It is a separate patch itself. So I think I should do it at later time
>> (after this ECAM quirk patch). I hope you are ok with this.
>
> OK.  I did the simple-minded version of leaving the DT ops the same
> but making sysdata point to a dummy pci_config_window.  Your proposal
> of using ECAM for DT would be much better.
>
> It's interesting that you actually already use the same accessors
> except that DT uses the 32-bit pci_generic_config_write32() and ACPI
> uses the regular pci_generic_config_write(). I guess that means the
> hardware actually *does* support sub-32 bit writes?

Yes, the hardware does support sub-32 bit writes (and reads). This is
another item in my TODO list for DT (which does not seem quite urgent
now): switch to use pci_generic_config_write for DT. But, well, I will
need to do that for read as well (for both ACPI and DT).

>
>> I need to define the function (xgene_get_csr_resource()) inside
>> pci-xgene.c to duplicate the code of acpi_get_rc_addr. The reason is
>> X-Gene firmware does not have a dedicate PNP0C02 node to declare the
>> resource, and if I use acpi_get_rc_resources() with "PNP0A08", I got
>> error due to acpi_bus_get_device() returns error.
>
> Looks good.
>
>> > All these init functions are almost identical.  Can we factor this out
>> > by having wrappers that do nothing more than pass in the table and
>> > version, and put the kzalloc and ioremap in a shared back-end?
>>
>> I refactor-ed these .init functions. And as a result, there are only 2
>> ecam ops left: xgene_v1_pcie_ecam_ops and xgene_v2_pcie_ecam_ops.
>
> Looks good.
>
> Bjorn
Regards,
Duc Dang.

^ permalink raw reply

* [PATCH] ACPI/IORT: Make dma masks set-up IORT specific
From: Rafael J. Wysocki @ 2016-12-05 22:18 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161205122619.25045-1-lorenzo.pieralisi@arm.com>

On Mon, Dec 5, 2016 at 1:26 PM, Lorenzo Pieralisi
<lorenzo.pieralisi@arm.com> wrote:
> The introduction of acpi_dma_configure() allows to configure DMA
> and related IOMMU for any device that is DMA capable. To achieve
> that goal it ensures DMA masks are set-up to sane default values
> before proceeding with IOMMU and DMA ops configuration.
>
> On x86/ia64 systems, through acpi_bind_one(), acpi_dma_configure() is
> called for every device that has an ACPI companion, in that every device
> is considered DMA capable on x86/ia64 systems (ie acpi_get_dma_attr() API),
> which has the side effect of initializing dma masks also for
> pseudo-devices (eg CPUs and memory nodes) and potentially for devices
> whose dma masks were not set-up before the acpi_dma_configure() API was
> introduced, which may have noxious side effects.
>
> Therefore, in preparation for IORT firmware specific DMA masks set-up,
> wrap the default DMA masks set-up in acpi_dma_configure() inside an IORT
> specific wrapper that reverts to a NOP on x86/ia64 systems, restoring the
> default expected behaviour on x86/ia64 systems and keeping DMA default
> masks set-up on IORT based (ie ARM) arch configurations.
>
> Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
> Cc: Will Deacon <will.deacon@arm.com>
> Cc: Hanjun Guo <hanjun.guo@linaro.org>
> Cc: Bjorn Helgaas <bhelgaas@google.com>
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: Tomasz Nowicki <tn@semihalf.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
> Cc: Sricharan R <sricharan@codeaurora.org>

Acked -by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

> ---
> Joerg,
>
> pending Rafael's ACK on it, given the 4.10 release timing and that the
> series is queued via the IOMMU tree please consider applying this patch to
> your arm/smmu branch for 4.10, it is not fixing a bug but it is modifying
> the x86/ia64 code path; I prefer preventing any issue related to default
> dma masks on x86/ia64 so I hope it can get merged along with the rest of
> the ACPI IORT SMMU series.
>
> Thanks a lot and apologies,
> Lorenzo
>
>  drivers/acpi/arm64/iort.c | 22 ++++++++++++++++++++++
>  drivers/acpi/scan.c       | 14 +-------------
>  include/linux/acpi_iort.h |  2 ++
>  3 files changed, 25 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
> index 47bace8..e0d2e6e 100644
> --- a/drivers/acpi/arm64/iort.c
> +++ b/drivers/acpi/arm64/iort.c
> @@ -547,6 +547,28 @@ static const struct iommu_ops *iort_iommu_xlate(struct device *dev,
>  }
>
>  /**
> + * iort_set_dma_mask - Set-up dma mask for a device.
> + *
> + * @dev: device to configure
> + */
> +void iort_set_dma_mask(struct device *dev)
> +{
> +       /*
> +        * Set default coherent_dma_mask to 32 bit.  Drivers are expected to
> +        * setup the correct supported mask.
> +        */
> +       if (!dev->coherent_dma_mask)
> +               dev->coherent_dma_mask = DMA_BIT_MASK(32);
> +
> +       /*
> +        * Set it to coherent_dma_mask by default if the architecture
> +        * code has not set it.
> +        */
> +       if (!dev->dma_mask)
> +               dev->dma_mask = &dev->coherent_dma_mask;
> +}
> +
> +/**
>   * iort_iommu_configure - Set-up IOMMU configuration for a device.
>   *
>   * @dev: device to configure
> diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
> index 80698d3..93b00cf 100644
> --- a/drivers/acpi/scan.c
> +++ b/drivers/acpi/scan.c
> @@ -1380,19 +1380,7 @@ void acpi_dma_configure(struct device *dev, enum dev_dma_attr attr)
>  {
>         const struct iommu_ops *iommu;
>
> -       /*
> -        * Set default coherent_dma_mask to 32 bit.  Drivers are expected to
> -        * setup the correct supported mask.
> -        */
> -       if (!dev->coherent_dma_mask)
> -               dev->coherent_dma_mask = DMA_BIT_MASK(32);
> -
> -       /*
> -        * Set it to coherent_dma_mask by default if the architecture
> -        * code has not set it.
> -        */
> -       if (!dev->dma_mask)
> -               dev->dma_mask = &dev->coherent_dma_mask;
> +       iort_set_dma_mask(dev);
>
>         iommu = iort_iommu_configure(dev);
>
> diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
> index dcb2b60..77e0809 100644
> --- a/include/linux/acpi_iort.h
> +++ b/include/linux/acpi_iort.h
> @@ -35,6 +35,7 @@ bool iort_node_match(u8 type);
>  u32 iort_msi_map_rid(struct device *dev, u32 req_id);
>  struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id);
>  /* IOMMU interface */
> +void iort_set_dma_mask(struct device *dev);
>  const struct iommu_ops *iort_iommu_configure(struct device *dev);
>  #else
>  static inline void acpi_iort_init(void) { }
> @@ -45,6 +46,7 @@ static inline struct irq_domain *iort_get_device_domain(struct device *dev,
>                                                         u32 req_id)
>  { return NULL; }
>  /* IOMMU interface */
> +static inline void iort_set_dma_mask(struct device *dev) { }
>  static inline
>  const struct iommu_ops *iort_iommu_configure(struct device *dev)
>  { return NULL; }
> --
> 2.10.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> the body of a message to majordomo at vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH v4 1/9] dt-bindings: clarify compatible property for rockchip timers
From: Rob Herring @ 2016-12-05 22:22 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1480436092-10728-2-git-send-email-al.kochet@gmail.com>

On Tue, Nov 29, 2016 at 07:14:44PM +0300, Alexander Kochetkov wrote:
> Make all properties description in form '"rockchip,<chip>-timer",
> "rockchip,rk3288-timer"' for all chips found in linux kernel.
> 
> Suggested-by: Heiko St?bner <heiko@sntech.de>
> Signed-off-by: Alexander Kochetkov <al.kochet@gmail.com>
> ---
>  .../bindings/timer/rockchip,rk-timer.txt           |   12 +++++++++---
>  1 file changed, 9 insertions(+), 3 deletions(-)

Acked-by: Rob Herring <robh@kernel.org>

^ permalink raw reply

* [PATCH v4 4/4] [media] dt-bindings: add TI VPIF documentation
From: Rob Herring @ 2016-12-05 22:27 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161129235712.29846-5-khilman@baylibre.com>

On Tue, Nov 29, 2016 at 03:57:12PM -0800, Kevin Hilman wrote:
> Signed-off-by: Kevin Hilman <khilman@baylibre.com>
> ---
>  .../devicetree/bindings/media/ti,da850-vpif.txt    | 67 ++++++++++++++++++++++
>  1 file changed, 67 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/media/ti,da850-vpif.txt

Acked-by: Rob Herring <robh@kernel.org>

^ permalink raw reply

* [RFC PATCH 00/29] arm64: Scalable Vector Extension core support
From: Torvald Riegel @ 2016-12-05 22:42 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161130120654.GJ1574@e103592.cambridge.arm.com>

On Wed, 2016-11-30 at 12:06 +0000, Dave Martin wrote:
> So, my key goal is to support _per-process_ vector length control.
> 
> From the kernel perspective, it is easiest to achieve this by providing
> per-thread control since that is the unit that context switching acts
> on.
> 
> How useful it really is to have threads with different VLs in the same
> process is an open question.  It's theoretically useful for runtime
> environments, which may want to dispatch code optimised for different
> VLs

What would be the primary use case(s)?  Vectorization of short vectors
(eg, if having an array of structs or sth like that)?

> -- changing the VL on-the-fly within a single thread is not
> something I want to encourage, due to overhead and ABI issues, but
> switching between threads of different VLs would be more manageable.

So if on-the-fly switching is probably not useful, that would mean we
need special threads for the use cases.  Is that a realistic assumption
for the use cases?  Or do you primarily want to keep it possible to do
this, regardless of whether there are real use cases now?
I suppose allowing for a per-thread setting of VL could also be added as
a feature in the future without breaking existing code.

> For setcontext/setjmp, we don't save/restore any SVE state due to the
> caller-save status of SVE, and I would not consider it necessary to
> save/restore VL itself because of the no-change-on-the-fly policy for
> this.

Thus, you would basically consider VL changes or per-thread VL as in the
realm of compilation internals?  So, the specific size for a particular
piece of code would not be part of an ABI?

> I'm not familiar with resumable functions/executors -- are these in
> the C++ standards yet (not that that would cause me to be familiar
> with them... ;)  Any implementation of coroutines (i.e.,
> cooperative switching) is likely to fall under the "setcontext"
> argument above.

These are not part of the C++ standard yet, but will appear in TSes.
There are various features for which implementations would be assumed to
use one OS thread for several tasks, coroutines, etc.  Some of them
switch between these tasks or coroutines while these are running,
whereas the ones that will be in C++17 only run more than parallel task
on the same OS thread but one after the other (like in a thread pool).

However, if we are careful not to expose VL or make promises about it,
this may just end up being a detail similar to, say, register
allocation, which isn't exposed beyond the internals of a particular
compiler either.
Exposing it as a feature the user can set without messing with the
implementation would introduce additional thread-specific state, as
Florian said.  This might not be a show-stopper by itself, but the more
thread-specific state we have the more an implementation has to take
care of or switch, and the higher the runtime costs are.  C++17 already
makes weaker promises for TLS for parallel tasks, so that
implementations don't have to run TLS constructors or destructors just
because a small parallel task was executed.

^ permalink raw reply

* [PATCH 1/3] Add DT bindings documentation for NS2 USB DRD phy
From: Rob Herring @ 2016-12-05 23:09 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1480485338-23451-2-git-send-email-raviteja.garimella@broadcom.com>

On Wed, Nov 30, 2016 at 11:25:36AM +0530, Raviteja Garimella wrote:
> This patch adds documentation for NS2 DRD Phy driver DT bindings
> 
> Signed-off-by: Raviteja Garimella <raviteja.garimella@broadcom.com>
> ---
>  .../devicetree/bindings/phy/brcm,ns2-drd-phy.txt   | 40 ++++++++++++++++++++++
>  1 file changed, 40 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/phy/brcm,ns2-drd-phy.txt
> 
> diff --git a/Documentation/devicetree/bindings/phy/brcm,ns2-drd-phy.txt b/Documentation/devicetree/bindings/phy/brcm,ns2-drd-phy.txt
> new file mode 100644
> index 0000000..5857f99
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/phy/brcm,ns2-drd-phy.txt
> @@ -0,0 +1,40 @@
> +BROADCOM NORTHSTAR2 USB2 (DUAL ROLE DEVICE) PHY
> +
> +Required properties:
> + - compatible: brcm,ns2-drd-phy
> + - reg: offset and length of the NS2 PHY related registers.
> + - reg-names
> +   The below registers must be provided.
> +   icfg - for DRD ICFG configurations
> +   rst-ctrl - for DRD IDM reset
> +   crmu-ctrl - for CRMU core vdd, PHY and PHY PLL reset
> +   usb2-strap - for port over current polarity reversal
> + - #phy-cells: Must be 0. No args required.
> + - vbus-gpios: vbus gpio binding
> + - id-gpios: id gpio binding
> +
> +Refer to phy/phy-bindings.txt for the generic PHY binding properties
> +
> +Example:
> +	gpio_g: gpio at 660a0000 {

You don't really need to show gpio node for the example. Otherwise,

Acked-by: Rob Herring <robh@kernel.org>

Rob

^ permalink raw reply

* [PATCH 0/2] arm64: dts: NS2: XMC support and Nitro memreserve
From: Jon Mason @ 2016-12-05 23:12 UTC (permalink / raw)
  To: linux-arm-kernel

Add support for the NS2 XMC formfactor via a new DTS file.  Also, set
aside memory for Nitro firmware in the NS2 DTSI file.

Jon Mason (2):
  arm64: dts: NS2: reserve memory for Nitro firmware
  arm64: dts: NS2: add support for XMC form factor

 arch/arm64/boot/dts/broadcom/Makefile    |   2 +-
 arch/arm64/boot/dts/broadcom/ns2-xmc.dts | 191 +++++++++++++++++++++++++++++++
 arch/arm64/boot/dts/broadcom/ns2.dtsi    |   2 +
 3 files changed, 194 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/boot/dts/broadcom/ns2-xmc.dts

-- 
2.7.4

^ permalink raw reply

* [PATCH 1/2] arm64: dts: NS2: reserve memory for Nitro firmware
From: Jon Mason @ 2016-12-05 23:12 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1480979542-26871-1-git-send-email-jon.mason@broadcom.com>

Nitro firmware is loaded into memory by the bootloader at a specific
location.  Set this memory range aside to prevent the kernel from using
it.

Signed-off-by: Jon Mason <jon.mason@broadcom.com>
---
 arch/arm64/boot/dts/broadcom/ns2.dtsi | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/boot/dts/broadcom/ns2.dtsi b/arch/arm64/boot/dts/broadcom/ns2.dtsi
index 96ed47b..9f9e203 100644
--- a/arch/arm64/boot/dts/broadcom/ns2.dtsi
+++ b/arch/arm64/boot/dts/broadcom/ns2.dtsi
@@ -30,6 +30,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+/memreserve/ 0x81000000 0x00200000;
+
 #include <dt-bindings/interrupt-controller/arm-gic.h>
 #include <dt-bindings/clock/bcm-ns2.h>
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH 2/2] arm64: dts: NS2: add support for XMC form factor
From: Jon Mason @ 2016-12-05 23:12 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1480979542-26871-1-git-send-email-jon.mason@broadcom.com>

The BCM958712DxXMC board is a smaller form factor typically used as
controller boards for switches.  This smaller board has less devices
pinned out, so only a few need be populated in the device tree.

Signed-off-by: Jon Mason <jon.mason@broadcom.com>
---
 arch/arm64/boot/dts/broadcom/Makefile    |   2 +-
 arch/arm64/boot/dts/broadcom/ns2-xmc.dts | 191 +++++++++++++++++++++++++++++++
 2 files changed, 192 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/boot/dts/broadcom/ns2-xmc.dts

diff --git a/arch/arm64/boot/dts/broadcom/Makefile b/arch/arm64/boot/dts/broadcom/Makefile
index 05faf2a..f1caece 100644
--- a/arch/arm64/boot/dts/broadcom/Makefile
+++ b/arch/arm64/boot/dts/broadcom/Makefile
@@ -1,5 +1,5 @@
 dtb-$(CONFIG_ARCH_BCM2835) += bcm2837-rpi-3-b.dtb
-dtb-$(CONFIG_ARCH_BCM_IPROC) += ns2-svk.dtb
+dtb-$(CONFIG_ARCH_BCM_IPROC) += ns2-svk.dtb ns2-xmc.dtb
 dtb-$(CONFIG_ARCH_VULCAN) += vulcan-eval.dtb
 
 always		:= $(dtb-y)
diff --git a/arch/arm64/boot/dts/broadcom/ns2-xmc.dts b/arch/arm64/boot/dts/broadcom/ns2-xmc.dts
new file mode 100644
index 0000000..99a2723
--- /dev/null
+++ b/arch/arm64/boot/dts/broadcom/ns2-xmc.dts
@@ -0,0 +1,191 @@
+/*
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Broadcom.  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Broadcom Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/dts-v1/;
+
+#include "ns2.dtsi"
+
+/ {
+	model = "Broadcom NS2 XMC";
+	compatible = "brcm,ns2-xmc", "brcm,ns2";
+
+	aliases {
+		serial0 = &uart3;
+	};
+
+	chosen {
+		stdout-path = "serial0:115200n8";
+		bootargs = "earlycon=uart8250,mmio32,0x66130000";
+	};
+
+	memory {
+		device_type = "memory";
+		reg = <0x000000000 0x80000000 0x00000001 0x00000000>;
+	};
+};
+
+&enet {
+	status = "ok";
+};
+
+&i2c0 {
+	status = "ok";
+};
+
+&i2c1 {
+	status = "ok";
+};
+
+&mdio_mux_iproc {
+	mdio at 10 {
+		gphy0: eth-phy at 10 {
+			reg = <0x10>;
+		};
+	};
+};
+
+&nand {
+	nandcs at 0 {
+		compatible = "brcm,nandcs";
+		reg = <0>;
+		nand-ecc-mode = "hw";
+		nand-ecc-strength = <8>;
+		nand-ecc-step-size = <512>;
+		nand-bus-width = <16>;
+		brcm,nand-oob-sector-size = <16>;
+		#address-cells = <1>;
+		#size-cells = <1>;
+
+		partition at 0 {
+			label = "nboot";
+			reg = <0x00000000 0x00280000>; /*  2.5MB */
+			read-only;
+		};
+
+		partition at 280000 {
+			label = "nenv";
+			reg = <0x00280000 0x00040000>; /* 0.25MB */
+			read-only;
+		};
+
+		partition at 2c0000 {
+			label = "ndtb";
+			reg = <0x002c0000 0x00040000>; /* 0.25MB */
+			read-only;
+		};
+
+		partition at 300000 {
+			label = "nsystem";
+			reg = <0x00300000 0x03d00000>; /*   61MB */
+			read-only;
+		};
+
+		partition at 4000000 {
+			label = "nrootfs";
+			reg = <0x04000000 0x06400000>; /*  100MB */
+		};
+
+		partition at 0a400000{
+			label = "ncustfs";
+			reg = <0x0a400000 0x35c00000>; /*  860MB */
+		};
+	};
+};
+
+&pci_phy0 {
+	status = "ok";
+};
+
+&pcie0 {
+	status = "ok";
+};
+
+&pcie8 {
+	status = "ok";
+};
+
+&sata_phy0 {
+	status = "ok";
+};
+
+&sata_phy1 {
+	status = "ok";
+};
+
+&sata {
+	status = "ok";
+};
+
+&qspi {
+	flash: m25p80 at 0 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		compatible = "m25p80";
+		spi-max-frequency = <62500000>;
+		m25p,default-addr-width = <3>;
+		reg = <0x0 0x0>;
+
+		partition at 0 {
+			label = "bl0";
+			reg = <0x00000000 0x00080000>; /*  512KB */
+		};
+
+		partition at 80000 {
+			label = "fip";
+			reg = <0x00080000 0x00150000>; /* 1344KB */
+		};
+
+		partition at 1e0000 {
+			label = "env";
+			reg = <0x001e0000 0x00010000>;/*    64KB */
+		};
+
+		partition at 1f0000 {
+			label = "dtb";
+			reg = <0x001f0000 0x00010000>; /*   64KB */
+		};
+
+		partition at 200000 {
+			label = "kernel";
+			reg = <0x00200000 0x00e00000>; /*   14MB */
+		};
+
+		partition at 1000000 {
+			label = "rootfs";
+			reg = <0x01000000 0x01000000>; /*   16MB */
+		};
+	};
+};
+
+&uart3 {
+	status = "ok";
+};
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 3/5] dt-bindings: spi: Add documentation for the Armada 3700 SPI Controller
From: Rob Herring @ 2016-12-05 23:27 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161130094351.2748-4-romain.perier@free-electrons.com>

On Wed, Nov 30, 2016 at 10:43:49AM +0100, Romain Perier wrote:
> This adds the devicetree bindings documentation for the SPI controller
> present in the Marvell Armada 3700 SoCs.
> 
> Signed-off-by: Romain Perier <romain.perier@free-electrons.com>
> ---
>  .../devicetree/bindings/spi/spi-armada-3700.txt    | 25 ++++++++++++++++++++++
>  1 file changed, 25 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/spi/spi-armada-3700.txt

Acked-by: Rob Herring <robh@kernel.org>

^ permalink raw reply

* [PATCH v3] PCI/ACPI: xgene: Add ECAM quirk for X-Gene PCIe controller
From: Jon Masters @ 2016-12-05 23:31 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161205212012.GA22455@bhelgaas-glaptop.roam.corp.google.com>

On 12/05/2016 04:20 PM, Bjorn Helgaas wrote:
> On Fri, Dec 02, 2016 at 11:06:30PM -0800, Duc Dang wrote:
>> On Fri, Dec 2, 2016 at 3:39 PM, Bjorn Helgaas <helgaas@kernel.org> wrote:
> 
>>> diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c
>>> index 8a177a1..a16fc8e 100644
>>> --- a/arch/arm64/kernel/pci.c
>>> +++ b/arch/arm64/kernel/pci.c
>>> @@ -114,6 +114,19 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
>>>         return 0;
>>>  }
>>>
>>> +static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
>>> +{
>>> +       struct resource_entry *entry, *tmp;
>>> +       int status;
>>> +
>>> +       status = acpi_pci_probe_root_resources(ci);
>>> +       resource_list_for_each_entry_safe(entry, tmp, &ci->resources) {
>>> +               if (!(entry->res->flags & IORESOURCE_WINDOW))
>>> +                       resource_list_destroy_entry(entry);
>>> +       }
>>> +       return status;
>>> +}
>>> +
>>>  /*
>>>   * Lookup the bus range for the domain in MCFG, and set up config space
>>>   * mapping.
>>> @@ -190,6 +203,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
>>>         }
>>>
>>>         root_ops->release_info = pci_acpi_generic_release_info;
>>> +       root_ops->prepare_resources = pci_acpi_root_prepare_resources;
>>>         root_ops->pci_ops = &ri->cfg->ops->pci_ops;
>>>         bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg);
>>>         if (!bus)
>>
>> I tried your patch above with my X-Gene ECAM v4 patch on Mustang, here
>> is the kernel boot log and output of 'cat /proc/iomem'. The PCIe core
>> does not print the MMIO space as a window (which is expected per your
>> patch above).
> 
> Thanks!

...and just for the record, here it is on HPE ProLiant m400 (Moonshot),
with the same result that the region is no longer claimed as PCI space
(it - 1f500000 - is now showing as being owned by PNP0A08:00):

# cat /proc/iomem 
10520000-10523fff : APMC0D18:00
  10520000-10523fff : APMC0D18:00
10524000-10527fff : APMC0D17:00
10540000-1054a0ff : APMC0D01:00
  10546000-10546fff : APMC0D50:00
  1054a000-1054a00f : APMC0D12:03
    1054a000-1054a00f : APMC0D12:02
      1054a000-1054a00f : APMC0D12:01
        1054a000-1054a00f : APMC0D12:00
17000000-17000fff : APMC0D01:00
17001000-17001fff : APMC0D01:00
  17001000-170013ff : APMC0D15:00
    17001000-170013ff : APMC0D15:00
1701c000-1701cfff : APMC0D14:00
1a800000-1a800fff : APMC0D0D:00
  1a800000-1a800fff : APMC0D0D:00
1c000200-1c0002ff : APMC0D06:00
1c021000-1c0210ff : APMC0D08:00
  1c021000-1c02101f : serial
1c024000-1c024fff : APMC0D07:00
1f230000-1f230fff : APMC0D0D:00
  1f230000-1f230fff : APMC0D0D:00
1f23d000-1f23dfff : APMC0D0D:00
  1f23d000-1f23dfff : APMC0D0D:00
1f23e000-1f23efff : APMC0D0D:00
  1f23e000-1f23efff : APMC0D0D:00
1f2a0000-1f31ffff : APMC0D06:00
1f500000-1f50ffff : PNP0A08:00
78800000-78800fff : APMC0D13:00
  78800000-78800fff : APMC0D12:03
    78800000-78800fff : APMC0D12:02
      78800000-78800fff : APMC0D12:01
        78800000-78800fff : APMC0D12:00
          78800000-78800fff : APMC0D11:00
          78800000-78800fff : APMC0D10:03
          78800000-78800fff : APMC0D10:02
          78800000-78800fff : APMC0D10:01
          78800000-78800fff : APMC0D10:00
79000000-798fffff : APMC0D0E:00
7c000000-7c1fffff : APMC0D12:00
7c200000-7c3fffff : APMC0D12:01
7c400000-7c5fffff : APMC0D12:02
7c600000-7c7fffff : APMC0D12:03
7e000000-7e000fff : APMC0D13:00
7e200000-7e200fff : APMC0D10:03
  7e200000-7e200fff : APMC0D10:02
    7e200000-7e200fff : APMC0D10:01
      7e200000-7e200fff : APMC0D10:00
7e600000-7e600fff : APMC0D11:00
7e700000-7e700fff : APMC0D10:03
  7e700000-7e700fff : APMC0D10:02
    7e700000-7e700fff : APMC0D10:01
      7e700000-7e700fff : APMC0D10:00
7e720000-7e720fff : APMC0D10:03
  7e720000-7e720fff : APMC0D10:02
    7e720000-7e720fff : APMC0D10:01
      7e720000-7e720fff : APMC0D10:00
7e800000-7e800fff : APMC0D10:00
7e840000-7e840fff : APMC0D10:01
7e880000-7e880fff : APMC0D10:02
7e8c0000-7e8c0fff : APMC0D10:03
7e930000-7e930fff : APMC0D13:00
4000000000-4001ffffff : System RAM
  4000080000-4000c3ffff : Kernel code
  4000db0000-400165ffff : Kernel data
40023a0000-4ff733ffff : System RAM
4ff7340000-4ff77cffff : reserved
4ff77d0000-4ff79cffff : System RAM
4ff79d0000-4ff7e7ffff : reserved
4ff7e80000-4ff7e8ffff : System RAM
4ff7e90000-4ff7efffff : reserved
4ff7f10000-4ff800ffff : reserved
4ff8010000-4fffffffff : System RAM
a020000000-a03fffffff : PCI Bus 0000:00
  a020000000-a0201fffff : PCI Bus 0000:01
    a020000000-a0200fffff : 0000:01:00.0
      a020000000-a0200fffff : mlx4_core
    a020100000-a0201fffff : 0000:01:00.0
a060000000-a07fffffff : PCI Bus 0000:00
a0d0000000-a0dfffffff : PCI ECAM
a110000000-a14fffffff : PCI Bus 0000:00
  a110000000-a121ffffff : PCI Bus 0000:01
    a110000000-a111ffffff : 0000:01:00.0
      a110000000-a111ffffff : mlx4_core
    a112000000-a121ffffff : 0000:01:00.0

Tested-by: Jon Masters <jcm@redhat.com>

-- 
Computer Architect | Sent from my Fedora powered laptop

^ permalink raw reply

* [resend v2: PATCH 1/2] dt-bindings: Document the hi3660 reset bindings
From: Rob Herring @ 2016-12-05 23:40 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1480687813.2460.19.camel@pengutronix.de>

On Fri, Dec 02, 2016 at 03:10:13PM +0100, Philipp Zabel wrote:
> Am Freitag, den 02.12.2016, 13:32 +0100 schrieb Arnd Bergmann:
> > On Friday, December 2, 2016 8:21:33 AM CET zhangfei wrote:
> > > Hi, Arnd
> > > 
> > > On 2016?12?01? 20:05, Arnd Bergmann wrote:
> > > > On Thursday, December 1, 2016 8:48:40 AM CET Zhangfei Gao wrote:
> > > >> +               hisi,reset-bits = <0x20 0x8             /* 0: i2c0 */
> > > >> +                                  0x20 0x10            /* 1: i2c1 */
> > > >> +                                  0x20 0x20            /* 2: i2c2 */
> > > >> +                                  0x20 0x8000000>;     /* 3: i2c6 */
> > > >> +       };
> > > >> +
> > > >> +Specifying reset lines connected to IP modules
> > > >> +==============================================
> > > >> +example:
> > > >> +
> > > >> +        i2c0: i2c at ..... {
> > > >> +                ...
> > > >> +               resets = <&iomcu_rst 0>;
> > > >> +                ...
> > > >> +        };
> > > > I don't really like this approach, since now the information is
> > > > in two places. Why not put the data into the reset specifier
> > > > directly when it is used?
> 
> From my point of view, with the binding above, all reset controller
> register/bit layout information is in a single place and can be easily
> compared to a list in the reference manual, whereas with your suggestion
> the description of the reset controller register layout is spread
> throughout one or even several dtsi files.

Which can be solved by tools.

> Also, since no two reset controllers are exactly the same, we get a
> proliferation of different slightly phandle argument meanings.

phandle args are supposed to be specific to the phandle it points to. 
Otherwise, we'd never need more than 1 cell and everything could be a 
lookup table.

> 
> > > Any example, still not understand.
> > > They are consumer and provider.
> > 
> > I mean in the i2c node, have
> > 
> > 	i2c0: i2c at ..... {
> > 		...
> > 		resets = <&iomcu_rst 0x20 0x8>;
> > 		...
> > 	}
> 
> There already are a few drivers that use this, and I fear people having
> to change their bindings because new flags are needed that have not been
> previously thought of.
> 

Drivers that use what?

Rob

^ permalink raw reply

* [PATCH 07/12] usb: sunxi: Uses the resource-managed extcon API when registering extcon notifier
From: Chanwoo Choi @ 2016-12-06  0:21 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161205163240.GA5783@uda0271908>

On 2016? 12? 06? 01:32, Bin Liu wrote:
> On Wed, Nov 30, 2016 at 09:45:03AM +0100, Maxime Ripard wrote:
>> On Wed, Nov 30, 2016 at 02:57:35PM +0900, Chanwoo Choi wrote:
>>> This patch just uses the resource-managed extcon API when registering
>>> the extcon notifier.
>>>
>>> Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
>>
>> Acked-by: Maxime Ripard <maxime.ripard@free-electrons.com>
> 
> It would be ideal if the subject was "usb: musb: sunxi: ...".
> 
> Acked-by: Bin Liu <b-liu@ti.com>
> 

Thanks for the review. I'll change the subject.

-- 
Best Regards,
Chanwoo Choi

^ permalink raw reply

* [PATCH] dt: bindings: zx: Add header for PM domains specifiers
From: Baoyou Xie @ 2016-12-06  0:21 UTC (permalink / raw)
  To: linux-arm-kernel

This patch adds header with values used for ZTE 2967
SoC's power domain driver.

Signed-off-by: Baoyou Xie <baoyou.xie@linaro.org>
---
 include/dt-bindings/arm/zte_pm_domains.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 include/dt-bindings/arm/zte_pm_domains.h

diff --git a/include/dt-bindings/arm/zte_pm_domains.h b/include/dt-bindings/arm/zte_pm_domains.h
new file mode 100644
index 0000000..1485e8d
--- /dev/null
+++ b/include/dt-bindings/arm/zte_pm_domains.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2015 Linaro Ltd.
+ *
+ * Author: Baoyou Xie <baoyou.xie@linaro.org>
+ * License terms: GNU General Public License (GPL) version 2
+ */
+#ifndef _DT_BINDINGS_ARM_ZTE_PM_DOMAINS_H
+#define _DT_BINDINGS_ARM_ZTE_PM_DOMAINS_H
+
+#define DM_ZX296718_SAPPU	0
+#define DM_ZX296718_VDE		1  /*g1v6*/
+#define DM_ZX296718_VCE		2  /*h1v6*/
+#define DM_ZX296718_HDE		3  /*g2v2*/
+#define DM_ZX296718_VIU		4
+#define DM_ZX296718_USB20	5
+#define DM_ZX296718_USB21	6
+#define DM_ZX296718_USB30	7
+#define DM_ZX296718_HSIC	8
+#define DM_ZX296718_GMAC	9
+#define DM_ZX296718_TS		10
+#define DM_ZX296718_VOU		11
+
+#endif /* _DT_BINDINGS_ARM_ZTE_PM_DOMAINS_H */
-- 
2.7.4

^ permalink raw reply related

* [PATCH 01/12] phy: rcar-gen3-usb2: Replace the deprecated extcon API
From: Chanwoo Choi @ 2016-12-06  0:25 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1480485460-2663-2-git-send-email-cw00.choi@samsung.com>

Hi Kishon,

Could you review and pick the patch1/2 for phy driver?

Best Regards,
Chanwoo Choi

On 2016? 11? 30? 14:57, Chanwoo Choi wrote:
> This patch replaces the deprecated extcon API as following:
> - extcon_set_cable_state_() -> extcon_set_state_sync()
> 
> Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
> ---
>  drivers/phy/phy-rcar-gen3-usb2.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/phy/phy-rcar-gen3-usb2.c b/drivers/phy/phy-rcar-gen3-usb2.c
> index bd2430d7339c..7f8081f157f4 100644
> --- a/drivers/phy/phy-rcar-gen3-usb2.c
> +++ b/drivers/phy/phy-rcar-gen3-usb2.c
> @@ -93,11 +93,11 @@ static void rcar_gen3_phy_usb2_work(struct work_struct *work)
>  						 work);
>  
>  	if (ch->extcon_host) {
> -		extcon_set_cable_state_(ch->extcon, EXTCON_USB_HOST, true);
> -		extcon_set_cable_state_(ch->extcon, EXTCON_USB, false);
> +		extcon_set_state_sync(ch->extcon, EXTCON_USB_HOST, true);
> +		extcon_set_state_sync(ch->extcon, EXTCON_USB, false);
>  	} else {
> -		extcon_set_cable_state_(ch->extcon, EXTCON_USB_HOST, false);
> -		extcon_set_cable_state_(ch->extcon, EXTCON_USB, true);
> +		extcon_set_state_sync(ch->extcon, EXTCON_USB_HOST, false);
> +		extcon_set_state_sync(ch->extcon, EXTCON_USB, true);
>  	}
>  }
>  
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox