* [PATCH v3 4/6] crypto: arm/crct10dif - port x86 SSE implementation to ARM
From: Ard Biesheuvel @ 2016-12-05 18:42 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1480963348-24203-1-git-send-email-ard.biesheuvel@linaro.org>
This is a transliteration of the Intel algorithm implemented
using SSE and PCLMULQDQ instructions that resides in the file
arch/x86/crypto/crct10dif-pcl-asm_64.S, but simplified to only
operate on buffers that are 16 byte aligned (but of any size)
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
arch/arm/crypto/Kconfig | 5 +
arch/arm/crypto/Makefile | 2 +
arch/arm/crypto/crct10dif-ce-core.S | 427 ++++++++++++++++++++
arch/arm/crypto/crct10dif-ce-glue.c | 101 +++++
4 files changed, 535 insertions(+)
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 27ed1b1cd1d7..fce801fa52a1 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -120,4 +120,9 @@ config CRYPTO_GHASH_ARM_CE
that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
that is part of the ARMv8 Crypto Extensions
+config CRYPTO_CRCT10DIF_ARM_CE
+ tristate "CRCT10DIF digest algorithm using PMULL instructions"
+ depends on KERNEL_MODE_NEON && CRC_T10DIF
+ select CRYPTO_HASH
+
endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index fc5150702b64..fc77265014b7 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -13,6 +13,7 @@ ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
+ce-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o
ifneq ($(ce-obj-y)$(ce-obj-m),)
ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y)
@@ -36,6 +37,7 @@ sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o
aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
+crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
quiet_cmd_perl = PERL $@
cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
new file mode 100644
index 000000000000..ce45ba0c0687
--- /dev/null
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -0,0 +1,427 @@
+//
+// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
+//
+// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2 as
+// published by the Free Software Foundation.
+//
+
+//
+// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+//
+// Copyright (c) 2013, Intel Corporation
+//
+// Authors:
+// Erdinc Ozturk <erdinc.ozturk@intel.com>
+// Vinodh Gopal <vinodh.gopal@intel.com>
+// James Guilford <james.guilford@intel.com>
+// Tim Chen <tim.c.chen@linux.intel.com>
+//
+// This software is available to you under a choice of one of two
+// licenses. You may choose to be licensed under the terms of the GNU
+// General Public License (GPL) Version 2, available from the file
+// COPYING in the main directory of this source tree, or the
+// OpenIB.org BSD license below:
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the
+// distribution.
+//
+// * Neither the name of the Intel Corporation nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+//
+// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Function API:
+// UINT16 crc_t10dif_pcl(
+// UINT16 init_crc, //initial CRC value, 16 bits
+// const unsigned char *buf, //buffer pointer to calculate CRC on
+// UINT64 len //buffer length in bytes (64-bit data)
+// );
+//
+// Reference paper titled "Fast CRC Computation for Generic
+// Polynomials Using PCLMULQDQ Instruction"
+// URL: http://www.intel.com/content/dam/www/public/us/en/documents
+// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+//
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+#ifdef CONFIG_CPU_ENDIAN_BE8
+#define CPU_LE(code...)
+#else
+#define CPU_LE(code...) code
+#endif
+
+ .text
+ .fpu crypto-neon-fp-armv8
+
+ arg1_low32 .req r0
+ arg2 .req r1
+ arg3 .req r2
+
+ qzr .req q13
+
+ q0l .req d0
+ q0h .req d1
+ q1l .req d2
+ q1h .req d3
+ q2l .req d4
+ q2h .req d5
+ q3l .req d6
+ q3h .req d7
+ q4l .req d8
+ q4h .req d9
+ q5l .req d10
+ q5h .req d11
+ q6l .req d12
+ q6h .req d13
+ q7l .req d14
+ q7h .req d15
+
+ENTRY(crc_t10dif_pmull)
+ vmov.i8 qzr, #0 // init zero register
+
+ // adjust the 16-bit initial_crc value, scale it to 32 bits
+ lsl arg1_low32, arg1_low32, #16
+
+ // check if smaller than 256
+ cmp arg3, #256
+
+ // for sizes less than 128, we can't fold 64B at a time...
+ blt _less_than_128
+
+ // load the initial crc value
+ // crc value does not need to be byte-reflected, but it needs
+ // to be moved to the high part of the register.
+ // because data will be byte-reflected and will align with
+ // initial crc at correct place.
+ vmov s0, arg1_low32 // initial crc
+ vext.8 q10, qzr, q0, #4
+
+ // receive the initial 64B data, xor the initial crc value
+ vld1.64 {q0-q1}, [arg2, :128]!
+ vld1.64 {q2-q3}, [arg2, :128]!
+ vld1.64 {q4-q5}, [arg2, :128]!
+ vld1.64 {q6-q7}, [arg2, :128]!
+CPU_LE( vrev64.8 q0, q0 )
+CPU_LE( vrev64.8 q1, q1 )
+CPU_LE( vrev64.8 q2, q2 )
+CPU_LE( vrev64.8 q3, q3 )
+CPU_LE( vrev64.8 q4, q4 )
+CPU_LE( vrev64.8 q5, q5 )
+CPU_LE( vrev64.8 q6, q6 )
+CPU_LE( vrev64.8 q7, q7 )
+
+ vswp d0, d1
+ vswp d2, d3
+ vswp d4, d5
+ vswp d6, d7
+ vswp d8, d9
+ vswp d10, d11
+ vswp d12, d13
+ vswp d14, d15
+
+ // XOR the initial_crc value
+ veor.8 q0, q0, q10
+
+ adr ip, rk3
+ vld1.64 {q10}, [ip, :128] // xmm10 has rk3 and rk4
+
+ //
+ // we subtract 256 instead of 128 to save one instruction from the loop
+ //
+ sub arg3, arg3, #256
+
+ // at this section of the code, there is 64*x+y (0<=y<64) bytes of
+ // buffer. The _fold_64_B_loop will fold 64B at a time
+ // until we have 64+y Bytes of buffer
+
+
+ // fold 64B at a time. This section of the code folds 4 vector
+ // registers in parallel
+_fold_64_B_loop:
+
+ .macro fold64, reg1, reg2
+ vld1.64 {q11-q12}, [arg2, :128]!
+
+ vmull.p64 q8, \reg1\()h, d21
+ vmull.p64 \reg1, \reg1\()l, d20
+ vmull.p64 q9, \reg2\()h, d21
+ vmull.p64 \reg2, \reg2\()l, d20
+
+CPU_LE( vrev64.8 q11, q11 )
+CPU_LE( vrev64.8 q12, q12 )
+ vswp d22, d23
+ vswp d24, d25
+
+ veor.8 \reg1, \reg1, q8
+ veor.8 \reg2, \reg2, q9
+ veor.8 \reg1, \reg1, q11
+ veor.8 \reg2, \reg2, q12
+ .endm
+
+ fold64 q0, q1
+ fold64 q2, q3
+ fold64 q4, q5
+ fold64 q6, q7
+
+ subs arg3, arg3, #128
+
+ // check if there is another 64B in the buffer to be able to fold
+ bge _fold_64_B_loop
+
+ // at this point, the buffer pointer is pointing at the last y Bytes
+ // of the buffer the 64B of folded data is in 4 of the vector
+ // registers: v0, v1, v2, v3
+
+ // fold the 8 vector registers to 1 vector register with different
+ // constants
+
+ adr ip, rk9
+ vld1.64 {q10}, [ip, :128]!
+
+ .macro fold16, reg, rk
+ vmull.p64 q8, \reg\()l, d20
+ vmull.p64 \reg, \reg\()h, d21
+ .ifnb \rk
+ vld1.64 {q10}, [ip, :128]!
+ .endif
+ veor.8 q7, q7, q8
+ veor.8 q7, q7, \reg
+ .endm
+
+ fold16 q0, rk11
+ fold16 q1, rk13
+ fold16 q2, rk15
+ fold16 q3, rk17
+ fold16 q4, rk19
+ fold16 q5, rk1
+ fold16 q6
+
+ // instead of 64, we add 48 to the loop counter to save 1 instruction
+ // from the loop instead of a cmp instruction, we use the negative
+ // flag with the jl instruction
+ adds arg3, arg3, #(128-16)
+ blt _final_reduction_for_128
+
+ // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
+ // and the rest is in memory. We can fold 16 bytes@a time if y>=16
+ // continue folding 16B at a time
+
+_16B_reduction_loop:
+ vmull.p64 q8, d14, d20
+ vmull.p64 q7, d15, d21
+ veor.8 q7, q7, q8
+
+ vld1.64 {q0}, [arg2, :128]!
+CPU_LE( vrev64.8 q0, q0 )
+ vswp d0, d1
+ veor.8 q7, q7, q0
+ subs arg3, arg3, #16
+
+ // instead of a cmp instruction, we utilize the flags with the
+ // jge instruction equivalent of: cmp arg3, 16-16
+ // check if there is any more 16B in the buffer to be able to fold
+ bge _16B_reduction_loop
+
+ // now we have 16+z bytes left to reduce, where 0<= z < 16.
+ // first, we reduce the data in the xmm7 register
+
+_final_reduction_for_128:
+ // check if any more data to fold. If not, compute the CRC of
+ // the final 128 bits
+ adds arg3, arg3, #16
+ beq _128_done
+
+ // here we are getting data that is less than 16 bytes.
+ // since we know that there was data before the pointer, we can
+ // offset the input pointer before the actual point, to receive
+ // exactly 16 bytes. after that the registers need to be adjusted.
+_get_last_two_regs:
+ add arg2, arg2, arg3
+ sub arg2, arg2, #16
+ vld1.64 {q1}, [arg2]
+CPU_LE( vrev64.8 q1, q1 )
+ vswp d2, d3
+
+ // get rid of the extra data that was loaded before
+ // load the shift constant
+ adr ip, tbl_shf_table + 16
+ sub ip, ip, arg3
+ vld1.8 {q0}, [ip]
+
+ // shift v2 to the left by arg3 bytes
+ vtbl.8 d4, {d14-d15}, d0
+ vtbl.8 d5, {d14-d15}, d1
+
+ // shift v7 to the right by 16-arg3 bytes
+ vmov.i8 q9, #0x80
+ veor.8 q0, q0, q9
+ vtbl.8 d18, {d14-d15}, d0
+ vtbl.8 d19, {d14-d15}, d1
+
+ // blend
+ vshr.s8 q0, q0, #7 // convert to 8-bit mask
+ vbsl.8 q0, q2, q1
+
+ // fold 16 Bytes
+ vmull.p64 q8, d18, d20
+ vmull.p64 q7, d19, d21
+ veor.8 q7, q7, q8
+ veor.8 q7, q7, q0
+
+_128_done:
+ // compute crc of a 128-bit value
+ vldr d20, rk5
+ vldr d21, rk6 // rk5 and rk6 in xmm10
+
+ // 64b fold
+ vext.8 q0, qzr, q7, #8
+ vmull.p64 q7, d15, d20
+ veor.8 q7, q7, q0
+
+ // 32b fold
+ vext.8 q0, q7, qzr, #12
+ vmov s31, s3
+ vmull.p64 q0, d0, d21
+ veor.8 q7, q0, q7
+
+ // barrett reduction
+_barrett:
+ vldr d20, rk7
+ vldr d21, rk8
+
+ vmull.p64 q0, d15, d20
+ vext.8 q0, qzr, q0, #12
+ vmull.p64 q0, d1, d21
+ vext.8 q0, qzr, q0, #12
+ veor.8 q7, q7, q0
+ vmov r0, s29
+
+_cleanup:
+ // scale the result back to 16 bits
+ lsr r0, r0, #16
+ bx lr
+
+_less_than_128:
+ teq arg3, #0
+ beq _cleanup
+
+ vmov.i8 q0, #0
+ vmov s3, arg1_low32 // get the initial crc value
+
+ vld1.64 {q7}, [arg2, :128]!
+CPU_LE( vrev64.8 q7, q7 )
+ vswp d14, d15
+ veor.8 q7, q7, q0
+
+ cmp arg3, #16
+ beq _128_done // exactly 16 left
+ blt _less_than_16_left
+
+ // now if there is, load the constants
+ vldr d20, rk1
+ vldr d21, rk2 // rk1 and rk2 in xmm10
+
+ // check if there is enough buffer to be able to fold 16B at a time
+ subs arg3, arg3, #32
+ addlt arg3, arg3, #16
+ blt _get_last_two_regs
+ b _16B_reduction_loop
+
+_less_than_16_left:
+ // shl r9, 4
+ adr ip, tbl_shf_table + 16
+ sub ip, ip, arg3
+ vld1.8 {q0}, [ip]
+ vmov.i8 q9, #0x80
+ veor.8 q0, q0, q9
+ vtbl.8 d18, {d14-d15}, d0
+ vtbl.8 d15, {d14-d15}, d1
+ vmov d14, d18
+ b _128_done
+ENDPROC(crc_t10dif_pmull)
+
+// precomputed constants
+// these constants are precomputed from the poly:
+// 0x8bb70000 (0x8bb7 scaled to 32 bits)
+ .align 4
+// Q = 0x18BB70000
+// rk1 = 2^(32*3) mod Q << 32
+// rk2 = 2^(32*5) mod Q << 32
+// rk3 = 2^(32*15) mod Q << 32
+// rk4 = 2^(32*17) mod Q << 32
+// rk5 = 2^(32*3) mod Q << 32
+// rk6 = 2^(32*2) mod Q << 32
+// rk7 = floor(2^64/Q)
+// rk8 = Q
+
+rk3: .quad 0x9d9d000000000000
+rk4: .quad 0x7cf5000000000000
+rk5: .quad 0x2d56000000000000
+rk6: .quad 0x1368000000000000
+rk7: .quad 0x00000001f65a57f8
+rk8: .quad 0x000000018bb70000
+rk9: .quad 0xceae000000000000
+rk10: .quad 0xbfd6000000000000
+rk11: .quad 0x1e16000000000000
+rk12: .quad 0x713c000000000000
+rk13: .quad 0xf7f9000000000000
+rk14: .quad 0x80a6000000000000
+rk15: .quad 0x044c000000000000
+rk16: .quad 0xe658000000000000
+rk17: .quad 0xad18000000000000
+rk18: .quad 0xa497000000000000
+rk19: .quad 0x6ee3000000000000
+rk20: .quad 0xe7b5000000000000
+rk1: .quad 0x2d56000000000000
+rk2: .quad 0x06df000000000000
+
+tbl_shf_table:
+// use these values for shift constants for the tbl/tbx instruction
+// different alignments result in values as shown:
+// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
+// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
+// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
+// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
+// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
+// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
+// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
+// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
+// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
+// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
+// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
+// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
+// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
+// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
+// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
+
+ .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+ .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
new file mode 100644
index 000000000000..d428355cf38d
--- /dev/null
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -0,0 +1,101 @@
+/*
+ * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/crc-t10dif.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
+
+asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u32 len);
+
+static int crct10dif_init(struct shash_desc *desc)
+{
+ u16 *crc = shash_desc_ctx(desc);
+
+ *crc = 0;
+ return 0;
+}
+
+static int crct10dif_update(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u16 *crc = shash_desc_ctx(desc);
+ unsigned int l;
+
+ if (!may_use_simd()) {
+ *crc = crc_t10dif_generic(*crc, data, length);
+ } else {
+ if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
+ l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
+ ((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
+
+ *crc = crc_t10dif_generic(*crc, data, l);
+
+ length -= l;
+ data += l;
+ }
+ if (length > 0) {
+ kernel_neon_begin();
+ *crc = crc_t10dif_pmull(*crc, data, length);
+ kernel_neon_end();
+ }
+ }
+ return 0;
+}
+
+static int crct10dif_final(struct shash_desc *desc, u8 *out)
+{
+ u16 *crc = shash_desc_ctx(desc);
+
+ *(u16 *)out = *crc;
+ return 0;
+}
+
+static struct shash_alg crc_t10dif_alg = {
+ .digestsize = CRC_T10DIF_DIGEST_SIZE,
+ .init = crct10dif_init,
+ .update = crct10dif_update,
+ .final = crct10dif_final,
+ .descsize = CRC_T10DIF_DIGEST_SIZE,
+
+ .base.cra_name = "crct10dif",
+ .base.cra_driver_name = "crct10dif-arm-ce",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+};
+
+static int __init crc_t10dif_mod_init(void)
+{
+ if (!(elf_hwcap2 & HWCAP2_PMULL))
+ return -ENODEV;
+
+ return crypto_register_shash(&crc_t10dif_alg);
+}
+
+static void __exit crc_t10dif_mod_exit(void)
+{
+ crypto_unregister_shash(&crc_t10dif_alg);
+}
+
+module_init(crc_t10dif_mod_init);
+module_exit(crc_t10dif_mod_exit);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("crct10dif");
--
2.7.4
^ permalink raw reply related
* [PATCH v3 5/6] crypto: arm64/crc32 - accelerated support based on x86 SSE implementation
From: Ard Biesheuvel @ 2016-12-05 18:42 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1480963348-24203-1-git-send-email-ard.biesheuvel@linaro.org>
This is a combination of the the Intel algorithm implemented using SSE
and PCLMULQDQ instructions from arch/x86/crypto/crc32-pclmul_asm.S, and
the new CRC32 extensions introduced for both 32-bit and 64-bit ARM in
version 8 of the architecture. Two versions of the above combo are
provided, one for CRC32 and one for CRC32C.
The PMULL/NEON algorithm is faster, but operates on blocks of at least
64 bytes, and on multiples of 16 bytes only. For the remaining input,
or for all input on systems that lack the PMULL 64x64->128 instructions,
the CRC32 instructions will be used.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
arch/arm64/crypto/Kconfig | 6 +
arch/arm64/crypto/Makefile | 3 +
arch/arm64/crypto/crc32-ce-core.S | 266 ++++++++++++++++++++
arch/arm64/crypto/crc32-ce-glue.c | 212 ++++++++++++++++
4 files changed, 487 insertions(+)
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index d773c0659202..21835deb1ab9 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -28,6 +28,11 @@ config CRYPTO_CRCT10DIF_ARM64_CE
depends on KERNEL_MODE_NEON && CRC_T10DIF
select CRYPTO_HASH
+config CRYPTO_CRC32_ARM64_CE
+ tristate "CRC32 and CRC32C digest algorithms using PMULL instructions"
+ depends on KERNEL_MODE_NEON && CRC32
+ select CRYPTO_HASH
+
config CRYPTO_AES_ARM64_CE
tristate "AES core cipher using ARMv8 Crypto Extensions"
depends on ARM64 && KERNEL_MODE_NEON
@@ -58,4 +63,5 @@ config CRYPTO_CRC32_ARM64
tristate "CRC32 and CRC32C using optional ARMv8 instructions"
depends on ARM64
select CRYPTO_HASH
+
endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 36fd3eb4201b..144387805a46 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -20,6 +20,9 @@ ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
+obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
+crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
+
obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
diff --git a/arch/arm64/crypto/crc32-ce-core.S b/arch/arm64/crypto/crc32-ce-core.S
new file mode 100644
index 000000000000..18f5a8442276
--- /dev/null
+++ b/arch/arm64/crypto/crc32-ce-core.S
@@ -0,0 +1,266 @@
+/*
+ * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ * Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .align 6
+ .cpu generic+crypto+crc
+
+.Lcrc32_constants:
+ /*
+ * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
+ * #define CONSTANT_R1 0x154442bd4LL
+ *
+ * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
+ * #define CONSTANT_R2 0x1c6e41596LL
+ */
+ .octa 0x00000001c6e415960000000154442bd4
+
+ /*
+ * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
+ * #define CONSTANT_R3 0x1751997d0LL
+ *
+ * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
+ * #define CONSTANT_R4 0x0ccaa009eLL
+ */
+ .octa 0x00000000ccaa009e00000001751997d0
+
+ /*
+ * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
+ * #define CONSTANT_R5 0x163cd6124LL
+ */
+ .quad 0x0000000163cd6124
+ .quad 0x00000000FFFFFFFF
+
+ /*
+ * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+ *
+ * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
+ * = 0x1F7011641LL
+ * #define CONSTANT_RU 0x1F7011641LL
+ */
+ .octa 0x00000001F701164100000001DB710641
+
+.Lcrc32c_constants:
+ .octa 0x000000009e4addf800000000740eef02
+ .octa 0x000000014cd00bd600000000f20c0dfe
+ .quad 0x00000000dd45aab8
+ .quad 0x00000000FFFFFFFF
+ .octa 0x00000000dea713f10000000105ec76f0
+
+ vCONSTANT .req v0
+ dCONSTANT .req d0
+ qCONSTANT .req q0
+
+ BUF .req x0
+ LEN .req x1
+ CRC .req x2
+
+ vzr .req v9
+
+ /**
+ * Calculate crc32
+ * BUF - buffer
+ * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
+ * CRC - initial crc32
+ * return %eax crc32
+ * uint crc32_pmull_le(unsigned char const *buffer,
+ * size_t len, uint crc32)
+ */
+ENTRY(crc32_pmull_le)
+ adr x3, .Lcrc32_constants
+ b 0f
+
+ENTRY(crc32c_pmull_le)
+ adr x3, .Lcrc32c_constants
+
+0: bic LEN, LEN, #15
+ ld1 {v1.16b-v4.16b}, [BUF], #0x40
+ movi vzr.16b, #0
+ fmov dCONSTANT, CRC
+ eor v1.16b, v1.16b, vCONSTANT.16b
+ sub LEN, LEN, #0x40
+ cmp LEN, #0x40
+ b.lt less_64
+
+ ldr qCONSTANT, [x3]
+
+loop_64: /* 64 bytes Full cache line folding */
+ sub LEN, LEN, #0x40
+
+ pmull2 v5.1q, v1.2d, vCONSTANT.2d
+ pmull2 v6.1q, v2.2d, vCONSTANT.2d
+ pmull2 v7.1q, v3.2d, vCONSTANT.2d
+ pmull2 v8.1q, v4.2d, vCONSTANT.2d
+
+ pmull v1.1q, v1.1d, vCONSTANT.1d
+ pmull v2.1q, v2.1d, vCONSTANT.1d
+ pmull v3.1q, v3.1d, vCONSTANT.1d
+ pmull v4.1q, v4.1d, vCONSTANT.1d
+
+ eor v1.16b, v1.16b, v5.16b
+ ld1 {v5.16b}, [BUF], #0x10
+ eor v2.16b, v2.16b, v6.16b
+ ld1 {v6.16b}, [BUF], #0x10
+ eor v3.16b, v3.16b, v7.16b
+ ld1 {v7.16b}, [BUF], #0x10
+ eor v4.16b, v4.16b, v8.16b
+ ld1 {v8.16b}, [BUF], #0x10
+
+ eor v1.16b, v1.16b, v5.16b
+ eor v2.16b, v2.16b, v6.16b
+ eor v3.16b, v3.16b, v7.16b
+ eor v4.16b, v4.16b, v8.16b
+
+ cmp LEN, #0x40
+ b.ge loop_64
+
+less_64: /* Folding cache line into 128bit */
+ ldr qCONSTANT, [x3, #16]
+
+ pmull2 v5.1q, v1.2d, vCONSTANT.2d
+ pmull v1.1q, v1.1d, vCONSTANT.1d
+ eor v1.16b, v1.16b, v5.16b
+ eor v1.16b, v1.16b, v2.16b
+
+ pmull2 v5.1q, v1.2d, vCONSTANT.2d
+ pmull v1.1q, v1.1d, vCONSTANT.1d
+ eor v1.16b, v1.16b, v5.16b
+ eor v1.16b, v1.16b, v3.16b
+
+ pmull2 v5.1q, v1.2d, vCONSTANT.2d
+ pmull v1.1q, v1.1d, vCONSTANT.1d
+ eor v1.16b, v1.16b, v5.16b
+ eor v1.16b, v1.16b, v4.16b
+
+ cbz LEN, fold_64
+
+loop_16: /* Folding rest buffer into 128bit */
+ subs LEN, LEN, #0x10
+
+ ld1 {v2.16b}, [BUF], #0x10
+ pmull2 v5.1q, v1.2d, vCONSTANT.2d
+ pmull v1.1q, v1.1d, vCONSTANT.1d
+ eor v1.16b, v1.16b, v5.16b
+ eor v1.16b, v1.16b, v2.16b
+
+ b.ne loop_16
+
+fold_64:
+ /* perform the last 64 bit fold, also adds 32 zeroes
+ * to the input stream */
+ ext v2.16b, v1.16b, v1.16b, #8
+ pmull2 v2.1q, v2.2d, vCONSTANT.2d
+ ext v1.16b, v1.16b, vzr.16b, #8
+ eor v1.16b, v1.16b, v2.16b
+
+ /* final 32-bit fold */
+ ldr dCONSTANT, [x3, #32]
+ ldr d3, [x3, #40]
+
+ ext v2.16b, v1.16b, vzr.16b, #4
+ and v1.16b, v1.16b, v3.16b
+ pmull v1.1q, v1.1d, vCONSTANT.1d
+ eor v1.16b, v1.16b, v2.16b
+
+ /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+ ldr qCONSTANT, [x3, #48]
+
+ and v2.16b, v1.16b, v3.16b
+ ext v2.16b, vzr.16b, v2.16b, #8
+ pmull2 v2.1q, v2.2d, vCONSTANT.2d
+ and v2.16b, v2.16b, v3.16b
+ pmull v2.1q, v2.1d, vCONSTANT.1d
+ eor v1.16b, v1.16b, v2.16b
+ mov w0, v1.s[1]
+
+ ret
+ENDPROC(crc32_pmull_le)
+ENDPROC(crc32c_pmull_le)
+
+ .macro __crc32, c
+0: subs x2, x2, #16
+ b.mi 8f
+ ldp x3, x4, [x1], #16
+CPU_BE( rev x3, x3 )
+CPU_BE( rev x4, x4 )
+ crc32\c\()x w0, w0, x3
+ crc32\c\()x w0, w0, x4
+ b.ne 0b
+ ret
+
+8: tbz x2, #3, 4f
+ ldr x3, [x1], #8
+CPU_BE( rev x3, x3 )
+ crc32\c\()x w0, w0, x3
+4: tbz x2, #2, 2f
+ ldr w3, [x1], #4
+CPU_BE( rev w3, w3 )
+ crc32\c\()w w0, w0, w3
+2: tbz x2, #1, 1f
+ ldrh w3, [x1], #2
+CPU_BE( rev16 w3, w3 )
+ crc32\c\()h w0, w0, w3
+1: tbz x2, #0, 0f
+ ldrb w3, [x1]
+ crc32\c\()b w0, w0, w3
+0: ret
+ .endm
+
+ .align 5
+ENTRY(crc32_armv8_le)
+ __crc32
+ENDPROC(crc32_armv8_le)
+
+ .align 5
+ENTRY(crc32c_armv8_le)
+ __crc32 c
+ENDPROC(crc32c_armv8_le)
diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
new file mode 100644
index 000000000000..8594127d5e01
--- /dev/null
+++ b/arch/arm64/crypto/crc32-ce-glue.c
@@ -0,0 +1,212 @@
+/*
+ * Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/crc32.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/unaligned.h>
+
+#define PMULL_MIN_LEN 64L /* minimum size of buffer
+ * for crc32_pmull_le_16 */
+#define SCALE_F 16L /* size of NEON register */
+
+asmlinkage u32 crc32_pmull_le(const u8 buf[], u64 len, u32 init_crc);
+asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], size_t len);
+
+asmlinkage u32 crc32c_pmull_le(const u8 buf[], u64 len, u32 init_crc);
+asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], size_t len);
+
+static u32 (*fallback_crc32)(u32 init_crc, const u8 buf[], size_t len);
+static u32 (*fallback_crc32c)(u32 init_crc, const u8 buf[], size_t len);
+
+static int crc32_pmull_cra_init(struct crypto_tfm *tfm)
+{
+ u32 *key = crypto_tfm_ctx(tfm);
+
+ *key = 0;
+ return 0;
+}
+
+static int crc32c_pmull_cra_init(struct crypto_tfm *tfm)
+{
+ u32 *key = crypto_tfm_ctx(tfm);
+
+ *key = ~0;
+ return 0;
+}
+
+static int crc32_pmull_setkey(struct crypto_shash *hash, const u8 *key,
+ unsigned int keylen)
+{
+ u32 *mctx = crypto_shash_ctx(hash);
+
+ if (keylen != sizeof(u32)) {
+ crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ *mctx = le32_to_cpup((__le32 *)key);
+ return 0;
+}
+
+static int crc32_pmull_init(struct shash_desc *desc)
+{
+ u32 *mctx = crypto_shash_ctx(desc->tfm);
+ u32 *crc = shash_desc_ctx(desc);
+
+ *crc = *mctx;
+ return 0;
+}
+
+static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u32 *crc = shash_desc_ctx(desc);
+ unsigned int l;
+
+ if ((u64)data % SCALE_F) {
+ l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
+
+ *crc = fallback_crc32(*crc, data, l);
+
+ data += l;
+ length -= l;
+ }
+
+ if (length >= PMULL_MIN_LEN) {
+ l = round_down(length, SCALE_F);
+
+ kernel_neon_begin_partial(10);
+ *crc = crc32_pmull_le(data, l, *crc);
+ kernel_neon_end();
+
+ data += l;
+ length -= l;
+ }
+
+ if (length > 0)
+ *crc = fallback_crc32(*crc, data, length);
+
+ return 0;
+}
+
+static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u32 *crc = shash_desc_ctx(desc);
+ unsigned int l;
+
+ if ((u64)data % SCALE_F) {
+ l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
+
+ *crc = fallback_crc32c(*crc, data, l);
+
+ data += l;
+ length -= l;
+ }
+
+ if (length >= PMULL_MIN_LEN) {
+ l = round_down(length, SCALE_F);
+
+ kernel_neon_begin_partial(10);
+ *crc = crc32c_pmull_le(data, l, *crc);
+ kernel_neon_end();
+
+ data += l;
+ length -= l;
+ }
+
+ if (length > 0) {
+ *crc = fallback_crc32c(*crc, data, length);
+ }
+
+ return 0;
+}
+
+static int crc32_pmull_final(struct shash_desc *desc, u8 *out)
+{
+ u32 *crc = shash_desc_ctx(desc);
+
+ put_unaligned_le32(*crc, out);
+ return 0;
+}
+
+static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
+{
+ u32 *crc = shash_desc_ctx(desc);
+
+ put_unaligned_le32(~*crc, out);
+ return 0;
+}
+
+static struct shash_alg crc32_pmull_algs[] = { {
+ .setkey = crc32_pmull_setkey,
+ .init = crc32_pmull_init,
+ .update = crc32_pmull_update,
+ .final = crc32_pmull_final,
+ .descsize = sizeof(u32),
+ .digestsize = sizeof(u32),
+
+ .base.cra_ctxsize = sizeof(u32),
+ .base.cra_init = crc32_pmull_cra_init,
+ .base.cra_name = "crc32",
+ .base.cra_driver_name = "crc32-arm64-ce",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = 1,
+ .base.cra_module = THIS_MODULE,
+}, {
+ .setkey = crc32_pmull_setkey,
+ .init = crc32_pmull_init,
+ .update = crc32c_pmull_update,
+ .final = crc32c_pmull_final,
+ .descsize = sizeof(u32),
+ .digestsize = sizeof(u32),
+
+ .base.cra_ctxsize = sizeof(u32),
+ .base.cra_init = crc32c_pmull_cra_init,
+ .base.cra_name = "crc32c",
+ .base.cra_driver_name = "crc32c-arm64-ce",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = 1,
+ .base.cra_module = THIS_MODULE,
+} };
+
+static int __init crc32_pmull_mod_init(void)
+{
+ if (elf_hwcap & HWCAP_CRC32) {
+ fallback_crc32 = crc32_armv8_le;
+ fallback_crc32c = crc32c_armv8_le;
+ } else {
+ fallback_crc32 = crc32_le;
+ fallback_crc32c = __crc32c_le;
+ }
+
+ return crypto_register_shashes(crc32_pmull_algs,
+ ARRAY_SIZE(crc32_pmull_algs));
+}
+
+static void __exit crc32_pmull_mod_exit(void)
+{
+ crypto_unregister_shashes(crc32_pmull_algs,
+ ARRAY_SIZE(crc32_pmull_algs));
+}
+
+module_cpu_feature_match(PMULL, crc32_pmull_mod_init);
+module_exit(crc32_pmull_mod_exit);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
--
2.7.4
^ permalink raw reply related
* [PATCH v3 6/6] crypto: arm/crc32 - accelerated support based on x86 SSE implementation
From: Ard Biesheuvel @ 2016-12-05 18:42 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1480963348-24203-1-git-send-email-ard.biesheuvel@linaro.org>
This is a combination of the the Intel algorithm implemented using SSE
and PCLMULQDQ instructions from arch/x86/crypto/crc32-pclmul_asm.S, and
the new CRC32 extensions introduced for both 32-bit and 64-bit ARM in
version 8 of the architecture. Two versions of the above combo are
provided, one for CRC32 and one for CRC32C.
The PMULL/NEON algorithm is faster, but operates on blocks of at least
64 bytes, and on multiples of 16 bytes only. For the remaining input,
or for all input on systems that lack the PMULL 64x64->128 instructions,
the CRC32 instructions will be used.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
arch/arm/crypto/Kconfig | 5 +
arch/arm/crypto/Makefile | 2 +
arch/arm/crypto/crc32-ce-core.S | 306 ++++++++++++++++++++
arch/arm/crypto/crc32-ce-glue.c | 242 ++++++++++++++++
4 files changed, 555 insertions(+)
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index fce801fa52a1..de7bb20815bf 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -125,4 +125,9 @@ config CRYPTO_CRCT10DIF_ARM_CE
depends on KERNEL_MODE_NEON && CRC_T10DIF
select CRYPTO_HASH
+config CRYPTO_CRC32_ARM_CE
+ tristate "CRC32(C) digest algorithm using CRC and/or PMULL instructions"
+ depends on KERNEL_MODE_NEON && CRC32
+ select CRYPTO_HASH
+
endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index fc77265014b7..b578a1820ab1 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -14,6 +14,7 @@ ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
ce-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o
+ce-obj-$(CONFIG_CRYPTO_CRC32_ARM_CE) += crc32-arm-ce.o
ifneq ($(ce-obj-y)$(ce-obj-m),)
ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y)
@@ -38,6 +39,7 @@ sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o
aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
+crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
quiet_cmd_perl = PERL $@
cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm/crypto/crc32-ce-core.S b/arch/arm/crypto/crc32-ce-core.S
new file mode 100644
index 000000000000..e63d400dc5c1
--- /dev/null
+++ b/arch/arm/crypto/crc32-ce-core.S
@@ -0,0 +1,306 @@
+/*
+ * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ * Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .align 6
+ .arch armv8-a
+ .arch_extension crc
+ .fpu crypto-neon-fp-armv8
+
+.Lcrc32_constants:
+ /*
+ * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
+ * #define CONSTANT_R1 0x154442bd4LL
+ *
+ * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
+ * #define CONSTANT_R2 0x1c6e41596LL
+ */
+ .quad 0x0000000154442bd4
+ .quad 0x00000001c6e41596
+
+ /*
+ * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
+ * #define CONSTANT_R3 0x1751997d0LL
+ *
+ * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
+ * #define CONSTANT_R4 0x0ccaa009eLL
+ */
+ .quad 0x00000001751997d0
+ .quad 0x00000000ccaa009e
+
+ /*
+ * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
+ * #define CONSTANT_R5 0x163cd6124LL
+ */
+ .quad 0x0000000163cd6124
+ .quad 0x00000000FFFFFFFF
+
+ /*
+ * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+ *
+ * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
+ * = 0x1F7011641LL
+ * #define CONSTANT_RU 0x1F7011641LL
+ */
+ .quad 0x00000001DB710641
+ .quad 0x00000001F7011641
+
+.Lcrc32c_constants:
+ .quad 0x00000000740eef02
+ .quad 0x000000009e4addf8
+ .quad 0x00000000f20c0dfe
+ .quad 0x000000014cd00bd6
+ .quad 0x00000000dd45aab8
+ .quad 0x00000000FFFFFFFF
+ .quad 0x0000000105ec76f0
+ .quad 0x00000000dea713f1
+
+ dCONSTANTl .req d0
+ dCONSTANTh .req d1
+ qCONSTANT .req q0
+
+ BUF .req r0
+ LEN .req r1
+ CRC .req r2
+
+ qzr .req q9
+
+ /**
+ * Calculate crc32
+ * BUF - buffer
+ * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
+ * CRC - initial crc32
+ * return %eax crc32
+ * uint crc32_pmull_le(unsigned char const *buffer,
+ * size_t len, uint crc32)
+ */
+ENTRY(crc32_pmull_le)
+ adr r3, .Lcrc32_constants
+ b 0f
+
+ENTRY(crc32c_pmull_le)
+ adr r3, .Lcrc32c_constants
+
+0: bic LEN, LEN, #15
+ vld1.8 {q1-q2}, [BUF, :128]!
+ vld1.8 {q3-q4}, [BUF, :128]!
+ vmov.i8 qzr, #0
+ vmov.i8 qCONSTANT, #0
+ vmov dCONSTANTl[0], CRC
+ veor.8 d2, d2, dCONSTANTl
+ sub LEN, LEN, #0x40
+ cmp LEN, #0x40
+ blt less_64
+
+ vld1.64 {qCONSTANT}, [r3]
+
+loop_64: /* 64 bytes Full cache line folding */
+ sub LEN, LEN, #0x40
+
+ vmull.p64 q5, d3, dCONSTANTh
+ vmull.p64 q6, d5, dCONSTANTh
+ vmull.p64 q7, d7, dCONSTANTh
+ vmull.p64 q8, d9, dCONSTANTh
+
+ vmull.p64 q1, d2, dCONSTANTl
+ vmull.p64 q2, d4, dCONSTANTl
+ vmull.p64 q3, d6, dCONSTANTl
+ vmull.p64 q4, d8, dCONSTANTl
+
+ veor.8 q1, q1, q5
+ vld1.8 {q5}, [BUF, :128]!
+ veor.8 q2, q2, q6
+ vld1.8 {q6}, [BUF, :128]!
+ veor.8 q3, q3, q7
+ vld1.8 {q7}, [BUF, :128]!
+ veor.8 q4, q4, q8
+ vld1.8 {q8}, [BUF, :128]!
+
+ veor.8 q1, q1, q5
+ veor.8 q2, q2, q6
+ veor.8 q3, q3, q7
+ veor.8 q4, q4, q8
+
+ cmp LEN, #0x40
+ bge loop_64
+
+less_64: /* Folding cache line into 128bit */
+ vldr dCONSTANTl, [r3, #16]
+ vldr dCONSTANTh, [r3, #24]
+
+ vmull.p64 q5, d3, dCONSTANTh
+ vmull.p64 q1, d2, dCONSTANTl
+ veor.8 q1, q1, q5
+ veor.8 q1, q1, q2
+
+ vmull.p64 q5, d3, dCONSTANTh
+ vmull.p64 q1, d2, dCONSTANTl
+ veor.8 q1, q1, q5
+ veor.8 q1, q1, q3
+
+ vmull.p64 q5, d3, dCONSTANTh
+ vmull.p64 q1, d2, dCONSTANTl
+ veor.8 q1, q1, q5
+ veor.8 q1, q1, q4
+
+ teq LEN, #0
+ beq fold_64
+
+loop_16: /* Folding rest buffer into 128bit */
+ subs LEN, LEN, #0x10
+
+ vld1.8 {q2}, [BUF, :128]!
+ vmull.p64 q5, d3, dCONSTANTh
+ vmull.p64 q1, d2, dCONSTANTl
+ veor.8 q1, q1, q5
+ veor.8 q1, q1, q2
+
+ bne loop_16
+
+fold_64:
+ /* perform the last 64 bit fold, also adds 32 zeroes
+ * to the input stream */
+ vmull.p64 q2, d2, dCONSTANTh
+ vext.8 q1, q1, qzr, #8
+ veor.8 q1, q1, q2
+
+ /* final 32-bit fold */
+ vldr dCONSTANTl, [r3, #32]
+ vldr d6, [r3, #40]
+ vmov.i8 d7, #0
+
+ vext.8 q2, q1, qzr, #4
+ vand.8 d2, d2, d6
+ vmull.p64 q1, d2, dCONSTANTl
+ veor.8 q1, q1, q2
+
+ /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+ vldr dCONSTANTl, [r3, #48]
+ vldr dCONSTANTh, [r3, #56]
+
+ vand.8 q2, q1, q3
+ vext.8 q2, qzr, q2, #8
+ vmull.p64 q2, d5, dCONSTANTh
+ vand.8 q2, q2, q3
+ vmull.p64 q2, d4, dCONSTANTl
+ veor.8 q1, q1, q2
+ vmov r0, s5
+
+ bx lr
+ENDPROC(crc32_pmull_le)
+ENDPROC(crc32c_pmull_le)
+
+ .macro __crc32, c
+ subs ip, r2, #8
+ bmi .Ltail\c
+
+ tst r1, #3
+ bne .Lunaligned\c
+
+ teq ip, #0
+.Laligned8\c:
+ ldrd r2, r3, [r1], #8
+ARM_BE8(rev r2, r2 )
+ARM_BE8(rev r3, r3 )
+ crc32\c\()w r0, r0, r2
+ crc32\c\()w r0, r0, r3
+ bxeq lr
+ subs ip, ip, #8
+ bpl .Laligned8\c
+
+.Ltail\c:
+ tst ip, #4
+ beq 2f
+ ldr r3, [r1], #4
+ARM_BE8(rev r3, r3 )
+ crc32\c\()w r0, r0, r3
+
+2: tst ip, #2
+ beq 1f
+ ldrh r3, [r1], #2
+ARM_BE8(rev16 r3, r3 )
+ crc32\c\()h r0, r0, r3
+
+1: tst ip, #1
+ bxeq lr
+ ldrb r3, [r1]
+ crc32\c\()b r0, r0, r3
+ bx lr
+
+.Lunaligned\c:
+ tst r1, #1
+ beq 2f
+ ldrb r3, [r1], #1
+ subs r2, r2, #1
+ crc32\c\()b r0, r0, r3
+
+ tst r1, #2
+ beq 0f
+2: ldrh r3, [r1], #2
+ subs r2, r2, #2
+ARM_BE8(rev16 r3, r3 )
+ crc32\c\()h r0, r0, r3
+
+0: subs ip, r2, #8
+ bpl .Laligned8\c
+ b .Ltail\c
+ .endm
+
+ .align 5
+ENTRY(crc32_armv8_le)
+ __crc32
+ENDPROC(crc32_armv8_le)
+
+ .align 5
+ENTRY(crc32c_armv8_le)
+ __crc32 c
+ENDPROC(crc32c_armv8_le)
diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c
new file mode 100644
index 000000000000..e1566bec1016
--- /dev/null
+++ b/arch/arm/crypto/crc32-ce-glue.c
@@ -0,0 +1,242 @@
+/*
+ * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/crc32.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/unaligned.h>
+
+#define PMULL_MIN_LEN 64L /* minimum size of buffer
+ * for crc32_pmull_le_16 */
+#define SCALE_F 16L /* size of NEON register */
+
+asmlinkage u32 crc32_pmull_le(const u8 buf[], u32 len, u32 init_crc);
+asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], u32 len);
+
+asmlinkage u32 crc32c_pmull_le(const u8 buf[], u32 len, u32 init_crc);
+asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], u32 len);
+
+static u32 (*fallback_crc32)(u32 init_crc, const u8 buf[], u32 len);
+static u32 (*fallback_crc32c)(u32 init_crc, const u8 buf[], u32 len);
+
+static int crc32_cra_init(struct crypto_tfm *tfm)
+{
+ u32 *key = crypto_tfm_ctx(tfm);
+
+ *key = 0;
+ return 0;
+}
+
+static int crc32c_cra_init(struct crypto_tfm *tfm)
+{
+ u32 *key = crypto_tfm_ctx(tfm);
+
+ *key = ~0;
+ return 0;
+}
+
+static int crc32_setkey(struct crypto_shash *hash, const u8 *key,
+ unsigned int keylen)
+{
+ u32 *mctx = crypto_shash_ctx(hash);
+
+ if (keylen != sizeof(u32)) {
+ crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ *mctx = le32_to_cpup((__le32 *)key);
+ return 0;
+}
+
+static int crc32_init(struct shash_desc *desc)
+{
+ u32 *mctx = crypto_shash_ctx(desc->tfm);
+ u32 *crc = shash_desc_ctx(desc);
+
+ *crc = *mctx;
+ return 0;
+}
+
+static int crc32_update(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u32 *crc = shash_desc_ctx(desc);
+
+ *crc = crc32_armv8_le(*crc, data, length);
+ return 0;
+}
+
+static int crc32c_update(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u32 *crc = shash_desc_ctx(desc);
+
+ *crc = crc32c_armv8_le(*crc, data, length);
+ return 0;
+}
+
+static int crc32_final(struct shash_desc *desc, u8 *out)
+{
+ u32 *crc = shash_desc_ctx(desc);
+
+ put_unaligned_le32(*crc, out);
+ return 0;
+}
+
+static int crc32c_final(struct shash_desc *desc, u8 *out)
+{
+ u32 *crc = shash_desc_ctx(desc);
+
+ put_unaligned_le32(~*crc, out);
+ return 0;
+}
+
+static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u32 *crc = shash_desc_ctx(desc);
+ unsigned int l;
+
+ if (may_use_simd()) {
+ if ((u32)data % SCALE_F) {
+ l = min_t(u32, length, SCALE_F - ((u32)data % SCALE_F));
+
+ *crc = fallback_crc32(*crc, data, l);
+
+ data += l;
+ length -= l;
+ }
+
+ if (length >= PMULL_MIN_LEN) {
+ l = round_down(length, SCALE_F);
+
+ kernel_neon_begin();
+ *crc = crc32_pmull_le(data, l, *crc);
+ kernel_neon_end();
+
+ data += l;
+ length -= l;
+ }
+ }
+
+ if (length > 0)
+ *crc = fallback_crc32(*crc, data, length);
+
+ return 0;
+}
+
+static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u32 *crc = shash_desc_ctx(desc);
+ unsigned int l;
+
+ if (may_use_simd()) {
+ if ((u32)data % SCALE_F) {
+ l = min_t(u32, length, SCALE_F - ((u32)data % SCALE_F));
+
+ *crc = fallback_crc32c(*crc, data, l);
+
+ data += l;
+ length -= l;
+ }
+
+ if (length >= PMULL_MIN_LEN) {
+ l = round_down(length, SCALE_F);
+
+ kernel_neon_begin();
+ *crc = crc32c_pmull_le(data, l, *crc);
+ kernel_neon_end();
+
+ data += l;
+ length -= l;
+ }
+ }
+
+ if (length > 0)
+ *crc = fallback_crc32c(*crc, data, length);
+
+ return 0;
+}
+
+static struct shash_alg crc32_pmull_algs[] = { {
+ .setkey = crc32_setkey,
+ .init = crc32_init,
+ .update = crc32_update,
+ .final = crc32_final,
+ .descsize = sizeof(u32),
+ .digestsize = sizeof(u32),
+
+ .base.cra_ctxsize = sizeof(u32),
+ .base.cra_init = crc32_cra_init,
+ .base.cra_name = "crc32",
+ .base.cra_driver_name = "crc32-arm-ce",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = 1,
+ .base.cra_module = THIS_MODULE,
+}, {
+ .setkey = crc32_setkey,
+ .init = crc32_init,
+ .update = crc32c_update,
+ .final = crc32c_final,
+ .descsize = sizeof(u32),
+ .digestsize = sizeof(u32),
+
+ .base.cra_ctxsize = sizeof(u32),
+ .base.cra_init = crc32c_cra_init,
+ .base.cra_name = "crc32c",
+ .base.cra_driver_name = "crc32c-arm-ce",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = 1,
+ .base.cra_module = THIS_MODULE,
+} };
+
+static int __init crc32_pmull_mod_init(void)
+{
+ if (elf_hwcap2 & HWCAP2_PMULL) {
+ crc32_pmull_algs[0].update = crc32_pmull_update;
+ crc32_pmull_algs[1].update = crc32c_pmull_update;
+
+ if (elf_hwcap2 & HWCAP2_CRC32) {
+ fallback_crc32 = crc32_armv8_le;
+ fallback_crc32c = crc32c_armv8_le;
+ } else {
+ fallback_crc32 = crc32_le;
+ fallback_crc32c = __crc32c_le;
+ }
+ } else if (!(elf_hwcap2 & HWCAP2_CRC32)) {
+ return -ENODEV;
+ }
+
+ return crypto_register_shashes(crc32_pmull_algs,
+ ARRAY_SIZE(crc32_pmull_algs));
+}
+
+static void __exit crc32_pmull_mod_exit(void)
+{
+ crypto_unregister_shashes(crc32_pmull_algs,
+ ARRAY_SIZE(crc32_pmull_algs));
+}
+
+module_init(crc32_pmull_mod_init);
+module_exit(crc32_pmull_mod_exit);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("crc32");
+MODULE_ALIAS_CRYPTO("crc32c");
--
2.7.4
^ permalink raw reply related
* [PATCH] ARM: dts: imx6sx-udoo-neo: Pass the 'phy-reset-duration' property
From: Fabio Estevam @ 2016-12-05 19:28 UTC (permalink / raw)
To: linux-arm-kernel
From: Fabio Estevam <fabio.estevam@nxp.com>
imx6sx-udoo-neo has a KSZ8091 Ethernet PHY, which requires the reset
signal to be low for at least 10ms.
Pass the 'phy-reset-duration' property to reflect such requirement.
Signed-off-by: Fabio Estevam <fabio.estevam@nxp.com>
---
arch/arm/boot/dts/imx6sx-udoo-neo.dtsi | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm/boot/dts/imx6sx-udoo-neo.dtsi b/arch/arm/boot/dts/imx6sx-udoo-neo.dtsi
index 2b65d26..d1ce9af 100644
--- a/arch/arm/boot/dts/imx6sx-udoo-neo.dtsi
+++ b/arch/arm/boot/dts/imx6sx-udoo-neo.dtsi
@@ -86,6 +86,7 @@
pinctrl-names = "default";
pinctrl-0 = <&pinctrl_enet1>;
phy-mode = "rmii";
+ phy-reset-duration = <10>;
phy-reset-gpios = <&gpio2 1 GPIO_ACTIVE_LOW>;
};
--
2.7.4
^ permalink raw reply related
* ILP32 for ARM64: testing with glibc testsuite
From: Steve Ellcey @ 2016-12-05 19:33 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <mvmeg1mn0yp.fsf@hawking.suse.de>
On Mon, 2016-12-05 at 11:07 +0100, Andreas Schwab wrote:
> On Dez 05 2016, "Zhangjian (Bamvor)" <bamvor.zhangjian@huawei.com>
> wrote:
>
> >
> > Is there some progresses on it? We could collabrate to fix those
> > issues.
> All the elf/nptl/rt fails should be fixed by the recent binutils
> fixes.
>
> Andreas.
I am using binutils ToT and Yury's latest patch (https://sourceware.org
/ml/binutils/2016-12/msg00039.html) and I am still seeing some nptl and
rt failures in the glibc testsuite, specifically:
FAIL: nptl/tst-cancel26
FAIL: nptl/tst-cancel27
FAIL: nptl/tst-stack4
FAIL: rt/tst-mqueue1
FAIL: rt/tst-mqueue2
FAIL: rt/tst-mqueue4
FAIL: rt/tst-mqueue7
Steve Ellcey
sellcey at caviumnetworks.com
^ permalink raw reply
* [PATCH] ARM: dts: imx7d: fix LCDIF clock assignment
From: Stefan Agner @ 2016-12-05 20:29 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20161205070609.a6t7hzh3m3l2t37s@pengutronix.de>
On 2016-12-04 23:06, Uwe Kleine-K?nig wrote:
> Hello Stefan,
>
> On Sun, Dec 04, 2016 at 05:26:58PM -0800, Stefan Agner wrote:
>> Since this fixes a kernel freeze, is there a chance to get this still in
>> 4.9?
>
> a Fixes:-Line would be nice then.
Good point.
Fixes: e8ed73f691bd ("ARM: dts: imx7d: add lcdif support")
--
Stefan
^ permalink raw reply
* [PATCH v3] PCI/ACPI: xgene: Add ECAM quirk for X-Gene PCIe controller
From: Bjorn Helgaas @ 2016-12-05 21:20 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <CADaLNDk-cMW8k+bXqH3+yYCye_kHf6RXUYQ=eCCZ4FF2ZTjgkQ@mail.gmail.com>
On Fri, Dec 02, 2016 at 11:06:30PM -0800, Duc Dang wrote:
> On Fri, Dec 2, 2016 at 3:39 PM, Bjorn Helgaas <helgaas@kernel.org> wrote:
> > diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c
> > index 8a177a1..a16fc8e 100644
> > --- a/arch/arm64/kernel/pci.c
> > +++ b/arch/arm64/kernel/pci.c
> > @@ -114,6 +114,19 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
> > return 0;
> > }
> >
> > +static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
> > +{
> > + struct resource_entry *entry, *tmp;
> > + int status;
> > +
> > + status = acpi_pci_probe_root_resources(ci);
> > + resource_list_for_each_entry_safe(entry, tmp, &ci->resources) {
> > + if (!(entry->res->flags & IORESOURCE_WINDOW))
> > + resource_list_destroy_entry(entry);
> > + }
> > + return status;
> > +}
> > +
> > /*
> > * Lookup the bus range for the domain in MCFG, and set up config space
> > * mapping.
> > @@ -190,6 +203,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
> > }
> >
> > root_ops->release_info = pci_acpi_generic_release_info;
> > + root_ops->prepare_resources = pci_acpi_root_prepare_resources;
> > root_ops->pci_ops = &ri->cfg->ops->pci_ops;
> > bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg);
> > if (!bus)
>
> I tried your patch above with my X-Gene ECAM v4 patch on Mustang, here
> is the kernel boot log and output of 'cat /proc/iomem'. The PCIe core
> does not print the MMIO space as a window (which is expected per your
> patch above).
Thanks!
> ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-ff])
> acpi PNP0A08:00: _OSC: OS supports [ExtendedConfig ASPM ClockPM Segments MSI]
> acpi PNP0A08:00: _OSC: OS now controls [PCIeHotplug PME AER PCIeCapability]
> acpi PNP0A08:00: MCFG quirk: ECAM at [mem 0xe0d0000000-0xe0dfffffff] for [bus 00-ff] with xgene_v1_pcie_ecam_ops
> acpi PNP0A08:00: [Firmware Bug]: ECAM area [mem 0xe0d0000000-0xe0dfffffff] not reserved in ACPI namespace
> acpi PNP0A08:00: ECAM at [mem 0xe0d0000000-0xe0dfffffff] for [bus 00-ff]
> Remapped I/O 0x000000e010000000 to [io 0x0000-0xffff window]
> PCI host bridge to bus 0000:00
> pci_bus 0000:00: root bus resource [io 0x0000-0xffff window] (bus address [0x10000000-0x1000ffff])
> pci_bus 0000:00: root bus resource [mem 0xe040000000-0xe07fffffff window] (bus address [0x40000000-0x7fffffff])
> pci_bus 0000:00: root bus resource [mem 0xf000000000-0xffffffffff window]
> pci_bus 0000:00: root bus resource [bus 00-ff]
Yup, no bridge register space here; that's good. I assume the bridge
registers are at [mem 0x1f2b0000-0x1f2bffff] as shown in /proc/iomem
below.
> [root@(none) ~]# cat /proc/io mem
> ...
> 19000000-19007fff : 808622B7:00
> 1900c100-190fffff : 808622B7:00
> 1900c100-190fffff : 808622B7:00
> 19800000-19807fff : 808622B7:01
> 1980c100-198fffff : 808622B7:01
> 1980c100-198fffff : 808622B7:01
> ...
> 1f280000-1f28ffff : 808622B7:00
> 1f290000-1f29ffff : 808622B7:01
I'm curious what these "808622B7" devices are. Per ACPI 6.0, sec
6.1.5, that looks like a PCI vendor ID, which I guess is a valid ACPI
ID. But these resources don't seem to have any connection with PCI
(they're not in any of the host bridge apertures).
> 1f2b0000-1f2bffff : PNP0A08:00
Looks like the bridge register space; good.
> e040000000-e07fffffff : PCI Bus 0000:00
> e040000000-e0401fffff : PCI Bus 0000:01
> e040000000-e0400fffff : 0000:01:00.0
> e040000000-e0400fffff : mlx4_core
> e040100000-e0401fffff : 0000:01:00.0
> e0d0000000-e0dfffffff : PCI ECAM
This region should be described in either a PNP0C02 device or (if we
decide we can allow "consumer" descriptors) the PNP0A08 device. I
assume you'll fix that in a future firmware release.
But I think this reservation from pci_ecam_create() is good enough for
now.
> f000000000-ffffffffff : PCI Bus 0000:00
> f000000000-f001ffffff : PCI Bus 0000:01
> f000000000-f001ffffff : 0000:01:00.0
> f000000000-f001ffffff : mlx4_core
^ permalink raw reply
* [PATCH v3] PCI/ACPI: xgene: Add ECAM quirk for X-Gene PCIe controller
From: Bjorn Helgaas @ 2016-12-05 21:21 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <62dd5462-4d68-b89f-2b45-0d58bbd96bcd@redhat.com>
On Fri, Dec 02, 2016 at 07:33:46PM -0500, Jon Masters wrote:
> On 12/02/2016 06:39 PM, Bjorn Helgaas wrote:
> > On Thu, Dec 01, 2016 at 11:08:23PM -0500, Jon Masters wrote:
>
> >> Let's see if I summarized this correctly...
> >>
> >> 1. The MMIO registers for the host bridge itself need to be described
> >> somewhere, especially if we need to find those in a quirk and poke
> >> them. Since those registers are very much part of the bridge device,
> >> it makes sense for them to be in the _CRS for PNP0A08/PNP0A03.
> >>
> >> 2. The address space covering these registers MUST be described as a
> >> ResourceConsumer in order to avoid accidentally exposing them as
> >> available for use by downstream devices on the PCI bus.
> >>
> >> 3. The ACPI specification allows for resources of the type "Memory32Fixed".
> >> This is a macro that doesn't have the notion of a producer or consumer.
> >> HOWEVER various interpretations seem to be that this could/should
> >> default to being interpreted as a consumed region.
> >
> > I agree; I think that per spec, Memory24, Memory32, Memory32Fixed, IO,
> > and FixedIO should all be for consumed resources, not for bridge
> > windows, since they don't have the notion of producer.
>
> Ok. If we ultimately codify this somewhere as the general Linux kernel
> consensus (Rafael?) then we can also go and get the various ARM server
> specs updated to reflect this in (for e.g.) reference firmware builds.
>
> > I'm pretty sure there's x86 firmware in the field that uses these for
> > windows, so I think we have to accept that usage, at least on x86.
>
> Ok. I was pondering how to even go about finding that out, but even if
> I scheduled a job across RH's infra to look, that would be a drop in
> the bucket of possible machines that might be out there doing this.
Hmmm, when researching this, I thought I came across a change
specifically for a machine that used Memory32Fixed this way, but I
can't find it now.
The only thing I did find was some old experiments with Windows that
showed it interpreting a Memory32Fixed region as a window and putting
PCI devices in it: https://bugzilla.kernel.org/show_bug.cgi?id=15817
But that was a synthetic example with qemu, not a real machine in the
field.
> > Even without this patch, I don't think it's a show-stopper to have
> > Linux mistakenly thinking this region is routed to PCI, because the
> > driver does reserve it and the PCI core will never try to use it.
>
> Ok. So are you happy with pulling in Duc's v4 patch and retaining
> status quo on the bridge resources for 4.10?
Yes, I think it looks good. I'll finish packaging things up and
repost the current series.
Bjorn
^ permalink raw reply
* [PATCH v3] PCI/ACPI: xgene: Add ECAM quirk for X-Gene PCIe controller
From: Duc Dang @ 2016-12-05 21:40 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20161205212012.GA22455@bhelgaas-glaptop.roam.corp.google.com>
On Mon, Dec 5, 2016 at 1:20 PM, Bjorn Helgaas <helgaas@kernel.org> wrote:
> On Fri, Dec 02, 2016 at 11:06:30PM -0800, Duc Dang wrote:
>> On Fri, Dec 2, 2016 at 3:39 PM, Bjorn Helgaas <helgaas@kernel.org> wrote:
>
>> > diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c
>> > index 8a177a1..a16fc8e 100644
>> > --- a/arch/arm64/kernel/pci.c
>> > +++ b/arch/arm64/kernel/pci.c
>> > @@ -114,6 +114,19 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
>> > return 0;
>> > }
>> >
>> > +static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
>> > +{
>> > + struct resource_entry *entry, *tmp;
>> > + int status;
>> > +
>> > + status = acpi_pci_probe_root_resources(ci);
>> > + resource_list_for_each_entry_safe(entry, tmp, &ci->resources) {
>> > + if (!(entry->res->flags & IORESOURCE_WINDOW))
>> > + resource_list_destroy_entry(entry);
>> > + }
>> > + return status;
>> > +}
>> > +
>> > /*
>> > * Lookup the bus range for the domain in MCFG, and set up config space
>> > * mapping.
>> > @@ -190,6 +203,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
>> > }
>> >
>> > root_ops->release_info = pci_acpi_generic_release_info;
>> > + root_ops->prepare_resources = pci_acpi_root_prepare_resources;
>> > root_ops->pci_ops = &ri->cfg->ops->pci_ops;
>> > bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg);
>> > if (!bus)
>>
>> I tried your patch above with my X-Gene ECAM v4 patch on Mustang, here
>> is the kernel boot log and output of 'cat /proc/iomem'. The PCIe core
>> does not print the MMIO space as a window (which is expected per your
>> patch above).
>
> Thanks!
>
>> ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-ff])
>> acpi PNP0A08:00: _OSC: OS supports [ExtendedConfig ASPM ClockPM Segments MSI]
>> acpi PNP0A08:00: _OSC: OS now controls [PCIeHotplug PME AER PCIeCapability]
>> acpi PNP0A08:00: MCFG quirk: ECAM at [mem 0xe0d0000000-0xe0dfffffff] for [bus 00-ff] with xgene_v1_pcie_ecam_ops
>> acpi PNP0A08:00: [Firmware Bug]: ECAM area [mem 0xe0d0000000-0xe0dfffffff] not reserved in ACPI namespace
>> acpi PNP0A08:00: ECAM at [mem 0xe0d0000000-0xe0dfffffff] for [bus 00-ff]
>> Remapped I/O 0x000000e010000000 to [io 0x0000-0xffff window]
>> PCI host bridge to bus 0000:00
>> pci_bus 0000:00: root bus resource [io 0x0000-0xffff window] (bus address [0x10000000-0x1000ffff])
>> pci_bus 0000:00: root bus resource [mem 0xe040000000-0xe07fffffff window] (bus address [0x40000000-0x7fffffff])
>> pci_bus 0000:00: root bus resource [mem 0xf000000000-0xffffffffff window]
>> pci_bus 0000:00: root bus resource [bus 00-ff]
>
> Yup, no bridge register space here; that's good. I assume the bridge
> registers are at [mem 0x1f2b0000-0x1f2bffff] as shown in /proc/iomem
> below.
Yes, the bridge registers are at [mem 0x1f2b0000-0x1f2bffff].
>
>> [root@(none) ~]# cat /proc/io mem
>> ...
>> 19000000-19007fff : 808622B7:00
>> 1900c100-190fffff : 808622B7:00
>> 1900c100-190fffff : 808622B7:00
>> 19800000-19807fff : 808622B7:01
>> 1980c100-198fffff : 808622B7:01
>> 1980c100-198fffff : 808622B7:01
>> ...
>> 1f280000-1f28ffff : 808622B7:00
>> 1f290000-1f29ffff : 808622B7:01
>
> I'm curious what these "808622B7" devices are. Per ACPI 6.0, sec
> 6.1.5, that looks like a PCI vendor ID, which I guess is a valid ACPI
> ID. But these resources don't seem to have any connection with PCI
> (they're not in any of the host bridge apertures).
These are DesignWare USB 3.0 controllers (DWC3). The ACPI ID is
defined in drivers/usb/dwc3/core.c.
>
>> 1f2b0000-1f2bffff : PNP0A08:00
>
> Looks like the bridge register space; good.
Yes, it is.
>
>> e040000000-e07fffffff : PCI Bus 0000:00
>> e040000000-e0401fffff : PCI Bus 0000:01
>> e040000000-e0400fffff : 0000:01:00.0
>> e040000000-e0400fffff : mlx4_core
>> e040100000-e0401fffff : 0000:01:00.0
>
>> e0d0000000-e0dfffffff : PCI ECAM
>
> This region should be described in either a PNP0C02 device or (if we
> decide we can allow "consumer" descriptors) the PNP0A08 device. I
> assume you'll fix that in a future firmware release.
Yes, future firmware will have PNP0C02 node that describes this ECAM
space (or a new resource in PNP0A08 if we use 'consumer' descriptor).
>
> But I think this reservation from pci_ecam_create() is good enough for
> now.
>
>> f000000000-ffffffffff : PCI Bus 0000:00
>> f000000000-f001ffffff : PCI Bus 0000:01
>> f000000000-f001ffffff : 0000:01:00.0
>> f000000000-f001ffffff : mlx4_core
Regards,
Duc Dang.
^ permalink raw reply
* [PATCH v3] PCI/ACPI: xgene: Add ECAM quirk for X-Gene PCIe controller
From: Bjorn Helgaas @ 2016-12-05 21:53 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <CADaLNDm6fPQ6ekQa85fveAamJtF3+HGeOvprmgGJ4gLnMhF2_w@mail.gmail.com>
On Thu, Dec 01, 2016 at 06:52:23PM -0800, Duc Dang wrote:
> On Thu, Dec 1, 2016 at 10:33 AM, Bjorn Helgaas <helgaas@kernel.org> wrote:
> I made similar changes in v4 patch. The ECAM quirk will be built when
> ACPI and PCI_QUIRKS are enabled.
>
> When building for DT only, the ECAM quirk won't be compiled.
Perfect.
> >> #define XGENE_PCIE_IP_VER_UNKN 0
> >> #define XGENE_PCIE_IP_VER_1 1
> >> +#define XGENE_PCIE_IP_VER_2 2
> >
> > This isn't used anywhere, which makes me wonder whether it's worth
> > keeping it.
>
> V2 controller will use this XGENE_PCIE_IP_VER_2 (port->version =
> XGENE_PCIE_IP_VER_2). This will be used to indicate that the
> controller is V2, and to enable configuration request retry status
> feature (by not disable it like V1 controller).
OK, I see. You don't actually need XGENE_PCIE_IP_VER_2, you just need
port->version to be something other than XGENE_PCIE_IP_VER_1. So this
is fine as it is.
> >> static void __iomem *xgene_pcie_get_cfg_base(struct pci_bus *bus)
> >> {
> >> - struct xgene_pcie_port *port = bus->sysdata;
> >> + struct pci_config_window *cfg;
> >> + struct xgene_pcie_port *port;
> >> +
> >> + if (acpi_disabled)
> >> + port = bus->sysdata;
> >> + else {
> >> + cfg = bus->sysdata;
> >> + port = cfg->priv;
> >> + }
> >
> > I would really, really like to figure out a way to get rid of these
> > "if (acpi_disabled)" checks sprinkled through here. Is there any way
> > we can set up bus->sysdata to be the same, regardless of whether we're
> > using this as a platform driver or an ACPI quirk?
>
> Right now, I created a inline function to extract xgene_pcie_port from
> pci_bus. In order to get rid of acpi_disabled, I will need to make
> sysdata in DT case also point to pci_config_window structure, which
> means I will need to convert and test the DT driver to use ecam ops.
> It is a separate patch itself. So I think I should do it at later time
> (after this ECAM quirk patch). I hope you are ok with this.
OK. I did the simple-minded version of leaving the DT ops the same
but making sysdata point to a dummy pci_config_window. Your proposal
of using ECAM for DT would be much better.
It's interesting that you actually already use the same accessors
except that DT uses the 32-bit pci_generic_config_write32() and ACPI
uses the regular pci_generic_config_write(). I guess that means the
hardware actually *does* support sub-32 bit writes?
> I need to define the function (xgene_get_csr_resource()) inside
> pci-xgene.c to duplicate the code of acpi_get_rc_addr. The reason is
> X-Gene firmware does not have a dedicate PNP0C02 node to declare the
> resource, and if I use acpi_get_rc_resources() with "PNP0A08", I got
> error due to acpi_bus_get_device() returns error.
Looks good.
> > All these init functions are almost identical. Can we factor this out
> > by having wrappers that do nothing more than pass in the table and
> > version, and put the kzalloc and ioremap in a shared back-end?
>
> I refactor-ed these .init functions. And as a result, there are only 2
> ecam ops left: xgene_v1_pcie_ecam_ops and xgene_v2_pcie_ecam_ops.
Looks good.
Bjorn
^ permalink raw reply
* [PATCH v3] PCI/ACPI: xgene: Add ECAM quirk for X-Gene PCIe controller
From: Duc Dang @ 2016-12-05 22:09 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20161205215318.GB22455@bhelgaas-glaptop.roam.corp.google.com>
On Mon, Dec 5, 2016 at 1:53 PM, Bjorn Helgaas <helgaas@kernel.org> wrote:
> On Thu, Dec 01, 2016 at 06:52:23PM -0800, Duc Dang wrote:
>> On Thu, Dec 1, 2016 at 10:33 AM, Bjorn Helgaas <helgaas@kernel.org> wrote:
>
>> I made similar changes in v4 patch. The ECAM quirk will be built when
>> ACPI and PCI_QUIRKS are enabled.
>>
>> When building for DT only, the ECAM quirk won't be compiled.
>
> Perfect.
>
>> >> #define XGENE_PCIE_IP_VER_UNKN 0
>> >> #define XGENE_PCIE_IP_VER_1 1
>> >> +#define XGENE_PCIE_IP_VER_2 2
>> >
>> > This isn't used anywhere, which makes me wonder whether it's worth
>> > keeping it.
>>
>> V2 controller will use this XGENE_PCIE_IP_VER_2 (port->version =
>> XGENE_PCIE_IP_VER_2). This will be used to indicate that the
>> controller is V2, and to enable configuration request retry status
>> feature (by not disable it like V1 controller).
>
> OK, I see. You don't actually need XGENE_PCIE_IP_VER_2, you just need
> port->version to be something other than XGENE_PCIE_IP_VER_1. So this
> is fine as it is.
>
>> >> static void __iomem *xgene_pcie_get_cfg_base(struct pci_bus *bus)
>> >> {
>> >> - struct xgene_pcie_port *port = bus->sysdata;
>> >> + struct pci_config_window *cfg;
>> >> + struct xgene_pcie_port *port;
>> >> +
>> >> + if (acpi_disabled)
>> >> + port = bus->sysdata;
>> >> + else {
>> >> + cfg = bus->sysdata;
>> >> + port = cfg->priv;
>> >> + }
>> >
>> > I would really, really like to figure out a way to get rid of these
>> > "if (acpi_disabled)" checks sprinkled through here. Is there any way
>> > we can set up bus->sysdata to be the same, regardless of whether we're
>> > using this as a platform driver or an ACPI quirk?
>>
>> Right now, I created a inline function to extract xgene_pcie_port from
>> pci_bus. In order to get rid of acpi_disabled, I will need to make
>> sysdata in DT case also point to pci_config_window structure, which
>> means I will need to convert and test the DT driver to use ecam ops.
>> It is a separate patch itself. So I think I should do it at later time
>> (after this ECAM quirk patch). I hope you are ok with this.
>
> OK. I did the simple-minded version of leaving the DT ops the same
> but making sysdata point to a dummy pci_config_window. Your proposal
> of using ECAM for DT would be much better.
>
> It's interesting that you actually already use the same accessors
> except that DT uses the 32-bit pci_generic_config_write32() and ACPI
> uses the regular pci_generic_config_write(). I guess that means the
> hardware actually *does* support sub-32 bit writes?
Yes, the hardware does support sub-32 bit writes (and reads). This is
another item in my TODO list for DT (which does not seem quite urgent
now): switch to use pci_generic_config_write for DT. But, well, I will
need to do that for read as well (for both ACPI and DT).
>
>> I need to define the function (xgene_get_csr_resource()) inside
>> pci-xgene.c to duplicate the code of acpi_get_rc_addr. The reason is
>> X-Gene firmware does not have a dedicate PNP0C02 node to declare the
>> resource, and if I use acpi_get_rc_resources() with "PNP0A08", I got
>> error due to acpi_bus_get_device() returns error.
>
> Looks good.
>
>> > All these init functions are almost identical. Can we factor this out
>> > by having wrappers that do nothing more than pass in the table and
>> > version, and put the kzalloc and ioremap in a shared back-end?
>>
>> I refactor-ed these .init functions. And as a result, there are only 2
>> ecam ops left: xgene_v1_pcie_ecam_ops and xgene_v2_pcie_ecam_ops.
>
> Looks good.
>
> Bjorn
Regards,
Duc Dang.
^ permalink raw reply
* [PATCH] ACPI/IORT: Make dma masks set-up IORT specific
From: Rafael J. Wysocki @ 2016-12-05 22:18 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20161205122619.25045-1-lorenzo.pieralisi@arm.com>
On Mon, Dec 5, 2016 at 1:26 PM, Lorenzo Pieralisi
<lorenzo.pieralisi@arm.com> wrote:
> The introduction of acpi_dma_configure() allows to configure DMA
> and related IOMMU for any device that is DMA capable. To achieve
> that goal it ensures DMA masks are set-up to sane default values
> before proceeding with IOMMU and DMA ops configuration.
>
> On x86/ia64 systems, through acpi_bind_one(), acpi_dma_configure() is
> called for every device that has an ACPI companion, in that every device
> is considered DMA capable on x86/ia64 systems (ie acpi_get_dma_attr() API),
> which has the side effect of initializing dma masks also for
> pseudo-devices (eg CPUs and memory nodes) and potentially for devices
> whose dma masks were not set-up before the acpi_dma_configure() API was
> introduced, which may have noxious side effects.
>
> Therefore, in preparation for IORT firmware specific DMA masks set-up,
> wrap the default DMA masks set-up in acpi_dma_configure() inside an IORT
> specific wrapper that reverts to a NOP on x86/ia64 systems, restoring the
> default expected behaviour on x86/ia64 systems and keeping DMA default
> masks set-up on IORT based (ie ARM) arch configurations.
>
> Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
> Cc: Will Deacon <will.deacon@arm.com>
> Cc: Hanjun Guo <hanjun.guo@linaro.org>
> Cc: Bjorn Helgaas <bhelgaas@google.com>
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: Tomasz Nowicki <tn@semihalf.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
> Cc: Sricharan R <sricharan@codeaurora.org>
Acked -by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> ---
> Joerg,
>
> pending Rafael's ACK on it, given the 4.10 release timing and that the
> series is queued via the IOMMU tree please consider applying this patch to
> your arm/smmu branch for 4.10, it is not fixing a bug but it is modifying
> the x86/ia64 code path; I prefer preventing any issue related to default
> dma masks on x86/ia64 so I hope it can get merged along with the rest of
> the ACPI IORT SMMU series.
>
> Thanks a lot and apologies,
> Lorenzo
>
> drivers/acpi/arm64/iort.c | 22 ++++++++++++++++++++++
> drivers/acpi/scan.c | 14 +-------------
> include/linux/acpi_iort.h | 2 ++
> 3 files changed, 25 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
> index 47bace8..e0d2e6e 100644
> --- a/drivers/acpi/arm64/iort.c
> +++ b/drivers/acpi/arm64/iort.c
> @@ -547,6 +547,28 @@ static const struct iommu_ops *iort_iommu_xlate(struct device *dev,
> }
>
> /**
> + * iort_set_dma_mask - Set-up dma mask for a device.
> + *
> + * @dev: device to configure
> + */
> +void iort_set_dma_mask(struct device *dev)
> +{
> + /*
> + * Set default coherent_dma_mask to 32 bit. Drivers are expected to
> + * setup the correct supported mask.
> + */
> + if (!dev->coherent_dma_mask)
> + dev->coherent_dma_mask = DMA_BIT_MASK(32);
> +
> + /*
> + * Set it to coherent_dma_mask by default if the architecture
> + * code has not set it.
> + */
> + if (!dev->dma_mask)
> + dev->dma_mask = &dev->coherent_dma_mask;
> +}
> +
> +/**
> * iort_iommu_configure - Set-up IOMMU configuration for a device.
> *
> * @dev: device to configure
> diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
> index 80698d3..93b00cf 100644
> --- a/drivers/acpi/scan.c
> +++ b/drivers/acpi/scan.c
> @@ -1380,19 +1380,7 @@ void acpi_dma_configure(struct device *dev, enum dev_dma_attr attr)
> {
> const struct iommu_ops *iommu;
>
> - /*
> - * Set default coherent_dma_mask to 32 bit. Drivers are expected to
> - * setup the correct supported mask.
> - */
> - if (!dev->coherent_dma_mask)
> - dev->coherent_dma_mask = DMA_BIT_MASK(32);
> -
> - /*
> - * Set it to coherent_dma_mask by default if the architecture
> - * code has not set it.
> - */
> - if (!dev->dma_mask)
> - dev->dma_mask = &dev->coherent_dma_mask;
> + iort_set_dma_mask(dev);
>
> iommu = iort_iommu_configure(dev);
>
> diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
> index dcb2b60..77e0809 100644
> --- a/include/linux/acpi_iort.h
> +++ b/include/linux/acpi_iort.h
> @@ -35,6 +35,7 @@ bool iort_node_match(u8 type);
> u32 iort_msi_map_rid(struct device *dev, u32 req_id);
> struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id);
> /* IOMMU interface */
> +void iort_set_dma_mask(struct device *dev);
> const struct iommu_ops *iort_iommu_configure(struct device *dev);
> #else
> static inline void acpi_iort_init(void) { }
> @@ -45,6 +46,7 @@ static inline struct irq_domain *iort_get_device_domain(struct device *dev,
> u32 req_id)
> { return NULL; }
> /* IOMMU interface */
> +static inline void iort_set_dma_mask(struct device *dev) { }
> static inline
> const struct iommu_ops *iort_iommu_configure(struct device *dev)
> { return NULL; }
> --
> 2.10.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> the body of a message to majordomo at vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* [PATCH v4 1/9] dt-bindings: clarify compatible property for rockchip timers
From: Rob Herring @ 2016-12-05 22:22 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1480436092-10728-2-git-send-email-al.kochet@gmail.com>
On Tue, Nov 29, 2016 at 07:14:44PM +0300, Alexander Kochetkov wrote:
> Make all properties description in form '"rockchip,<chip>-timer",
> "rockchip,rk3288-timer"' for all chips found in linux kernel.
>
> Suggested-by: Heiko St?bner <heiko@sntech.de>
> Signed-off-by: Alexander Kochetkov <al.kochet@gmail.com>
> ---
> .../bindings/timer/rockchip,rk-timer.txt | 12 +++++++++---
> 1 file changed, 9 insertions(+), 3 deletions(-)
Acked-by: Rob Herring <robh@kernel.org>
^ permalink raw reply
* [PATCH v4 4/4] [media] dt-bindings: add TI VPIF documentation
From: Rob Herring @ 2016-12-05 22:27 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20161129235712.29846-5-khilman@baylibre.com>
On Tue, Nov 29, 2016 at 03:57:12PM -0800, Kevin Hilman wrote:
> Signed-off-by: Kevin Hilman <khilman@baylibre.com>
> ---
> .../devicetree/bindings/media/ti,da850-vpif.txt | 67 ++++++++++++++++++++++
> 1 file changed, 67 insertions(+)
> create mode 100644 Documentation/devicetree/bindings/media/ti,da850-vpif.txt
Acked-by: Rob Herring <robh@kernel.org>
^ permalink raw reply
* [RFC PATCH 00/29] arm64: Scalable Vector Extension core support
From: Torvald Riegel @ 2016-12-05 22:42 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20161130120654.GJ1574@e103592.cambridge.arm.com>
On Wed, 2016-11-30 at 12:06 +0000, Dave Martin wrote:
> So, my key goal is to support _per-process_ vector length control.
>
> From the kernel perspective, it is easiest to achieve this by providing
> per-thread control since that is the unit that context switching acts
> on.
>
> How useful it really is to have threads with different VLs in the same
> process is an open question. It's theoretically useful for runtime
> environments, which may want to dispatch code optimised for different
> VLs
What would be the primary use case(s)? Vectorization of short vectors
(eg, if having an array of structs or sth like that)?
> -- changing the VL on-the-fly within a single thread is not
> something I want to encourage, due to overhead and ABI issues, but
> switching between threads of different VLs would be more manageable.
So if on-the-fly switching is probably not useful, that would mean we
need special threads for the use cases. Is that a realistic assumption
for the use cases? Or do you primarily want to keep it possible to do
this, regardless of whether there are real use cases now?
I suppose allowing for a per-thread setting of VL could also be added as
a feature in the future without breaking existing code.
> For setcontext/setjmp, we don't save/restore any SVE state due to the
> caller-save status of SVE, and I would not consider it necessary to
> save/restore VL itself because of the no-change-on-the-fly policy for
> this.
Thus, you would basically consider VL changes or per-thread VL as in the
realm of compilation internals? So, the specific size for a particular
piece of code would not be part of an ABI?
> I'm not familiar with resumable functions/executors -- are these in
> the C++ standards yet (not that that would cause me to be familiar
> with them... ;) Any implementation of coroutines (i.e.,
> cooperative switching) is likely to fall under the "setcontext"
> argument above.
These are not part of the C++ standard yet, but will appear in TSes.
There are various features for which implementations would be assumed to
use one OS thread for several tasks, coroutines, etc. Some of them
switch between these tasks or coroutines while these are running,
whereas the ones that will be in C++17 only run more than parallel task
on the same OS thread but one after the other (like in a thread pool).
However, if we are careful not to expose VL or make promises about it,
this may just end up being a detail similar to, say, register
allocation, which isn't exposed beyond the internals of a particular
compiler either.
Exposing it as a feature the user can set without messing with the
implementation would introduce additional thread-specific state, as
Florian said. This might not be a show-stopper by itself, but the more
thread-specific state we have the more an implementation has to take
care of or switch, and the higher the runtime costs are. C++17 already
makes weaker promises for TLS for parallel tasks, so that
implementations don't have to run TLS constructors or destructors just
because a small parallel task was executed.
^ permalink raw reply
* [PATCH 1/3] Add DT bindings documentation for NS2 USB DRD phy
From: Rob Herring @ 2016-12-05 23:09 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1480485338-23451-2-git-send-email-raviteja.garimella@broadcom.com>
On Wed, Nov 30, 2016 at 11:25:36AM +0530, Raviteja Garimella wrote:
> This patch adds documentation for NS2 DRD Phy driver DT bindings
>
> Signed-off-by: Raviteja Garimella <raviteja.garimella@broadcom.com>
> ---
> .../devicetree/bindings/phy/brcm,ns2-drd-phy.txt | 40 ++++++++++++++++++++++
> 1 file changed, 40 insertions(+)
> create mode 100644 Documentation/devicetree/bindings/phy/brcm,ns2-drd-phy.txt
>
> diff --git a/Documentation/devicetree/bindings/phy/brcm,ns2-drd-phy.txt b/Documentation/devicetree/bindings/phy/brcm,ns2-drd-phy.txt
> new file mode 100644
> index 0000000..5857f99
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/phy/brcm,ns2-drd-phy.txt
> @@ -0,0 +1,40 @@
> +BROADCOM NORTHSTAR2 USB2 (DUAL ROLE DEVICE) PHY
> +
> +Required properties:
> + - compatible: brcm,ns2-drd-phy
> + - reg: offset and length of the NS2 PHY related registers.
> + - reg-names
> + The below registers must be provided.
> + icfg - for DRD ICFG configurations
> + rst-ctrl - for DRD IDM reset
> + crmu-ctrl - for CRMU core vdd, PHY and PHY PLL reset
> + usb2-strap - for port over current polarity reversal
> + - #phy-cells: Must be 0. No args required.
> + - vbus-gpios: vbus gpio binding
> + - id-gpios: id gpio binding
> +
> +Refer to phy/phy-bindings.txt for the generic PHY binding properties
> +
> +Example:
> + gpio_g: gpio at 660a0000 {
You don't really need to show gpio node for the example. Otherwise,
Acked-by: Rob Herring <robh@kernel.org>
Rob
^ permalink raw reply
* [PATCH 0/2] arm64: dts: NS2: XMC support and Nitro memreserve
From: Jon Mason @ 2016-12-05 23:12 UTC (permalink / raw)
To: linux-arm-kernel
Add support for the NS2 XMC formfactor via a new DTS file. Also, set
aside memory for Nitro firmware in the NS2 DTSI file.
Jon Mason (2):
arm64: dts: NS2: reserve memory for Nitro firmware
arm64: dts: NS2: add support for XMC form factor
arch/arm64/boot/dts/broadcom/Makefile | 2 +-
arch/arm64/boot/dts/broadcom/ns2-xmc.dts | 191 +++++++++++++++++++++++++++++++
arch/arm64/boot/dts/broadcom/ns2.dtsi | 2 +
3 files changed, 194 insertions(+), 1 deletion(-)
create mode 100644 arch/arm64/boot/dts/broadcom/ns2-xmc.dts
--
2.7.4
^ permalink raw reply
* [PATCH 1/2] arm64: dts: NS2: reserve memory for Nitro firmware
From: Jon Mason @ 2016-12-05 23:12 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1480979542-26871-1-git-send-email-jon.mason@broadcom.com>
Nitro firmware is loaded into memory by the bootloader at a specific
location. Set this memory range aside to prevent the kernel from using
it.
Signed-off-by: Jon Mason <jon.mason@broadcom.com>
---
arch/arm64/boot/dts/broadcom/ns2.dtsi | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/arm64/boot/dts/broadcom/ns2.dtsi b/arch/arm64/boot/dts/broadcom/ns2.dtsi
index 96ed47b..9f9e203 100644
--- a/arch/arm64/boot/dts/broadcom/ns2.dtsi
+++ b/arch/arm64/boot/dts/broadcom/ns2.dtsi
@@ -30,6 +30,8 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+/memreserve/ 0x81000000 0x00200000;
+
#include <dt-bindings/interrupt-controller/arm-gic.h>
#include <dt-bindings/clock/bcm-ns2.h>
--
2.7.4
^ permalink raw reply related
* [PATCH 2/2] arm64: dts: NS2: add support for XMC form factor
From: Jon Mason @ 2016-12-05 23:12 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1480979542-26871-1-git-send-email-jon.mason@broadcom.com>
The BCM958712DxXMC board is a smaller form factor typically used as
controller boards for switches. This smaller board has less devices
pinned out, so only a few need be populated in the device tree.
Signed-off-by: Jon Mason <jon.mason@broadcom.com>
---
arch/arm64/boot/dts/broadcom/Makefile | 2 +-
arch/arm64/boot/dts/broadcom/ns2-xmc.dts | 191 +++++++++++++++++++++++++++++++
2 files changed, 192 insertions(+), 1 deletion(-)
create mode 100644 arch/arm64/boot/dts/broadcom/ns2-xmc.dts
diff --git a/arch/arm64/boot/dts/broadcom/Makefile b/arch/arm64/boot/dts/broadcom/Makefile
index 05faf2a..f1caece 100644
--- a/arch/arm64/boot/dts/broadcom/Makefile
+++ b/arch/arm64/boot/dts/broadcom/Makefile
@@ -1,5 +1,5 @@
dtb-$(CONFIG_ARCH_BCM2835) += bcm2837-rpi-3-b.dtb
-dtb-$(CONFIG_ARCH_BCM_IPROC) += ns2-svk.dtb
+dtb-$(CONFIG_ARCH_BCM_IPROC) += ns2-svk.dtb ns2-xmc.dtb
dtb-$(CONFIG_ARCH_VULCAN) += vulcan-eval.dtb
always := $(dtb-y)
diff --git a/arch/arm64/boot/dts/broadcom/ns2-xmc.dts b/arch/arm64/boot/dts/broadcom/ns2-xmc.dts
new file mode 100644
index 0000000..99a2723
--- /dev/null
+++ b/arch/arm64/boot/dts/broadcom/ns2-xmc.dts
@@ -0,0 +1,191 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Broadcom. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Broadcom Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/dts-v1/;
+
+#include "ns2.dtsi"
+
+/ {
+ model = "Broadcom NS2 XMC";
+ compatible = "brcm,ns2-xmc", "brcm,ns2";
+
+ aliases {
+ serial0 = &uart3;
+ };
+
+ chosen {
+ stdout-path = "serial0:115200n8";
+ bootargs = "earlycon=uart8250,mmio32,0x66130000";
+ };
+
+ memory {
+ device_type = "memory";
+ reg = <0x000000000 0x80000000 0x00000001 0x00000000>;
+ };
+};
+
+&enet {
+ status = "ok";
+};
+
+&i2c0 {
+ status = "ok";
+};
+
+&i2c1 {
+ status = "ok";
+};
+
+&mdio_mux_iproc {
+ mdio at 10 {
+ gphy0: eth-phy at 10 {
+ reg = <0x10>;
+ };
+ };
+};
+
+&nand {
+ nandcs at 0 {
+ compatible = "brcm,nandcs";
+ reg = <0>;
+ nand-ecc-mode = "hw";
+ nand-ecc-strength = <8>;
+ nand-ecc-step-size = <512>;
+ nand-bus-width = <16>;
+ brcm,nand-oob-sector-size = <16>;
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ partition at 0 {
+ label = "nboot";
+ reg = <0x00000000 0x00280000>; /* 2.5MB */
+ read-only;
+ };
+
+ partition at 280000 {
+ label = "nenv";
+ reg = <0x00280000 0x00040000>; /* 0.25MB */
+ read-only;
+ };
+
+ partition at 2c0000 {
+ label = "ndtb";
+ reg = <0x002c0000 0x00040000>; /* 0.25MB */
+ read-only;
+ };
+
+ partition at 300000 {
+ label = "nsystem";
+ reg = <0x00300000 0x03d00000>; /* 61MB */
+ read-only;
+ };
+
+ partition at 4000000 {
+ label = "nrootfs";
+ reg = <0x04000000 0x06400000>; /* 100MB */
+ };
+
+ partition at 0a400000{
+ label = "ncustfs";
+ reg = <0x0a400000 0x35c00000>; /* 860MB */
+ };
+ };
+};
+
+&pci_phy0 {
+ status = "ok";
+};
+
+&pcie0 {
+ status = "ok";
+};
+
+&pcie8 {
+ status = "ok";
+};
+
+&sata_phy0 {
+ status = "ok";
+};
+
+&sata_phy1 {
+ status = "ok";
+};
+
+&sata {
+ status = "ok";
+};
+
+&qspi {
+ flash: m25p80 at 0 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "m25p80";
+ spi-max-frequency = <62500000>;
+ m25p,default-addr-width = <3>;
+ reg = <0x0 0x0>;
+
+ partition at 0 {
+ label = "bl0";
+ reg = <0x00000000 0x00080000>; /* 512KB */
+ };
+
+ partition at 80000 {
+ label = "fip";
+ reg = <0x00080000 0x00150000>; /* 1344KB */
+ };
+
+ partition at 1e0000 {
+ label = "env";
+ reg = <0x001e0000 0x00010000>;/* 64KB */
+ };
+
+ partition at 1f0000 {
+ label = "dtb";
+ reg = <0x001f0000 0x00010000>; /* 64KB */
+ };
+
+ partition at 200000 {
+ label = "kernel";
+ reg = <0x00200000 0x00e00000>; /* 14MB */
+ };
+
+ partition at 1000000 {
+ label = "rootfs";
+ reg = <0x01000000 0x01000000>; /* 16MB */
+ };
+ };
+};
+
+&uart3 {
+ status = "ok";
+};
--
2.7.4
^ permalink raw reply related
* [PATCH v2 3/5] dt-bindings: spi: Add documentation for the Armada 3700 SPI Controller
From: Rob Herring @ 2016-12-05 23:27 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20161130094351.2748-4-romain.perier@free-electrons.com>
On Wed, Nov 30, 2016 at 10:43:49AM +0100, Romain Perier wrote:
> This adds the devicetree bindings documentation for the SPI controller
> present in the Marvell Armada 3700 SoCs.
>
> Signed-off-by: Romain Perier <romain.perier@free-electrons.com>
> ---
> .../devicetree/bindings/spi/spi-armada-3700.txt | 25 ++++++++++++++++++++++
> 1 file changed, 25 insertions(+)
> create mode 100644 Documentation/devicetree/bindings/spi/spi-armada-3700.txt
Acked-by: Rob Herring <robh@kernel.org>
^ permalink raw reply
* [PATCH v3] PCI/ACPI: xgene: Add ECAM quirk for X-Gene PCIe controller
From: Jon Masters @ 2016-12-05 23:31 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20161205212012.GA22455@bhelgaas-glaptop.roam.corp.google.com>
On 12/05/2016 04:20 PM, Bjorn Helgaas wrote:
> On Fri, Dec 02, 2016 at 11:06:30PM -0800, Duc Dang wrote:
>> On Fri, Dec 2, 2016 at 3:39 PM, Bjorn Helgaas <helgaas@kernel.org> wrote:
>
>>> diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c
>>> index 8a177a1..a16fc8e 100644
>>> --- a/arch/arm64/kernel/pci.c
>>> +++ b/arch/arm64/kernel/pci.c
>>> @@ -114,6 +114,19 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
>>> return 0;
>>> }
>>>
>>> +static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
>>> +{
>>> + struct resource_entry *entry, *tmp;
>>> + int status;
>>> +
>>> + status = acpi_pci_probe_root_resources(ci);
>>> + resource_list_for_each_entry_safe(entry, tmp, &ci->resources) {
>>> + if (!(entry->res->flags & IORESOURCE_WINDOW))
>>> + resource_list_destroy_entry(entry);
>>> + }
>>> + return status;
>>> +}
>>> +
>>> /*
>>> * Lookup the bus range for the domain in MCFG, and set up config space
>>> * mapping.
>>> @@ -190,6 +203,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
>>> }
>>>
>>> root_ops->release_info = pci_acpi_generic_release_info;
>>> + root_ops->prepare_resources = pci_acpi_root_prepare_resources;
>>> root_ops->pci_ops = &ri->cfg->ops->pci_ops;
>>> bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg);
>>> if (!bus)
>>
>> I tried your patch above with my X-Gene ECAM v4 patch on Mustang, here
>> is the kernel boot log and output of 'cat /proc/iomem'. The PCIe core
>> does not print the MMIO space as a window (which is expected per your
>> patch above).
>
> Thanks!
...and just for the record, here it is on HPE ProLiant m400 (Moonshot),
with the same result that the region is no longer claimed as PCI space
(it - 1f500000 - is now showing as being owned by PNP0A08:00):
# cat /proc/iomem
10520000-10523fff : APMC0D18:00
10520000-10523fff : APMC0D18:00
10524000-10527fff : APMC0D17:00
10540000-1054a0ff : APMC0D01:00
10546000-10546fff : APMC0D50:00
1054a000-1054a00f : APMC0D12:03
1054a000-1054a00f : APMC0D12:02
1054a000-1054a00f : APMC0D12:01
1054a000-1054a00f : APMC0D12:00
17000000-17000fff : APMC0D01:00
17001000-17001fff : APMC0D01:00
17001000-170013ff : APMC0D15:00
17001000-170013ff : APMC0D15:00
1701c000-1701cfff : APMC0D14:00
1a800000-1a800fff : APMC0D0D:00
1a800000-1a800fff : APMC0D0D:00
1c000200-1c0002ff : APMC0D06:00
1c021000-1c0210ff : APMC0D08:00
1c021000-1c02101f : serial
1c024000-1c024fff : APMC0D07:00
1f230000-1f230fff : APMC0D0D:00
1f230000-1f230fff : APMC0D0D:00
1f23d000-1f23dfff : APMC0D0D:00
1f23d000-1f23dfff : APMC0D0D:00
1f23e000-1f23efff : APMC0D0D:00
1f23e000-1f23efff : APMC0D0D:00
1f2a0000-1f31ffff : APMC0D06:00
1f500000-1f50ffff : PNP0A08:00
78800000-78800fff : APMC0D13:00
78800000-78800fff : APMC0D12:03
78800000-78800fff : APMC0D12:02
78800000-78800fff : APMC0D12:01
78800000-78800fff : APMC0D12:00
78800000-78800fff : APMC0D11:00
78800000-78800fff : APMC0D10:03
78800000-78800fff : APMC0D10:02
78800000-78800fff : APMC0D10:01
78800000-78800fff : APMC0D10:00
79000000-798fffff : APMC0D0E:00
7c000000-7c1fffff : APMC0D12:00
7c200000-7c3fffff : APMC0D12:01
7c400000-7c5fffff : APMC0D12:02
7c600000-7c7fffff : APMC0D12:03
7e000000-7e000fff : APMC0D13:00
7e200000-7e200fff : APMC0D10:03
7e200000-7e200fff : APMC0D10:02
7e200000-7e200fff : APMC0D10:01
7e200000-7e200fff : APMC0D10:00
7e600000-7e600fff : APMC0D11:00
7e700000-7e700fff : APMC0D10:03
7e700000-7e700fff : APMC0D10:02
7e700000-7e700fff : APMC0D10:01
7e700000-7e700fff : APMC0D10:00
7e720000-7e720fff : APMC0D10:03
7e720000-7e720fff : APMC0D10:02
7e720000-7e720fff : APMC0D10:01
7e720000-7e720fff : APMC0D10:00
7e800000-7e800fff : APMC0D10:00
7e840000-7e840fff : APMC0D10:01
7e880000-7e880fff : APMC0D10:02
7e8c0000-7e8c0fff : APMC0D10:03
7e930000-7e930fff : APMC0D13:00
4000000000-4001ffffff : System RAM
4000080000-4000c3ffff : Kernel code
4000db0000-400165ffff : Kernel data
40023a0000-4ff733ffff : System RAM
4ff7340000-4ff77cffff : reserved
4ff77d0000-4ff79cffff : System RAM
4ff79d0000-4ff7e7ffff : reserved
4ff7e80000-4ff7e8ffff : System RAM
4ff7e90000-4ff7efffff : reserved
4ff7f10000-4ff800ffff : reserved
4ff8010000-4fffffffff : System RAM
a020000000-a03fffffff : PCI Bus 0000:00
a020000000-a0201fffff : PCI Bus 0000:01
a020000000-a0200fffff : 0000:01:00.0
a020000000-a0200fffff : mlx4_core
a020100000-a0201fffff : 0000:01:00.0
a060000000-a07fffffff : PCI Bus 0000:00
a0d0000000-a0dfffffff : PCI ECAM
a110000000-a14fffffff : PCI Bus 0000:00
a110000000-a121ffffff : PCI Bus 0000:01
a110000000-a111ffffff : 0000:01:00.0
a110000000-a111ffffff : mlx4_core
a112000000-a121ffffff : 0000:01:00.0
Tested-by: Jon Masters <jcm@redhat.com>
--
Computer Architect | Sent from my Fedora powered laptop
^ permalink raw reply
* [resend v2: PATCH 1/2] dt-bindings: Document the hi3660 reset bindings
From: Rob Herring @ 2016-12-05 23:40 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1480687813.2460.19.camel@pengutronix.de>
On Fri, Dec 02, 2016 at 03:10:13PM +0100, Philipp Zabel wrote:
> Am Freitag, den 02.12.2016, 13:32 +0100 schrieb Arnd Bergmann:
> > On Friday, December 2, 2016 8:21:33 AM CET zhangfei wrote:
> > > Hi, Arnd
> > >
> > > On 2016?12?01? 20:05, Arnd Bergmann wrote:
> > > > On Thursday, December 1, 2016 8:48:40 AM CET Zhangfei Gao wrote:
> > > >> + hisi,reset-bits = <0x20 0x8 /* 0: i2c0 */
> > > >> + 0x20 0x10 /* 1: i2c1 */
> > > >> + 0x20 0x20 /* 2: i2c2 */
> > > >> + 0x20 0x8000000>; /* 3: i2c6 */
> > > >> + };
> > > >> +
> > > >> +Specifying reset lines connected to IP modules
> > > >> +==============================================
> > > >> +example:
> > > >> +
> > > >> + i2c0: i2c at ..... {
> > > >> + ...
> > > >> + resets = <&iomcu_rst 0>;
> > > >> + ...
> > > >> + };
> > > > I don't really like this approach, since now the information is
> > > > in two places. Why not put the data into the reset specifier
> > > > directly when it is used?
>
> From my point of view, with the binding above, all reset controller
> register/bit layout information is in a single place and can be easily
> compared to a list in the reference manual, whereas with your suggestion
> the description of the reset controller register layout is spread
> throughout one or even several dtsi files.
Which can be solved by tools.
> Also, since no two reset controllers are exactly the same, we get a
> proliferation of different slightly phandle argument meanings.
phandle args are supposed to be specific to the phandle it points to.
Otherwise, we'd never need more than 1 cell and everything could be a
lookup table.
>
> > > Any example, still not understand.
> > > They are consumer and provider.
> >
> > I mean in the i2c node, have
> >
> > i2c0: i2c at ..... {
> > ...
> > resets = <&iomcu_rst 0x20 0x8>;
> > ...
> > }
>
> There already are a few drivers that use this, and I fear people having
> to change their bindings because new flags are needed that have not been
> previously thought of.
>
Drivers that use what?
Rob
^ permalink raw reply
* [PATCH 07/12] usb: sunxi: Uses the resource-managed extcon API when registering extcon notifier
From: Chanwoo Choi @ 2016-12-06 0:21 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <20161205163240.GA5783@uda0271908>
On 2016? 12? 06? 01:32, Bin Liu wrote:
> On Wed, Nov 30, 2016 at 09:45:03AM +0100, Maxime Ripard wrote:
>> On Wed, Nov 30, 2016 at 02:57:35PM +0900, Chanwoo Choi wrote:
>>> This patch just uses the resource-managed extcon API when registering
>>> the extcon notifier.
>>>
>>> Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
>>
>> Acked-by: Maxime Ripard <maxime.ripard@free-electrons.com>
>
> It would be ideal if the subject was "usb: musb: sunxi: ...".
>
> Acked-by: Bin Liu <b-liu@ti.com>
>
Thanks for the review. I'll change the subject.
--
Best Regards,
Chanwoo Choi
^ permalink raw reply
* [PATCH] dt: bindings: zx: Add header for PM domains specifiers
From: Baoyou Xie @ 2016-12-06 0:21 UTC (permalink / raw)
To: linux-arm-kernel
This patch adds header with values used for ZTE 2967
SoC's power domain driver.
Signed-off-by: Baoyou Xie <baoyou.xie@linaro.org>
---
include/dt-bindings/arm/zte_pm_domains.h | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)
create mode 100644 include/dt-bindings/arm/zte_pm_domains.h
diff --git a/include/dt-bindings/arm/zte_pm_domains.h b/include/dt-bindings/arm/zte_pm_domains.h
new file mode 100644
index 0000000..1485e8d
--- /dev/null
+++ b/include/dt-bindings/arm/zte_pm_domains.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2015 Linaro Ltd.
+ *
+ * Author: Baoyou Xie <baoyou.xie@linaro.org>
+ * License terms: GNU General Public License (GPL) version 2
+ */
+#ifndef _DT_BINDINGS_ARM_ZTE_PM_DOMAINS_H
+#define _DT_BINDINGS_ARM_ZTE_PM_DOMAINS_H
+
+#define DM_ZX296718_SAPPU 0
+#define DM_ZX296718_VDE 1 /*g1v6*/
+#define DM_ZX296718_VCE 2 /*h1v6*/
+#define DM_ZX296718_HDE 3 /*g2v2*/
+#define DM_ZX296718_VIU 4
+#define DM_ZX296718_USB20 5
+#define DM_ZX296718_USB21 6
+#define DM_ZX296718_USB30 7
+#define DM_ZX296718_HSIC 8
+#define DM_ZX296718_GMAC 9
+#define DM_ZX296718_TS 10
+#define DM_ZX296718_VOU 11
+
+#endif /* _DT_BINDINGS_ARM_ZTE_PM_DOMAINS_H */
--
2.7.4
^ permalink raw reply related
* [PATCH 01/12] phy: rcar-gen3-usb2: Replace the deprecated extcon API
From: Chanwoo Choi @ 2016-12-06 0:25 UTC (permalink / raw)
To: linux-arm-kernel
In-Reply-To: <1480485460-2663-2-git-send-email-cw00.choi@samsung.com>
Hi Kishon,
Could you review and pick the patch1/2 for phy driver?
Best Regards,
Chanwoo Choi
On 2016? 11? 30? 14:57, Chanwoo Choi wrote:
> This patch replaces the deprecated extcon API as following:
> - extcon_set_cable_state_() -> extcon_set_state_sync()
>
> Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
> ---
> drivers/phy/phy-rcar-gen3-usb2.c | 8 ++++----
> 1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/phy/phy-rcar-gen3-usb2.c b/drivers/phy/phy-rcar-gen3-usb2.c
> index bd2430d7339c..7f8081f157f4 100644
> --- a/drivers/phy/phy-rcar-gen3-usb2.c
> +++ b/drivers/phy/phy-rcar-gen3-usb2.c
> @@ -93,11 +93,11 @@ static void rcar_gen3_phy_usb2_work(struct work_struct *work)
> work);
>
> if (ch->extcon_host) {
> - extcon_set_cable_state_(ch->extcon, EXTCON_USB_HOST, true);
> - extcon_set_cable_state_(ch->extcon, EXTCON_USB, false);
> + extcon_set_state_sync(ch->extcon, EXTCON_USB_HOST, true);
> + extcon_set_state_sync(ch->extcon, EXTCON_USB, false);
> } else {
> - extcon_set_cable_state_(ch->extcon, EXTCON_USB_HOST, false);
> - extcon_set_cable_state_(ch->extcon, EXTCON_USB, true);
> + extcon_set_state_sync(ch->extcon, EXTCON_USB_HOST, false);
> + extcon_set_state_sync(ch->extcon, EXTCON_USB, true);
> }
> }
>
>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox