LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/3] module: Rename module_alloc() to text_alloc() and move to kernel proper
From: Jarkko Sakkinen @ 2020-07-14  9:45 UTC (permalink / raw)
  To: linux-kernel
  Cc: Catalin Marinas, Kefeng Wang, Paul Mackerras, Zong Li, Andi Kleen,
	Paul Burton, Vincent Whitchurch, Petr Mladek, Brian Gerst,
	Andy Lutomirski, Thomas Gleixner, Jiri Kosina, Anup Patel,
	Philipp Rudo, Torsten Duwe, Masami Hiramatsu, Andrew Morton,
	Mark Rutland, James E.J. Bottomley, Vincent Chen, Omar Sandoval,
	open list:S390, Joe Lawrence, Helge Deller, John Fastabend,
	Anil S Keshavamurthy, Yonghong Song, Iurii Zaikin,
	Andrii Nakryiko, Thomas Huth, Vasily Gorbik,
	moderated list:ARM PORT, Daniel Axtens, Damien Le Moal,
	Martin KaFai Lau, Song Liu, Paul Walmsley, Heiko Carstens,
	Alexei Starovoitov, Jarkko Sakkinen, Atish Patra, Will Deacon,
	Daniel Borkmann, Masahiro Yamada, Nayna Jain, Ley Foon Tan,
	Christian Borntraeger, Sami Tolvanen, Naveen N. Rao, Mao Han,
	Marco Elver, Steven Rostedt, Babu Moger, Borislav Petkov,
	Greentime Hu, Ben Dooks, Guan Xuetao, Thomas Bogendoerfer,
	open list:PARISC ARCHITECTURE, Jessica Yu,
	open list:BPF JIT for MIPS 32-BIT AND 64-BIT, David S. Miller,
	Thiago Jung Bauermann, Peter Zijlstra, David Howells,
	open list:SPARC + UltraSPARC sparc/sparc64, Sandipan Das,
	H. Peter Anvin, Amit Daniel Kachhap, Tiezhu Yang, Miroslav Benes,
	Jiri Olsa, Ard Biesheuvel, Vincenzo Frascino, Anders Roxell,
	Sven Schnelle, maintainer:X86 ARCHITECTURE 32-BIT AND 64-BIT,
	Russell King, open list:RISC-V ARCHITECTURE, Mike Rapoport,
	Ingo Molnar, Albert Ou, Paul E. McKenney, Josh Poimboeuf,
	KP Singh, Dmitry Vyukov, Nick Hu,
	open list:BPF JIT for MIPS 32-BIT AND 64-BIT, open list:MIPS,
	Palmer Dabbelt, open list:LINUX FOR POWERPC 32-BIT AND 64-BIT
In-Reply-To: <20200714094625.1443261-1-jarkko.sakkinen@linux.intel.com>

Rename module_alloc() to text_alloc() and module_memfree() to
text_memfree(), and move them to kernel/text.c, which is unconditionally
compiled to the kernel proper. This allows kprobes, ftrace and bpf to
allocate space for executable code without requiring to compile the modules
support (CONFIG_MODULES=y) in.

Cc: Andi Kleen <ak@linux.intel.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 arch/arm/kernel/Makefile         |  3 +-
 arch/arm/kernel/module.c         | 21 -----------
 arch/arm/kernel/text.c           | 33 ++++++++++++++++++
 arch/arm64/kernel/Makefile       |  2 +-
 arch/arm64/kernel/module.c       | 42 ----------------------
 arch/arm64/kernel/text.c         | 54 ++++++++++++++++++++++++++++
 arch/mips/kernel/Makefile        |  2 +-
 arch/mips/kernel/module.c        |  9 -----
 arch/mips/kernel/text.c          | 19 ++++++++++
 arch/mips/net/bpf_jit.c          |  4 +--
 arch/nds32/kernel/Makefile       |  2 +-
 arch/nds32/kernel/module.c       |  7 ----
 arch/nds32/kernel/text.c         | 12 +++++++
 arch/nios2/kernel/Makefile       |  1 +
 arch/nios2/kernel/module.c       | 19 ----------
 arch/nios2/kernel/text.c         | 34 ++++++++++++++++++
 arch/parisc/kernel/Makefile      |  2 +-
 arch/parisc/kernel/module.c      | 11 ------
 arch/parisc/kernel/text.c        | 22 ++++++++++++
 arch/powerpc/net/bpf_jit_comp.c  |  4 +--
 arch/riscv/kernel/Makefile       |  1 +
 arch/riscv/kernel/module.c       | 12 -------
 arch/riscv/kernel/text.c         | 20 +++++++++++
 arch/s390/kernel/Makefile        |  2 +-
 arch/s390/kernel/ftrace.c        |  2 +-
 arch/s390/kernel/module.c        | 16 ---------
 arch/s390/kernel/text.c          | 23 ++++++++++++
 arch/sparc/kernel/Makefile       |  1 +
 arch/sparc/kernel/module.c       | 30 ----------------
 arch/sparc/kernel/text.c         | 39 +++++++++++++++++++++
 arch/sparc/net/bpf_jit_comp_32.c |  6 ++--
 arch/unicore32/kernel/Makefile   |  1 +
 arch/unicore32/kernel/module.c   |  7 ----
 arch/unicore32/kernel/text.c     | 18 ++++++++++
 arch/x86/kernel/Makefile         |  1 +
 arch/x86/kernel/ftrace.c         |  4 +--
 arch/x86/kernel/kprobes/core.c   |  4 +--
 arch/x86/kernel/module.c         | 49 --------------------------
 arch/x86/kernel/text.c           | 60 ++++++++++++++++++++++++++++++++
 include/linux/moduleloader.h     |  4 +--
 kernel/Makefile                  |  2 +-
 kernel/bpf/core.c                |  4 +--
 kernel/kprobes.c                 |  4 +--
 kernel/module.c                  | 37 ++++++--------------
 kernel/text.c                    | 25 +++++++++++++
 45 files changed, 400 insertions(+), 275 deletions(-)
 create mode 100644 arch/arm/kernel/text.c
 create mode 100644 arch/arm64/kernel/text.c
 create mode 100644 arch/mips/kernel/text.c
 create mode 100644 arch/nds32/kernel/text.c
 create mode 100644 arch/nios2/kernel/text.c
 create mode 100644 arch/parisc/kernel/text.c
 create mode 100644 arch/riscv/kernel/text.c
 create mode 100644 arch/s390/kernel/text.c
 create mode 100644 arch/sparc/kernel/text.c
 create mode 100644 arch/unicore32/kernel/text.c
 create mode 100644 arch/x86/kernel/text.c
 create mode 100644 kernel/text.c

diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 89e5d864e923..69bfacfd60ef 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -19,7 +19,8 @@ CFLAGS_REMOVE_return_address.o = -pg
 obj-y		:= elf.o entry-common.o irq.o opcodes.o \
 		   process.o ptrace.o reboot.o \
 		   setup.o signal.o sigreturn_codes.o \
-		   stacktrace.o sys_arm.o time.o traps.o
+		   stacktrace.o sys_arm.o time.o traps.o \
+		   text.o
 
 ifneq ($(CONFIG_ARM_UNWIND),y)
 obj-$(CONFIG_FRAME_POINTER)	+= return_address.o
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index e15444b25ca0..13e3442a6b9f 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -33,27 +33,6 @@
 #define MODULES_VADDR	(((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK)
 #endif
 
-#ifdef CONFIG_MMU
-void *module_alloc(unsigned long size)
-{
-	gfp_t gfp_mask = GFP_KERNEL;
-	void *p;
-
-	/* Silence the initial allocation */
-	if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
-		gfp_mask |= __GFP_NOWARN;
-
-	p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
-	if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
-		return p;
-	return __vmalloc_node_range(size, 1,  VMALLOC_START, VMALLOC_END,
-				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
-}
-#endif
-
 bool module_init_section(const char *name)
 {
 	return strstarts(name, ".init") ||
diff --git a/arch/arm/kernel/text.c b/arch/arm/kernel/text.c
new file mode 100644
index 000000000000..600143fb909d
--- /dev/null
+++ b/arch/arm/kernel/text.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  linux/arch/arm/kernel/module.c
+ *
+ *  Copyright (C) 2002 Russell King.
+ *  Modified for nommu by Hyok S. Choi
+ *
+ * Module allocation method suggested by Andi Kleen.
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+#ifdef CONFIG_MMU
+void *text_alloc(unsigned long size)
+{
+	gfp_t gfp_mask = GFP_KERNEL;
+	void *p;
+
+	/* Silence the initial allocation */
+	if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
+		gfp_mask |= __GFP_NOWARN;
+
+	p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+				gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+				__builtin_return_address(0));
+	if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
+		return p;
+	return __vmalloc_node_range(size, 1,  VMALLOC_START, VMALLOC_END,
+				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+				__builtin_return_address(0));
+}
+#endif
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index a561cbb91d4d..7765a45d71b5 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -19,7 +19,7 @@ obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o	\
-			   syscall.o
+			   syscall.o text.o
 
 targets			+= efi-entry.o
 
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 1cd1a4d0ed30..adde022f703c 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -20,48 +20,6 @@
 #include <asm/insn.h>
 #include <asm/sections.h>
 
-void *module_alloc(unsigned long size)
-{
-	u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
-	gfp_t gfp_mask = GFP_KERNEL;
-	void *p;
-
-	/* Silence the initial allocation */
-	if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
-		gfp_mask |= __GFP_NOWARN;
-
-	if (IS_ENABLED(CONFIG_KASAN))
-		/* don't exceed the static module region - see below */
-		module_alloc_end = MODULES_END;
-
-	p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
-				module_alloc_end, gfp_mask, PAGE_KERNEL, 0,
-				NUMA_NO_NODE, __builtin_return_address(0));
-
-	if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
-	    !IS_ENABLED(CONFIG_KASAN))
-		/*
-		 * KASAN can only deal with module allocations being served
-		 * from the reserved module region, since the remainder of
-		 * the vmalloc region is already backed by zero shadow pages,
-		 * and punching holes into it is non-trivial. Since the module
-		 * region is not randomized when KASAN is enabled, it is even
-		 * less likely that the module region gets exhausted, so we
-		 * can simply omit this fallback in that case.
-		 */
-		p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
-				module_alloc_base + SZ_2G, GFP_KERNEL,
-				PAGE_KERNEL, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
-
-	if (p && (kasan_module_alloc(p, size) < 0)) {
-		vfree(p);
-		return NULL;
-	}
-
-	return p;
-}
-
 enum aarch64_reloc_op {
 	RELOC_OP_NONE,
 	RELOC_OP_ABS,
diff --git a/arch/arm64/kernel/text.c b/arch/arm64/kernel/text.c
new file mode 100644
index 000000000000..64fc7e2d85df
--- /dev/null
+++ b/arch/arm64/kernel/text.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AArch64 loadable module support.
+ *
+ * Copyright (C) 2012 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+#include <linux/kasan.h>
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+void *text_alloc(unsigned long size)
+{
+	u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
+	gfp_t gfp_mask = GFP_KERNEL;
+	void *p;
+
+	/* Silence the initial allocation */
+	if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
+		gfp_mask |= __GFP_NOWARN;
+
+	if (IS_ENABLED(CONFIG_KASAN))
+		/* don't exceed the static module region - see below */
+		module_alloc_end = MODULES_END;
+
+	p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
+				module_alloc_end, gfp_mask, PAGE_KERNEL, 0,
+				NUMA_NO_NODE, __builtin_return_address(0));
+
+	if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
+	    !IS_ENABLED(CONFIG_KASAN))
+		/*
+		 * KASAN can only deal with module allocations being served
+		 * from the reserved module region, since the remainder of
+		 * the vmalloc region is already backed by zero shadow pages,
+		 * and punching holes into it is non-trivial. Since the module
+		 * region is not randomized when KASAN is enabled, it is even
+		 * less likely that the module region gets exhausted, so we
+		 * can simply omit this fallback in that case.
+		 */
+		p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
+				module_alloc_base + SZ_2G, GFP_KERNEL,
+				PAGE_KERNEL, 0, NUMA_NO_NODE,
+				__builtin_return_address(0));
+
+	if (p && (kasan_module_alloc(p, size) < 0)) {
+		vfree(p);
+		return NULL;
+	}
+
+	return p;
+}
diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile
index 8c7a043295ed..37ebf9a7f259 100644
--- a/arch/mips/kernel/Makefile
+++ b/arch/mips/kernel/Makefile
@@ -8,7 +8,7 @@ extra-y		:= head.o vmlinux.lds
 obj-y		+= cmpxchg.o cpu-probe.o branch.o elf.o entry.o genex.o idle.o irq.o \
 		   process.o prom.o ptrace.o reset.o setup.o signal.o \
 		   syscall.o time.o topology.o traps.o unaligned.o watch.o \
-		   vdso.o cacheinfo.o
+		   vdso.o cacheinfo.o text.o
 
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_ftrace.o = -pg
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 3c0c3d1260c1..f5ac4ebc4bad 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -31,15 +31,6 @@ struct mips_hi16 {
 static LIST_HEAD(dbe_list);
 static DEFINE_SPINLOCK(dbe_lock);
 
-#ifdef MODULE_START
-void *module_alloc(unsigned long size)
-{
-	return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
-				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
-}
-#endif
-
 static int apply_r_mips_none(struct module *me, u32 *location,
 			     u32 base, Elf_Addr v, bool rela)
 {
diff --git a/arch/mips/kernel/text.c b/arch/mips/kernel/text.c
new file mode 100644
index 000000000000..55ca87a421c3
--- /dev/null
+++ b/arch/mips/kernel/text.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *
+ *  Copyright (C) 2001 Rusty Russell.
+ *  Copyright (C) 2003, 2004 Ralf Baechle (ralf@linux-mips.org)
+ *  Copyright (C) 2005 Thiemo Seufer
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+#ifdef MODULE_START
+void *text_alloc(unsigned long size)
+{
+	return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
+				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
+				__builtin_return_address(0));
+}
+#endif
diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c
index 0af88622c619..2b03f7128809 100644
--- a/arch/mips/net/bpf_jit.c
+++ b/arch/mips/net/bpf_jit.c
@@ -1233,7 +1233,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
 	build_epilogue(&ctx);
 
 	alloc_size = 4 * ctx.idx;
-	ctx.target = module_alloc(alloc_size);
+	ctx.target = text_alloc(alloc_size);
 	if (ctx.target == NULL)
 		goto out;
 
@@ -1264,7 +1264,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
 void bpf_jit_free(struct bpf_prog *fp)
 {
 	if (fp->jited)
-		module_memfree(fp->bpf_func);
+		text_memfree(fp->bpf_func);
 
 	bpf_prog_unlock_free(fp);
 }
diff --git a/arch/nds32/kernel/Makefile b/arch/nds32/kernel/Makefile
index 394df3f6442c..fc15778c59d1 100644
--- a/arch/nds32/kernel/Makefile
+++ b/arch/nds32/kernel/Makefile
@@ -10,7 +10,7 @@ AFLAGS_head.o		:= -DTEXTADDR=$(TEXTADDR)
 obj-y			:= ex-entry.o ex-exit.o ex-scall.o irq.o \
 			process.o ptrace.o setup.o signal.o \
 			sys_nds32.o time.o traps.o cacheinfo.o \
-			dma.o syscall_table.o vdso.o
+			dma.o syscall_table.o vdso.o text.o
 
 obj-$(CONFIG_MODULES)		+= nds32_ksyms.o module.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
diff --git a/arch/nds32/kernel/module.c b/arch/nds32/kernel/module.c
index 3897fd14a21d..3d23a12ed535 100644
--- a/arch/nds32/kernel/module.c
+++ b/arch/nds32/kernel/module.c
@@ -7,13 +7,6 @@
 #include <linux/moduleloader.h>
 #include <linux/pgtable.h>
 
-void *module_alloc(unsigned long size)
-{
-	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				    GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
-				    __builtin_return_address(0));
-}
-
 void module_free(struct module *module, void *region)
 {
 	vfree(region);
diff --git a/arch/nds32/kernel/text.c b/arch/nds32/kernel/text.c
new file mode 100644
index 000000000000..6e86eff9aaf0
--- /dev/null
+++ b/arch/nds32/kernel/text.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2005-2017 Andes Technology Corporation
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+void *text_alloc(unsigned long size)
+{
+	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+				    GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
+				    __builtin_return_address(0));
+}
diff --git a/arch/nios2/kernel/Makefile b/arch/nios2/kernel/Makefile
index 0b645e1e3158..5476fc749f37 100644
--- a/arch/nios2/kernel/Makefile
+++ b/arch/nios2/kernel/Makefile
@@ -18,6 +18,7 @@ obj-y	+= setup.o
 obj-y	+= signal.o
 obj-y	+= sys_nios2.o
 obj-y	+= syscall_table.o
+obj-y	+= text.o
 obj-y	+= time.o
 obj-y	+= traps.o
 
diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c
index 76e0a42d6e36..20a0faf64e38 100644
--- a/arch/nios2/kernel/module.c
+++ b/arch/nios2/kernel/module.c
@@ -21,25 +21,6 @@
 
 #include <asm/cacheflush.h>
 
-/*
- * Modules should NOT be allocated with kmalloc for (obvious) reasons.
- * But we do it for now to avoid relocation issues. CALL26/PCREL26 cannot reach
- * from 0x80000000 (vmalloc area) to 0xc00000000 (kernel) (kmalloc returns
- * addresses in 0xc0000000)
- */
-void *module_alloc(unsigned long size)
-{
-	if (size == 0)
-		return NULL;
-	return kmalloc(size, GFP_KERNEL);
-}
-
-/* Free memory returned from module_alloc */
-void module_memfree(void *module_region)
-{
-	kfree(module_region);
-}
-
 int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
 			unsigned int symindex, unsigned int relsec,
 			struct module *mod)
diff --git a/arch/nios2/kernel/text.c b/arch/nios2/kernel/text.c
new file mode 100644
index 000000000000..af424174442f
--- /dev/null
+++ b/arch/nios2/kernel/text.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Kernel module support for Nios II.
+ *
+ * Copyright (C) 2004 Microtronix Datacom Ltd.
+ *   Written by Wentao Xu <xuwentao@microtronix.com>
+ * Copyright (C) 2001, 2003 Rusty Russell
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file COPYING in the main directory of this
+ * archive for more details.
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+/*
+ * Modules should NOT be allocated with kmalloc for (obvious) reasons.
+ * But we do it for now to avoid relocation issues. CALL26/PCREL26 cannot reach
+ * from 0x80000000 (vmalloc area) to 0xc00000000 (kernel) (kmalloc returns
+ * addresses in 0xc0000000)
+ */
+void *text_alloc(unsigned long size)
+{
+	if (size == 0)
+		return NULL;
+	return kmalloc(size, GFP_KERNEL);
+}
+
+/* Free memory returned from module_alloc */
+void text_memfree(void *module_region)
+{
+	kfree(module_region);
+}
diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile
index 068d90950d93..f71f7ffdae2e 100644
--- a/arch/parisc/kernel/Makefile
+++ b/arch/parisc/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y	     	:= cache.o pacache.o setup.o pdt.o traps.o time.o irq.o \
 		   ptrace.o hardware.o inventory.o drivers.o alternative.o \
 		   signal.o hpmc.o real2.o parisc_ksyms.o unaligned.o \
 		   process.o processor.o pdc_cons.o pdc_chassis.o unwind.o \
-		   patch.o
+		   patch.o text.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not profile debug and lowlevel utilities
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index 7df140545b22..c81e63e2549b 100644
--- a/arch/parisc/kernel/module.c
+++ b/arch/parisc/kernel/module.c
@@ -192,17 +192,6 @@ static inline int reassemble_22(int as22)
 		((as22 & 0x0003ff) << 3));
 }
 
-void *module_alloc(unsigned long size)
-{
-	/* using RWX means less protection for modules, but it's
-	 * easier than trying to map the text, data, init_text and
-	 * init_data correctly */
-	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
-				    GFP_KERNEL,
-				    PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
-				    __builtin_return_address(0));
-}
-
 #ifndef CONFIG_64BIT
 static inline unsigned long count_gots(const Elf_Rela *rela, unsigned long n)
 {
diff --git a/arch/parisc/kernel/text.c b/arch/parisc/kernel/text.c
new file mode 100644
index 000000000000..9ff503084191
--- /dev/null
+++ b/arch/parisc/kernel/text.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *    Linux/PA-RISC Project
+ *    Copyright (C) 2003 Randolph Chung <tausq at debian . org>
+ *    Copyright (C) 2008 Helge Deller <deller@gmx.de>
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+void *text_alloc(unsigned long size)
+{
+	/*
+	 * Using RWX means less protection for modules, but it's
+	 * easier than trying to map the text, data, init_text and
+	 * init_data correctly.
+	 */
+	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+				    GFP_KERNEL,
+				    PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
+				    __builtin_return_address(0));
+}
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 0acc9d5fb19e..ba1cef7a812d 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -634,7 +634,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
 
 	proglen = cgctx.idx * 4;
 	alloclen = proglen + FUNCTION_DESCR_SIZE;
-	image = module_alloc(alloclen);
+	image = text_alloc(alloclen);
 	if (!image)
 		goto out;
 
@@ -678,7 +678,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
 void bpf_jit_free(struct bpf_prog *fp)
 {
 	if (fp->jited)
-		module_memfree(fp->bpf_func);
+		text_memfree(fp->bpf_func);
 
 	bpf_prog_unlock_free(fp);
 }
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index b355cf485671..d0b30f286ce6 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -29,6 +29,7 @@ obj-y	+= riscv_ksyms.o
 obj-y	+= stacktrace.o
 obj-y	+= cacheinfo.o
 obj-y	+= patch.o
+obj-y	+= text.o
 obj-$(CONFIG_MMU) += vdso.o vdso/
 
 obj-$(CONFIG_RISCV_M_MODE)	+= clint.o traps_misaligned.o
diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
index 7191342c54da..f6aa66431c9e 100644
--- a/arch/riscv/kernel/module.c
+++ b/arch/riscv/kernel/module.c
@@ -390,15 +390,3 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 
 	return 0;
 }
-
-#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
-#define VMALLOC_MODULE_START \
-	 max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START)
-void *module_alloc(unsigned long size)
-{
-	return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START,
-				    VMALLOC_END, GFP_KERNEL,
-				    PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				    __builtin_return_address(0));
-}
-#endif
diff --git a/arch/riscv/kernel/text.c b/arch/riscv/kernel/text.c
new file mode 100644
index 000000000000..201608a25641
--- /dev/null
+++ b/arch/riscv/kernel/text.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *
+ *  Copyright (C) 2017 Zihao Yu
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
+#define VMALLOC_MODULE_START \
+	 max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START)
+void *text_alloc(unsigned long size)
+{
+	return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START,
+				    VMALLOC_END, GFP_KERNEL,
+				    PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+				    __builtin_return_address(0));
+}
+#endif
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index a8f136943deb..9f00c320b938 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -40,7 +40,7 @@ obj-y	+= sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o
 obj-y	+= runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
 obj-y	+= entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
 obj-y	+= nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
-obj-y	+= smp.o
+obj-y	+= smp.o text.o
 
 extra-y				+= head64.o vmlinux.lds
 
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index b388e87a08bf..a752b1442846 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -134,7 +134,7 @@ static int __init ftrace_plt_init(void)
 {
 	unsigned int *ip;
 
-	ftrace_plt = (unsigned long) module_alloc(PAGE_SIZE);
+	ftrace_plt = (unsigned long) text_alloc(PAGE_SIZE);
 	if (!ftrace_plt)
 		panic("cannot allocate ftrace plt\n");
 	ip = (unsigned int *) ftrace_plt;
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 4055f1c49814..087cb5951de6 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -32,22 +32,6 @@
 
 #define PLT_ENTRY_SIZE 20
 
-void *module_alloc(unsigned long size)
-{
-	void *p;
-
-	if (PAGE_ALIGN(size) > MODULES_LEN)
-		return NULL;
-	p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
-				 GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				 __builtin_return_address(0));
-	if (p && (kasan_module_alloc(p, size) < 0)) {
-		vfree(p);
-		return NULL;
-	}
-	return p;
-}
-
 void module_arch_freeing_init(struct module *mod)
 {
 	if (is_livepatch_module(mod) &&
diff --git a/arch/s390/kernel/text.c b/arch/s390/kernel/text.c
new file mode 100644
index 000000000000..63aaa1ab727b
--- /dev/null
+++ b/arch/s390/kernel/text.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ *  Kernel module help for s390.
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+void *text_alloc(unsigned long size)
+{
+	void *p;
+
+	if (PAGE_ALIGN(size) > MODULES_LEN)
+		return NULL;
+	p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
+				 GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+				 __builtin_return_address(0));
+	if (p && (kasan_module_alloc(p, size) < 0)) {
+		vfree(p);
+		return NULL;
+	}
+	return p;
+}
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 97c0e19263d1..e025f9e1db4a 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -52,6 +52,7 @@ obj-y                   += prom_common.o
 obj-y                   += prom_$(BITS).o
 obj-y                   += of_device_common.o
 obj-y                   += of_device_$(BITS).o
+obj-y			+= text.o
 obj-$(CONFIG_SPARC64)   += prom_irqtrans.o
 
 obj-$(CONFIG_SPARC32)   += leon_kernel.o
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index df39580f398d..f2babc69f189 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -21,36 +21,6 @@
 
 #include "entry.h"
 
-#ifdef CONFIG_SPARC64
-
-#include <linux/jump_label.h>
-
-static void *module_map(unsigned long size)
-{
-	if (PAGE_ALIGN(size) > MODULES_LEN)
-		return NULL;
-	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
-}
-#else
-static void *module_map(unsigned long size)
-{
-	return vmalloc(size);
-}
-#endif /* CONFIG_SPARC64 */
-
-void *module_alloc(unsigned long size)
-{
-	void *ret;
-
-	ret = module_map(size);
-	if (ret)
-		memset(ret, 0, size);
-
-	return ret;
-}
-
 /* Make generic code ignore STT_REGISTER dummy undefined symbols.  */
 int module_frob_arch_sections(Elf_Ehdr *hdr,
 			      Elf_Shdr *sechdrs,
diff --git a/arch/sparc/kernel/text.c b/arch/sparc/kernel/text.c
new file mode 100644
index 000000000000..d16663f2c6ba
--- /dev/null
+++ b/arch/sparc/kernel/text.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Kernel module help for sparc64.
+ *
+ * Copyright (C) 2001 Rusty Russell.
+ * Copyright (C) 2002 David S. Miller.
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+#ifdef CONFIG_SPARC64
+
+#include <linux/jump_label.h>
+
+static void *module_map(unsigned long size)
+{
+	if (PAGE_ALIGN(size) > MODULES_LEN)
+		return NULL;
+	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
+				__builtin_return_address(0));
+}
+#else
+static void *module_map(unsigned long size)
+{
+	return vmalloc(size);
+}
+#endif /* CONFIG_SPARC64 */
+
+void *text_alloc(unsigned long size)
+{
+	void *ret;
+
+	ret = module_map(size);
+	if (ret)
+		memset(ret, 0, size);
+
+	return ret;
+}
diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c
index c8eabb973b86..d9dd513b27b2 100644
--- a/arch/sparc/net/bpf_jit_comp_32.c
+++ b/arch/sparc/net/bpf_jit_comp_32.c
@@ -713,7 +713,7 @@ cond_branch:			f_offset = addrs[i + filter[i].jf];
 				if (unlikely(proglen + ilen > oldproglen)) {
 					pr_err("bpb_jit_compile fatal error\n");
 					kfree(addrs);
-					module_memfree(image);
+					text_memfree(image);
 					return;
 				}
 				memcpy(image + proglen, temp, ilen);
@@ -736,7 +736,7 @@ cond_branch:			f_offset = addrs[i + filter[i].jf];
 			break;
 		}
 		if (proglen == oldproglen) {
-			image = module_alloc(proglen);
+			image = text_alloc(proglen);
 			if (!image)
 				goto out;
 		}
@@ -758,7 +758,7 @@ cond_branch:			f_offset = addrs[i + filter[i].jf];
 void bpf_jit_free(struct bpf_prog *fp)
 {
 	if (fp->jited)
-		module_memfree(fp->bpf_func);
+		text_memfree(fp->bpf_func);
 
 	bpf_prog_unlock_free(fp);
 }
diff --git a/arch/unicore32/kernel/Makefile b/arch/unicore32/kernel/Makefile
index 2f79aa56735b..96eb8cfc8b1e 100644
--- a/arch/unicore32/kernel/Makefile
+++ b/arch/unicore32/kernel/Makefile
@@ -6,6 +6,7 @@
 # Object file lists.
 obj-y				:= dma.o elf.o entry.o process.o ptrace.o
 obj-y				+= setup.o signal.o sys.o stacktrace.o traps.o
+obj-y				+= text.o
 
 obj-$(CONFIG_MODULES)		+= ksyms.o module.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
diff --git a/arch/unicore32/kernel/module.c b/arch/unicore32/kernel/module.c
index 67c89ef2d6ee..e1e703c02379 100644
--- a/arch/unicore32/kernel/module.c
+++ b/arch/unicore32/kernel/module.c
@@ -18,13 +18,6 @@
 
 #include <asm/sections.h>
 
-void *module_alloc(unsigned long size)
-{
-	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
-}
-
 int
 apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 	       unsigned int relindex, struct module *module)
diff --git a/arch/unicore32/kernel/text.c b/arch/unicore32/kernel/text.c
new file mode 100644
index 000000000000..b94aac824bb8
--- /dev/null
+++ b/arch/unicore32/kernel/text.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/arch/unicore32/kernel/module.c
+ *
+ * Code specific to PKUnity SoC and UniCore ISA
+ *
+ * Copyright (C) 2001-2010 GUAN Xue-tao
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+void *text_alloc(unsigned long size)
+{
+	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+				__builtin_return_address(0));
+}
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e77261db2391..2878e4b753a0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -68,6 +68,7 @@ obj-y			+= tsc.o tsc_msr.o io_delay.o rtc.o
 obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
 obj-y			+= irqflags.o
+obj-y			+= text.o
 
 obj-y				+= process.o
 obj-y				+= fpu/
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 51504566b3a6..f76703ee96f2 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -265,11 +265,11 @@ int __init ftrace_dyn_arch_init(void)
 /* Module allocation simplifies allocating memory for code */
 static inline void *alloc_tramp(unsigned long size)
 {
-	return module_alloc(size);
+	return text_alloc(size);
 }
 static inline void tramp_free(void *tramp)
 {
-	module_memfree(tramp);
+	text_memfree(tramp);
 }
 #else
 /* Trampolines can only be created if modules are supported */
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index ada39ddbc922..e9ac7d3c658e 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -423,7 +423,7 @@ void *alloc_insn_page(void)
 {
 	void *page;
 
-	page = module_alloc(PAGE_SIZE);
+	page = text_alloc(PAGE_SIZE);
 	if (!page)
 		return NULL;
 
@@ -446,7 +446,7 @@ void *alloc_insn_page(void)
 /* Recover page to RW mode before releasing it */
 void free_insn_page(void *page)
 {
-	module_memfree(page);
+	text_memfree(page);
 }
 
 static int arch_copy_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 34b153cbd4ac..261df078f127 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -36,55 +36,6 @@ do {							\
 } while (0)
 #endif
 
-#ifdef CONFIG_RANDOMIZE_BASE
-static unsigned long module_load_offset;
-
-/* Mutex protects the module_load_offset. */
-static DEFINE_MUTEX(module_kaslr_mutex);
-
-static unsigned long int get_module_load_offset(void)
-{
-	if (kaslr_enabled()) {
-		mutex_lock(&module_kaslr_mutex);
-		/*
-		 * Calculate the module_load_offset the first time this
-		 * code is called. Once calculated it stays the same until
-		 * reboot.
-		 */
-		if (module_load_offset == 0)
-			module_load_offset =
-				(get_random_int() % 1024 + 1) * PAGE_SIZE;
-		mutex_unlock(&module_kaslr_mutex);
-	}
-	return module_load_offset;
-}
-#else
-static unsigned long int get_module_load_offset(void)
-{
-	return 0;
-}
-#endif
-
-void *module_alloc(unsigned long size)
-{
-	void *p;
-
-	if (PAGE_ALIGN(size) > MODULES_LEN)
-		return NULL;
-
-	p = __vmalloc_node_range(size, MODULE_ALIGN,
-				    MODULES_VADDR + get_module_load_offset(),
-				    MODULES_END, GFP_KERNEL,
-				    PAGE_KERNEL, 0, NUMA_NO_NODE,
-				    __builtin_return_address(0));
-	if (p && (kasan_module_alloc(p, size) < 0)) {
-		vfree(p);
-		return NULL;
-	}
-
-	return p;
-}
-
 #ifdef CONFIG_X86_32
 int apply_relocate(Elf32_Shdr *sechdrs,
 		   const char *strtab,
diff --git a/arch/x86/kernel/text.c b/arch/x86/kernel/text.c
new file mode 100644
index 000000000000..724ab2d93ac5
--- /dev/null
+++ b/arch/x86/kernel/text.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *  Kernel module help for x86.
+ *  Copyright (C) 2001 Rusty Russell.
+ */
+#include <linux/kasan.h>
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/random.h>
+#include <linux/vmalloc.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_RANDOMIZE_BASE
+static unsigned long module_load_offset;
+
+/* Mutex protects the module_load_offset. */
+static DEFINE_MUTEX(module_kaslr_mutex);
+
+static unsigned long get_module_load_offset(void)
+{
+	if (kaslr_enabled()) {
+		mutex_lock(&module_kaslr_mutex);
+		/*
+		 * Calculate the module_load_offset the first time this
+		 * code is called. Once calculated it stays the same until
+		 * reboot.
+		 */
+		if (module_load_offset == 0)
+			module_load_offset =
+				(get_random_int() % 1024 + 1) * PAGE_SIZE;
+		mutex_unlock(&module_kaslr_mutex);
+	}
+	return module_load_offset;
+}
+#else
+static unsigned long get_module_load_offset(void)
+{
+	return 0;
+}
+#endif
+
+void *text_alloc(unsigned long size)
+{
+	void *p;
+
+	if (PAGE_ALIGN(size) > MODULES_LEN)
+		return NULL;
+
+	p = __vmalloc_node_range(size, MODULE_ALIGN,
+				    MODULES_VADDR + get_module_load_offset(),
+				    MODULES_END, GFP_KERNEL,
+				    PAGE_KERNEL, 0, NUMA_NO_NODE,
+				    __builtin_return_address(0));
+	if (p && (kasan_module_alloc(p, size) < 0)) {
+		vfree(p);
+		return NULL;
+	}
+
+	return p;
+}
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 4fa67a8b2265..4e8b9ba431ee 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -24,10 +24,10 @@ unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section);
 
 /* Allocator used for allocating struct module, core sections and init
    sections.  Returns NULL on failure. */
-void *module_alloc(unsigned long size);
+void *text_alloc(unsigned long size);
 
 /* Free memory returned from module_alloc. */
-void module_memfree(void *module_region);
+void text_memfree(void *module_region);
 
 /* Determines if the section name is an init section (that is only used during
  * module loading).
diff --git a/kernel/Makefile b/kernel/Makefile
index f3218bc5ec69..9e88e81f68ef 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    extable.o params.o \
 	    kthread.o sys_ni.o nsproxy.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
-	    async.o range.o smpboot.o ucount.o
+	    async.o range.o smpboot.o ucount.o text.o
 
 obj-$(CONFIG_MODULES) += kmod.o
 obj-$(CONFIG_MULTIUSER) += groups.o
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9df4cc9a2907..febd55019a8a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -840,12 +840,12 @@ static void bpf_jit_uncharge_modmem(u32 pages)
 
 void *__weak bpf_jit_alloc_exec(unsigned long size)
 {
-	return module_alloc(size);
+	return text_alloc(size);
 }
 
 void __weak bpf_jit_free_exec(void *addr)
 {
-	module_memfree(addr);
+	text_memfree(addr);
 }
 
 struct bpf_binary_header *
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 4a904cc56d68..d1c354ec89de 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -111,12 +111,12 @@ enum kprobe_slot_state {
 
 void __weak *alloc_insn_page(void)
 {
-	return module_alloc(PAGE_SIZE);
+	return text_alloc(PAGE_SIZE);
 }
 
 void __weak free_insn_page(void *page)
 {
-	module_memfree(page);
+	text_memfree(page);
 }
 
 struct kprobe_insn_cache kprobe_insn_slots = {
diff --git a/kernel/module.c b/kernel/module.c
index bee1c25ca5c5..bdb3773f3668 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2141,16 +2141,6 @@ static void free_module_elf(struct module *mod)
 }
 #endif /* CONFIG_LIVEPATCH */
 
-void __weak module_memfree(void *module_region)
-{
-	/*
-	 * This memory may be RO, and freeing RO memory in an interrupt is not
-	 * supported by vmalloc.
-	 */
-	WARN_ON(in_interrupt());
-	vfree(module_region);
-}
-
 void __weak module_arch_cleanup(struct module *mod)
 {
 }
@@ -2200,7 +2190,7 @@ static void free_module(struct module *mod)
 
 	/* This may be empty, but that's OK */
 	module_arch_freeing_init(mod);
-	module_memfree(mod->init_layout.base);
+	text_memfree(mod->init_layout.base);
 	kfree(mod->args);
 	percpu_modfree(mod);
 
@@ -2208,7 +2198,7 @@ static void free_module(struct module *mod)
 	lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
 
 	/* Finally, free the core (containing the module structure) */
-	module_memfree(mod->core_layout.base);
+	text_memfree(mod->core_layout.base);
 }
 
 void *__symbol_get(const char *symbol)
@@ -2781,13 +2771,6 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
 		ddebug_remove_module(mod->name);
 }
 
-void * __weak module_alloc(unsigned long size)
-{
-	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
-			GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
-			NUMA_NO_NODE, __builtin_return_address(0));
-}
-
 bool __weak module_init_section(const char *name)
 {
 	return strstarts(name, ".init");
@@ -3246,7 +3229,7 @@ static int move_module(struct module *mod, struct load_info *info)
 	void *ptr;
 
 	/* Do the allocs. */
-	ptr = module_alloc(mod->core_layout.size);
+	ptr = text_alloc(mod->core_layout.size);
 	/*
 	 * The pointer to this block is stored in the module structure
 	 * which is inside the block. Just mark it as not being a
@@ -3260,7 +3243,7 @@ static int move_module(struct module *mod, struct load_info *info)
 	mod->core_layout.base = ptr;
 
 	if (mod->init_layout.size) {
-		ptr = module_alloc(mod->init_layout.size);
+		ptr = text_alloc(mod->init_layout.size);
 		/*
 		 * The pointer to this block is stored in the module structure
 		 * which is inside the block. This block doesn't need to be
@@ -3269,7 +3252,7 @@ static int move_module(struct module *mod, struct load_info *info)
 		 */
 		kmemleak_ignore(ptr);
 		if (!ptr) {
-			module_memfree(mod->core_layout.base);
+			text_memfree(mod->core_layout.base);
 			return -ENOMEM;
 		}
 		memset(ptr, 0, mod->init_layout.size);
@@ -3452,8 +3435,8 @@ static void module_deallocate(struct module *mod, struct load_info *info)
 {
 	percpu_modfree(mod);
 	module_arch_freeing_init(mod);
-	module_memfree(mod->init_layout.base);
-	module_memfree(mod->core_layout.base);
+	text_memfree(mod->init_layout.base);
+	text_memfree(mod->core_layout.base);
 }
 
 int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -3527,7 +3510,7 @@ static void do_free_init(struct work_struct *w)
 
 	llist_for_each_safe(pos, n, list) {
 		initfree = container_of(pos, struct mod_initfree, node);
-		module_memfree(initfree->module_init);
+		text_memfree(initfree->module_init);
 		kfree(initfree);
 	}
 }
@@ -3626,10 +3609,10 @@ static noinline int do_init_module(struct module *mod)
 	 * We want to free module_init, but be aware that kallsyms may be
 	 * walking this with preempt disabled.  In all the failure paths, we
 	 * call synchronize_rcu(), but we don't want to slow down the success
-	 * path. module_memfree() cannot be called in an interrupt, so do the
+	 * path. text_memfree() cannot be called in an interrupt, so do the
 	 * work and call synchronize_rcu() in a work queue.
 	 *
-	 * Note that module_alloc() on most architectures creates W+X page
+	 * Note that text_alloc() on most architectures creates W+X page
 	 * mappings which won't be cleaned up until do_free_init() runs.  Any
 	 * code such as mark_rodata_ro() which depends on those mappings to
 	 * be cleaned up needs to sync with the queued work - ie
diff --git a/kernel/text.c b/kernel/text.c
new file mode 100644
index 000000000000..9a12c508ded5
--- /dev/null
+++ b/kernel/text.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *  Copyright (C) 2002 Richard Henderson
+ *  Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+void __weak text_memfree(void *module_region)
+{
+	/*
+	 * This memory may be RO, and freeing RO memory in an interrupt is not
+	 * supported by vmalloc.
+	 */
+	WARN_ON(in_interrupt());
+	vfree(module_region);
+}
+
+void * __weak text_alloc(unsigned long size)
+{
+	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+			GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
+			NUMA_NO_NODE, __builtin_return_address(0));
+}
-- 
2.25.1


^ permalink raw reply related

* [PATCH v2 0/3] kprobes: Remove MODULE dependency
From: Jarkko Sakkinen @ 2020-07-14  9:45 UTC (permalink / raw)
  To: linux-kernel
  Cc: Catalin Marinas, Song Liu, Paul Mackerras, Zong Li, Paul Burton,
	Aneesh Kumar K.V, Vincent Whitchurch, Omar Sandoval, Petr Mladek,
	Brian Gerst, Andy Lutomirski, Thomas Gleixner, Iurii Zaikin,
	Jiri Kosina, Anup Patel, Philipp Rudo, Vincent Chen, Torsten Duwe,
	Masami Hiramatsu, Andrew Morton, Mark Rutland, Ingo Molnar,
	open list:S390, Joe Lawrence, Alexandre Ghiti, Helge Deller,
	John Fastabend, Yonghong Song, Borislav Petkov, Andrii Nakryiko,
	Vasily Gorbik, moderated list:ARM PORT, Daniel Axtens,
	Damien Le Moal, Stephen Boyd, Sean Christopherson,
	Martin KaFai Lau, Kefeng Wang, Palmer Dabbelt, Heiko Carstens,
	Jarkko Sakkinen, Atish Patra, Will Deacon, Masahiro Yamada,
	Nayna Jain, Krzysztof Kozlowski, Christian Borntraeger,
	Sami Tolvanen, Mao Han, Marco Elver, Kees Cook, Arnd Bergmann,
	Steven Rostedt (VMware), Babu Moger, Russell King, Ben Dooks,
	Peter Collingbourne, Tiezhu Yang, Thomas Bogendoerfer,
	open list:PARISC ARCHITECTURE, Jessica Yu,
	open list:BPF JIT for MIPS 32-BIT AND 64-BIT,
	Thiago Jung Bauermann, Peter Zijlstra,
	open list:SPARC + UltraSPARC sparc/sparc64, H. Peter Anvin,
	Amit Daniel Kachhap, open list:LIVE PATCHING,
	open list:RISC-V ARCHITECTURE, Miroslav Benes, Jiri Olsa,
	Ard Biesheuvel, Vincenzo Frascino, Anders Roxell, Sven Schnelle,
	Mike Rapoport, Paul E. McKenney, Frederic Weisbecker,
	Paul Walmsley, KP Singh, Gerald Schaefer, Josh Poimboeuf,
	open list:BPF JIT for MIPS 32-BIT AND 64-BIT, open list:MIPS,
	Sergey Senozhatsky, open list:LINUX FOR POWERPC 32-BIT AND 64-BIT

Remove MODULES dependency by creating module subsystem indepdent
text_alloc() and text_memfree() to allocate space for executable code.

Right now one has to compile modules support only to enable kprobes. This
incrases the barrier to use them in test kernels and I'd guess also in some
embedded kernels (the former is my use case).

v2:
* Added the missing cover letter.

Jarkko Sakkinen (3):
  module: Rename module_alloc() to text_alloc() and move to kernel
    proper
  module: Add lock_modules() and unlock_modules()
  kprobes: Flag out CONFIG_MODULES dependent code

 arch/Kconfig                     |  1 -
 arch/arm/kernel/Makefile         |  3 +-
 arch/arm/kernel/module.c         | 21 -------
 arch/arm/kernel/text.c           | 33 +++++++++++
 arch/arm64/kernel/Makefile       |  2 +-
 arch/arm64/kernel/module.c       | 42 --------------
 arch/arm64/kernel/text.c         | 54 ++++++++++++++++++
 arch/mips/kernel/Makefile        |  2 +-
 arch/mips/kernel/module.c        |  9 ---
 arch/mips/kernel/text.c          | 19 +++++++
 arch/mips/net/bpf_jit.c          |  4 +-
 arch/nds32/kernel/Makefile       |  2 +-
 arch/nds32/kernel/module.c       |  7 ---
 arch/nds32/kernel/text.c         | 12 ++++
 arch/nios2/kernel/Makefile       |  1 +
 arch/nios2/kernel/module.c       | 19 -------
 arch/nios2/kernel/text.c         | 34 +++++++++++
 arch/parisc/kernel/Makefile      |  2 +-
 arch/parisc/kernel/module.c      | 11 ----
 arch/parisc/kernel/text.c        | 22 ++++++++
 arch/powerpc/net/bpf_jit_comp.c  |  4 +-
 arch/riscv/kernel/Makefile       |  1 +
 arch/riscv/kernel/module.c       | 12 ----
 arch/riscv/kernel/text.c         | 20 +++++++
 arch/s390/kernel/Makefile        |  2 +-
 arch/s390/kernel/ftrace.c        |  2 +-
 arch/s390/kernel/module.c        | 16 ------
 arch/s390/kernel/text.c          | 23 ++++++++
 arch/sparc/kernel/Makefile       |  1 +
 arch/sparc/kernel/module.c       | 30 ----------
 arch/sparc/kernel/text.c         | 39 +++++++++++++
 arch/sparc/net/bpf_jit_comp_32.c |  6 +-
 arch/unicore32/kernel/Makefile   |  1 +
 arch/unicore32/kernel/module.c   |  7 ---
 arch/unicore32/kernel/text.c     | 18 ++++++
 arch/x86/kernel/Makefile         |  1 +
 arch/x86/kernel/ftrace.c         |  4 +-
 arch/x86/kernel/kprobes/core.c   |  4 +-
 arch/x86/kernel/module.c         | 49 ----------------
 arch/x86/kernel/text.c           | 60 ++++++++++++++++++++
 include/linux/module.h           | 29 +++++++---
 include/linux/moduleloader.h     |  4 +-
 kernel/Makefile                  |  2 +-
 kernel/bpf/core.c                |  4 +-
 kernel/kprobes.c                 | 17 ++++--
 kernel/livepatch/core.c          |  8 +--
 kernel/module.c                  | 97 +++++++++++++-------------------
 kernel/text.c                    | 25 ++++++++
 kernel/trace/trace_kprobe.c      | 20 ++++++-
 49 files changed, 484 insertions(+), 322 deletions(-)
 create mode 100644 arch/arm/kernel/text.c
 create mode 100644 arch/arm64/kernel/text.c
 create mode 100644 arch/mips/kernel/text.c
 create mode 100644 arch/nds32/kernel/text.c
 create mode 100644 arch/nios2/kernel/text.c
 create mode 100644 arch/parisc/kernel/text.c
 create mode 100644 arch/riscv/kernel/text.c
 create mode 100644 arch/s390/kernel/text.c
 create mode 100644 arch/sparc/kernel/text.c
 create mode 100644 arch/unicore32/kernel/text.c
 create mode 100644 arch/x86/kernel/text.c
 create mode 100644 kernel/text.c

-- 
2.25.1


^ permalink raw reply

* Re: [PATCH 06/20] Documentation: gpu/komeda-kms: eliminate duplicated word
From: Daniel Vetter @ 2020-07-14 10:11 UTC (permalink / raw)
  To: james qian wang (Arm Technology China), Randy Dunlap,
	linux-kernel, Jonathan Corbet, linux-doc, linux-mm, Mike Rapoport,
	Jens Axboe, linux-block, Jason Wessel, Daniel Thompson,
	Douglas Anderson, kgdb-bugreport, Wu Hao, linux-fpga, Liviu Dudau,
	Mihail Atanassov, Mali DP Maintainers, David Airlie, dri-devel,
	Srinivas Pandruvada, Jiri Kosina, linux-input, Wolfram Sang,
	linux-i2c, Masahiro Yamada, Michal Marek, linux-kbuild,
	Jacek Anaszewski, Pavel Machek, Dan Murphy, linux-leds,
	Dan Williams, Paul Cercueil, Thomas Bogendoerfer, linux-mips,
	Derek Kiernan, Dragan Cvetic, Michael Ellerman,
	Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev,
	Tony Krowiak, Pierre Morel, Halil Pasic, linux-s390,
	Matthew Wilcox, Hannes Reinecke, linux-scsi, James E.J. Bottomley,
	Martin K. Petersen, Jarkko Sakkinen, Mimi Zohar, linux-integrity,
	keyrings, Paolo Bonzini, kvm, Andrew Morton, nd
In-Reply-To: <20200714101005.GA3278063@phenom.ffwll.local>

On Tue, Jul 14, 2020 at 12:10:05PM +0200, Daniel Vetter wrote:
> This and next patch merged to drm-misc-next, thanks.

Oops strike this, I just noticed that Jon said he's picked it all up.

Oh well, the confusion, I managed to stop the script before it published
anything at least :-)
-Daniel

> 
> On Wed, Jul 08, 2020 at 01:08:21PM +0800, james qian wang (Arm Technology China) wrote:
> > Hi Randy
> > 
> > On Tue, Jul 07, 2020 at 11:04:00AM -0700, Randy Dunlap wrote:
> > > Drop the doubled word "and".
> > > 
> > > Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
> > > Cc: Jonathan Corbet <corbet@lwn.net>
> > > Cc: linux-doc@vger.kernel.org
> > > Cc: James (Qian) Wang <james.qian.wang@arm.com>
> > > Cc: Liviu Dudau <liviu.dudau@arm.com>
> > > Cc: Mihail Atanassov <mihail.atanassov@arm.com>
> > > Cc: Mali DP Maintainers <malidp@foss.arm.com>
> > > ---
> > >  Documentation/gpu/komeda-kms.rst |    2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > 
> > > --- linux-next-20200701.orig/Documentation/gpu/komeda-kms.rst
> > > +++ linux-next-20200701/Documentation/gpu/komeda-kms.rst
> > > @@ -41,7 +41,7 @@ Compositor blends multiple layers or pix
> > >  frame. its output frame can be fed into post image processor for showing it on
> > >  the monitor or fed into wb_layer and written to memory at the same time.
> > >  user can also insert a scaler between compositor and wb_layer to down scale
> > > -the display frame first and and then write to memory.
> > > +the display frame first and then write to memory.
> > 
> > Thank you for the patch.
> > 
> > Reviewed-by: James Qian Wang <james.qian.wang@arm.com>
> 
> James, for simple patches like this just go ahead and merge them. You're
> the maintainer for this, just slapping an r-b onto a patch and no
> indiciation whether you will pick it up only confuses people and increases
> the risk that patches get lost.
> 
> So either pick up right away, or state clearly that you will pick it up
> later, or that you expect someone else to merge this.
> 
> Thanks, Daniel
> > 
> > >  Writeback Layer (wb_layer)
> > >  --------------------------
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply

* Re: [PATCH 05/15] powerpc/powernv/sriov: Move SR-IOV into a seperate file
From: Alexey Kardashevskiy @ 2020-07-14  9:16 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev
In-Reply-To: <20200710052340.737567-6-oohall@gmail.com>



On 10/07/2020 15:23, Oliver O'Halloran wrote:
> pci-ioda.c is getting a bit unwieldly due to the amount of stuff jammed in
> there. The SR-IOV support can be extracted easily enough and is mostly
> standalone, so move it into a seperate file.
> 
> This patch also moves the PowerNV SR-IOV specific fields from pci_dn and moves them
> into a platform specific structure. I'm not sure how they ended up in there
> in the first place, but leaking platform specifics into common code has
> proven to be a terrible idea so far so lets stop doing that.
> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> ---
> The pci_dn change and the pci-sriov.c changes originally separate patches.
> I accidently squashed them together while rebasing and fixing that seemed
> like more pain that it was worth. I kind of like it this way though since
> they did cause a lot of churn on the same set of functions.
> 
> I'll split them up again if you really want (please don't want this).


Nah, not worth it splitting it this way. However it would be nice to not
to have a (small?) functional change in the same patch, there is a small
new piece (below).


> ---
>  arch/powerpc/include/asm/device.h          |   3 +
>  arch/powerpc/platforms/powernv/Makefile    |   1 +
>  arch/powerpc/platforms/powernv/pci-ioda.c  | 673 +--------------------
>  arch/powerpc/platforms/powernv/pci-sriov.c | 642 ++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.h       |  74 +++
>  5 files changed, 738 insertions(+), 655 deletions(-)
>  create mode 100644 arch/powerpc/platforms/powernv/pci-sriov.c
> 
> diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h
> index 266542769e4b..4d8934db7ef5 100644
> --- a/arch/powerpc/include/asm/device.h
> +++ b/arch/powerpc/include/asm/device.h
> @@ -49,6 +49,9 @@ struct dev_archdata {
>  #ifdef CONFIG_CXL_BASE
>  	struct cxl_context	*cxl_ctx;
>  #endif
> +#ifdef CONFIG_PCI_IOV
> +	void *iov_data;
> +#endif
>  };
>  
>  struct pdev_archdata {
> diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
> index fe3f0fb5aeca..2eb6ae150d1f 100644
> --- a/arch/powerpc/platforms/powernv/Makefile
> +++ b/arch/powerpc/platforms/powernv/Makefile
> @@ -11,6 +11,7 @@ obj-$(CONFIG_FA_DUMP)	+= opal-fadump.o
>  obj-$(CONFIG_PRESERVE_FA_DUMP)	+= opal-fadump.o
>  obj-$(CONFIG_OPAL_CORE)	+= opal-core.o
>  obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
> +obj-$(CONFIG_PCI_IOV)   += pci-sriov.o
>  obj-$(CONFIG_CXL_BASE)	+= pci-cxl.o
>  obj-$(CONFIG_EEH)	+= eeh-powernv.o
>  obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 8fb17676d914..2d36a9ebf0e9 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -115,26 +115,6 @@ static int __init pci_reset_phbs_setup(char *str)
>  
>  early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup);
>  
> -static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r)
> -{
> -	/*
> -	 * WARNING: We cannot rely on the resource flags. The Linux PCI
> -	 * allocation code sometimes decides to put a 64-bit prefetchable
> -	 * BAR in the 32-bit window, so we have to compare the addresses.
> -	 *
> -	 * For simplicity we only test resource start.
> -	 */
> -	return (r->start >= phb->ioda.m64_base &&
> -		r->start < (phb->ioda.m64_base + phb->ioda.m64_size));
> -}
> -
> -static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags)
> -{
> -	unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
> -
> -	return (resource_flags & flags) == flags;
> -}
> -
>  static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
>  {
>  	s64 rc;
> @@ -172,7 +152,7 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
>  	pnv_ioda_init_pe(phb, pe_no);
>  }
>  
> -static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
> +struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
>  {
>  	long pe;
>  
> @@ -184,7 +164,7 @@ static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
>  	return NULL;
>  }
>  
> -static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
> +void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
>  {
>  	struct pnv_phb *phb = pe->phb;
>  	unsigned int pe_num = pe->pe_number;
> @@ -816,7 +796,7 @@ static void pnv_ioda_unset_peltv(struct pnv_phb *phb,
>  		pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc);
>  }
>  
> -static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
> +int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>  {
>  	struct pci_dev *parent;
>  	uint8_t bcomp, dcomp, fcomp;
> @@ -887,7 +867,7 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>  	return 0;
>  }
>  
> -static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
> +int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>  {
>  	struct pci_dev *parent;
>  	uint8_t bcomp, dcomp, fcomp;
> @@ -982,91 +962,6 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>  	return 0;
>  }
>  
> -#ifdef CONFIG_PCI_IOV
> -static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
> -{
> -	struct pci_dn *pdn = pci_get_pdn(dev);
> -	int i;
> -	struct resource *res, res2;
> -	resource_size_t size;
> -	u16 num_vfs;
> -
> -	if (!dev->is_physfn)
> -		return -EINVAL;
> -
> -	/*
> -	 * "offset" is in VFs.  The M64 windows are sized so that when they
> -	 * are segmented, each segment is the same size as the IOV BAR.
> -	 * Each segment is in a separate PE, and the high order bits of the
> -	 * address are the PE number.  Therefore, each VF's BAR is in a
> -	 * separate PE, and changing the IOV BAR start address changes the
> -	 * range of PEs the VFs are in.
> -	 */
> -	num_vfs = pdn->num_vfs;
> -	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> -		res = &dev->resource[i + PCI_IOV_RESOURCES];
> -		if (!res->flags || !res->parent)
> -			continue;
> -
> -		/*
> -		 * The actual IOV BAR range is determined by the start address
> -		 * and the actual size for num_vfs VFs BAR.  This check is to
> -		 * make sure that after shifting, the range will not overlap
> -		 * with another device.
> -		 */
> -		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
> -		res2.flags = res->flags;
> -		res2.start = res->start + (size * offset);
> -		res2.end = res2.start + (size * num_vfs) - 1;
> -
> -		if (res2.end > res->end) {
> -			dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
> -				i, &res2, res, num_vfs, offset);
> -			return -EBUSY;
> -		}
> -	}
> -
> -	/*
> -	 * Since M64 BAR shares segments among all possible 256 PEs,
> -	 * we have to shift the beginning of PF IOV BAR to make it start from
> -	 * the segment which belongs to the PE number assigned to the first VF.
> -	 * This creates a "hole" in the /proc/iomem which could be used for
> -	 * allocating other resources so we reserve this area below and
> -	 * release when IOV is released.
> -	 */
> -	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> -		res = &dev->resource[i + PCI_IOV_RESOURCES];
> -		if (!res->flags || !res->parent)
> -			continue;
> -
> -		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
> -		res2 = *res;
> -		res->start += size * offset;
> -
> -		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
> -			 i, &res2, res, (offset > 0) ? "En" : "Dis",
> -			 num_vfs, offset);
> -
> -		if (offset < 0) {
> -			devm_release_resource(&dev->dev, &pdn->holes[i]);
> -			memset(&pdn->holes[i], 0, sizeof(pdn->holes[i]));
> -		}
> -
> -		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
> -
> -		if (offset > 0) {
> -			pdn->holes[i].start = res2.start;
> -			pdn->holes[i].end = res2.start + size * offset - 1;
> -			pdn->holes[i].flags = IORESOURCE_BUS;
> -			pdn->holes[i].name = "pnv_iov_reserved";
> -			devm_request_resource(&dev->dev, res->parent,
> -					&pdn->holes[i]);
> -		}
> -	}
> -	return 0;
> -}
> -#endif /* CONFIG_PCI_IOV */
> -
>  static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
>  {
>  	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
> @@ -1294,406 +1189,9 @@ static void pnv_pci_ioda_setup_nvlink(void)
>  #endif
>  }
>  
> -#ifdef CONFIG_PCI_IOV
> -static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
> -{
> -	struct pnv_phb        *phb;
> -	struct pci_dn         *pdn;
> -	int                    i, j;
> -	int                    m64_bars;
> -
> -	phb = pci_bus_to_pnvhb(pdev->bus);
> -	pdn = pci_get_pdn(pdev);
> -
> -	if (pdn->m64_single_mode)
> -		m64_bars = num_vfs;
> -	else
> -		m64_bars = 1;
> -
> -	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
> -		for (j = 0; j < m64_bars; j++) {
> -			if (pdn->m64_map[j][i] == IODA_INVALID_M64)
> -				continue;
> -			opal_pci_phb_mmio_enable(phb->opal_id,
> -				OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
> -			clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
> -			pdn->m64_map[j][i] = IODA_INVALID_M64;
> -		}
> -
> -	kfree(pdn->m64_map);
> -	return 0;
> -}
> -
> -static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
> -{
> -	struct pnv_phb        *phb;
> -	struct pci_dn         *pdn;
> -	unsigned int           win;
> -	struct resource       *res;
> -	int                    i, j;
> -	int64_t                rc;
> -	int                    total_vfs;
> -	resource_size_t        size, start;
> -	int                    pe_num;
> -	int                    m64_bars;
> -
> -	phb = pci_bus_to_pnvhb(pdev->bus);
> -	pdn = pci_get_pdn(pdev);
> -	total_vfs = pci_sriov_get_totalvfs(pdev);
> -
> -	if (pdn->m64_single_mode)
> -		m64_bars = num_vfs;
> -	else
> -		m64_bars = 1;
> -
> -	pdn->m64_map = kmalloc_array(m64_bars,
> -				     sizeof(*pdn->m64_map),
> -				     GFP_KERNEL);
> -	if (!pdn->m64_map)
> -		return -ENOMEM;
> -	/* Initialize the m64_map to IODA_INVALID_M64 */
> -	for (i = 0; i < m64_bars ; i++)
> -		for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
> -			pdn->m64_map[i][j] = IODA_INVALID_M64;
> -
> -
> -	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> -		res = &pdev->resource[i + PCI_IOV_RESOURCES];
> -		if (!res->flags || !res->parent)
> -			continue;
> -
> -		for (j = 0; j < m64_bars; j++) {
> -			do {
> -				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
> -						phb->ioda.m64_bar_idx + 1, 0);
> -
> -				if (win >= phb->ioda.m64_bar_idx + 1)
> -					goto m64_failed;
> -			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
> -
> -			pdn->m64_map[j][i] = win;
> -
> -			if (pdn->m64_single_mode) {
> -				size = pci_iov_resource_size(pdev,
> -							PCI_IOV_RESOURCES + i);
> -				start = res->start + size * j;
> -			} else {
> -				size = resource_size(res);
> -				start = res->start;
> -			}
> -
> -			/* Map the M64 here */
> -			if (pdn->m64_single_mode) {
> -				pe_num = pdn->pe_num_map[j];
> -				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
> -						pe_num, OPAL_M64_WINDOW_TYPE,
> -						pdn->m64_map[j][i], 0);
> -			}
> -
> -			rc = opal_pci_set_phb_mem_window(phb->opal_id,
> -						 OPAL_M64_WINDOW_TYPE,
> -						 pdn->m64_map[j][i],
> -						 start,
> -						 0, /* unused */
> -						 size);
> -
> -
> -			if (rc != OPAL_SUCCESS) {
> -				dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
> -					win, rc);
> -				goto m64_failed;
> -			}
> -
> -			if (pdn->m64_single_mode)
> -				rc = opal_pci_phb_mmio_enable(phb->opal_id,
> -				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
> -			else
> -				rc = opal_pci_phb_mmio_enable(phb->opal_id,
> -				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
> -
> -			if (rc != OPAL_SUCCESS) {
> -				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
> -					win, rc);
> -				goto m64_failed;
> -			}
> -		}
> -	}
> -	return 0;
> -
> -m64_failed:
> -	pnv_pci_vf_release_m64(pdev, num_vfs);
> -	return -EBUSY;
> -}
> -
> -static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe);
> -
> -static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
> -{
> -	struct pnv_phb        *phb;
> -	struct pnv_ioda_pe    *pe, *pe_n;
> -	struct pci_dn         *pdn;
> -
> -	phb = pci_bus_to_pnvhb(pdev->bus);
> -	pdn = pci_get_pdn(pdev);
> -
> -	if (!pdev->is_physfn)
> -		return;
> -
> -	/* FIXME: Use pnv_ioda_release_pe()? */
> -	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
> -		if (pe->parent_dev != pdev)
> -			continue;
> -
> -		pnv_pci_ioda2_release_pe_dma(pe);
> -
> -		/* Remove from list */
> -		mutex_lock(&phb->ioda.pe_list_mutex);
> -		list_del(&pe->list);
> -		mutex_unlock(&phb->ioda.pe_list_mutex);
> -
> -		pnv_ioda_deconfigure_pe(phb, pe);
> -
> -		pnv_ioda_free_pe(pe);
> -	}
> -}
> -
> -static void pnv_pci_sriov_disable(struct pci_dev *pdev)
> -{
> -	struct pnv_phb        *phb;
> -	struct pnv_ioda_pe    *pe;
> -	struct pci_dn         *pdn;
> -	u16                    num_vfs, i;
> -
> -	phb = pci_bus_to_pnvhb(pdev->bus);
> -	pdn = pci_get_pdn(pdev);
> -	num_vfs = pdn->num_vfs;
> -
> -	/* Release VF PEs */
> -	pnv_ioda_release_vf_PE(pdev);
> -
> -	if (phb->type == PNV_PHB_IODA2) {
> -		if (!pdn->m64_single_mode)
> -			pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map);
> -
> -		/* Release M64 windows */
> -		pnv_pci_vf_release_m64(pdev, num_vfs);
> -
> -		/* Release PE numbers */
> -		if (pdn->m64_single_mode) {
> -			for (i = 0; i < num_vfs; i++) {
> -				if (pdn->pe_num_map[i] == IODA_INVALID_PE)
> -					continue;
> -
> -				pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
> -				pnv_ioda_free_pe(pe);
> -			}
> -		} else
> -			bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
> -		/* Releasing pe_num_map */
> -		kfree(pdn->pe_num_map);
> -	}
> -}
> -
> -static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> -				       struct pnv_ioda_pe *pe);
> -static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
> -{
> -	struct pnv_phb        *phb;
> -	struct pnv_ioda_pe    *pe;
> -	int                    pe_num;
> -	u16                    vf_index;
> -	struct pci_dn         *pdn;
> -
> -	phb = pci_bus_to_pnvhb(pdev->bus);
> -	pdn = pci_get_pdn(pdev);
> -
> -	if (!pdev->is_physfn)
> -		return;
> -
> -	/* Reserve PE for each VF */
> -	for (vf_index = 0; vf_index < num_vfs; vf_index++) {
> -		int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index);
> -		int vf_bus = pci_iov_virtfn_bus(pdev, vf_index);
> -		struct pci_dn *vf_pdn;
> -
> -		if (pdn->m64_single_mode)
> -			pe_num = pdn->pe_num_map[vf_index];
> -		else
> -			pe_num = *pdn->pe_num_map + vf_index;
> -
> -		pe = &phb->ioda.pe_array[pe_num];
> -		pe->pe_number = pe_num;
> -		pe->phb = phb;
> -		pe->flags = PNV_IODA_PE_VF;
> -		pe->pbus = NULL;
> -		pe->parent_dev = pdev;
> -		pe->mve_number = -1;
> -		pe->rid = (vf_bus << 8) | vf_devfn;
> -
> -		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
> -			pci_domain_nr(pdev->bus), pdev->bus->number,
> -			PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num);
> -
> -		if (pnv_ioda_configure_pe(phb, pe)) {
> -			/* XXX What do we do here ? */
> -			pnv_ioda_free_pe(pe);
> -			pe->pdev = NULL;
> -			continue;
> -		}
> -
> -		/* Put PE to the list */
> -		mutex_lock(&phb->ioda.pe_list_mutex);
> -		list_add_tail(&pe->list, &phb->ioda.pe_list);
> -		mutex_unlock(&phb->ioda.pe_list_mutex);
> -
> -		/* associate this pe to it's pdn */
> -		list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) {
> -			if (vf_pdn->busno == vf_bus &&
> -			    vf_pdn->devfn == vf_devfn) {
> -				vf_pdn->pe_number = pe_num;
> -				break;
> -			}
> -		}
> -
> -		pnv_pci_ioda2_setup_dma_pe(phb, pe);
> -	}
> -}
> -
> -static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
> -{
> -	struct pnv_phb        *phb;
> -	struct pnv_ioda_pe    *pe;
> -	struct pci_dn         *pdn;
> -	int                    ret;
> -	u16                    i;
> -
> -	phb = pci_bus_to_pnvhb(pdev->bus);
> -	pdn = pci_get_pdn(pdev);
> -
> -	if (phb->type == PNV_PHB_IODA2) {
> -		if (!pdn->vfs_expanded) {
> -			dev_info(&pdev->dev, "don't support this SRIOV device"
> -				" with non 64bit-prefetchable IOV BAR\n");
> -			return -ENOSPC;
> -		}
> -
> -		/*
> -		 * When M64 BARs functions in Single PE mode, the number of VFs
> -		 * could be enabled must be less than the number of M64 BARs.
> -		 */
> -		if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) {
> -			dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
> -			return -EBUSY;
> -		}
> -
> -		/* Allocating pe_num_map */
> -		if (pdn->m64_single_mode)
> -			pdn->pe_num_map = kmalloc_array(num_vfs,
> -							sizeof(*pdn->pe_num_map),
> -							GFP_KERNEL);
> -		else
> -			pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL);
> -
> -		if (!pdn->pe_num_map)
> -			return -ENOMEM;
> -
> -		if (pdn->m64_single_mode)
> -			for (i = 0; i < num_vfs; i++)
> -				pdn->pe_num_map[i] = IODA_INVALID_PE;
> -
> -		/* Calculate available PE for required VFs */
> -		if (pdn->m64_single_mode) {
> -			for (i = 0; i < num_vfs; i++) {
> -				pe = pnv_ioda_alloc_pe(phb);
> -				if (!pe) {
> -					ret = -EBUSY;
> -					goto m64_failed;
> -				}
> -
> -				pdn->pe_num_map[i] = pe->pe_number;
> -			}
> -		} else {
> -			mutex_lock(&phb->ioda.pe_alloc_mutex);
> -			*pdn->pe_num_map = bitmap_find_next_zero_area(
> -				phb->ioda.pe_alloc, phb->ioda.total_pe_num,
> -				0, num_vfs, 0);
> -			if (*pdn->pe_num_map >= phb->ioda.total_pe_num) {
> -				mutex_unlock(&phb->ioda.pe_alloc_mutex);
> -				dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
> -				kfree(pdn->pe_num_map);
> -				return -EBUSY;
> -			}
> -			bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
> -			mutex_unlock(&phb->ioda.pe_alloc_mutex);
> -		}
> -		pdn->num_vfs = num_vfs;
> -
> -		/* Assign M64 window accordingly */
> -		ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
> -		if (ret) {
> -			dev_info(&pdev->dev, "Not enough M64 window resources\n");
> -			goto m64_failed;
> -		}
> -
> -		/*
> -		 * When using one M64 BAR to map one IOV BAR, we need to shift
> -		 * the IOV BAR according to the PE# allocated to the VFs.
> -		 * Otherwise, the PE# for the VF will conflict with others.
> -		 */
> -		if (!pdn->m64_single_mode) {
> -			ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map);
> -			if (ret)
> -				goto m64_failed;
> -		}
> -	}
> -
> -	/* Setup VF PEs */
> -	pnv_ioda_setup_vf_PE(pdev, num_vfs);
> -
> -	return 0;
> -
> -m64_failed:
> -	if (pdn->m64_single_mode) {
> -		for (i = 0; i < num_vfs; i++) {
> -			if (pdn->pe_num_map[i] == IODA_INVALID_PE)
> -				continue;
> -
> -			pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
> -			pnv_ioda_free_pe(pe);
> -		}
> -	} else
> -		bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
> -
> -	/* Releasing pe_num_map */
> -	kfree(pdn->pe_num_map);
> -
> -	return ret;
> -}
> -
> -static int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
> -{
> -	pnv_pci_sriov_disable(pdev);
> -
> -	/* Release PCI data */
> -	remove_sriov_vf_pdns(pdev);
> -	return 0;
> -}
> -
> -static int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
> -{
> -	/* Allocate PCI data */
> -	add_sriov_vf_pdns(pdev);
> -
> -	return pnv_pci_sriov_enable(pdev, num_vfs);
> -}
> -#endif /* CONFIG_PCI_IOV */
> -
>  static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
>  				       struct pnv_ioda_pe *pe);
>  
> -static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> -				       struct pnv_ioda_pe *pe);
> -
>  static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
>  {
>  	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
> @@ -2559,8 +2057,8 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
>  };
>  #endif
>  
> -static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> -				       struct pnv_ioda_pe *pe)
> +void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> +				struct pnv_ioda_pe *pe)
>  {
>  	int64_t rc;
>  
> @@ -2737,117 +2235,6 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
>  		count, phb->msi_base);
>  }
>  
> -#ifdef CONFIG_PCI_IOV
> -static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
> -{
> -	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
> -	const resource_size_t gate = phb->ioda.m64_segsize >> 2;
> -	struct resource *res;
> -	int i;
> -	resource_size_t size, total_vf_bar_sz;
> -	struct pci_dn *pdn;
> -	int mul, total_vfs;
> -
> -	pdn = pci_get_pdn(pdev);
> -	pdn->vfs_expanded = 0;
> -	pdn->m64_single_mode = false;
> -
> -	total_vfs = pci_sriov_get_totalvfs(pdev);
> -	mul = phb->ioda.total_pe_num;
> -	total_vf_bar_sz = 0;
> -
> -	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> -		res = &pdev->resource[i + PCI_IOV_RESOURCES];
> -		if (!res->flags || res->parent)
> -			continue;
> -		if (!pnv_pci_is_m64_flags(res->flags)) {
> -			dev_warn(&pdev->dev, "Don't support SR-IOV with"
> -					" non M64 VF BAR%d: %pR. \n",
> -				 i, res);
> -			goto truncate_iov;
> -		}
> -
> -		total_vf_bar_sz += pci_iov_resource_size(pdev,
> -				i + PCI_IOV_RESOURCES);
> -
> -		/*
> -		 * If bigger than quarter of M64 segment size, just round up
> -		 * power of two.
> -		 *
> -		 * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
> -		 * with other devices, IOV BAR size is expanded to be
> -		 * (total_pe * VF_BAR_size).  When VF_BAR_size is half of M64
> -		 * segment size , the expanded size would equal to half of the
> -		 * whole M64 space size, which will exhaust the M64 Space and
> -		 * limit the system flexibility.  This is a design decision to
> -		 * set the boundary to quarter of the M64 segment size.
> -		 */
> -		if (total_vf_bar_sz > gate) {
> -			mul = roundup_pow_of_two(total_vfs);
> -			dev_info(&pdev->dev,
> -				"VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
> -				total_vf_bar_sz, gate, mul);
> -			pdn->m64_single_mode = true;
> -			break;
> -		}
> -	}
> -
> -	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> -		res = &pdev->resource[i + PCI_IOV_RESOURCES];
> -		if (!res->flags || res->parent)
> -			continue;
> -
> -		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
> -		/*
> -		 * On PHB3, the minimum size alignment of M64 BAR in single
> -		 * mode is 32MB.
> -		 */
> -		if (pdn->m64_single_mode && (size < SZ_32M))
> -			goto truncate_iov;
> -		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
> -		res->end = res->start + size * mul - 1;
> -		dev_dbg(&pdev->dev, "                       %pR\n", res);
> -		dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
> -			 i, res, mul);
> -	}
> -	pdn->vfs_expanded = mul;
> -
> -	return;
> -
> -truncate_iov:
> -	/* To save MMIO space, IOV BAR is truncated. */
> -	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> -		res = &pdev->resource[i + PCI_IOV_RESOURCES];
> -		res->flags = 0;
> -		res->end = res->start - 1;
> -	}
> -}
> -
> -static void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev)
> -{
> -	if (WARN_ON(pci_dev_is_added(pdev)))
> -		return;
> -
> -	if (pdev->is_virtfn) {
> -		struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
> -
> -		/*
> -		 * VF PEs are single-device PEs so their pdev pointer needs to
> -		 * be set. The pdev doesn't exist when the PE is allocated (in
> -		 * (pcibios_sriov_enable()) so we fix it up here.
> -		 */
> -		pe->pdev = pdev;
> -		WARN_ON(!(pe->flags & PNV_IODA_PE_VF));
> -	} else if (pdev->is_physfn) {
> -		/*
> -		 * For PFs adjust their allocated IOV resources to match what
> -		 * the PHB can support using it's M64 BAR table.
> -		 */
> -		pnv_pci_ioda_fixup_iov_resources(pdev);
> -	}
> -}
> -#endif /* CONFIG_PCI_IOV */
> -
>  static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
>  				  struct resource *res)
>  {
> @@ -3192,41 +2579,6 @@ static resource_size_t pnv_pci_default_alignment(void)
>  	return PAGE_SIZE;
>  }
>  
> -#ifdef CONFIG_PCI_IOV
> -static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
> -						      int resno)
> -{
> -	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
> -	struct pci_dn *pdn = pci_get_pdn(pdev);
> -	resource_size_t align;
> -
> -	/*
> -	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
> -	 * SR-IOV. While from hardware perspective, the range mapped by M64
> -	 * BAR should be size aligned.
> -	 *
> -	 * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
> -	 * powernv-specific hardware restriction is gone. But if just use the
> -	 * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
> -	 * in one segment of M64 #15, which introduces the PE conflict between
> -	 * PF and VF. Based on this, the minimum alignment of an IOV BAR is
> -	 * m64_segsize.
> -	 *
> -	 * This function returns the total IOV BAR size if M64 BAR is in
> -	 * Shared PE mode or just VF BAR size if not.
> -	 * If the M64 BAR is in Single PE mode, return the VF BAR size or
> -	 * M64 segment size if IOV BAR size is less.
> -	 */
> -	align = pci_iov_resource_size(pdev, resno);
> -	if (!pdn->vfs_expanded)
> -		return align;
> -	if (pdn->m64_single_mode)
> -		return max(align, (resource_size_t)phb->ioda.m64_segsize);
> -
> -	return pdn->vfs_expanded * align;
> -}
> -#endif /* CONFIG_PCI_IOV */
> -
>  /* Prevent enabling devices for which we couldn't properly
>   * assign a PE
>   */
> @@ -3323,7 +2675,7 @@ static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
>  	iommu_tce_table_put(tbl);
>  }
>  
> -static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
> +void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
>  {
>  	struct iommu_table *tbl = pe->table_group.tables[0];
>  	int64_t rc;
> @@ -3436,12 +2788,23 @@ static void pnv_pci_release_device(struct pci_dev *pdev)
>  	struct pci_dn *pdn = pci_get_pdn(pdev);
>  	struct pnv_ioda_pe *pe;
>  
> +	/* The VF PE state is torn down when sriov_disable() is called */
>  	if (pdev->is_virtfn)
>  		return;
>  
>  	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
>  		return;
>  
> +#ifdef CONFIG_PCI_IOV
> +	/*
> +	 * FIXME: Try move this to sriov_disable(). It's here since we allocate
> +	 * the iov state at probe time since we need to fiddle with the IOV
> +	 * resources.
> +	 */
> +	if (pdev->is_physfn)
> +		kfree(pdev->dev.archdata.iov_data);
> +#endif
> +
>  	/*
>  	 * PCI hotplug can happen as part of EEH error recovery. The @pdn
>  	 * isn't removed and added afterwards in this scenario. We should
> diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
> new file mode 100644
> index 000000000000..080ea39f5a83
> --- /dev/null
> +++ b/arch/powerpc/platforms/powernv/pci-sriov.c
> @@ -0,0 +1,642 @@
> +// SPDX-License-Identifier: GPL-2.0

Not SPDX-License-Identifier: GPL-2.0-or-later ?


> +
> +#include <linux/kernel.h>
> +#include <linux/ioport.h>
> +#include <linux/bitmap.h>
> +#include <linux/pci.h>
> +
> +#include <asm/opal.h>
> +
> +#include "pci.h"
> +
> +/* for pci_dev_is_added() */
> +#include "../../../../drivers/pci/pci.h"
> +
> +
> +static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
> +{
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
> +	const resource_size_t gate = phb->ioda.m64_segsize >> 2;
> +	struct resource *res;
> +	int i;
> +	resource_size_t size, total_vf_bar_sz;
> +	struct pnv_iov_data *iov;
> +	int mul, total_vfs;
> +
> +	iov = kzalloc(sizeof(*iov), GFP_KERNEL);
> +	if (!iov)
> +		goto truncate_iov;
> +	pdev->dev.archdata.iov_data = iov;
> +
> +	total_vfs = pci_sriov_get_totalvfs(pdev);
> +	mul = phb->ioda.total_pe_num;
> +	total_vf_bar_sz = 0;
> +
> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> +		res = &pdev->resource[i + PCI_IOV_RESOURCES];
> +		if (!res->flags || res->parent)
> +			continue;
> +		if (!pnv_pci_is_m64_flags(res->flags)) {
> +			dev_warn(&pdev->dev, "Don't support SR-IOV with"
> +					" non M64 VF BAR%d: %pR. \n",
> +				 i, res);
> +			goto truncate_iov;
> +		}
> +
> +		total_vf_bar_sz += pci_iov_resource_size(pdev,
> +				i + PCI_IOV_RESOURCES);
> +
> +		/*
> +		 * If bigger than quarter of M64 segment size, just round up
> +		 * power of two.
> +		 *
> +		 * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
> +		 * with other devices, IOV BAR size is expanded to be
> +		 * (total_pe * VF_BAR_size).  When VF_BAR_size is half of M64
> +		 * segment size , the expanded size would equal to half of the
> +		 * whole M64 space size, which will exhaust the M64 Space and
> +		 * limit the system flexibility.  This is a design decision to
> +		 * set the boundary to quarter of the M64 segment size.
> +		 */
> +		if (total_vf_bar_sz > gate) {
> +			mul = roundup_pow_of_two(total_vfs);
> +			dev_info(&pdev->dev,
> +				"VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
> +				total_vf_bar_sz, gate, mul);
> +			iov->m64_single_mode = true;
> +			break;
> +		}
> +	}
> +
> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> +		res = &pdev->resource[i + PCI_IOV_RESOURCES];
> +		if (!res->flags || res->parent)
> +			continue;
> +
> +		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
> +		/*
> +		 * On PHB3, the minimum size alignment of M64 BAR in single
> +		 * mode is 32MB.
> +		 */
> +		if (iov->m64_single_mode && (size < SZ_32M))
> +			goto truncate_iov;
> +		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
> +		res->end = res->start + size * mul - 1;
> +		dev_dbg(&pdev->dev, "                       %pR\n", res);
> +		dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
> +			 i, res, mul);
> +	}
> +	iov->vfs_expanded = mul;
> +
> +	return;
> +
> +truncate_iov:
> +	/* To save MMIO space, IOV BAR is truncated. */
> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> +		res = &pdev->resource[i + PCI_IOV_RESOURCES];
> +		res->flags = 0;
> +		res->end = res->start - 1;
> +	}
> +
> +	pdev->dev.archdata.iov_data = NULL;
> +	kfree(iov);
> +}
> +
> +void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev)
> +{
> +	if (WARN_ON(pci_dev_is_added(pdev)))
> +		return;
> +
> +	if (pdev->is_virtfn) {
> +		struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
> +
> +		/*
> +		 * VF PEs are single-device PEs so their pdev pointer needs to
> +		 * be set. The pdev doesn't exist when the PE is allocated (in
> +		 * (pcibios_sriov_enable()) so we fix it up here.
> +		 */
> +		pe->pdev = pdev;
> +		WARN_ON(!(pe->flags & PNV_IODA_PE_VF));
> +	} else if (pdev->is_physfn) {
> +		/*
> +		 * For PFs adjust their allocated IOV resources to match what
> +		 * the PHB can support using it's M64 BAR table.
> +		 */
> +		pnv_pci_ioda_fixup_iov_resources(pdev);
> +	}
> +}
> +
> +resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
> +						      int resno)
> +{
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
> +	struct pnv_iov_data *iov = pnv_iov_get(pdev);
> +	resource_size_t align;
> +
> +	/*
> +	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
> +	 * SR-IOV. While from hardware perspective, the range mapped by M64
> +	 * BAR should be size aligned.
> +	 *
> +	 * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
> +	 * powernv-specific hardware restriction is gone. But if just use the
> +	 * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
> +	 * in one segment of M64 #15, which introduces the PE conflict between
> +	 * PF and VF. Based on this, the minimum alignment of an IOV BAR is
> +	 * m64_segsize.
> +	 *
> +	 * This function returns the total IOV BAR size if M64 BAR is in
> +	 * Shared PE mode or just VF BAR size if not.
> +	 * If the M64 BAR is in Single PE mode, return the VF BAR size or
> +	 * M64 segment size if IOV BAR size is less.
> +	 */
> +	align = pci_iov_resource_size(pdev, resno);
> +
> +	/*
> +	 * iov can be null if we have an SR-IOV device with IOV BAR that can't
> +	 * be placed in the m64 space (i.e. The BAR is 32bit or non-prefetch).
> +	 * In that case we don't allow VFs to be enabled so just return the
> +	 * default alignment.
> +	 */
> +	if (!iov)
> +		return align;


This is the new chunk. What would happen before? Non-prefetch BAR would
still go to m64 space?

The rest is accurate.




-- 
Alexey

^ permalink raw reply

* Re: [PATCH 14/14] powerpc/eeh: Move PE tree setup into the platform
From: Alexey Kardashevskiy @ 2020-07-14  9:10 UTC (permalink / raw)
  To: Oliver O'Halloran; +Cc: linuxppc-dev, Mahesh J Salgaonkar
In-Reply-To: <CAOSf1CGRXox-fO7cJNA+0APjrQOUve01jUbjLmXWqjH=07cGxg@mail.gmail.com>



On 14/07/2020 13:08, Oliver O'Halloran wrote:
> On Tue, Jul 14, 2020 at 11:50 AM Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>
>> On 06/07/2020 11:36, Oliver O'Halloran wrote:
>>>  /**
>>>   * eeh_pe_tree_insert - Add EEH device to parent PE
>>>   * @edev: EEH device
>>> + * @new_pe_parent: PE to create additional PEs under
>>>   *
>>> - * Add EEH device to the parent PE. If the parent PE already
>>> - * exists, the PE type will be changed to EEH_PE_BUS. Otherwise,
>>> - * we have to create new PE to hold the EEH device and the new
>>> - * PE will be linked to its parent PE as well.
>>> + * Add EEH device to the PE in edev->pe_config_addr. If a PE already
>>> + * exists with that address then @edev is added to that PE. Otherwise
>>> + * a new PE is created and inserted into the PE tree as a child of
>>> + * @new_pe_parent.
>>> + *
>>> + * If @new_pe_parent is NULL then the new PE will be inserted under
>>> + * directly under the the PHB.
>>>   */
>>> -int eeh_pe_tree_insert(struct eeh_dev *edev)
>>> +int eeh_pe_tree_insert(struct eeh_dev *edev, struct eeh_pe *new_pe_parent)
>>>  {
>>>       struct pci_controller *hose = edev->controller;
>>>       struct eeh_pe *pe, *parent;
>>
>>
>> We can ditch this "parent" in that single place now and use "pe"
>> instead. And name the new parameter simply "parent". Dunno if it
>> improves things though.
> 
> I did this at some point and then decided not to. It added a bunch of
> noise to the diff and calling the parameter "parent" ended up being
> pretty unreadable. The parameter is "the parent of the PE that will be
> created to contain edev", or "parent of the parent PE". It's pretty
> unwieldy.

Ok fine but we still do not need both pe and parent in that function
(may be one day...).


> 
>>> @@ -399,7 +366,7 @@ int eeh_pe_tree_insert(struct eeh_dev *edev)
>>>                       }
>>>
>>>                       eeh_edev_dbg(edev,
>>> -                                  "Added to device PE (parent: PE#%x)\n",
>>> +                                  "Added to existing PE (parent: PE#%x)\n",
>>>                                    pe->parent->addr);
>>>               } else {
>>>                       /* Mark the PE as type of PCI bus */
>>> @@ -431,10 +398,9 @@ int eeh_pe_tree_insert(struct eeh_dev *edev)
>>>        * to PHB directly. Otherwise, we have to associate the
>>>        * PE with its parent.
>>>        */
>>> -     parent = eeh_pe_get_parent(edev);
>>> -     if (!parent) {
>>> -             parent = eeh_phb_pe_get(hose);
>>> -             if (!parent) {
>>> +     if (!new_pe_parent) {
>>> +             new_pe_parent = eeh_phb_pe_get(hose);
>>> +             if (!new_pe_parent) {
>>
>>
>>
>> afaict only pseries can realisticly pass new_pe_parent==NULL so this
>> chunk could go to pseries_eeh_pe_get_parent.
> 
> pnv_eeh_get_upstream_pe() will never return the PHB PE so
> new_pe_parent will be NULL for the first PE created under a PowerNV
> PHB. I guess we could move the PHB PE handling into the platform too,
> but I think that just results in having to special case PHB PEs in two
> places rather than one.
> 
>>> +static struct eeh_pe *pseries_eeh_pe_get_parent(struct eeh_dev *edev)
>>> +{
>>> +     struct eeh_dev *parent;
>>> +     struct pci_dn *pdn = eeh_dev_to_pdn(edev);
>>> +
>>> +     /*
>>> +      * It might have the case for the indirect parent
>>> +      * EEH device already having associated PE, but
>>> +      * the direct parent EEH device doesn't have yet.
>>> +      */
>>> +     if (edev->physfn)
>>> +             pdn = pci_get_pdn(edev->physfn);
>>> +     else
>>> +             pdn = pdn ? pdn->parent : NULL;
>>> +     while (pdn) {
>>> +             /* We're poking out of PCI territory */
>>
>>
>> We are traversing up PCI hierarchy here - pci_dn->parent, how is this
>> out of PCI territory? Or I understand "out of" incorrectly?
>>
>>
>>> +             parent = pdn_to_eeh_dev(pdn);
>>> +             if (!parent)
>>> +                     return NULL;
> 
> If there's no eeh dev then the node we're looking at is a PHB rather
> than an actual PCI device so it stops looking. I think. The comment
> was copied over from the existing code and I haven't spent a whole lot
> of time parsing it's meaning.


I noticed cut-n-paste. May be just ditch it if nobody can parse it?

> 
> 
> 
>>> @@ -301,6 +343,8 @@ void pseries_eeh_init_edev(struct pci_dn *pdn)
>>>       if (ret) {
>>>               eeh_edev_dbg(edev, "EEH failed to enable on device (code %d)\n", ret);
>>>       } else {
>>> +             struct eeh_pe *parent;
>>> +
>>>               /* Retrieve PE address */
>>>               edev->pe_config_addr = pseries_eeh_get_pe_addr(pdn);
>>>               pe.addr = edev->pe_config_addr;
>>> @@ -313,16 +357,23 @@ void pseries_eeh_init_edev(struct pci_dn *pdn)
>>>               if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT)
>>>                       enable = 1;
>>>
>>> -             if (enable) {
>>> +             /* This device doesn't support EEH, but it may have an
>>> +              * EEH parent, in which case we mark it as supported.
>>> +              */
>>> +             parent = pseries_eeh_pe_get_parent(edev);
>>> +             if (parent && !enable)
>>> +                     edev->pe_config_addr = parent->addr;
>>
>>
>> What if pseries_eeh_pe_get_parent() returned NULL - we won't write
>> edev->pe_config_addr so it remains 0 which is fine just by accident? :)
> 
> edev->pe_config_addr is set above when we call
> pseries_eeh_get_pe_addr(). The check there is mainly to cover the case
> where pseries_eeh_get_pe_addr() fails because the device is on a
> subordinate bus rather than the root bus of the PE. PAPR says the
> get-pe-addr-info RTAS call can fail in that situation and that you're
> supposed to traverse up the DT to find the pe_config_addr, which is
> what pe_get_parent() does. Yeah it's confusing, but that's what it
> does today too.
> 
>> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>
>>> +
>>> +             if (enable || parent) {
>>>                       eeh_add_flag(EEH_ENABLED);
>>> -                     eeh_pe_tree_insert(edev);
>>> +                     eeh_pe_tree_insert(edev, parent);
> 
>>>               } else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) &&
>>>                          (pdn_to_eeh_dev(pdn->parent))->pe) {
>>>                       /* This device doesn't support EEH, but it may have an
>>>                        * EEH parent, in which case we mark it as supported.
>>>                        */
>>>                       edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr;
>>> -                     eeh_pe_tree_insert(edev);
>>> +                     eeh_pe_tree_insert(edev, parent);
> 
> I think I was supposed to delete this hunk and then forgot to since it
> handles the same case mentioned above.

A-ha!


> 
>>>               }
>>>               eeh_edev_dbg(edev, "EEH is %s on device (code %d)\n",
>>>                            (enable ? "enabled" : "unsupported"), ret);
>>>
>>
>> --
>> Alexey

-- 
Alexey

^ permalink raw reply

* [PATCH 3/3] ASoC: fsl-asoc-card: Support Headphone and Microphone Jack detection
From: Shengjiu Wang @ 2020-07-14  9:05 UTC (permalink / raw)
  To: perex, tiwai, lgirdwood, broonie, kuninori.morimoto.gx, katsuhiro,
	samuel, alsa-devel, robh+dt, devicetree, timur, nicoleotsuka,
	Xiubo.Lee, festevam
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1594717536-5188-1-git-send-email-shengjiu.wang@nxp.com>

Use asoc_simple_init_jack function from simple card to implement
the Headphone and Microphone detection.
Register notifier to disable Speaker when Headphone is plugged in
and enable Speaker when Headphone is unplugged.
Register notifier to disable Digital Microphone when Analog Microphone
is plugged in and enable DMIC when Analog Microphone is unplugged.

Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
---
 sound/soc/fsl/Kconfig         |  1 +
 sound/soc/fsl/fsl-asoc-card.c | 69 ++++++++++++++++++++++++++++++++++-
 2 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/sound/soc/fsl/Kconfig b/sound/soc/fsl/Kconfig
index ea7b4787a8af..1c4ca5ec8caf 100644
--- a/sound/soc/fsl/Kconfig
+++ b/sound/soc/fsl/Kconfig
@@ -315,6 +315,7 @@ config SND_SOC_FSL_ASOC_CARD
 	depends on OF && I2C
 	# enforce SND_SOC_FSL_ASOC_CARD=m if SND_AC97_CODEC=m:
 	depends on SND_AC97_CODEC || SND_AC97_CODEC=n
+	select SND_SIMPLE_CARD_UTILS
 	select SND_SOC_IMX_AUDMUX
 	select SND_SOC_IMX_PCM_DMA
 	select SND_SOC_FSL_ESAI
diff --git a/sound/soc/fsl/fsl-asoc-card.c b/sound/soc/fsl/fsl-asoc-card.c
index faac6ce9a82c..313058789ea9 100644
--- a/sound/soc/fsl/fsl-asoc-card.c
+++ b/sound/soc/fsl/fsl-asoc-card.c
@@ -15,6 +15,8 @@
 #endif
 #include <sound/pcm_params.h>
 #include <sound/soc.h>
+#include <sound/jack.h>
+#include <sound/simple_card_utils.h>
 
 #include "fsl_esai.h"
 #include "fsl_sai.h"
@@ -65,6 +67,8 @@ struct cpu_priv {
 /**
  * struct fsl_asoc_card_priv - Freescale Generic ASOC card private data
  * @dai_link: DAI link structure including normal one and DPCM link
+ * @hp_jack: Headphone Jack structure
+ * @mic_jack: Microphone Jack structure
  * @pdev: platform device pointer
  * @codec_priv: CODEC private data
  * @cpu_priv: CPU private data
@@ -79,6 +83,8 @@ struct cpu_priv {
 
 struct fsl_asoc_card_priv {
 	struct snd_soc_dai_link dai_link[3];
+	struct asoc_simple_jack hp_jack;
+	struct asoc_simple_jack mic_jack;
 	struct platform_device *pdev;
 	struct codec_priv codec_priv;
 	struct cpu_priv cpu_priv;
@@ -445,6 +451,44 @@ static int fsl_asoc_card_audmux_init(struct device_node *np,
 	return 0;
 }
 
+static int hp_jack_event(struct notifier_block *nb, unsigned long event,
+			 void *data)
+{
+	struct snd_soc_jack *jack = (struct snd_soc_jack *)data;
+	struct snd_soc_dapm_context *dapm = &jack->card->dapm;
+
+	if (event & SND_JACK_HEADPHONE)
+		/* Disable speaker if headphone is plugged in */
+		snd_soc_dapm_disable_pin(dapm, "Ext Spk");
+	else
+		snd_soc_dapm_enable_pin(dapm, "Ext Spk");
+
+	return 0;
+}
+
+static struct notifier_block hp_jack_nb = {
+	.notifier_call = hp_jack_event,
+};
+
+static int mic_jack_event(struct notifier_block *nb, unsigned long event,
+			  void *data)
+{
+	struct snd_soc_jack *jack = (struct snd_soc_jack *)data;
+	struct snd_soc_dapm_context *dapm = &jack->card->dapm;
+
+	if (event & SND_JACK_MICROPHONE)
+		/* Disable dmic if microphone is plugged in */
+		snd_soc_dapm_disable_pin(dapm, "DMIC");
+	else
+		snd_soc_dapm_enable_pin(dapm, "DMIC");
+
+	return 0;
+}
+
+static struct notifier_block mic_jack_nb = {
+	.notifier_call = mic_jack_event,
+};
+
 static int fsl_asoc_card_late_probe(struct snd_soc_card *card)
 {
 	struct fsl_asoc_card_priv *priv = snd_soc_card_get_drvdata(card);
@@ -745,8 +789,29 @@ static int fsl_asoc_card_probe(struct platform_device *pdev)
 	snd_soc_card_set_drvdata(&priv->card, priv);
 
 	ret = devm_snd_soc_register_card(&pdev->dev, &priv->card);
-	if (ret && ret != -EPROBE_DEFER)
-		dev_err(&pdev->dev, "snd_soc_register_card failed (%d)\n", ret);
+	if (ret) {
+		if (ret != -EPROBE_DEFER)
+			dev_err(&pdev->dev, "snd_soc_register_card failed (%d)\n", ret);
+		goto asrc_fail;
+	}
+
+	if (of_property_read_bool(np, "hp-det-gpio")) {
+		ret = asoc_simple_init_jack(&priv->card, &priv->hp_jack,
+					    1, NULL, "Headphone Jack");
+		if (ret)
+			goto asrc_fail;
+
+		snd_soc_jack_notifier_register(&priv->hp_jack.jack, &hp_jack_nb);
+	}
+
+	if (of_property_read_bool(np, "mic-det-gpio")) {
+		ret = asoc_simple_init_jack(&priv->card, &priv->mic_jack,
+					    0, NULL, "Mic Jack");
+		if (ret)
+			goto asrc_fail;
+
+		snd_soc_jack_notifier_register(&priv->mic_jack.jack, &mic_jack_nb);
+	}
 
 asrc_fail:
 	of_node_put(asrc_np);
-- 
2.27.0


^ permalink raw reply related

* [PATCH 2/3] ASoC: bindings: fsl-asoc-card: Support hp-det-gpio and mic-det-gpio
From: Shengjiu Wang @ 2020-07-14  9:05 UTC (permalink / raw)
  To: perex, tiwai, lgirdwood, broonie, kuninori.morimoto.gx, katsuhiro,
	samuel, alsa-devel, robh+dt, devicetree, timur, nicoleotsuka,
	Xiubo.Lee, festevam
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1594717536-5188-1-git-send-email-shengjiu.wang@nxp.com>

Add headphone and microphone detection GPIO support.

Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
---
 Documentation/devicetree/bindings/sound/fsl-asoc-card.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/devicetree/bindings/sound/fsl-asoc-card.txt b/Documentation/devicetree/bindings/sound/fsl-asoc-card.txt
index 133d7e14a4d0..8a6a3d0fda5e 100644
--- a/Documentation/devicetree/bindings/sound/fsl-asoc-card.txt
+++ b/Documentation/devicetree/bindings/sound/fsl-asoc-card.txt
@@ -69,6 +69,9 @@ Optional properties:
 			        coexisting in order to support the old bindings
 				of wm8962 and sgtl5000.
 
+  - hp-det-gpio		: The GPIO that detect headphones are plugged in
+  - mic-det-gpio	: The GPIO that detect microphones are plugged in
+
 Optional unless SSI is selected as a CPU DAI:
 
   - mux-int-port	: The internal port of the i.MX audio muxer (AUDMUX)
-- 
2.27.0


^ permalink raw reply related

* [PATCH 1/3] ASoC: simple-card-utils: Support configure pin_name for asoc_simple_init_jack
From: Shengjiu Wang @ 2020-07-14  9:05 UTC (permalink / raw)
  To: perex, tiwai, lgirdwood, broonie, kuninori.morimoto.gx, katsuhiro,
	samuel, alsa-devel, robh+dt, devicetree, timur, nicoleotsuka,
	Xiubo.Lee, festevam
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1594717536-5188-1-git-send-email-shengjiu.wang@nxp.com>

Currently the pin_name is fixed in asoc_simple_init_jack, but some driver
may use a different pin_name. So add a new parameter in
asoc_simple_init_jack for configuring pin_name.

If this parameter is NULL, then the default pin_name is used.

Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
---
 include/sound/simple_card_utils.h     | 6 +++---
 sound/soc/generic/simple-card-utils.c | 7 ++++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/sound/simple_card_utils.h b/include/sound/simple_card_utils.h
index bbdd1542d6f1..86a1e956991e 100644
--- a/include/sound/simple_card_utils.h
+++ b/include/sound/simple_card_utils.h
@@ -12,9 +12,9 @@
 #include <sound/soc.h>
 
 #define asoc_simple_init_hp(card, sjack, prefix) \
-	asoc_simple_init_jack(card, sjack, 1, prefix)
+	asoc_simple_init_jack(card, sjack, 1, prefix, NULL)
 #define asoc_simple_init_mic(card, sjack, prefix) \
-	asoc_simple_init_jack(card, sjack, 0, prefix)
+	asoc_simple_init_jack(card, sjack, 0, prefix, NULL)
 
 struct asoc_simple_dai {
 	const char *name;
@@ -131,7 +131,7 @@ int asoc_simple_parse_pin_switches(struct snd_soc_card *card,
 
 int asoc_simple_init_jack(struct snd_soc_card *card,
 			       struct asoc_simple_jack *sjack,
-			       int is_hp, char *prefix);
+			       int is_hp, char *prefix, char *pin);
 int asoc_simple_init_priv(struct asoc_simple_priv *priv,
 			       struct link_info *li);
 
diff --git a/sound/soc/generic/simple-card-utils.c b/sound/soc/generic/simple-card-utils.c
index 8c54dc6710fe..b408cb5ed644 100644
--- a/sound/soc/generic/simple-card-utils.c
+++ b/sound/soc/generic/simple-card-utils.c
@@ -540,7 +540,8 @@ EXPORT_SYMBOL_GPL(asoc_simple_parse_pin_switches);
 
 int asoc_simple_init_jack(struct snd_soc_card *card,
 			  struct asoc_simple_jack *sjack,
-			  int is_hp, char *prefix)
+			  int is_hp, char *prefix,
+			  char *pin)
 {
 	struct device *dev = card->dev;
 	enum of_gpio_flags flags;
@@ -557,12 +558,12 @@ int asoc_simple_init_jack(struct snd_soc_card *card,
 
 	if (is_hp) {
 		snprintf(prop, sizeof(prop), "%shp-det-gpio", prefix);
-		pin_name	= "Headphones";
+		pin_name	= pin ? pin : "Headphones";
 		gpio_name	= "Headphone detection";
 		mask		= SND_JACK_HEADPHONE;
 	} else {
 		snprintf(prop, sizeof(prop), "%smic-det-gpio", prefix);
-		pin_name	= "Mic Jack";
+		pin_name	= pin ? pin : "Mic Jack";
 		gpio_name	= "Mic detection";
 		mask		= SND_JACK_MICROPHONE;
 	}
-- 
2.27.0


^ permalink raw reply related

* [PATCH 0/3] ASoC: fsl-asoc-card: Support hp and mic detection
From: Shengjiu Wang @ 2020-07-14  9:05 UTC (permalink / raw)
  To: perex, tiwai, lgirdwood, broonie, kuninori.morimoto.gx, katsuhiro,
	samuel, alsa-devel, robh+dt, devicetree, timur, nicoleotsuka,
	Xiubo.Lee, festevam
  Cc: linuxppc-dev, linux-kernel

Support hp and mic detection.
Add a parameter for asoc_simple_init_jack.

Shengjiu Wang (3):
  ASoC: simple-card-utils: Support configure pin_name for
    asoc_simple_init_jack
  ASoC: bindings: fsl-asoc-card: Support hp-det-gpio and mic-det-gpio
  ASoC: fsl-asoc-card: Support Headphone and Microphone Jack detection

 .../bindings/sound/fsl-asoc-card.txt          |  3 +
 include/sound/simple_card_utils.h             |  6 +-
 sound/soc/fsl/Kconfig                         |  1 +
 sound/soc/fsl/fsl-asoc-card.c                 | 69 ++++++++++++++++++-
 sound/soc/generic/simple-card-utils.c         |  7 +-
 5 files changed, 78 insertions(+), 8 deletions(-)

-- 
2.27.0


^ permalink raw reply

* Re: [PATCH 04/15] powerpc/powernv/pci: Initialise M64 for IODA1 as a 1-1 window
From: Alexey Kardashevskiy @ 2020-07-14  7:39 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev
In-Reply-To: <20200710052340.737567-5-oohall@gmail.com>



On 10/07/2020 15:23, Oliver O'Halloran wrote:
> We pre-configure the m64 window for IODA1 as a 1-1 segment-PE mapping,
> similar to PHB3. Currently the actual mapping of segments occurs in
> pnv_ioda_pick_m64_pe(), but we can move it into pnv_ioda1_init_m64() and
> drop the IODA1 specific code paths in the PE setup / teardown.
> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>



Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>


> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 55 +++++++++++------------
>  1 file changed, 25 insertions(+), 30 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index bb9c1cc60c33..8fb17676d914 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -311,6 +311,28 @@ static int pnv_ioda1_init_m64(struct pnv_phb *phb)
>  		}
>  	}
>  
> +	for (index = 0; index < phb->ioda.total_pe_num; index++) {
> +		int64_t rc;
> +
> +		/*
> +		 * P7IOC supports M64DT, which helps mapping M64 segment
> +		 * to one particular PE#. However, PHB3 has fixed mapping
> +		 * between M64 segment and PE#. In order to have same logic
> +		 * for P7IOC and PHB3, we enforce fixed mapping between M64
> +		 * segment and PE# on P7IOC.
> +		 */
> +		rc = opal_pci_map_pe_mmio_window(phb->opal_id,
> +				index, OPAL_M64_WINDOW_TYPE,
> +				index / PNV_IODA1_M64_SEGS,
> +				index % PNV_IODA1_M64_SEGS);
> +		if (rc != OPAL_SUCCESS) {
> +			pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
> +				__func__, rc, phb->hose->global_number,
> +				index);
> +			goto fail;
> +		}
> +	}
> +
>  	/*
>  	 * Exclude the segments for reserved and root bus PE, which
>  	 * are first or last two PEs.
> @@ -402,26 +424,6 @@ static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
>  			pe->master = master_pe;
>  			list_add_tail(&pe->list, &master_pe->slaves);
>  		}
> -
> -		/*
> -		 * P7IOC supports M64DT, which helps mapping M64 segment
> -		 * to one particular PE#. However, PHB3 has fixed mapping
> -		 * between M64 segment and PE#. In order to have same logic
> -		 * for P7IOC and PHB3, we enforce fixed mapping between M64
> -		 * segment and PE# on P7IOC.
> -		 */
> -		if (phb->type == PNV_PHB_IODA1) {
> -			int64_t rc;
> -
> -			rc = opal_pci_map_pe_mmio_window(phb->opal_id,
> -					pe->pe_number, OPAL_M64_WINDOW_TYPE,
> -					pe->pe_number / PNV_IODA1_M64_SEGS,
> -					pe->pe_number % PNV_IODA1_M64_SEGS);
> -			if (rc != OPAL_SUCCESS)
> -				pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
> -					__func__, rc, phb->hose->global_number,
> -					pe->pe_number);
> -		}
>  	}
>  
>  	kfree(pe_alloc);
> @@ -3354,14 +3356,8 @@ static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
>  		if (map[idx] != pe->pe_number)
>  			continue;
>  
> -		if (win == OPAL_M64_WINDOW_TYPE)
> -			rc = opal_pci_map_pe_mmio_window(phb->opal_id,
> -					phb->ioda.reserved_pe_idx, win,
> -					idx / PNV_IODA1_M64_SEGS,
> -					idx % PNV_IODA1_M64_SEGS);
> -		else
> -			rc = opal_pci_map_pe_mmio_window(phb->opal_id,
> -					phb->ioda.reserved_pe_idx, win, 0, idx);
> +		rc = opal_pci_map_pe_mmio_window(phb->opal_id,
> +				phb->ioda.reserved_pe_idx, win, 0, idx);
>  
>  		if (rc != OPAL_SUCCESS)
>  			pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n",
> @@ -3380,8 +3376,7 @@ static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
>  				     phb->ioda.io_segmap);
>  		pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
>  				     phb->ioda.m32_segmap);
> -		pnv_ioda_free_pe_seg(pe, OPAL_M64_WINDOW_TYPE,
> -				     phb->ioda.m64_segmap);
> +		/* M64 is pre-configured by pnv_ioda1_init_m64() */
>  	} else if (phb->type == PNV_PHB_IODA2) {
>  		pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
>  				     phb->ioda.m32_segmap);
> 

-- 
Alexey

^ permalink raw reply

* [PATCH] spi: ppc4xx: Convert to use GPIO descriptors
From: Linus Walleij @ 2020-07-14  7:22 UTC (permalink / raw)
  To: Mark Brown, linux-spi; +Cc: Linus Walleij, linuxppc-dev

This converts the PPC4xx SPI driver to use GPIO descriptors.

The driver is already just picking some GPIOs from the device
tree so the conversion is pretty straight forward. However
this driver is looking form a pure "gpios" property rather
than the standard binding "cs-gpios" so we need to add a new
exception to the gpiolib OF parser to allow this for this
driver's compatibles.

Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-of.c |  10 ++--
 drivers/spi/spi-ppc4xx.c  | 106 ++++----------------------------------
 2 files changed, 17 insertions(+), 99 deletions(-)

diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
index 219eb0054233..e3e88510aec7 100644
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -26,7 +26,7 @@
 /**
  * of_gpio_spi_cs_get_count() - special GPIO counting for SPI
  * Some elder GPIO controllers need special quirks. Currently we handle
- * the Freescale GPIO controller with bindings that doesn't use the
+ * the Freescale and PPC GPIO controller with bindings that doesn't use the
  * established "cs-gpios" for chip selects but instead rely on
  * "gpios" for the chip select lines. If we detect this, we redirect
  * the counting of "cs-gpios" to count "gpios" transparent to the
@@ -41,7 +41,8 @@ static int of_gpio_spi_cs_get_count(struct device *dev, const char *con_id)
 	if (!con_id || strcmp(con_id, "cs"))
 		return 0;
 	if (!of_device_is_compatible(np, "fsl,spi") &&
-	    !of_device_is_compatible(np, "aeroflexgaisler,spictrl"))
+	    !of_device_is_compatible(np, "aeroflexgaisler,spictrl") &&
+	    !of_device_is_compatible(np, "ibm,ppc4xx-spi"))
 		return 0;
 	return of_gpio_named_count(np, "gpios");
 }
@@ -405,9 +406,10 @@ static struct gpio_desc *of_find_spi_cs_gpio(struct device *dev,
 	if (!IS_ENABLED(CONFIG_SPI_MASTER))
 		return ERR_PTR(-ENOENT);
 
-	/* Allow this specifically for Freescale devices */
+	/* Allow this specifically for Freescale and PPC devices */
 	if (!of_device_is_compatible(np, "fsl,spi") &&
-	    !of_device_is_compatible(np, "aeroflexgaisler,spictrl"))
+	    !of_device_is_compatible(np, "aeroflexgaisler,spictrl") &&
+	    !of_device_is_compatible(np, "ibm,ppc4xx-spi"))
 		return ERR_PTR(-ENOENT);
 	/* Allow only if asking for "cs-gpios" */
 	if (!con_id || strcmp(con_id, "cs"))
diff --git a/drivers/spi/spi-ppc4xx.c b/drivers/spi/spi-ppc4xx.c
index 0ea2d9a369d9..d8ee363fb714 100644
--- a/drivers/spi/spi-ppc4xx.c
+++ b/drivers/spi/spi-ppc4xx.c
@@ -28,11 +28,9 @@
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
-#include <linux/of_gpio.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 
-#include <linux/gpio.h>
 #include <linux/spi/spi.h>
 #include <linux/spi/spi_bitbang.h>
 
@@ -127,8 +125,6 @@ struct ppc4xx_spi {
 	const unsigned char *tx;
 	unsigned char *rx;
 
-	int *gpios;
-
 	struct spi_ppc4xx_regs __iomem *regs; /* pointer to the registers */
 	struct spi_master *master;
 	struct device *dev;
@@ -260,27 +256,6 @@ static int spi_ppc4xx_setup(struct spi_device *spi)
 	return 0;
 }
 
-static void spi_ppc4xx_chipsel(struct spi_device *spi, int value)
-{
-	struct ppc4xx_spi *hw = spi_master_get_devdata(spi->master);
-	unsigned int cs = spi->chip_select;
-	unsigned int cspol;
-
-	/*
-	 * If there are no chip selects at all, or if this is the special
-	 * case of a non-existent (dummy) chip select, do nothing.
-	 */
-
-	if (!hw->master->num_chipselect || hw->gpios[cs] == -EEXIST)
-		return;
-
-	cspol = spi->mode & SPI_CS_HIGH ? 1 : 0;
-	if (value == BITBANG_CS_INACTIVE)
-		cspol = !cspol;
-
-	gpio_set_value(hw->gpios[cs], cspol);
-}
-
 static irqreturn_t spi_ppc4xx_int(int irq, void *dev_id)
 {
 	struct ppc4xx_spi *hw;
@@ -359,19 +334,6 @@ static void spi_ppc4xx_enable(struct ppc4xx_spi *hw)
 	dcri_clrset(SDR0, SDR0_PFC1, 0x80000000 >> 14, 0);
 }
 
-static void free_gpios(struct ppc4xx_spi *hw)
-{
-	if (hw->master->num_chipselect) {
-		int i;
-		for (i = 0; i < hw->master->num_chipselect; i++)
-			if (gpio_is_valid(hw->gpios[i]))
-				gpio_free(hw->gpios[i]);
-
-		kfree(hw->gpios);
-		hw->gpios = NULL;
-	}
-}
-
 /*
  * platform_device layer stuff...
  */
@@ -385,7 +347,6 @@ static int spi_ppc4xx_of_probe(struct platform_device *op)
 	struct device *dev = &op->dev;
 	struct device_node *opbnp;
 	int ret;
-	int num_gpios;
 	const unsigned int *clk;
 
 	master = spi_alloc_master(dev, sizeof *hw);
@@ -399,74 +360,32 @@ static int spi_ppc4xx_of_probe(struct platform_device *op)
 
 	init_completion(&hw->done);
 
-	/*
-	 * A count of zero implies a single SPI device without any chip-select.
-	 * Note that of_gpio_count counts all gpios assigned to this spi master.
-	 * This includes both "null" gpio's and real ones.
-	 */
-	num_gpios = of_gpio_count(np);
-	if (num_gpios > 0) {
-		int i;
-
-		hw->gpios = kcalloc(num_gpios, sizeof(*hw->gpios), GFP_KERNEL);
-		if (!hw->gpios) {
-			ret = -ENOMEM;
-			goto free_master;
-		}
-
-		for (i = 0; i < num_gpios; i++) {
-			int gpio;
-			enum of_gpio_flags flags;
-
-			gpio = of_get_gpio_flags(np, i, &flags);
-			hw->gpios[i] = gpio;
-
-			if (gpio_is_valid(gpio)) {
-				/* Real CS - set the initial state. */
-				ret = gpio_request(gpio, np->name);
-				if (ret < 0) {
-					dev_err(dev,
-						"can't request gpio #%d: %d\n",
-						i, ret);
-					goto free_gpios;
-				}
-
-				gpio_direction_output(gpio,
-						!!(flags & OF_GPIO_ACTIVE_LOW));
-			} else if (gpio == -EEXIST) {
-				; /* No CS, but that's OK. */
-			} else {
-				dev_err(dev, "invalid gpio #%d: %d\n", i, gpio);
-				ret = -EINVAL;
-				goto free_gpios;
-			}
-		}
-	}
-
 	/* Setup the state for the bitbang driver */
 	bbp = &hw->bitbang;
 	bbp->master = hw->master;
 	bbp->setup_transfer = spi_ppc4xx_setupxfer;
-	bbp->chipselect = spi_ppc4xx_chipsel;
 	bbp->txrx_bufs = spi_ppc4xx_txrx;
 	bbp->use_dma = 0;
 	bbp->master->setup = spi_ppc4xx_setup;
 	bbp->master->cleanup = spi_ppc4xx_cleanup;
 	bbp->master->bits_per_word_mask = SPI_BPW_MASK(8);
+	bbp->master->use_gpio_descriptors = true;
+	/*
+	 * The SPI core will count the number of GPIO descriptors to figure
+	 * out the number of chip selects available on the platform.
+	 */
+	bbp->master->num_chipselect = 0;
 
 	/* the spi->mode bits understood by this driver: */
 	bbp->master->mode_bits =
 		SPI_CPHA | SPI_CPOL | SPI_CS_HIGH | SPI_LSB_FIRST;
 
-	/* this many pins in all GPIO controllers */
-	bbp->master->num_chipselect = num_gpios > 0 ? num_gpios : 0;
-
 	/* Get the clock for the OPB */
 	opbnp = of_find_compatible_node(NULL, NULL, "ibm,opb");
 	if (opbnp == NULL) {
 		dev_err(dev, "OPB: cannot find node\n");
 		ret = -ENODEV;
-		goto free_gpios;
+		goto free_master;
 	}
 	/* Get the clock (Hz) for the OPB */
 	clk = of_get_property(opbnp, "clock-frequency", NULL);
@@ -474,7 +393,7 @@ static int spi_ppc4xx_of_probe(struct platform_device *op)
 		dev_err(dev, "OPB: no clock-frequency property set\n");
 		of_node_put(opbnp);
 		ret = -ENODEV;
-		goto free_gpios;
+		goto free_master;
 	}
 	hw->opb_freq = *clk;
 	hw->opb_freq >>= 2;
@@ -483,7 +402,7 @@ static int spi_ppc4xx_of_probe(struct platform_device *op)
 	ret = of_address_to_resource(np, 0, &resource);
 	if (ret) {
 		dev_err(dev, "error while parsing device node resource\n");
-		goto free_gpios;
+		goto free_master;
 	}
 	hw->mapbase = resource.start;
 	hw->mapsize = resource_size(&resource);
@@ -492,7 +411,7 @@ static int spi_ppc4xx_of_probe(struct platform_device *op)
 	if (hw->mapsize < sizeof(struct spi_ppc4xx_regs)) {
 		dev_err(dev, "too small to map registers\n");
 		ret = -EINVAL;
-		goto free_gpios;
+		goto free_master;
 	}
 
 	/* Request IRQ */
@@ -501,7 +420,7 @@ static int spi_ppc4xx_of_probe(struct platform_device *op)
 			  0, "spi_ppc4xx_of", (void *)hw);
 	if (ret) {
 		dev_err(dev, "unable to allocate interrupt\n");
-		goto free_gpios;
+		goto free_master;
 	}
 
 	if (!request_mem_region(hw->mapbase, hw->mapsize, DRIVER_NAME)) {
@@ -538,8 +457,6 @@ static int spi_ppc4xx_of_probe(struct platform_device *op)
 	release_mem_region(hw->mapbase, hw->mapsize);
 request_mem_error:
 	free_irq(hw->irqnum, hw);
-free_gpios:
-	free_gpios(hw);
 free_master:
 	spi_master_put(master);
 
@@ -556,7 +473,6 @@ static int spi_ppc4xx_of_remove(struct platform_device *op)
 	release_mem_region(hw->mapbase, hw->mapsize);
 	free_irq(hw->irqnum, hw);
 	iounmap(hw->regs);
-	free_gpios(hw);
 	spi_master_put(master);
 	return 0;
 }
-- 
2.26.2


^ permalink raw reply related

* Re: [PATCH 03/15] powerpc/powernv/pci: Add explicit tracking of the DMA setup state
From: Alexey Kardashevskiy @ 2020-07-14  7:21 UTC (permalink / raw)
  To: Oliver O'Halloran; +Cc: linuxppc-dev
In-Reply-To: <CAOSf1CESRPypebf6+rnkZkNmi6+xL4+QP1xgAS1szGsZDBcs8A@mail.gmail.com>



On 14/07/2020 15:58, Oliver O'Halloran wrote:
> On Tue, Jul 14, 2020 at 3:37 PM Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>
>> On 10/07/2020 15:23, Oliver O'Halloran wrote:
>>> There's an optimisation in the PE setup which skips performing DMA
>>> setup for a PE if we only have bridges in a PE. The assumption being
>>> that only "real" devices will DMA to system memory, which is probably
>>> fair. However, if we start off with only bridge devices in a PE then
>>> add a non-bridge device the new device won't be able to use DMA  because
>>> we never configured it.
>>>
>>> Fix this (admittedly pretty weird) edge case by tracking whether we've done
>>> the DMA setup for the PE or not. If a non-bridge device is added to the PE
>>> (via rescan or hotplug, or whatever) we can set up DMA on demand.
>>
>> So hotplug does not work on powernv then, right? I thought you tested it
>> a while ago, or this patch is the result of that attempt? If it is, then
> 
> It mostly works. Just the really niche case of hot plugging a bridge,
> then later on hot plugging a device into the same bus which wouldn't
> work.

Do not you have to have a slot (which is a bridge) for hotplug in the
first place, to hotplug the bridge?

> 
>> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>
>>
>>> This also means the only remaining user of the old "DMA Weight" code is
>>> the IODA1 DMA setup code that it was originally added for, which is good.
>>
>>
>> Is ditching IODA1 in the plan? :)
> 
> That or separating out the pci_controller_ops for IODA1 and IODA2 so
> we can stop any IODA2 specific changes from breaking it.

Is IODA1 tested at all these days? Or, is anyone running upstream
kernels anywhere and keeps shouting when it does not work on IODA1? Thanks,



> For the most
> part keeping around IODA1 support isn't hurting anyone, but I wanted
> to re-work how the BDFN->PE assignment works so that we'd delay
> assigning a BDFN to a PE until the device is probed. Right now when
> we're configuring the PE for a bus we map all 255 devfn's to that PE.
> This is mostly fine, but if you do a bus rescan and there's no device
> present we'll get a spurious EEH on that PE since the PHB sees that
> there's no device responding to the CFG cycle. We stop the spurious
> EEH freeze today by only allowing config cycles if we can find a
> pci_dn for that bdfn, but I want to get rid of pci_dn.
> 
> Mapping each BDFN to a PE after the device is probed is easy enough to
> do on PHB3 and above since the mapping is handled by an in-memory
> table which is indexed by the BDFN. Earlier PHBs (i.e. IODA1) use a
> table of bask & mask values which match on the BDFN, so assigning a
> whole bus at once is easy, but adding individual BDFNs is hard. It's
> still possible to do in the HW, but the way the OPAL API works makes
> it impossible.
> 
>>>
>>> Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
>>> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
>>> ---
>>> Alexey, do we need to have the IOMMU API stuff set/clear this flag?
>>
>>
>> I'd say no as that API only cares if a device is in a PE and for those
>> the PE DMA setup  optimization is skipped. Thanks,
> 
> Ok cool.
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH 4/5] dma-mapping: add a dma_ops_bypass flag to struct device
From: Alexey Kardashevskiy @ 2020-07-14  7:12 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Björn Töpel, Daniel Borkmann, Greg Kroah-Hartman,
	Joerg Roedel, linuxppc-dev, linux-kernel, iommu,
	Jesper Dangaard Brouer, Robin Murphy, Lu Baolu
In-Reply-To: <20200714070757.GA776@lst.de>



On 14/07/2020 17:07, Christoph Hellwig wrote:
> On Mon, Jul 13, 2020 at 02:59:39PM +1000, Alexey Kardashevskiy wrote:
>>
>>
>> On 09/07/2020 01:24, Christoph Hellwig wrote:
>>> Several IOMMU drivers have a bypass mode where they can use a direct
>>> mapping if the devices DMA mask is large enough.  Add generic support
>>> to the core dma-mapping code to do that to switch those drivers to
>>> a common solution.
>>>
>>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>>> ---
>>>  include/linux/device.h |  8 +++++
>>>  kernel/dma/Kconfig     |  8 +++++
>>>  kernel/dma/mapping.c   | 74 +++++++++++++++++++++++++++++-------------
>>>  3 files changed, 68 insertions(+), 22 deletions(-)
>>>
>>> diff --git a/include/linux/device.h b/include/linux/device.h
>>> index 4c4af98321ebd6..1f71acf37f78d7 100644
>>> --- a/include/linux/device.h
>>> +++ b/include/linux/device.h
>>> @@ -523,6 +523,11 @@ struct dev_links_info {
>>>   *		  sync_state() callback.
>>>   * @dma_coherent: this particular device is dma coherent, even if the
>>>   *		architecture supports non-coherent devices.
>>> + * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
>>> + *		streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
>>> + *		and optionall (if the coherent mask is large enough) also
>>
>>
>> s/optionall/optional/g
>>
>> Otherwise the series looks good and works well on powernv and pseries.
>> Thanks,
> 
> Can you give a formal ACK?

It did never matter before but sure :)

Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>

or you want me to reply to individual patches?  Thanks,


-- 
Alexey

^ permalink raw reply

* Re: [PATCH 4/5] dma-mapping: add a dma_ops_bypass flag to struct device
From: Christoph Hellwig @ 2020-07-14  7:07 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Björn Töpel, Daniel Borkmann, Greg Kroah-Hartman,
	Joerg Roedel, Robin Murphy, linux-kernel, iommu,
	Jesper Dangaard Brouer, linuxppc-dev, Christoph Hellwig, Lu Baolu
In-Reply-To: <9bff7460-e6fa-f765-dcb4-cc96eb86d92c@ozlabs.ru>

On Mon, Jul 13, 2020 at 02:59:39PM +1000, Alexey Kardashevskiy wrote:
> 
> 
> On 09/07/2020 01:24, Christoph Hellwig wrote:
> > Several IOMMU drivers have a bypass mode where they can use a direct
> > mapping if the devices DMA mask is large enough.  Add generic support
> > to the core dma-mapping code to do that to switch those drivers to
> > a common solution.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> >  include/linux/device.h |  8 +++++
> >  kernel/dma/Kconfig     |  8 +++++
> >  kernel/dma/mapping.c   | 74 +++++++++++++++++++++++++++++-------------
> >  3 files changed, 68 insertions(+), 22 deletions(-)
> > 
> > diff --git a/include/linux/device.h b/include/linux/device.h
> > index 4c4af98321ebd6..1f71acf37f78d7 100644
> > --- a/include/linux/device.h
> > +++ b/include/linux/device.h
> > @@ -523,6 +523,11 @@ struct dev_links_info {
> >   *		  sync_state() callback.
> >   * @dma_coherent: this particular device is dma coherent, even if the
> >   *		architecture supports non-coherent devices.
> > + * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
> > + *		streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
> > + *		and optionall (if the coherent mask is large enough) also
> 
> 
> s/optionall/optional/g
> 
> Otherwise the series looks good and works well on powernv and pseries.
> Thanks,

Can you give a formal ACK?

^ permalink raw reply

* Re: /sys/kernel/debug/kmemleak empty despite kmemleak reports
From: Paul Menzel @ 2020-07-14  6:59 UTC (permalink / raw)
  To: Catalin Marinas; +Cc: linuxppc-dev
In-Reply-To: <20200713182735.GH15829@gaia>

Dear Catalin,


Am 13.07.20 um 20:27 schrieb Catalin Marinas:
> On Thu, Jul 09, 2020 at 11:08:52PM +0200, Paul Menzel wrote:
>> Am 09.07.20 um 19:57 schrieb Catalin Marinas:
>>> On Thu, Jul 09, 2020 at 04:37:10PM +0200, Paul Menzel wrote:
>>>> Despite Linux 5.8-rc4 reporting memory leaks on the IBM POWER 8 S822LC, the
>>>> file does not contain more information.
>>>>
>>>>> $ dmesg
>>>>> […] > [48662.953323] perf: interrupt took too long (2570 > 2500), lowering kernel.perf_event_max_sample_rate to 77750
>>>>> [48854.810636] perf: interrupt took too long (3216 > 3212), lowering kernel.perf_event_max_sample_rate to 62000
>>>>> [52300.044518] perf: interrupt took too long (4244 > 4020), lowering kernel.perf_event_max_sample_rate to 47000
>>>>> [52751.373083] perf: interrupt took too long (5373 > 5305), lowering kernel.perf_event_max_sample_rate to 37000
>>>>> [53354.000363] perf: interrupt took too long (6793 > 6716), lowering kernel.perf_event_max_sample_rate to 29250
>>>>> [53850.215606] perf: interrupt took too long (8672 > 8491), lowering kernel.perf_event_max_sample_rate to 23000
>>>>> [57542.266099] perf: interrupt took too long (10940 > 10840), lowering kernel.perf_event_max_sample_rate to 18250
>>>>> [57559.645404] perf: interrupt took too long (13714 > 13675), lowering kernel.perf_event_max_sample_rate to 14500
>>>>> [61608.697728] Can't find PMC that caused IRQ
>>>>> [71774.463111] kmemleak: 12 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
>>>>> [92372.044785] process '@/usr/bin/gnatmake-5' started with executable stack
>>>>> [92849.380672] FS-Cache: Loaded
>>>>> [92849.417269] FS-Cache: Netfs 'nfs' registered for caching
>>>>> [92849.595974] NFS: Registering the id_resolver key type
>>>>> [92849.596000] Key type id_resolver registered
>>>>> [92849.596000] Key type id_legacy registered
>>>>> [101808.079143] kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
>>>>> [106904.323471] Can't find PMC that caused IRQ
>>>>> [129416.391456] kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
>>>>> [158171.604221] kmemleak: 34 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
>>>>> $ sudo cat /sys/kernel/debug/kmemleak
>>>
>>> When they are no longer present, they are most likely false positives.
>>
>> How can this be? Shouldn’t the false positive also be logged in
>> `/sys/kernel/debug/kmemleak`?
> 
> Sorry, I wasn't clear. It can be a transient false positive. At a
> subsequent scan, kmemleak found pointer referring the previously
> reported objects and no longer shows them.

Interesting. Is it possible to print a message in that case to avoid 
confusion?

>>> Was this triggered during boot? Or under some workload?
>>
>>  From the timestamps it looks like under some load.
> 
> Was it during boot? I put a delay of 60s to avoid this but, depending on
> the platform, it can still trigger.

No, it happened after several hours of runtime.


Kind regards,

Paul

^ permalink raw reply

* Re: [PATCH v3 4/6] powerpc/pseries/iommu: Remove default DMA window before creating DDW
From: Leonardo Bras @ 2020-07-14  6:46 UTC (permalink / raw)
  To: Alexey Kardashevskiy, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Thiago Jung Bauermann, Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <eb357d42f5605a2b0234c04de477e171134c24f5.camel@gmail.com>

In fact, the changes over the last patch are more complex than the
current patch. 
Just for reference, that's how enable_ddw() currently patches:

@@ -1087,7 +1119,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct
device_node *pdn)
        struct device_node *dn;
        u32 ddw_avail[DDW_APPLICABLE_SIZE];
        struct direct_window *window;
-       struct property *win64;
+       struct property *win64, *default_win = NULL;
        struct dynamic_dma_window_prop *ddwprop;
        struct failed_ddw_pdn *fpdn;
 
@@ -1133,14 +1165,38 @@ static u64 enable_ddw(struct pci_dev *dev,
struct device_node *pdn)
        if (ret != 0)
                goto out_failed;
 
+       /*
+        * If there is no window available, remove the default DMA
window,
+        * if it's present. This will make all the resources available
to the
+        * new DDW window.
+        * If anything fails after this, we need to restore it, so also
check
+        * for extensions presence.
+        */
        if (query.windows_available == 0) {
-               /*
-                * no additional windows are available for this device.
-                * We might be able to reallocate the existing window,
-                * trading in for a larger page size.
-                */
-               dev_dbg(&dev->dev, "no free dynamic windows");
-               goto out_failed;
+               int reset_win_ext;
+
+               default_win = of_find_property(pdn, "ibm,dma-window",
NULL);
+               if (!default_win)
+                       goto out_failed;
+
+               reset_win_ext = ddw_read_ext(pdn,
DDW_EXT_RESET_DMA_WIN, NULL);
+               if (reset_win_ext) {
+                       default_win = NULL;
+                       goto out_failed;
+               }
+
+               remove_dma_window(pdn, ddw_avail, default_win);
+
+               /* Query again, to check if the window is available */
+               ret = query_ddw(dev, ddw_avail, &query, pdn);
+               if (ret != 0)
+                       goto out_failed;
+
+               if (query.windows_available == 0) {
+                       /* no windows are available for this device. */
+                       dev_dbg(&dev->dev, "no free dynamic windows");
+                       goto out_failed;
+               }
        }
        if (query.page_size & 4) {
                page_shift = 24; /* 16MB */
@@ -1231,6 +1287,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct
device_node *pdn)
        kfree(win64);
 
 out_failed:
+       if (default_win)
+               reset_dma_window(dev, pdn);
 
        fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
        if (!fpdn)


^ permalink raw reply

* Re: [PATCH 1/3] module: Rename module_alloc() to text_alloc() and move to kernel proper
From: Ard Biesheuvel @ 2020-07-14  6:35 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Catalin Marinas, Kefeng Wang, Paul Mackerras, Zong Li, Andi Kleen,
	Paul Burton, Vincent Whitchurch, Petr Mladek, Brian Gerst,
	Andy Lutomirski, Yonghong Song, Thomas Gleixner, Jiri Kosina,
	Anup Patel, Linux Kernel Mailing List, Philipp Rudo, Torsten Duwe,
	Masami Hiramatsu, Andrew Morton, Mark Rutland,
	James E.J. Bottomley, Vincent Chen, open list:S390, Joe Lawrence,
	Helge Deller, John Fastabend, Anil S Keshavamurthy,
	Andrey Ryabinin, Iurii Zaikin, Andrii Nakryiko, Vasily Gorbik,
	moderated list:ARM PORT, Daniel Axtens, Damien Le Moal,
	Peter Oberparleiter, Sean Christopherson, Martin KaFai Lau,
	Song Liu, Paul Walmsley, Heiko Carstens, Alexei Starovoitov,
	Jarkko Sakkinen, Atish Patra, Will Deacon, Daniel Borkmann,
	Masahiro Yamada, Nayna Jain, Ley Foon Tan, Christian Borntraeger,
	Dmitry Vyukov, Sami Tolvanen, Naveen N. Rao, Mao Han, Marco Elver,
	Babu Moger, Borislav Petkov, Greentime Hu, Ben Dooks, Guan Xuetao,
	Thomas Bogendoerfer, open list:PARISC ARCHITECTURE, Jessica Yu,
	open list:BPF JIT for MIPS (32-BIT AND 64-BIT), David S. Miller,
	Thiago Jung Bauermann, Peter Zijlstra,
	open list:SPARC + UltraSPARC (sparc/sparc64), Sandipan Das,
	H. Peter Anvin, Amit Daniel Kachhap, Tiezhu Yang, Miroslav Benes,
	Jiri Olsa, open list:RISC-V ARCHITECTURE, Vincenzo Frascino,
	Anders Roxell, Sven Schnelle,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT), Russell King,
	Mike Rapoport, Ingo Molnar, Albert Ou, Paul E. McKenney,
	Josh Poimboeuf, KP Singh, Gerald Schaefer, Nick Hu,
	open list:BPF JIT for MIPS (32-BIT AND 64-BIT), open list:MIPS,
	Sergey Senozhatsky, Palmer Dabbelt,
	open list:LINUX FOR POWERPC (32-BIT AND 64-BIT)
In-Reply-To: <20200713220436.2f21d366@oasis.local.home>

On Tue, 14 Jul 2020 at 05:04, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Mon, 13 Jul 2020 22:49:48 +0300
> Ard Biesheuvel <ardb@kernel.org> wrote:
>
> > On arm64, we no longer use module_alloc for bpf or kprobes, to avoid
> > wasting va space on code that does not need to be loaded close to the
> > kernel. Also, module_alloc() allocates kasan shadow, which is
> > unnecessary for kprobes or bpf programs, which don't have kasan
> > instrumentation.
> >
> > This patch suggests that there are other reasons why conflating
> > allocation of module space and allocating  text pages for other uses
> > is a bad idea, but switching all users to text_alloc() is a step in
> > the wrong direction. It would be better to stop using module_alloc()
> > in core code except in the module loader, and have a generic
> > text_alloc() that can be overridden by the arch if necessary. Note
> > that x86  and s390 are the only architectures that use module_alloc()
> > in ftrace code.
> >
> > Please have a look at alloc_insn_page() or bpf_jit_alloc_exec() in the
> > arm64 tree to see what I mean.
>
> Hmm, so you have another method for allocating memory for trampolines?
> (I haven't looked at those functions you pointed out, out of sheer
> laziness ;-)
>
> It would be nice to implement the trampoline optimization in arm, which
> x86 has (see arch_ftrace_update_trampoline() and
> arch_ftrace_trampoline_func()).
>
> It helps when you have two different callbacks for different functions
> (like having live patching enabled and function tracing enabled, or
> kprobes using ftrace). Each callback will get its own allocated
> trampoline to jump to instead of jumping to the a trampoline that calls
> a looping function that tests to see which callback wants to be called
> by the traced function.
>

So in what sense are ftrace trampolines like kernel modules, apart
from the fact that they are executable pages that live in the vmalloc
space?

^ permalink raw reply

* Re: [PATCH v2] powerpc/pseries: detect secure and trusted boot state of the system.
From: Daniel Axtens @ 2020-07-14  6:38 UTC (permalink / raw)
  To: Nayna Jain, linuxppc-dev; +Cc: Nayna Jain, linux-kernel, Mimi Zohar
In-Reply-To: <1594434329-31219-1-git-send-email-nayna@linux.ibm.com>

Hi Nayna,

Thanks! Would you be able to fold in some of the information from my
reply to v1 into the changelog? Until we have public PAPR release with
it, that information is the extent of the public documentation. It would
be good to get it into the git log rather than just floating around in
the mail archives!

A couple of small nits:

> +	if (enabled)
> +		goto out;
> +
> +	if (!of_property_read_u32(of_root, "ibm,secure-boot", &secureboot)) {
> +		if (secureboot)
> +			enabled = (secureboot > 1) ? true : false;

Your tests double up here - you don't need both the 'if' statement and
the 'secureboot > 1' ternary operator.

Just

+	if (!of_property_read_u32(of_root, "ibm,secure-boot", &secureboot)) {
+		enabled = (secureboot > 1) ? true : false;

or even

+	if (!of_property_read_u32(of_root, "ibm,secure-boot", &secureboot)) {
+		enabled = (secureboot > 1);

would work.

> +	if (!of_property_read_u32(of_root, "ibm,trusted-boot", &trustedboot)) {
> +		if (trustedboot)
> +			enabled = (trustedboot > 0) ? true : false;

Likewise for trusted boot.

Regards,
Daniel

P.S. please could you add me to the cc: list for future revisions?

> +	}
> +
> +out:
>  	pr_info("Trusted boot mode %s\n", enabled ? "enabled" : "disabled");
>  
>  	return enabled;
> -- 
> 2.26.2

^ permalink raw reply

* Re: [RFC PATCH 7/7] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Nicholas Piggin @ 2020-07-14  6:31 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, Andy Lutomirski, linuxppc-dev
In-Reply-To: <1594701900.gcgdq8p13l.astroid@bobo.none>

Excerpts from Nicholas Piggin's message of July 14, 2020 3:04 pm:
> Excerpts from Andy Lutomirski's message of July 14, 2020 4:18 am:
>> 
>>> On Jul 13, 2020, at 9:48 AM, Nicholas Piggin <npiggin@gmail.com> wrote:
>>> 
>>> Excerpts from Andy Lutomirski's message of July 14, 2020 1:59 am:
>>>>> On Thu, Jul 9, 2020 at 6:57 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>>>>> 
>>>>> On big systems, the mm refcount can become highly contented when doing
>>>>> a lot of context switching with threaded applications (particularly
>>>>> switching between the idle thread and an application thread).
>>>>> 
>>>>> Abandoning lazy tlb slows switching down quite a bit in the important
>>>>> user->idle->user cases, so so instead implement a non-refcounted scheme
>>>>> that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
>>>>> any remaining lazy ones.
>>>>> 
>>>>> On a 16-socket 192-core POWER8 system, a context switching benchmark
>>>>> with as many software threads as CPUs (so each switch will go in and
>>>>> out of idle), upstream can achieve a rate of about 1 million context
>>>>> switches per second. After this patch it goes up to 118 million.
>>>>> 
>>>> 
>>>> I read the patch a couple of times, and I have a suggestion that could
>>>> be nonsense.  You are, effectively, using mm_cpumask() as a sort of
>>>> refcount.  You're saying "hey, this mm has no more references, but it
>>>> still has nonempty mm_cpumask(), so let's send an IPI and shoot down
>>>> those references too."  I'm wondering whether you actually need the
>>>> IPI.  What if, instead, you actually treated mm_cpumask as a refcount
>>>> for real?  Roughly, in __mmdrop(), you would only free the page tables
>>>> if mm_cpumask() is empty.  And, in the code that removes a CPU from
>>>> mm_cpumask(), you would check if mm_users == 0 and, if so, check if
>>>> you just removed the last bit from mm_cpumask and potentially free the
>>>> mm.
>>>> 
>>>> Getting the locking right here could be a bit tricky -- you need to
>>>> avoid two CPUs simultaneously exiting lazy TLB and thinking they
>>>> should free the mm, and you also need to avoid an mm with mm_users
>>>> hitting zero concurrently with the last remote CPU using it lazily
>>>> exiting lazy TLB.  Perhaps this could be resolved by having mm_count
>>>> == 1 mean "mm_cpumask() is might contain bits and, if so, it owns the
>>>> mm" and mm_count == 0 meaning "now it's dead" and using some careful
>>>> cmpxchg or dec_return to make sure that only one CPU frees it.
>>>> 
>>>> Or maybe you'd need a lock or RCU for this, but the idea would be to
>>>> only ever take the lock after mm_users goes to zero.
>>> 
>>> I don't think it's nonsense, it could be a good way to avoid IPIs.
>>> 
>>> I haven't seen much problem here that made me too concerned about IPIs 
>>> yet, so I think the simple patch may be good enough to start with
>>> for powerpc. I'm looking at avoiding/reducing the IPIs by combining the
>>> unlazying with the exit TLB flush without doing anything fancy with
>>> ref counting, but we'll see.
>> 
>> I would be cautious with benchmarking here. I would expect that the
>> nasty cases may affect power consumption more than performance — the 
>> specific issue is IPIs hitting idle cores, and the main effects are to 
>> slow down exit() a bit but also to kick the idle core out of idle. 
>> Although, if the idle core is in a deep sleep, that IPI could be 
>> *very* slow.
> 
> It will tend to be self-limiting to some degree (deeper idle cores
> would tend to have less chance of IPI) but we have bigger issues on
> powerpc with that, like broadcast IPIs to the mm cpumask for THP
> management. Power hasn't really shown up as an issue but powerpc
> CPUs may have their own requirements and issues there, shall we say.
> 
>> So I think it’s worth at least giving this a try.
> 
> To be clear it's not a complete solution itself. The problem is of 
> course that mm cpumask gives you false negatives, so the bits
> won't always clean up after themselves as CPUs switch away from their
> lazy tlb mms.

^^

False positives: CPU is in the mm_cpumask, but is not using the mm
as a lazy tlb. So there can be bits left and never freed.

If you closed the false positives, you're back to a shared mm cache
line on lazy mm context switches.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH v3 4/6] powerpc/pseries/iommu: Remove default DMA window before creating DDW
From: Leonardo Bras @ 2020-07-14  6:30 UTC (permalink / raw)
  To: Alexey Kardashevskiy, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Thiago Jung Bauermann, Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <18fd94d2-4365-16d1-7c85-af07d5c9a0f3@ozlabs.ru>

On Tue, 2020-07-14 at 14:52 +1000, Alexey Kardashevskiy wrote:
> 
> On 14/07/2020 12:40, Leonardo Bras wrote:
> > Thank you for this feedback Alexey!
> > 
> > On Mon, 2020-07-13 at 17:33 +1000, Alexey Kardashevskiy wrote:
> > > [...]
> > > > -	int len, ret;
> > > > +	int len, ret, reset_win_ext;
> > > 
> > > Make it "reset_token".
> > 
> > Oh, it's not a token here, it just checks if the reset_win extension
> > exists. The token would be returned in *value, but since we did not
> > need it here, it's not copied.
> 
> ah right, so it is a bool actually.

In fact I did it a int, as it's the return value of ddw_read_ext(),
which can return 0 on success and -error otherwise.

> > > > [...]
> > > > -out_failed:
> > > > +out_restore_defwin:
> > > > +	if (default_win && reset_win_ext == 0)
> > > 
> > > reset_win_ext potentially may be uninitialized here. Yeah I know it is
> > > tied to default_win but still.
> > 
> > I can't see it being used uninitialized here, as you said it's tied to
> > default_win. 
> 
> Where it is declared - it is not initialized so in theory it can skip
> "if (query.windows_available == 0)".

Humm, I thought doing if (default_win && reset_win_ext == 0) would
guarantee default_win to be tested before reset_win_ext is ever tested,
so I could control it using default_win. 

> 
> 
> > Could you please tell me how it can be used uninitialized here, or what
> > is bad by doing this way?
> > 
> > > After looking at this function for a few minutes, it could use some
> > > refactoring (way too many gotos)  such as:
> > 
> > Yes, I agree.
> > 
> > > 1. move (query.page_size & xx) checks before "if
> > > (query.windows_available == 0)"
> > 
> > Moving 'page_size selection' above 'checking windows available' will
> > need us to duplicate the 'page_size selection' after the new query,
> > inside the if.
> 
> page_size selection is not going to change, why?

In theory, a query after freeing the default DMA window could have a
different (bigger) page size, so we should test again.

> 
> 
> > I mean, as query will be done again, it will need to get the (new) page
> > size.
> > 
> > > 2. move "win64 = kzalloc(sizeof(struct property), GFP_KERNEL)" before
> > > "if (query.windows_available == 0)"
> > > 3. call "reset_dma_window(dev, pdn)" inside the "if
> > > (query.windows_available == 0)" branch.
> > > Then you can drop all "goto out_restore_defwin" and move default_win and
> > > reset_win_ext inside "if (query.windows_available == 0)".
> > 
> > I did all changes suggested locally and did some analysis in the
> > result:
> > 
> > I did not see a way to put default_win and reset_win_ext inside 
> > "if (query.windows_available == 0)", because if we still need a way to
> > know if the default window was removed, and if so, restore in case
> > anything ever fails ahead (like creating the node property). 
> 
> Ah, I missed that new out_restore_defwin label is between other exit
> labels. Sorry :-/
> 
> 
> >                 reset_win_ext = ddw_read_ext(pdn,
> > DDW_EXT_RESET_DMA_WIN, NULL);
> > -               if (reset_win_ext)
> > +               if (reset_win_ext){
> > +                       default_win = NULL;
> >                         goto out_failed;
> > +               }
> 
> This says "if we can reset, then we fail", no?

Here ddw_read_ext() should return 0 if extension was found, and 
(-EINVAL, -ENODATA or -EOVERFLOW) otherwise.
So it should return nonzero if we can't find the extension, in which
case we should fail.

> 
> >                 remove_dma_window(pdn, ddw_avail, default_win);
> 
> I think you can do "default_win=NULL" here and later at
> out_restore_defwin check if it is NULL - then call reset.

Currently I initialize 'default_win = NULL', and it only changes when I
read the default DMA window. If reset is not available I restore it to
NULL, so it will be not-NULL only when the have removed the default DMA
window. 

If I make it NULL here, we either never reset the default DMA window
(as it is now "if (default_win)" ) or we may always reset it (in case
 "if (default_win == NULL)"). 

If you think it's better, I can create a bool variable like
"default_win_removed", initialized with 'false', which can be assigned
here with 'true' and test in the end if(default_win_removed) reset();

This would allow to move default_win inside this 'if block'.

What do you think?

> > [...]
> >  
> > -out_restore_defwin:
> > -       if (default_win && reset_win_ext == 0)
> > +out_failed:
> > +       if (default_win)
> >                 reset_dma_window(dev, pdn);
> >  
> > -out_failed:
> >         fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
> >         if (!fpdn)
> >                 goto out_unlock;
> > 
> > #####
> > 
> > What do you think?
> > 
> > 
> > 
> > > The rest of the series is good as it is,
> > 
> > Thank you :)
> > 
> > >  however it may conflict with
> > > https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20200713062348.100552-1-aik@ozlabs.ru/
> > > and the patchset it is made on top of -
> > > https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=188385 .
> > 
> > <From the message of the first link>
> > > (do not rush, let me finish reviewing this first) 
> > 
> > Ok, I have no problem rebasing on top of those patchsets, but what
> > would you suggest to be done?
> 
> Polish this patch one more time and if by the time when you reposted it
> the other patchset is not in upstream, I'll ask Michael to take yours first.

Ok :)

> 
> > Would it be ok doing a big multi-author patchset, so we guarantee it
> > being applied in the correct order?
> > > (You probably want me to rebase my patchset on top of Hellwig + yours,
> > right?) 
> 
> Nah, at least not yet.

Thank you!


^ permalink raw reply

* Re: [PATCH 05/11] powerpc/smp: Dont assume l2-cache to be superset of sibling
From: Srikar Dronamraju @ 2020-07-14  6:30 UTC (permalink / raw)
  To: Oliver O'Halloran
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <CAOSf1CGmHuyiW_s6DgaNbBEzUhq0qsuQ0ODPYvH+X9je3VWxwA@mail.gmail.com>

* Oliver O'Halloran <oohall@gmail.com> [2020-07-14 15:40:09]:

> On Tue, Jul 14, 2020 at 2:45 PM Srikar Dronamraju
> <srikar@linux.vnet.ibm.com> wrote:
> >
> > Current code assumes that cpumask of cpus sharing a l2-cache mask will
> > always be a superset of cpu_sibling_mask.
> >
> > Lets stop that assumption.
> 
> It's been a while since I looked, but I'm pretty sure the scheduler
> requires child domains to be subsets of their parents. Why is this
> necessary or a good idea?

Thanks for looking into the patches.

Yes the scheduler requires the child domains to be subsets of their parents.

Current code assumes that the l2_cache is always a superset of sibling mask.
However there may be processors in future whose sibling mask maynot be a
superset. 

Lets for example we have a chip with 16 threads and 8 threads share
l2-cache, i.e 8 threads are acting like a small core and 16 threads are
acting like a big core. Then the assumption that l2-cache mask is a superset
of cpu_sibling mask would be wrong.

> 
> > Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> > Cc: Michael Ellerman <michaele@au1.ibm.com>
> > Cc: Nick Piggin <npiggin@au1.ibm.com>
> > Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> > Cc: Nathan Lynch <nathanl@linux.ibm.com>
> > Cc: Michael Neuling <mikey@linux.ibm.com>
> > Cc: Anton Blanchard <anton@au1.ibm.com>
> > Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> > Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> > Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> > ---
> >  arch/powerpc/kernel/smp.c | 28 +++++++++++++++-------------
> >  1 file changed, 15 insertions(+), 13 deletions(-)
> >
> > diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> > index 7d430fc536cc..875f57e41355 100644
> > --- a/arch/powerpc/kernel/smp.c
> > +++ b/arch/powerpc/kernel/smp.c
> > @@ -1198,6 +1198,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int))
> >         struct device_node *l2_cache, *np;
> >         int i;
> >
> > +       cpumask_set_cpu(cpu, mask_fn(cpu));
> 
> ?

At the time the cpumasks are updated, the cpu is not yet part of the
cpu_online_mask. So when we online/offline the cpus, the masks will end up
not having itself and causes the scheduler to bork.

Previously (as we can note in code below thats removed), we were doing as
part of updating all cpus that were part of the cpu_sibling_mask before
calling update_mask_by_l2.

> 
> >         l2_cache = cpu_to_l2cache(cpu);
> >         if (!l2_cache)
> >                 return false;
> > @@ -1284,29 +1285,30 @@ static void add_cpu_to_masks(int cpu)
> >          * add it to it's own thread sibling mask.
> >          */
> >         cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
> > +       cpumask_set_cpu(cpu, cpu_core_mask(cpu));
> >
> >         for (i = first_thread; i < first_thread + threads_per_core; i++)
> >                 if (cpu_online(i))
> >                         set_cpus_related(i, cpu, cpu_sibling_mask);
> >
> >         add_cpu_to_smallcore_masks(cpu);
> > -       /*
> > -        * Copy the thread sibling mask into the cache sibling mask
> > -        * and mark any CPUs that share an L2 with this CPU.
> > -        */
> > -       for_each_cpu(i, cpu_sibling_mask(cpu))
> > -               set_cpus_related(cpu, i, cpu_l2_cache_mask);

I am referring to this code above. This would have updated the self in its
cpumask. For the rest of the cpus in the cpu_sibling_mask, they get updated
correctly in the update_mask_by_l2.

> >         update_mask_by_l2(cpu, cpu_l2_cache_mask);
> >
> > -       /*
> > -        * Copy the cache sibling mask into core sibling mask and mark
> > -        * any CPUs on the same chip as this CPU.
> > -        */

-- 
Thanks and Regards
Srikar Dronamraju

^ permalink raw reply

* Re: [PATCH v3] powerpc/perf: Use SIER_USER_MASK while updating SPRN_SIER for EBB events
From: Michael Ellerman @ 2020-07-14  6:08 UTC (permalink / raw)
  To: Athira Rajeev; +Cc: maddy, linuxppc-dev
In-Reply-To: <8C50DF8B-1CBB-4365-B068-C8DA5B7D1148@linux.vnet.ibm.com>

Athira Rajeev <atrajeev@linux.vnet.ibm.com> writes:
>> On 19-Mar-2020, at 4:22 PM, Michael Ellerman <mpe@ellerman.id.au> wrote:
>> 
>> Hi Athira,
>> 
>> Athira Rajeev <atrajeev@linux.vnet.ibm.com> writes:
>>> Sampled Instruction Event Register (SIER), is a PMU register,
>>                                                               ^
>>                                                               that
>>> captures architecture state for a given sample. And sier_user_mask
>>           ^                                          ^
>>           don't think we need "architecture"         SIER_USER_MASK
>> 
>>> defined in commit 330a1eb7775b ("powerpc/perf: Core EBB support for 64-bit
>>> book3s") defines the architected bits that needs to be saved from the SPR.
>> 
>> Not quite, it defines the bits that are visible to userspace.
>> 
>> And I think it's true that for EBB events the bits we need/want to save
>> are only the user visible bits.
>> 
>>> Currently all of the bits from SIER are saved for EBB events. Patch fixes
>>> this by ANDing the "sier_user_mask" to data from SIER in ebb_switch_out().
>>> This will force save only architected bits from the SIER.
>> 
>> s/architected/user visible/
>> 
>> 
>> But, why does it matter? The kernel saves the user visible bits, as well
>> as the kernel-only bits into the thread struct. And then later the
>> kernel restores that value into the hardware before returning to
>> userspace.
>> 
>> But the hardware enforces the visibility of the bits, so userspace can't
>> observe any bits that it shouldn't.
>> 
>> Or is there some other mechanism whereby userspace can see those bits? ;)
>> 
>> If there was, what would the security implications of that be?
>
> Hi Michael,
>
> Thanks for your comments. 
>
> In ebb_switch_in, we set PMCC bit [MMCR0 44:45 ] to 10 which means
> SIER ( Group B ) register is readable in problem state. Hence the
> intention of the patch was to make sure we are not exposing the bits
> which the userspace shouldn't be reading.
>
> But following your comment about "hardware enforcing the visibility of
> bits", I did try an "ebb" experiment which showed that reading
> SPRN_SIER didn't expose any bits other than the user visible bits.
> Sorry for the confusion here.

That's OK. Thanks for following my trail of clues :)

> In that case, Can we drop the existing definition of SIER_USER_MASK if
> it is no more needed ?

I think it is still needed, and I think this change to use it is good, because
SIER is visible via ptrace.

What we need to do, is look at what information in SIER we are currently
exposing to userspace via ptrace, and what the security implications (if
any) of that are.

cheers

^ permalink raw reply

* Re: [PATCH 03/15] powerpc/powernv/pci: Add explicit tracking of the DMA setup state
From: Oliver O'Halloran @ 2020-07-14  5:58 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: linuxppc-dev
In-Reply-To: <ee5a00db-badd-12fe-1c46-eaba5afc8dea@ozlabs.ru>

On Tue, Jul 14, 2020 at 3:37 PM Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>
> On 10/07/2020 15:23, Oliver O'Halloran wrote:
> > There's an optimisation in the PE setup which skips performing DMA
> > setup for a PE if we only have bridges in a PE. The assumption being
> > that only "real" devices will DMA to system memory, which is probably
> > fair. However, if we start off with only bridge devices in a PE then
> > add a non-bridge device the new device won't be able to use DMA  because
> > we never configured it.
> >
> > Fix this (admittedly pretty weird) edge case by tracking whether we've done
> > the DMA setup for the PE or not. If a non-bridge device is added to the PE
> > (via rescan or hotplug, or whatever) we can set up DMA on demand.
>
> So hotplug does not work on powernv then, right? I thought you tested it
> a while ago, or this patch is the result of that attempt? If it is, then

It mostly works. Just the really niche case of hot plugging a bridge,
then later on hot plugging a device into the same bus which wouldn't
work.

> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>
>
> > This also means the only remaining user of the old "DMA Weight" code is
> > the IODA1 DMA setup code that it was originally added for, which is good.
>
>
> Is ditching IODA1 in the plan? :)

That or separating out the pci_controller_ops for IODA1 and IODA2 so
we can stop any IODA2 specific changes from breaking it. For the most
part keeping around IODA1 support isn't hurting anyone, but I wanted
to re-work how the BDFN->PE assignment works so that we'd delay
assigning a BDFN to a PE until the device is probed. Right now when
we're configuring the PE for a bus we map all 255 devfn's to that PE.
This is mostly fine, but if you do a bus rescan and there's no device
present we'll get a spurious EEH on that PE since the PHB sees that
there's no device responding to the CFG cycle. We stop the spurious
EEH freeze today by only allowing config cycles if we can find a
pci_dn for that bdfn, but I want to get rid of pci_dn.

Mapping each BDFN to a PE after the device is probed is easy enough to
do on PHB3 and above since the mapping is handled by an in-memory
table which is indexed by the BDFN. Earlier PHBs (i.e. IODA1) use a
table of bask & mask values which match on the BDFN, so assigning a
whole bus at once is easy, but adding individual BDFNs is hard. It's
still possible to do in the HW, but the way the OPAL API works makes
it impossible.

> >
> > Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> > Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> > ---
> > Alexey, do we need to have the IOMMU API stuff set/clear this flag?
>
>
> I'd say no as that API only cares if a device is in a PE and for those
> the PE DMA setup  optimization is skipped. Thanks,

Ok cool.

^ permalink raw reply

* Re: [PATCH 05/11] powerpc/smp: Dont assume l2-cache to be superset of sibling
From: Oliver O'Halloran @ 2020-07-14  5:40 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-6-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 2:45 PM Srikar Dronamraju
<srikar@linux.vnet.ibm.com> wrote:
>
> Current code assumes that cpumask of cpus sharing a l2-cache mask will
> always be a superset of cpu_sibling_mask.
>
> Lets stop that assumption.

It's been a while since I looked, but I'm pretty sure the scheduler
requires child domains to be subsets of their parents. Why is this
necessary or a good idea?

> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/kernel/smp.c | 28 +++++++++++++++-------------
>  1 file changed, 15 insertions(+), 13 deletions(-)
>
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 7d430fc536cc..875f57e41355 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1198,6 +1198,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int))
>         struct device_node *l2_cache, *np;
>         int i;
>
> +       cpumask_set_cpu(cpu, mask_fn(cpu));

?

>         l2_cache = cpu_to_l2cache(cpu);
>         if (!l2_cache)
>                 return false;
> @@ -1284,29 +1285,30 @@ static void add_cpu_to_masks(int cpu)
>          * add it to it's own thread sibling mask.
>          */
>         cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
> +       cpumask_set_cpu(cpu, cpu_core_mask(cpu));
>
>         for (i = first_thread; i < first_thread + threads_per_core; i++)
>                 if (cpu_online(i))
>                         set_cpus_related(i, cpu, cpu_sibling_mask);
>
>         add_cpu_to_smallcore_masks(cpu);
> -       /*
> -        * Copy the thread sibling mask into the cache sibling mask
> -        * and mark any CPUs that share an L2 with this CPU.
> -        */
> -       for_each_cpu(i, cpu_sibling_mask(cpu))
> -               set_cpus_related(cpu, i, cpu_l2_cache_mask);
>         update_mask_by_l2(cpu, cpu_l2_cache_mask);
>
> -       /*
> -        * Copy the cache sibling mask into core sibling mask and mark
> -        * any CPUs on the same chip as this CPU.
> -        */
> -       for_each_cpu(i, cpu_l2_cache_mask(cpu))
> -               set_cpus_related(cpu, i, cpu_core_mask);
> +       if (pkg_id == -1) {
> +               struct cpumask *(*mask)(int) = cpu_sibling_mask;
> +
> +               /*
> +                * Copy the sibling mask into core sibling mask and
> +                * mark any CPUs on the same chip as this CPU.
> +                */
> +               if (shared_caches)
> +                       mask = cpu_l2_cache_mask;
> +
> +               for_each_cpu(i, mask(cpu))
> +                       set_cpus_related(cpu, i, cpu_core_mask);
>
> -       if (pkg_id == -1)
>                 return;
> +       }
>
>         for_each_cpu(i, cpu_online_mask)
>                 if (get_physical_package_id(i) == pkg_id)
> --
> 2.17.1
>

^ permalink raw reply

* Re: [PATCH 03/15] powerpc/powernv/pci: Add explicit tracking of the DMA setup state
From: Alexey Kardashevskiy @ 2020-07-14  5:37 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev
In-Reply-To: <20200710052340.737567-4-oohall@gmail.com>



On 10/07/2020 15:23, Oliver O'Halloran wrote:
> There's an optimisation in the PE setup which skips performing DMA
> setup for a PE if we only have bridges in a PE. The assumption being
> that only "real" devices will DMA to system memory, which is probably
> fair. However, if we start off with only bridge devices in a PE then
> add a non-bridge device the new device won't be able to use DMA  because
> we never configured it.
> 
> Fix this (admittedly pretty weird) edge case by tracking whether we've done
> the DMA setup for the PE or not. If a non-bridge device is added to the PE
> (via rescan or hotplug, or whatever) we can set up DMA on demand.

So hotplug does not work on powernv then, right? I thought you tested it
a while ago, or this patch is the result of that attempt? If it is, then

Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>


> This also means the only remaining user of the old "DMA Weight" code is
> the IODA1 DMA setup code that it was originally added for, which is good.


Is ditching IODA1 in the plan? :)

> 
> Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> ---
> Alexey, do we need to have the IOMMU API stuff set/clear this flag?


I'd say no as that API only cares if a device is in a PE and for those
the PE DMA setup  optimization is skipped. Thanks,




> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 48 ++++++++++++++---------
>  arch/powerpc/platforms/powernv/pci.h      |  7 ++++
>  2 files changed, 36 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index bfb40607aa0e..bb9c1cc60c33 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -141,6 +141,7 @@ static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
>  
>  	phb->ioda.pe_array[pe_no].phb = phb;
>  	phb->ioda.pe_array[pe_no].pe_number = pe_no;
> +	phb->ioda.pe_array[pe_no].dma_setup_done = false;
>  
>  	/*
>  	 * Clear the PE frozen state as it might be put into frozen state
> @@ -1685,6 +1686,12 @@ static int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>  }
>  #endif /* CONFIG_PCI_IOV */
>  
> +static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
> +				       struct pnv_ioda_pe *pe);
> +
> +static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> +				       struct pnv_ioda_pe *pe);
> +
>  static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
>  {
>  	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
> @@ -1713,6 +1720,24 @@ static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
>  		pci_info(pdev, "Added to existing PE#%x\n", pe->pe_number);
>  	}
>  
> +	/*
> +	 * We assume that bridges *probably* don't need to do any DMA so we can
> +	 * skip allocating a TCE table, etc unless we get a non-bridge device.
> +	 */
> +	if (!pe->dma_setup_done && !pci_is_bridge(pdev)) {
> +		switch (phb->type) {
> +		case PNV_PHB_IODA1:
> +			pnv_pci_ioda1_setup_dma_pe(phb, pe);
> +			break;
> +		case PNV_PHB_IODA2:
> +			pnv_pci_ioda2_setup_dma_pe(phb, pe);
> +			break;
> +		default:
> +			pr_warn("%s: No DMA for PHB#%x (type %d)\n",
> +				__func__, phb->hose->global_number, phb->type);
> +		}
> +	}
> +
>  	if (pdn)
>  		pdn->pe_number = pe->pe_number;
>  	pe->device_count++;
> @@ -2222,6 +2247,7 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
>  	pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
>  	iommu_init_table(tbl, phb->hose->node, 0, 0);
>  
> +	pe->dma_setup_done = true;
>  	return;
>   fail:
>  	/* XXX Failure: Try to fallback to 64-bit only ? */
> @@ -2536,9 +2562,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>  {
>  	int64_t rc;
>  
> -	if (!pnv_pci_ioda_pe_dma_weight(pe))
> -		return;
> -
>  	/* TVE #1 is selected by PCI address bit 59 */
>  	pe->tce_bypass_base = 1ull << 59;
>  
> @@ -2563,6 +2586,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>  	iommu_register_group(&pe->table_group, phb->hose->global_number,
>  			     pe->pe_number);
>  #endif
> +	pe->dma_setup_done = true;
>  }
>  
>  int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
> @@ -3136,7 +3160,6 @@ static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
>  
>  static void pnv_pci_configure_bus(struct pci_bus *bus)
>  {
> -	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
>  	struct pci_dev *bridge = bus->self;
>  	struct pnv_ioda_pe *pe;
>  	bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
> @@ -3160,17 +3183,6 @@ static void pnv_pci_configure_bus(struct pci_bus *bus)
>  		return;
>  
>  	pnv_ioda_setup_pe_seg(pe);
> -	switch (phb->type) {
> -	case PNV_PHB_IODA1:
> -		pnv_pci_ioda1_setup_dma_pe(phb, pe);
> -		break;
> -	case PNV_PHB_IODA2:
> -		pnv_pci_ioda2_setup_dma_pe(phb, pe);
> -		break;
> -	default:
> -		pr_warn("%s: No DMA for PHB#%x (type %d)\n",
> -			__func__, phb->hose->global_number, phb->type);
> -	}
>  }
>  
>  static resource_size_t pnv_pci_default_alignment(void)
> @@ -3289,11 +3301,10 @@ static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
>  
>  static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
>  {
> -	unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
>  	struct iommu_table *tbl = pe->table_group.tables[0];
>  	int64_t rc;
>  
> -	if (!weight)
> +	if (!pe->dma_setup_done)
>  		return;
>  
>  	rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0);
> @@ -3313,10 +3324,9 @@ static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
>  static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
>  {
>  	struct iommu_table *tbl = pe->table_group.tables[0];
> -	unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
>  	int64_t rc;
>  
> -	if (!weight)
> +	if (pe->dma_setup_done)
>  		return;
>  
>  	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index 0727dec9a0d1..6aa6aefb637d 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -87,6 +87,13 @@ struct pnv_ioda_pe {
>  	bool			tce_bypass_enabled;
>  	uint64_t		tce_bypass_base;
>  
> +	/*
> +	 * Used to track whether we've done DMA setup for this PE or not. We
> +	 * want to defer allocating TCE tables, etc until we've added a
> +	 * non-bridge device to the PE.
> +	 */
> +	bool			dma_setup_done;
> +
>  	/* MSIs. MVE index is identical for for 32 and 64 bit MSI
>  	 * and -1 if not supported. (It's actually identical to the
>  	 * PE number)
> 

-- 
Alexey

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox