LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* 4.17.0-10146-gf0dc7f9c6dd9: hw csum failure on powerpc+sungem
From: Meelis Roos @ 2018-06-11 10:57 UTC (permalink / raw)
  To: netdev, linuxppc-dev; +Cc: Linux Kernel list

I am seeing this on PowerMac G4 with sungem ethernet driver. 4.17 was 
OK, 4.17.0-10146-gf0dc7f9c6dd9 is problematic.

[  140.518664] eth0: hw csum failure
[  140.518699] CPU: 0 PID: 1237 Comm: postconf Not tainted 4.17.0-10146-gf0dc7f9c6dd9 #83
[  140.518707] Call Trace:
[  140.518734] [effefd90] [c03d6db8] __skb_checksum_complete+0xd8/0xdc (unreliable)
[  140.518759] [effefdb0] [c04c1284] icmpv6_rcv+0x248/0x4ec
[  140.518775] [effefdd0] [c049a448] ip6_input_finish.constprop.0+0x11c/0x5f4
[  140.518786] [effefe10] [c049b1c0] ip6_mc_input+0xcc/0x100
[  140.518807] [effefe20] [c03e110c] __netif_receive_skb_core+0x310/0x944
[  140.518820] [effefe70] [c03e76ec] napi_gro_receive+0xd0/0xe8
[  140.518845] [effefe80] [f3e1f66c] gem_poll+0x618/0x1274 [sungem]
[  140.518856] [effeff30] [c03e6f0c] net_rx_action+0x198/0x374
[  140.518872] [effeff90] [c0501a88] __do_softirq+0x120/0x278
[  140.518890] [effeffe0] [c0036188] irq_exit+0xd8/0xdc
[  140.518908] [effefff0] [c000f478] call_do_irq+0x24/0x3c
[  140.518925] [d05a5d30] [c0007120] do_IRQ+0x74/0xf0
[  140.518941] [d05a5d50] [c0012474] ret_from_except+0x0/0x14
[  140.518960] --- interrupt: 501 at copy_page+0x40/0x90
                   LR = copy_user_page+0x18/0x30
[  140.518973] [d05a5e10] [d058cd80] 0xd058cd80 (unreliable)
[  140.518989] [d05a5e20] [c00fa2bc] wp_page_copy+0xec/0x654
[  140.519002] [d05a5e60] [c00fd3a4] do_wp_page+0xa8/0x5b4
[  140.519013] [d05a5e90] [c00fe934] handle_mm_fault+0x564/0xa84
[  140.519025] [d05a5f00] [c0016230] do_page_fault+0x1bc/0x7e8
[  140.519037] [d05a5f40] [c0012300] handle_page_fault+0x14/0x40
[  140.519048] --- interrupt: 301 at 0xb78b6864
                   LR = 0xb78b6c54


-- 
Meelis Roos (mroos@linux.ee)

^ permalink raw reply

* [PATCH v2 1/3] powerpc/fsl: Disable the speculation barrier from the command line
From: Diana Craciun @ 2018-06-11 12:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: mpe, oss, leoyang.li, bharat.bhushan, Diana Craciun
In-Reply-To: <1528721608-15443-1-git-send-email-diana.craciun@nxp.com>

The speculation barrier can be disabled from the command line
with the parameter: "nospectre_v1".

Signed-off-by: Diana Craciun <diana.craciun@nxp.com>
---
 arch/powerpc/kernel/security.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
index 3eb9c45..c55e102 100644
--- a/arch/powerpc/kernel/security.c
+++ b/arch/powerpc/kernel/security.c
@@ -16,6 +16,7 @@
 unsigned long powerpc_security_features __read_mostly = SEC_FTR_DEFAULT;
 
 bool barrier_nospec_enabled;
+static bool no_nospec;
 
 static void enable_barrier_nospec(bool enable)
 {
@@ -42,9 +43,18 @@ void setup_barrier_nospec(void)
 	enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
 		 security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR);
 
-	enable_barrier_nospec(enable);
+	if (!no_nospec)
+		enable_barrier_nospec(enable);
 }
 
+static int __init handle_nospectre_v1(char *p)
+{
+	no_nospec = true;
+
+	return 0;
+}
+early_param("nospectre_v1", handle_nospectre_v1);
+
 #ifdef CONFIG_DEBUG_FS
 static int barrier_nospec_set(void *data, u64 val)
 {
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2 0/3] powerpc/fsl: Speculation barrier for NXP PowerPC Book3E
From: Diana Craciun @ 2018-06-11 12:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: mpe, oss, leoyang.li, bharat.bhushan, Diana Craciun

Implement barrier_nospec for NXP PowerPC Book3E processors. 

Diana Craciun (3):
  Disable the speculation barrier from the command line
  Add barrier_nospec implementation for NXP PowerPC Book3E
  Implement cpu_show_spectre_v1/v2 for NXP PowerPC Book3E

 arch/powerpc/Kconfig               |  2 +-
 arch/powerpc/include/asm/barrier.h | 10 +++++++++
 arch/powerpc/include/asm/setup.h   |  2 +-
 arch/powerpc/kernel/Makefile       |  2 +-
 arch/powerpc/kernel/module.c       |  5 +++--
 arch/powerpc/kernel/security.c     | 42 +++++++++++++++++++++++++++++++++++++-
 arch/powerpc/kernel/setup_32.c     |  5 +++++
 arch/powerpc/kernel/setup_64.c     |  6 ++++++
 arch/powerpc/kernel/vmlinux.lds.S  |  4 +++-
 arch/powerpc/lib/feature-fixups.c  | 35 ++++++++++++++++++++++++++++++-
 10 files changed, 105 insertions(+), 8 deletions(-)

--
History:

v1 --> v2
- added implementation for cpu_show_spectre_x functions
- the mitigation is no longer enabled through device tree options
2.5.5

^ permalink raw reply

* [PATCH v2 2/3] powerpc/fsl: Add barrier_nospec implementation for NXP PowerPC Book3E
From: Diana Craciun @ 2018-06-11 12:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: mpe, oss, leoyang.li, bharat.bhushan, Diana Craciun
In-Reply-To: <1528721608-15443-1-git-send-email-diana.craciun@nxp.com>

Implement the barrier_nospec as a isync;sync instruction sequence.
The implementation uses the infrastructure built for BOOK3S 64.

Signed-off-by: Diana Craciun <diana.craciun@nxp.com>
---
 arch/powerpc/include/asm/barrier.h | 10 ++++++++++
 arch/powerpc/include/asm/setup.h   |  2 +-
 arch/powerpc/kernel/Makefile       |  2 +-
 arch/powerpc/kernel/module.c       |  5 +++--
 arch/powerpc/kernel/security.c     | 15 +++++++++++++++
 arch/powerpc/kernel/setup_32.c     |  5 +++++
 arch/powerpc/kernel/setup_64.c     |  6 ++++++
 arch/powerpc/kernel/vmlinux.lds.S  |  4 +++-
 arch/powerpc/lib/feature-fixups.c  | 35 ++++++++++++++++++++++++++++++++++-
 9 files changed, 78 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index f67b3f6..405d572 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -86,6 +86,16 @@ do {									\
 // This also acts as a compiler barrier due to the memory clobber.
 #define barrier_nospec() asm (stringify_in_c(barrier_nospec_asm) ::: "memory")
 
+#elif defined(CONFIG_PPC_FSL_BOOK3E)
+/*
+ * Prevent the execution of subsequent instructions speculatively using a
+ * isync;sync instruction sequence.
+ */
+#define barrier_nospec_asm NOSPEC_BARRIER_FIXUP_SECTION; nop; nop
+
+// This also acts as a compiler barrier due to the memory clobber.
+#define barrier_nospec() asm (stringify_in_c(barrier_nospec_asm) ::: "memory")
+
 #else /* !CONFIG_PPC_BOOK3S_64 */
 #define barrier_nospec_asm
 #define barrier_nospec()
diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h
index 8721fd0..67a2810 100644
--- a/arch/powerpc/include/asm/setup.h
+++ b/arch/powerpc/include/asm/setup.h
@@ -56,7 +56,7 @@ void setup_barrier_nospec(void);
 void do_barrier_nospec_fixups(bool enable);
 extern bool barrier_nospec_enabled;
 
-#ifdef CONFIG_PPC_BOOK3S_64
+#if defined(CONFIG_PPC_BOOK3S_64) || defined(CONFIG_PPC_FSL_BOOK3E)
 void do_barrier_nospec_fixups_range(bool enable, void *start, void *end);
 #else
 static inline void do_barrier_nospec_fixups_range(bool enable, void *start, void *end) { };
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 2b4c40b2..d9dee43 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -76,7 +76,7 @@ endif
 obj64-$(CONFIG_HIBERNATION)	+= swsusp_asm64.o
 obj-$(CONFIG_MODULES)		+= module.o module_$(BITS).o
 obj-$(CONFIG_44x)		+= cpu_setup_44x.o
-obj-$(CONFIG_PPC_FSL_BOOK3E)	+= cpu_setup_fsl_booke.o
+obj-$(CONFIG_PPC_FSL_BOOK3E)	+= cpu_setup_fsl_booke.o security.o
 obj-$(CONFIG_PPC_DOORBELL)	+= dbell.o
 obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o
 
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index 1b3c683..96a9821 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -72,13 +72,14 @@ int module_finalize(const Elf_Ehdr *hdr,
 		do_feature_fixups(powerpc_firmware_features,
 				  (void *)sect->sh_addr,
 				  (void *)sect->sh_addr + sect->sh_size);
-
+#endif /* CONFIG_PPC64 */
+#if defined(CONFIG_PPC64) || defined(CONFIG_PPC_FSL_BOOK3E)
 	sect = find_section(hdr, sechdrs, "__spec_barrier_fixup");
 	if (sect != NULL)
 		do_barrier_nospec_fixups_range(barrier_nospec_enabled,
 				  (void *)sect->sh_addr,
 				  (void *)sect->sh_addr + sect->sh_size);
-#endif
+#endif /* CONFIG_PPC64 || CONFIG_PPC_FSL_BOOK3E */
 
 	sect = find_section(hdr, sechdrs, "__lwsync_fixup");
 	if (sect != NULL)
diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
index c55e102..797c975 100644
--- a/arch/powerpc/kernel/security.c
+++ b/arch/powerpc/kernel/security.c
@@ -13,7 +13,9 @@
 #include <asm/setup.h>
 
 
+#ifdef CONFIG_PPC_BOOK3S_64
 unsigned long powerpc_security_features __read_mostly = SEC_FTR_DEFAULT;
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 bool barrier_nospec_enabled;
 static bool no_nospec;
@@ -24,6 +26,7 @@ static void enable_barrier_nospec(bool enable)
 	do_barrier_nospec_fixups(enable);
 }
 
+#ifdef CONFIG_PPC_BOOK3S_64
 void setup_barrier_nospec(void)
 {
 	bool enable;
@@ -46,6 +49,15 @@ void setup_barrier_nospec(void)
 	if (!no_nospec)
 		enable_barrier_nospec(enable);
 }
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+void setup_barrier_nospec(void)
+{
+	if (!no_nospec)
+		enable_barrier_nospec(true);
+}
+#endif /* CONFIG_PPC_FSL_BOOK3E */
 
 static int __init handle_nospectre_v1(char *p)
 {
@@ -92,6 +104,7 @@ static __init int barrier_nospec_debugfs_init(void)
 device_initcall(barrier_nospec_debugfs_init);
 #endif /* CONFIG_DEBUG_FS */
 
+#ifdef CONFIG_PPC_BOOK3S_64
 ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	bool thread_priv;
@@ -168,3 +181,5 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, c
 
 	return s.len;
 }
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 7445748..80c1e6e 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -116,6 +116,11 @@ notrace void __init machine_init(u64 dt_ptr)
 	/* Do some early initialization based on the flat device tree */
 	early_init_devtree(__va(dt_ptr));
 
+	/* Apply the speculation barrier fixup */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	setup_barrier_nospec();
+#endif
+
 	early_init_mmu();
 
 	setup_kdump_trampoline();
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 7a7ce8a..b2a644a 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -327,6 +327,12 @@ void __init early_setup(unsigned long dt_ptr)
 
 	/* Apply all the dynamic patching */
 	apply_feature_fixups();
+
+	/* Apply the speculation barrier fixup */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	setup_barrier_nospec();
+#endif /* CONFIG_PPC_FSL_BOOK3E */
+
 	setup_feature_keys();
 
 	/* Initialize the hash table or TLB handling */
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index ff73f49..af513e6 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -139,14 +139,16 @@ SECTIONS
 		*(__rfi_flush_fixup)
 		__stop___rfi_flush_fixup = .;
 	}
+#endif /* CONFIG_PPC64 */
 
+#if defined(CONFIG_PPC64) || defined(CONFIG_PPC_FSL_BOOK3E)
 	. = ALIGN(8);
 	__spec_barrier_fixup : AT(ADDR(__spec_barrier_fixup) - LOAD_OFFSET) {
 		__start___barrier_nospec_fixup = .;
 		*(__barrier_nospec_fixup)
 		__stop___barrier_nospec_fixup = .;
 	}
-#endif
+#endif /* CONFIG_PPC64 || CONFIG_PPC_FSL_BOOK3E */
 
 	EXCEPTION_TABLE(0)
 
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
index 2b9173d..bea2b87 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -188,7 +188,40 @@ void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_
 
 	printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i);
 }
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_end)
+{
+	unsigned int instr[2], *dest;
+	long *start, *end;
+	int i;
+
+	start = fixup_start;
+	end = fixup_end;
+
+	instr[0] = PPC_INST_NOP;
+	instr[1] = PPC_INST_NOP;
+
+	if (enable) {
+		pr_info("barrier_nospec; using isync; sync as a speculation barrier\n");
+		instr[0] = PPC_INST_ISYNC;
+		instr[1] = PPC_INST_SYNC;
+	}
+
+	for (i = 0; start < end; start++, i++) {
+		dest = (void *)start + *start;
+		pr_devel("patching dest %lx\n", (unsigned long)dest);
 
+		patch_instruction(dest, instr[0]);
+		patch_instruction(dest + 1, instr[1]);
+	}
+
+	pr_debug("barrier-nospec: patched %d locations\n", i);
+}
+#endif /* CONFIG_PPC_FSL_BOOK3E */
+
+#if defined(CONFIG_PPC_BOOK3S_64) || defined(CONFIG_PPC_FSL_BOOK3E)
 void do_barrier_nospec_fixups(bool enable)
 {
 	void *start, *end;
@@ -199,7 +232,7 @@ void do_barrier_nospec_fixups(bool enable)
 	do_barrier_nospec_fixups_range(enable, start, end);
 }
 
-#endif /* CONFIG_PPC_BOOK3S_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 || CONFIG_PPC_FSL_BOOK3E */
 
 void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)
 {
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2 3/3] powerpc/fsl: Implement cpu_show_spectre_v1/v2 for NXP PowerPC Book3E
From: Diana Craciun @ 2018-06-11 12:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: mpe, oss, leoyang.li, bharat.bhushan, Diana Craciun
In-Reply-To: <1528721608-15443-1-git-send-email-diana.craciun@nxp.com>

Signed-off-by: Diana Craciun <diana.craciun@nxp.com>
---
 arch/powerpc/Kconfig           |  2 +-
 arch/powerpc/kernel/security.c | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 940c955..a781d60 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -170,7 +170,7 @@ config PPC
 	select GENERIC_CLOCKEVENTS_BROADCAST	if SMP
 	select GENERIC_CMOS_UPDATE
 	select GENERIC_CPU_AUTOPROBE
-	select GENERIC_CPU_VULNERABILITIES	if PPC_BOOK3S_64
+	select GENERIC_CPU_VULNERABILITIES	if PPC_BOOK3S_64 || PPC_FSL_BOOK3E
 	select GENERIC_IRQ_SHOW
 	select GENERIC_IRQ_SHOW_LEVEL
 	select GENERIC_SMP_IDLE_THREAD
diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
index 797c975..aceaadc 100644
--- a/arch/powerpc/kernel/security.c
+++ b/arch/powerpc/kernel/security.c
@@ -183,3 +183,18 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, c
 }
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	if (barrier_nospec_enabled)
+		return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+
+	return sprintf(buf, "Vulnerable\n");
+}
+
+ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "Vulnerable\n");
+}
+#endif /* CONFIG_PPC_FSL_BOOK3E */
+
-- 
2.5.5

^ permalink raw reply related

* Re: [PATCH v11 00/26] Speculative page faults
From: Laurent Dufour @ 2018-06-11 15:15 UTC (permalink / raw)
  To: Song, HaiyanX
  Cc: akpm@linux-foundation.org, mhocko@kernel.org,
	peterz@infradead.org, kirill@shutemov.name, ak@linux.intel.com,
	dave@stgolabs.net, jack@suse.cz, Matthew Wilcox,
	khandual@linux.vnet.ibm.com, aneesh.kumar@linux.vnet.ibm.com,
	benh@kernel.crashing.org, mpe@ellerman.id.au, paulus@samba.org,
	Thomas Gleixner, Ingo Molnar, hpa@zytor.com, Will Deacon,
	Sergey Senozhatsky, sergey.senozhatsky.work@gmail.com,
	Andrea Arcangeli, Alexei Starovoitov, Wang, Kemi, Daniel Jordan,
	David Rientjes, Jerome Glisse, Ganesh Mahendran, Minchan Kim,
	Punit Agrawal, vinayak menon, Yang Shi,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	haren@linux.vnet.ibm.com, npiggin@gmail.com,
	bsingharora@gmail.com, paulmck@linux.vnet.ibm.com, Tim Chen,
	linuxppc-dev@lists.ozlabs.org, x86@kernel.org
In-Reply-To: <9FE19350E8A7EE45B64D8D63D368C8966B847F54@SHSMSX101.ccr.corp.intel.com>

Hi Haiyan,

I don't have access to the same hardware you ran the test on, but I give a try
to those test on a Power8 system (2 sockets, 5 cores/s, 8 threads/c, 80 CPUs 32G).
I run each will-it-scale test 10 times and compute the average.

test THP enabled		4.17.0-rc4-mm1	spf		delta
page_fault3_threads		2697.7		2683.5		-0.53%
page_fault2_threads		170660.6	169574.1	-0.64%
context_switch1_threads		6915269.2	6877507.3	-0.55%
context_switch1_processes	6478076.2	6529493.5	0.79%
brk1				243391.2	238527.5	-2.00%

Test were launched with the arguments '-t 80 -s 5', only the average report is
taken in account. Note that page size is 64K by default on ppc64.

It would be nice if you could capture some perf data to figure out why the
page_fault2/3 are showing such a performance regression.

Thanks,
Laurent.

On 11/06/2018 09:49, Song, HaiyanX wrote:
> Hi Laurent,
> 
> Regression test for v11 patch serials have been run, some regression is found by LKP-tools (linux kernel performance)
> tested on Intel 4s skylake platform. This time only test the cases which have been run and found regressions on
> V9 patch serials.
> 
> The regression result is sorted by the metric will-it-scale.per_thread_ops.
> branch: Laurent-Dufour/Speculative-page-faults/20180520-045126
> commit id:
>   head commit : a7a8993bfe3ccb54ad468b9f1799649e4ad1ff12
>   base commit : ba98a1cdad71d259a194461b3a61471b49b14df1
> Benchmark: will-it-scale
> Download link: https://github.com/antonblanchard/will-it-scale/tree/master
> 
> Metrics:
>   will-it-scale.per_process_ops=processes/nr_cpu
>   will-it-scale.per_thread_ops=threads/nr_cpu
>   test box: lkp-skl-4sp1(nr_cpu=192,memory=768G)
> THP: enable / disable
> nr_task:100%
> 
> 1. Regressions:
> 
> a). Enable THP
> testcase                          base           change      head           metric
> page_fault3/enable THP           10519          -20.5%        836      will-it-scale.per_thread_ops
> page_fault2/enalbe THP            8281          -18.8%       6728      will-it-scale.per_thread_ops
> brk1/eanble THP                 998475           -2.2%     976893      will-it-scale.per_process_ops
> context_switch1/enable THP      223910           -1.3%     220930      will-it-scale.per_process_ops
> context_switch1/enable THP      233722           -1.0%     231288      will-it-scale.per_thread_ops
> 
> b). Disable THP
> page_fault3/disable THP          10856          -23.1%       8344      will-it-scale.per_thread_ops
> page_fault2/disable THP           8147          -18.8%       6613      will-it-scale.per_thread_ops
> brk1/disable THP                   957           -7.9%        881      will-it-scale.per_thread_ops
> context_switch1/disable THP     237006           -2.2%     231907      will-it-scale.per_thread_ops
> brk1/disable THP                997317           -2.0%     977778      will-it-scale.per_process_ops
> page_fault3/disable THP         467454           -1.8%     459251      will-it-scale.per_process_ops
> context_switch1/disable THP     224431           -1.3%     221567      will-it-scale.per_process_ops
> 
> Notes: for the above  values of test result, the higher is better.
> 
> 2. Improvement: not found improvement based on the selected test cases.
> 
> 
> Best regards
> Haiyan Song
> ________________________________________
> From: owner-linux-mm@kvack.org [owner-linux-mm@kvack.org] on behalf of Laurent Dufour [ldufour@linux.vnet.ibm.com]
> Sent: Monday, May 28, 2018 4:54 PM
> To: Song, HaiyanX
> Cc: akpm@linux-foundation.org; mhocko@kernel.org; peterz@infradead.org; kirill@shutemov.name; ak@linux.intel.com; dave@stgolabs.net; jack@suse.cz; Matthew Wilcox; khandual@linux.vnet.ibm.com; aneesh.kumar@linux.vnet.ibm.com; benh@kernel.crashing.org; mpe@ellerman.id.au; paulus@samba.org; Thomas Gleixner; Ingo Molnar; hpa@zytor.com; Will Deacon; Sergey Senozhatsky; sergey.senozhatsky.work@gmail.com; Andrea Arcangeli; Alexei Starovoitov; Wang, Kemi; Daniel Jordan; David Rientjes; Jerome Glisse; Ganesh Mahendran; Minchan Kim; Punit Agrawal; vinayak menon; Yang Shi; linux-kernel@vger.kernel.org; linux-mm@kvack.org; haren@linux.vnet.ibm.com; npiggin@gmail.com; bsingharora@gmail.com; paulmck@linux.vnet.ibm.com; Tim Chen; linuxppc-dev@lists.ozlabs.org; x86@kernel.org
> Subject: Re: [PATCH v11 00/26] Speculative page faults
> 
> On 28/05/2018 10:22, Haiyan Song wrote:
>> Hi Laurent,
>>
>> Yes, these tests are done on V9 patch.
> 
> Do you plan to give this V11 a run ?
> 
>>
>>
>> Best regards,
>> Haiyan Song
>>
>> On Mon, May 28, 2018 at 09:51:34AM +0200, Laurent Dufour wrote:
>>> On 28/05/2018 07:23, Song, HaiyanX wrote:
>>>>
>>>> Some regression and improvements is found by LKP-tools(linux kernel performance) on V9 patch series
>>>> tested on Intel 4s Skylake platform.
>>>
>>> Hi,
>>>
>>> Thanks for reporting this benchmark results, but you mentioned the "V9 patch
>>> series" while responding to the v11 header series...
>>> Were these tests done on v9 or v11 ?
>>>
>>> Cheers,
>>> Laurent.
>>>
>>>>
>>>> The regression result is sorted by the metric will-it-scale.per_thread_ops.
>>>> Branch: Laurent-Dufour/Speculative-page-faults/20180316-151833 (V9 patch series)
>>>> Commit id:
>>>>     base commit: d55f34411b1b126429a823d06c3124c16283231f
>>>>     head commit: 0355322b3577eeab7669066df42c550a56801110
>>>> Benchmark suite: will-it-scale
>>>> Download link:
>>>> https://github.com/antonblanchard/will-it-scale/tree/master/tests
>>>> Metrics:
>>>>     will-it-scale.per_process_ops=processes/nr_cpu
>>>>     will-it-scale.per_thread_ops=threads/nr_cpu
>>>> test box: lkp-skl-4sp1(nr_cpu=192,memory=768G)
>>>> THP: enable / disable
>>>> nr_task: 100%
>>>>
>>>> 1. Regressions:
>>>> a) THP enabled:
>>>> testcase                        base            change          head       metric
>>>> page_fault3/ enable THP         10092           -17.5%          8323       will-it-scale.per_thread_ops
>>>> page_fault2/ enable THP          8300           -17.2%          6869       will-it-scale.per_thread_ops
>>>> brk1/ enable THP                  957.67         -7.6%           885       will-it-scale.per_thread_ops
>>>> page_fault3/ enable THP        172821            -5.3%        163692       will-it-scale.per_process_ops
>>>> signal1/ enable THP              9125            -3.2%          8834       will-it-scale.per_process_ops
>>>>
>>>> b) THP disabled:
>>>> testcase                        base            change          head       metric
>>>> page_fault3/ disable THP        10107           -19.1%          8180       will-it-scale.per_thread_ops
>>>> page_fault2/ disable THP         8432           -17.8%          6931       will-it-scale.per_thread_ops
>>>> context_switch1/ disable THP   215389            -6.8%        200776       will-it-scale.per_thread_ops
>>>> brk1/ disable THP                 939.67         -6.6%           877.33    will-it-scale.per_thread_ops
>>>> page_fault3/ disable THP       173145            -4.7%        165064       will-it-scale.per_process_ops
>>>> signal1/ disable THP             9162            -3.9%          8802       will-it-scale.per_process_ops
>>>>
>>>> 2. Improvements:
>>>> a) THP enabled:
>>>> testcase                        base            change          head       metric
>>>> malloc1/ enable THP               66.33        +469.8%           383.67    will-it-scale.per_thread_ops
>>>> writeseek3/ enable THP          2531             +4.5%          2646       will-it-scale.per_thread_ops
>>>> signal1/ enable THP              989.33          +2.8%          1016       will-it-scale.per_thread_ops
>>>>
>>>> b) THP disabled:
>>>> testcase                        base            change          head       metric
>>>> malloc1/ disable THP              90.33        +417.3%           467.33    will-it-scale.per_thread_ops
>>>> read2/ disable THP             58934            +39.2%         82060       will-it-scale.per_thread_ops
>>>> page_fault1/ disable THP        8607            +36.4%         11736       will-it-scale.per_thread_ops
>>>> read1/ disable THP            314063            +12.7%        353934       will-it-scale.per_thread_ops
>>>> writeseek3/ disable THP         2452            +12.5%          2759       will-it-scale.per_thread_ops
>>>> signal1/ disable THP             971.33          +5.5%          1024       will-it-scale.per_thread_ops
>>>>
>>>> Notes: for above values in column "change", the higher value means that the related testcase result
>>>> on head commit is better than that on base commit for this benchmark.
>>>>
>>>>
>>>> Best regards
>>>> Haiyan Song
>>>>
>>>> ________________________________________
>>>> From: owner-linux-mm@kvack.org [owner-linux-mm@kvack.org] on behalf of Laurent Dufour [ldufour@linux.vnet.ibm.com]
>>>> Sent: Thursday, May 17, 2018 7:06 PM
>>>> To: akpm@linux-foundation.org; mhocko@kernel.org; peterz@infradead.org; kirill@shutemov.name; ak@linux.intel.com; dave@stgolabs.net; jack@suse.cz; Matthew Wilcox; khandual@linux.vnet.ibm.com; aneesh.kumar@linux.vnet.ibm.com; benh@kernel.crashing.org; mpe@ellerman.id.au; paulus@samba.org; Thomas Gleixner; Ingo Molnar; hpa@zytor.com; Will Deacon; Sergey Senozhatsky; sergey.senozhatsky.work@gmail.com; Andrea Arcangeli; Alexei Starovoitov; Wang, Kemi; Daniel Jordan; David Rientjes; Jerome Glisse; Ganesh Mahendran; Minchan Kim; Punit Agrawal; vinayak menon; Yang Shi
>>>> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; haren@linux.vnet.ibm.com; npiggin@gmail.com; bsingharora@gmail.com; paulmck@linux.vnet.ibm.com; Tim Chen; linuxppc-dev@lists.ozlabs.org; x86@kernel.org
>>>> Subject: [PATCH v11 00/26] Speculative page faults
>>>>
>>>> This is a port on kernel 4.17 of the work done by Peter Zijlstra to handle
>>>> page fault without holding the mm semaphore [1].
>>>>
>>>> The idea is to try to handle user space page faults without holding the
>>>> mmap_sem. This should allow better concurrency for massively threaded
>>>> process since the page fault handler will not wait for other threads memory
>>>> layout change to be done, assuming that this change is done in another part
>>>> of the process's memory space. This type page fault is named speculative
>>>> page fault. If the speculative page fault fails because of a concurrency is
>>>> detected or because underlying PMD or PTE tables are not yet allocating, it
>>>> is failing its processing and a classic page fault is then tried.
>>>>
>>>> The speculative page fault (SPF) has to look for the VMA matching the fault
>>>> address without holding the mmap_sem, this is done by introducing a rwlock
>>>> which protects the access to the mm_rb tree. Previously this was done using
>>>> SRCU but it was introducing a lot of scheduling to process the VMA's
>>>> freeing operation which was hitting the performance by 20% as reported by
>>>> Kemi Wang [2]. Using a rwlock to protect access to the mm_rb tree is
>>>> limiting the locking contention to these operations which are expected to
>>>> be in a O(log n) order. In addition to ensure that the VMA is not freed in
>>>> our back a reference count is added and 2 services (get_vma() and
>>>> put_vma()) are introduced to handle the reference count. Once a VMA is
>>>> fetched from the RB tree using get_vma(), it must be later freed using
>>>> put_vma(). I can't see anymore the overhead I got while will-it-scale
>>>> benchmark anymore.
>>>>
>>>> The VMA's attributes checked during the speculative page fault processing
>>>> have to be protected against parallel changes. This is done by using a per
>>>> VMA sequence lock. This sequence lock allows the speculative page fault
>>>> handler to fast check for parallel changes in progress and to abort the
>>>> speculative page fault in that case.
>>>>
>>>> Once the VMA has been found, the speculative page fault handler would check
>>>> for the VMA's attributes to verify that the page fault has to be handled
>>>> correctly or not. Thus, the VMA is protected through a sequence lock which
>>>> allows fast detection of concurrent VMA changes. If such a change is
>>>> detected, the speculative page fault is aborted and a *classic* page fault
>>>> is tried.  VMA sequence lockings are added when VMA attributes which are
>>>> checked during the page fault are modified.
>>>>
>>>> When the PTE is fetched, the VMA is checked to see if it has been changed,
>>>> so once the page table is locked, the VMA is valid, so any other changes
>>>> leading to touching this PTE will need to lock the page table, so no
>>>> parallel change is possible at this time.
>>>>
>>>> The locking of the PTE is done with interrupts disabled, this allows
>>>> checking for the PMD to ensure that there is not an ongoing collapsing
>>>> operation. Since khugepaged is firstly set the PMD to pmd_none and then is
>>>> waiting for the other CPU to have caught the IPI interrupt, if the pmd is
>>>> valid at the time the PTE is locked, we have the guarantee that the
>>>> collapsing operation will have to wait on the PTE lock to move forward.
>>>> This allows the SPF handler to map the PTE safely. If the PMD value is
>>>> different from the one recorded at the beginning of the SPF operation, the
>>>> classic page fault handler will be called to handle the operation while
>>>> holding the mmap_sem. As the PTE lock is done with the interrupts disabled,
>>>> the lock is done using spin_trylock() to avoid dead lock when handling a
>>>> page fault while a TLB invalidate is requested by another CPU holding the
>>>> PTE.
>>>>
>>>> In pseudo code, this could be seen as:
>>>>     speculative_page_fault()
>>>>     {
>>>>             vma = get_vma()
>>>>             check vma sequence count
>>>>             check vma's support
>>>>             disable interrupt
>>>>                   check pgd,p4d,...,pte
>>>>                   save pmd and pte in vmf
>>>>                   save vma sequence counter in vmf
>>>>             enable interrupt
>>>>             check vma sequence count
>>>>             handle_pte_fault(vma)
>>>>                     ..
>>>>                     page = alloc_page()
>>>>                     pte_map_lock()
>>>>                             disable interrupt
>>>>                                     abort if sequence counter has changed
>>>>                                     abort if pmd or pte has changed
>>>>                                     pte map and lock
>>>>                             enable interrupt
>>>>                     if abort
>>>>                        free page
>>>>                        abort
>>>>                     ...
>>>>     }
>>>>
>>>>     arch_fault_handler()
>>>>     {
>>>>             if (speculative_page_fault(&vma))
>>>>                goto done
>>>>     again:
>>>>             lock(mmap_sem)
>>>>             vma = find_vma();
>>>>             handle_pte_fault(vma);
>>>>             if retry
>>>>                unlock(mmap_sem)
>>>>                goto again;
>>>>     done:
>>>>             handle fault error
>>>>     }
>>>>
>>>> Support for THP is not done because when checking for the PMD, we can be
>>>> confused by an in progress collapsing operation done by khugepaged. The
>>>> issue is that pmd_none() could be true either if the PMD is not already
>>>> populated or if the underlying PTE are in the way to be collapsed. So we
>>>> cannot safely allocate a PMD if pmd_none() is true.
>>>>
>>>> This series add a new software performance event named 'speculative-faults'
>>>> or 'spf'. It counts the number of successful page fault event handled
>>>> speculatively. When recording 'faults,spf' events, the faults one is
>>>> counting the total number of page fault events while 'spf' is only counting
>>>> the part of the faults processed speculatively.
>>>>
>>>> There are some trace events introduced by this series. They allow
>>>> identifying why the page faults were not processed speculatively. This
>>>> doesn't take in account the faults generated by a monothreaded process
>>>> which directly processed while holding the mmap_sem. This trace events are
>>>> grouped in a system named 'pagefault', they are:
>>>>  - pagefault:spf_vma_changed : if the VMA has been changed in our back
>>>>  - pagefault:spf_vma_noanon : the vma->anon_vma field was not yet set.
>>>>  - pagefault:spf_vma_notsup : the VMA's type is not supported
>>>>  - pagefault:spf_vma_access : the VMA's access right are not respected
>>>>  - pagefault:spf_pmd_changed : the upper PMD pointer has changed in our
>>>>    back.
>>>>
>>>> To record all the related events, the easier is to run perf with the
>>>> following arguments :
>>>> $ perf stat -e 'faults,spf,pagefault:*' <command>
>>>>
>>>> There is also a dedicated vmstat counter showing the number of successful
>>>> page fault handled speculatively. I can be seen this way:
>>>> $ grep speculative_pgfault /proc/vmstat
>>>>
>>>> This series builds on top of v4.16-mmotm-2018-04-13-17-28 and is functional
>>>> on x86, PowerPC and arm64.
>>>>
>>>> ---------------------
>>>> Real Workload results
>>>>
>>>> As mentioned in previous email, we did non official runs using a "popular
>>>> in memory multithreaded database product" on 176 cores SMT8 Power system
>>>> which showed a 30% improvements in the number of transaction processed per
>>>> second. This run has been done on the v6 series, but changes introduced in
>>>> this new version should not impact the performance boost seen.
>>>>
>>>> Here are the perf data captured during 2 of these runs on top of the v8
>>>> series:
>>>>                 vanilla         spf
>>>> faults          89.418          101.364         +13%
>>>> spf                n/a           97.989
>>>>
>>>> With the SPF kernel, most of the page fault were processed in a speculative
>>>> way.
>>>>
>>>> Ganesh Mahendran had backported the series on top of a 4.9 kernel and gave
>>>> it a try on an android device. He reported that the application launch time
>>>> was improved in average by 6%, and for large applications (~100 threads) by
>>>> 20%.
>>>>
>>>> Here are the launch time Ganesh mesured on Android 8.0 on top of a Qcom
>>>> MSM845 (8 cores) with 6GB (the less is better):
>>>>
>>>> Application                             4.9     4.9+spf delta
>>>> com.tencent.mm                          416     389     -7%
>>>> com.eg.android.AlipayGphone             1135    986     -13%
>>>> com.tencent.mtt                         455     454     0%
>>>> com.qqgame.hlddz                        1497    1409    -6%
>>>> com.autonavi.minimap                    711     701     -1%
>>>> com.tencent.tmgp.sgame                  788     748     -5%
>>>> com.immomo.momo                         501     487     -3%
>>>> com.tencent.peng                        2145    2112    -2%
>>>> com.smile.gifmaker                      491     461     -6%
>>>> com.baidu.BaiduMap                      479     366     -23%
>>>> com.taobao.taobao                       1341    1198    -11%
>>>> com.baidu.searchbox                     333     314     -6%
>>>> com.tencent.mobileqq                    394     384     -3%
>>>> com.sina.weibo                          907     906     0%
>>>> com.youku.phone                         816     731     -11%
>>>> com.happyelements.AndroidAnimal.qq      763     717     -6%
>>>> com.UCMobile                            415     411     -1%
>>>> com.tencent.tmgp.ak                     1464    1431    -2%
>>>> com.tencent.qqmusic                     336     329     -2%
>>>> com.sankuai.meituan                     1661    1302    -22%
>>>> com.netease.cloudmusic                  1193    1200    1%
>>>> air.tv.douyu.android                    4257    4152    -2%
>>>>
>>>> ------------------
>>>> Benchmarks results
>>>>
>>>> Base kernel is v4.17.0-rc4-mm1
>>>> SPF is BASE + this series
>>>>
>>>> Kernbench:
>>>> ----------
>>>> Here are the results on a 16 CPUs X86 guest using kernbench on a 4.15
>>>> kernel (kernel is build 5 times):
>>>>
>>>> Average Half load -j 8
>>>>                  Run    (std deviation)
>>>>                  BASE                   SPF
>>>> Elapsed Time     1448.65 (5.72312)      1455.84 (4.84951)       0.50%
>>>> User    Time     10135.4 (30.3699)      10148.8 (31.1252)       0.13%
>>>> System  Time     900.47  (2.81131)      923.28  (7.52779)       2.53%
>>>> Percent CPU      761.4   (1.14018)      760.2   (0.447214)      -0.16%
>>>> Context Switches 85380   (3419.52)      84748   (1904.44)       -0.74%
>>>> Sleeps           105064  (1240.96)      105074  (337.612)       0.01%
>>>>
>>>> Average Optimal load -j 16
>>>>                  Run    (std deviation)
>>>>                  BASE                   SPF
>>>> Elapsed Time     920.528 (10.1212)      927.404 (8.91789)       0.75%
>>>> User    Time     11064.8 (981.142)      11085   (990.897)       0.18%
>>>> System  Time     979.904 (84.0615)      1001.14 (82.5523)       2.17%
>>>> Percent CPU      1089.5  (345.894)      1086.1  (343.545)       -0.31%
>>>> Context Switches 159488  (78156.4)      158223  (77472.1)       -0.79%
>>>> Sleeps           110566  (5877.49)      110388  (5617.75)       -0.16%
>>>>
>>>>
>>>> During a run on the SPF, perf events were captured:
>>>>  Performance counter stats for '../kernbench -M':
>>>>          526743764      faults
>>>>                210      spf
>>>>                  3      pagefault:spf_vma_changed
>>>>                  0      pagefault:spf_vma_noanon
>>>>               2278      pagefault:spf_vma_notsup
>>>>                  0      pagefault:spf_vma_access
>>>>                  0      pagefault:spf_pmd_changed
>>>>
>>>> Very few speculative page faults were recorded as most of the processes
>>>> involved are monothreaded (sounds that on this architecture some threads
>>>> were created during the kernel build processing).
>>>>
>>>> Here are the kerbench results on a 80 CPUs Power8 system:
>>>>
>>>> Average Half load -j 40
>>>>                  Run    (std deviation)
>>>>                  BASE                   SPF
>>>> Elapsed Time     117.152 (0.774642)     117.166 (0.476057)      0.01%
>>>> User    Time     4478.52 (24.7688)      4479.76 (9.08555)       0.03%
>>>> System  Time     131.104 (0.720056)     134.04  (0.708414)      2.24%
>>>> Percent CPU      3934    (19.7104)      3937.2  (19.0184)       0.08%
>>>> Context Switches 92125.4 (576.787)      92581.6 (198.622)       0.50%
>>>> Sleeps           317923  (652.499)      318469  (1255.59)       0.17%
>>>>
>>>> Average Optimal load -j 80
>>>>                  Run    (std deviation)
>>>>                  BASE                   SPF
>>>> Elapsed Time     107.73  (0.632416)     107.31  (0.584936)      -0.39%
>>>> User    Time     5869.86 (1466.72)      5871.71 (1467.27)       0.03%
>>>> System  Time     153.728 (23.8573)      157.153 (24.3704)       2.23%
>>>> Percent CPU      5418.6  (1565.17)      5436.7  (1580.91)       0.33%
>>>> Context Switches 223861  (138865)       225032  (139632)        0.52%
>>>> Sleeps           330529  (13495.1)      332001  (14746.2)       0.45%
>>>>
>>>> During a run on the SPF, perf events were captured:
>>>>  Performance counter stats for '../kernbench -M':
>>>>          116730856      faults
>>>>                  0      spf
>>>>                  3      pagefault:spf_vma_changed
>>>>                  0      pagefault:spf_vma_noanon
>>>>                476      pagefault:spf_vma_notsup
>>>>                  0      pagefault:spf_vma_access
>>>>                  0      pagefault:spf_pmd_changed
>>>>
>>>> Most of the processes involved are monothreaded so SPF is not activated but
>>>> there is no impact on the performance.
>>>>
>>>> Ebizzy:
>>>> -------
>>>> The test is counting the number of records per second it can manage, the
>>>> higher is the best. I run it like this 'ebizzy -mTt <nrcpus>'. To get
>>>> consistent result I repeated the test 100 times and measure the average
>>>> result. The number is the record processes per second, the higher is the
>>>> best.
>>>>
>>>>                 BASE            SPF             delta
>>>> 16 CPUs x86 VM  742.57          1490.24         100.69%
>>>> 80 CPUs P8 node 13105.4         24174.23        84.46%
>>>>
>>>> Here are the performance counter read during a run on a 16 CPUs x86 VM:
>>>>  Performance counter stats for './ebizzy -mTt 16':
>>>>            1706379      faults
>>>>            1674599      spf
>>>>              30588      pagefault:spf_vma_changed
>>>>                  0      pagefault:spf_vma_noanon
>>>>                363      pagefault:spf_vma_notsup
>>>>                  0      pagefault:spf_vma_access
>>>>                  0      pagefault:spf_pmd_changed
>>>>
>>>> And the ones captured during a run on a 80 CPUs Power node:
>>>>  Performance counter stats for './ebizzy -mTt 80':
>>>>            1874773      faults
>>>>            1461153      spf
>>>>             413293      pagefault:spf_vma_changed
>>>>                  0      pagefault:spf_vma_noanon
>>>>                200      pagefault:spf_vma_notsup
>>>>                  0      pagefault:spf_vma_access
>>>>                  0      pagefault:spf_pmd_changed
>>>>
>>>> In ebizzy's case most of the page fault were handled in a speculative way,
>>>> leading the ebizzy performance boost.
>>>>
>>>> ------------------
>>>> Changes since v10 (https://lkml.org/lkml/2018/4/17/572):
>>>>  - Accounted for all review feedbacks from Punit Agrawal, Ganesh Mahendran
>>>>    and Minchan Kim, hopefully.
>>>>  - Remove unneeded check on CONFIG_SPECULATIVE_PAGE_FAULT in
>>>>    __do_page_fault().
>>>>  - Loop in pte_spinlock() and pte_map_lock() when pte try lock fails
>>>>    instead
>>>>    of aborting the speculative page fault handling. Dropping the now
>>>> useless
>>>>    trace event pagefault:spf_pte_lock.
>>>>  - No more try to reuse the fetched VMA during the speculative page fault
>>>>    handling when retrying is needed. This adds a lot of complexity and
>>>>    additional tests done didn't show a significant performance improvement.
>>>>  - Convert IS_ENABLED(CONFIG_NUMA) back to #ifdef due to build error.
>>>>
>>>> [1] http://linux-kernel.2935.n7.nabble.com/RFC-PATCH-0-6-Another-go-at-speculative-page-faults-tt965642.html#none
>>>> [2] https://patchwork.kernel.org/patch/9999687/
>>>>
>>>>
>>>> Laurent Dufour (20):
>>>>   mm: introduce CONFIG_SPECULATIVE_PAGE_FAULT
>>>>   x86/mm: define ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
>>>>   powerpc/mm: set ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
>>>>   mm: introduce pte_spinlock for FAULT_FLAG_SPECULATIVE
>>>>   mm: make pte_unmap_same compatible with SPF
>>>>   mm: introduce INIT_VMA()
>>>>   mm: protect VMA modifications using VMA sequence count
>>>>   mm: protect mremap() against SPF hanlder
>>>>   mm: protect SPF handler against anon_vma changes
>>>>   mm: cache some VMA fields in the vm_fault structure
>>>>   mm/migrate: Pass vm_fault pointer to migrate_misplaced_page()
>>>>   mm: introduce __lru_cache_add_active_or_unevictable
>>>>   mm: introduce __vm_normal_page()
>>>>   mm: introduce __page_add_new_anon_rmap()
>>>>   mm: protect mm_rb tree with a rwlock
>>>>   mm: adding speculative page fault failure trace events
>>>>   perf: add a speculative page fault sw event
>>>>   perf tools: add support for the SPF perf event
>>>>   mm: add speculative page fault vmstats
>>>>   powerpc/mm: add speculative page fault
>>>>
>>>> Mahendran Ganesh (2):
>>>>   arm64/mm: define ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
>>>>   arm64/mm: add speculative page fault
>>>>
>>>> Peter Zijlstra (4):
>>>>   mm: prepare for FAULT_FLAG_SPECULATIVE
>>>>   mm: VMA sequence count
>>>>   mm: provide speculative fault infrastructure
>>>>   x86/mm: add speculative pagefault handling
>>>>
>>>>  arch/arm64/Kconfig                    |   1 +
>>>>  arch/arm64/mm/fault.c                 |  12 +
>>>>  arch/powerpc/Kconfig                  |   1 +
>>>>  arch/powerpc/mm/fault.c               |  16 +
>>>>  arch/x86/Kconfig                      |   1 +
>>>>  arch/x86/mm/fault.c                   |  27 +-
>>>>  fs/exec.c                             |   2 +-
>>>>  fs/proc/task_mmu.c                    |   5 +-
>>>>  fs/userfaultfd.c                      |  17 +-
>>>>  include/linux/hugetlb_inline.h        |   2 +-
>>>>  include/linux/migrate.h               |   4 +-
>>>>  include/linux/mm.h                    | 136 +++++++-
>>>>  include/linux/mm_types.h              |   7 +
>>>>  include/linux/pagemap.h               |   4 +-
>>>>  include/linux/rmap.h                  |  12 +-
>>>>  include/linux/swap.h                  |  10 +-
>>>>  include/linux/vm_event_item.h         |   3 +
>>>>  include/trace/events/pagefault.h      |  80 +++++
>>>>  include/uapi/linux/perf_event.h       |   1 +
>>>>  kernel/fork.c                         |   5 +-
>>>>  mm/Kconfig                            |  22 ++
>>>>  mm/huge_memory.c                      |   6 +-
>>>>  mm/hugetlb.c                          |   2 +
>>>>  mm/init-mm.c                          |   3 +
>>>>  mm/internal.h                         |  20 ++
>>>>  mm/khugepaged.c                       |   5 +
>>>>  mm/madvise.c                          |   6 +-
>>>>  mm/memory.c                           | 612 +++++++++++++++++++++++++++++-----
>>>>  mm/mempolicy.c                        |  51 ++-
>>>>  mm/migrate.c                          |   6 +-
>>>>  mm/mlock.c                            |  13 +-
>>>>  mm/mmap.c                             | 229 ++++++++++---
>>>>  mm/mprotect.c                         |   4 +-
>>>>  mm/mremap.c                           |  13 +
>>>>  mm/nommu.c                            |   2 +-
>>>>  mm/rmap.c                             |   5 +-
>>>>  mm/swap.c                             |   6 +-
>>>>  mm/swap_state.c                       |   8 +-
>>>>  mm/vmstat.c                           |   5 +-
>>>>  tools/include/uapi/linux/perf_event.h |   1 +
>>>>  tools/perf/util/evsel.c               |   1 +
>>>>  tools/perf/util/parse-events.c        |   4 +
>>>>  tools/perf/util/parse-events.l        |   1 +
>>>>  tools/perf/util/python.c              |   1 +
>>>>  44 files changed, 1161 insertions(+), 211 deletions(-)
>>>>  create mode 100644 include/trace/events/pagefault.h
>>>>
>>>> --
>>>> 2.7.4
>>>>
>>>>
>>>
>>
> 

^ permalink raw reply

* Re: pkeys on POWER: Access rights not reset on execve
From: Ram Pai @ 2018-06-11 17:23 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Linux-MM, linuxppc-dev, Andy Lutomirski, Dave Hansen
In-Reply-To: <2858a8eb-c9b5-42ce-5cfc-74a4b3ad6aa9@redhat.com>

On Fri, Jun 08, 2018 at 07:53:51AM +0200, Florian Weimer wrote:
> On 06/08/2018 04:34 AM, Ram Pai wrote:
> >>
> >>So the remaining question at this point is whether the Intel
> >>behavior (default-deny instead of default-allow) is preferable.
> >
> >Florian, remind me what behavior needs to fixed?
> 
> See the other thread.  The Intel register equivalent to the AMR by
> default disallows access to yet-unallocated keys, so that threads
> which are created before key allocation do not magically gain access
> to a key allocated by another thread.

Are you referring to the thread
'[PATCH] pkeys: Introduce PKEY_ALLOC_SIGNALINHERIT and change signal semantics'

If yes, I will wait for your next version of the patch.

Otherwise please point me to the URL of that thread. Sorry and thankx. :)
RP

^ permalink raw reply

* Re: pkeys on POWER: Access rights not reset on execve
From: Florian Weimer @ 2018-06-11 17:29 UTC (permalink / raw)
  To: Ram Pai; +Cc: Linux-MM, linuxppc-dev, Andy Lutomirski, Dave Hansen
In-Reply-To: <20180611172305.GB5697@ram.oc3035372033.ibm.com>

On 06/11/2018 07:23 PM, Ram Pai wrote:
> On Fri, Jun 08, 2018 at 07:53:51AM +0200, Florian Weimer wrote:
>> On 06/08/2018 04:34 AM, Ram Pai wrote:
>>>>
>>>> So the remaining question at this point is whether the Intel
>>>> behavior (default-deny instead of default-allow) is preferable.
>>>
>>> Florian, remind me what behavior needs to fixed?
>>
>> See the other thread.  The Intel register equivalent to the AMR by
>> default disallows access to yet-unallocated keys, so that threads
>> which are created before key allocation do not magically gain access
>> to a key allocated by another thread.
> 
> Are you referring to the thread
> '[PATCH] pkeys: Introduce PKEY_ALLOC_SIGNALINHERIT and change signal semantics'

> Otherwise please point me to the URL of that thread. Sorry and thankx. :)

No, it's this issue:

   <https://lists.ozlabs.org/pipermail/linuxppc-dev/2018-May/173157.html>

The UAMOR part has been fixed (thanks), but I think processes still 
start out with default-allow AMR.

Thanks,
Florian

^ permalink raw reply

* Re: [v3, 03/10] dt-binding: ptp_qoriq: add DPAA FMan support
From: Rob Herring @ 2018-06-11 18:25 UTC (permalink / raw)
  To: Yangbo Lu
  Cc: netdev, madalin.bucur, Richard Cochran, Shawn Guo,
	David S . Miller, devicetree, linuxppc-dev, linux-arm-kernel,
	linux-kernel
In-Reply-To: <20180607092050.46128-4-yangbo.lu@nxp.com>

On Thu, Jun 07, 2018 at 05:20:43PM +0800, Yangbo Lu wrote:
> This patch is to add bindings description for DPAA
> FMan 1588 timer, and also remove its description in
> fsl-fman dt-bindings document.
> 
> Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
> ---
> Changes for v2:
> 	- None.
> Changes for v3:
> 	- None.
> ---
>  Documentation/devicetree/bindings/net/fsl-fman.txt |   25 +-------------------
>  .../devicetree/bindings/ptp/ptp-qoriq.txt          |   15 +++++++++--
>  2 files changed, 13 insertions(+), 27 deletions(-)

Reviewed-by: Rob Herring <robh@kernel.org>

^ permalink raw reply

* Re: pkeys on POWER: Access rights not reset on execve
From: Ram Pai @ 2018-06-11 20:08 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Linux-MM, linuxppc-dev, Andy Lutomirski, Dave Hansen
In-Reply-To: <30f5cb0e-e09a-15e6-f77d-a3afa422a651@redhat.com>

On Mon, Jun 11, 2018 at 07:29:33PM +0200, Florian Weimer wrote:
> On 06/11/2018 07:23 PM, Ram Pai wrote:
> >On Fri, Jun 08, 2018 at 07:53:51AM +0200, Florian Weimer wrote:
> >>On 06/08/2018 04:34 AM, Ram Pai wrote:
> >>>>
> >>>>So the remaining question at this point is whether the Intel
> >>>>behavior (default-deny instead of default-allow) is preferable.
> >>>
> >>>Florian, remind me what behavior needs to fixed?
> >>
> >>See the other thread.  The Intel register equivalent to the AMR by
> >>default disallows access to yet-unallocated keys, so that threads
> >>which are created before key allocation do not magically gain access
> >>to a key allocated by another thread.
> >
> >Are you referring to the thread
> >'[PATCH] pkeys: Introduce PKEY_ALLOC_SIGNALINHERIT and change signal semantics'
> 
> >Otherwise please point me to the URL of that thread. Sorry and thankx. :)
> 
> No, it's this issue:
> 
>   ...

Ok. try this patch. This patch is on top of the 5 patches that I had
sent last week i.e  "[PATCH  0/5] powerpc/pkeys: fixes to pkeys"

The following is a draft patch though to check if it meets your
expectations.

commit fe53b5fe2dcb3139ea27ade3ae7cbbe43c4af3be
Author: Ram Pai <linuxram@us.ibm.com>
Date:   Mon Jun 11 14:57:34 2018 -0500

    powerpc/pkeys: Deny read/write/execute by default
    
    Deny everything for all keys; with some exceptions. Do not do this for
    pkey-0, or else everything will come to a screaching halt.  Also by
    default, do not deny execute for execute-only key.
    
    This is a draft-patch for now.
    
    Signed-off-by: Ram Pai <linuxram@us.ibm.com>

diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 8225263..289aafd 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -128,13 +128,13 @@ int pkey_initialize(void)
 
 	/* register mask is in BE format */
 	pkey_amr_mask = ~0x0ul;
-	pkey_iamr_mask = ~0x0ul;
+	pkey_amr_mask &= ~(0x3ul << pkeyshift(PKEY_0));
+	pkey_amr_mask &= ~(0x3ul << pkeyshift(1));
 
-	for (i = 0; i < (pkeys_total - os_reserved); i++) {
-		pkey_amr_mask &= ~(0x3ul << pkeyshift(i));
-		pkey_iamr_mask &= ~(0x1ul << pkeyshift(i));
-	}
-	pkey_amr_mask |= (AMR_RD_BIT|AMR_WR_BIT) << pkeyshift(EXECUTE_ONLY_KEY);
+	pkey_iamr_mask = ~0x0ul;
+	pkey_iamr_mask &= ~(0x3ul << pkeyshift(PKEY_0));
+	pkey_iamr_mask &= ~(0x3ul << pkeyshift(1));
+	pkey_iamr_mask &= ~(0x3ul << pkeyshift(EXECUTE_ONLY_KEY));
 
 	pkey_uamor_mask = ~0x0ul;
 	pkey_uamor_mask &= ~(0x3ul << pkeyshift(PKEY_0));

-- 
Ram Pai

^ permalink raw reply related

* Re: 4.17.0-10146-gf0dc7f9c6dd9: hw csum failure on powerpc+sungem
From: Mathieu Malaterre @ 2018-06-11 20:20 UTC (permalink / raw)
  To: Meelis Roos; +Cc: netdev, Mauro Carvalho Chehab, linuxppc-dev, LKML
In-Reply-To: <alpine.LRH.2.21.1806111352330.17091@math.ut.ee>

Hi Meelis,

On Mon, Jun 11, 2018 at 1:21 PM Meelis Roos <mroos@linux.ee> wrote:
>
> I am seeing this on PowerMac G4 with sungem ethernet driver. 4.17 was
> OK, 4.17.0-10146-gf0dc7f9c6dd9 is problematic.

Same here.

> [  140.518664] eth0: hw csum failure
> [  140.518699] CPU: 0 PID: 1237 Comm: postconf Not tainted 4.17.0-10146-gf0dc7f9c6dd9 #83
> [  140.518707] Call Trace:
> [  140.518734] [effefd90] [c03d6db8] __skb_checksum_complete+0xd8/0xdc (unreliable)
> [  140.518759] [effefdb0] [c04c1284] icmpv6_rcv+0x248/0x4ec
> [  140.518775] [effefdd0] [c049a448] ip6_input_finish.constprop.0+0x11c/0x5f4
> [  140.518786] [effefe10] [c049b1c0] ip6_mc_input+0xcc/0x100
> [  140.518807] [effefe20] [c03e110c] __netif_receive_skb_core+0x310/0x944
> [  140.518820] [effefe70] [c03e76ec] napi_gro_receive+0xd0/0xe8
> [  140.518845] [effefe80] [f3e1f66c] gem_poll+0x618/0x1274 [sungem]
> [  140.518856] [effeff30] [c03e6f0c] net_rx_action+0x198/0x374
> [  140.518872] [effeff90] [c0501a88] __do_softirq+0x120/0x278
> [  140.518890] [effeffe0] [c0036188] irq_exit+0xd8/0xdc
> [  140.518908] [effefff0] [c000f478] call_do_irq+0x24/0x3c
> [  140.518925] [d05a5d30] [c0007120] do_IRQ+0x74/0xf0
> [  140.518941] [d05a5d50] [c0012474] ret_from_except+0x0/0x14
> [  140.518960] --- interrupt: 501 at copy_page+0x40/0x90
>                    LR = copy_user_page+0x18/0x30
> [  140.518973] [d05a5e10] [d058cd80] 0xd058cd80 (unreliable)
> [  140.518989] [d05a5e20] [c00fa2bc] wp_page_copy+0xec/0x654
> [  140.519002] [d05a5e60] [c00fd3a4] do_wp_page+0xa8/0x5b4
> [  140.519013] [d05a5e90] [c00fe934] handle_mm_fault+0x564/0xa84
> [  140.519025] [d05a5f00] [c0016230] do_page_fault+0x1bc/0x7e8
> [  140.519037] [d05a5f40] [c0012300] handle_page_fault+0x14/0x40
> [  140.519048] --- interrupt: 301 at 0xb78b6864
>                    LR = 0xb78b6c54
>

For some reason if I do a git bisect it returns that:

$ git bisect good
3036bc45364f98515a2c446d7fac2c34dcfbeff4 is the first bad commit

Could you also check on your side please.

> --
> Meelis Roos (mroos@linux.ee)

^ permalink raw reply

* [PATCH] misc: ocxl: Change return type for fault handler
From: Souptick Joarder @ 2018-06-11 20:29 UTC (permalink / raw)
  To: willy, fbarrat, andrew.donnellan, arnd, gregkh
  Cc: linuxppc-dev, linux-kernel, sabyasachi.linux, brajeswar.linux

Use new return type vm_fault_t for fault handler. For
now, this is just documenting that the function returns
a VM_FAULT value rather than an errno. Once all instances
are converted, vm_fault_t will become a distinct type.

Ref-> commit 1c8f422059ae ("mm: change return type to vm_fault_t")

There is an existing bug when vm_insert_pfn() can return
ENOMEM which was ignored and VM_FAULT_NOPAGE returned as
default. The new inline vmf_insert_pfn() has removed
this inefficiency by returning correct vm_fault_ type.

Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
---
 drivers/misc/ocxl/context.c | 22 +++++++++++-----------
 drivers/misc/ocxl/sysfs.c   |  5 ++---
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c
index 909e880..98daf91 100644
--- a/drivers/misc/ocxl/context.c
+++ b/drivers/misc/ocxl/context.c
@@ -83,7 +83,7 @@ int ocxl_context_attach(struct ocxl_context *ctx, u64 amr)
 	return rc;
 }
 
-static int map_afu_irq(struct vm_area_struct *vma, unsigned long address,
+static vm_fault_t map_afu_irq(struct vm_area_struct *vma, unsigned long address,
 		u64 offset, struct ocxl_context *ctx)
 {
 	u64 trigger_addr;
@@ -92,15 +92,15 @@ static int map_afu_irq(struct vm_area_struct *vma, unsigned long address,
 	if (!trigger_addr)
 		return VM_FAULT_SIGBUS;
 
-	vm_insert_pfn(vma, address, trigger_addr >> PAGE_SHIFT);
-	return VM_FAULT_NOPAGE;
+	return vmf_insert_pfn(vma, address, trigger_addr >> PAGE_SHIFT);
 }
 
-static int map_pp_mmio(struct vm_area_struct *vma, unsigned long address,
+static vm_fault_t map_pp_mmio(struct vm_area_struct *vma, unsigned long address,
 		u64 offset, struct ocxl_context *ctx)
 {
 	u64 pp_mmio_addr;
 	int pasid_off;
+	vm_fault_t ret;
 
 	if (offset >= ctx->afu->config.pp_mmio_stride)
 		return VM_FAULT_SIGBUS;
@@ -118,27 +118,27 @@ static int map_pp_mmio(struct vm_area_struct *vma, unsigned long address,
 		pasid_off * ctx->afu->config.pp_mmio_stride +
 		offset;
 
-	vm_insert_pfn(vma, address, pp_mmio_addr >> PAGE_SHIFT);
+	ret = vmf_insert_pfn(vma, address, pp_mmio_addr >> PAGE_SHIFT);
 	mutex_unlock(&ctx->status_mutex);
-	return VM_FAULT_NOPAGE;
+	return ret;
 }
 
-static int ocxl_mmap_fault(struct vm_fault *vmf)
+static vm_fault_t ocxl_mmap_fault(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct ocxl_context *ctx = vma->vm_file->private_data;
 	u64 offset;
-	int rc;
+	vm_fault_t ret;
 
 	offset = vmf->pgoff << PAGE_SHIFT;
 	pr_debug("%s: pasid %d address 0x%lx offset 0x%llx\n", __func__,
 		ctx->pasid, vmf->address, offset);
 
 	if (offset < ctx->afu->irq_base_offset)
-		rc = map_pp_mmio(vma, vmf->address, offset, ctx);
+		ret = map_pp_mmio(vma, vmf->address, offset, ctx);
 	else
-		rc = map_afu_irq(vma, vmf->address, offset, ctx);
-	return rc;
+		ret = map_afu_irq(vma, vmf->address, offset, ctx);
+	return ret;
 }
 
 static const struct vm_operations_struct ocxl_vmops = {
diff --git a/drivers/misc/ocxl/sysfs.c b/drivers/misc/ocxl/sysfs.c
index d9753a1..0ab1fd1 100644
--- a/drivers/misc/ocxl/sysfs.c
+++ b/drivers/misc/ocxl/sysfs.c
@@ -64,7 +64,7 @@ static ssize_t global_mmio_read(struct file *filp, struct kobject *kobj,
 	return count;
 }
 
-static int global_mmio_fault(struct vm_fault *vmf)
+static vm_fault_t global_mmio_fault(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct ocxl_afu *afu = vma->vm_private_data;
@@ -75,8 +75,7 @@ static int global_mmio_fault(struct vm_fault *vmf)
 
 	offset = vmf->pgoff;
 	offset += (afu->global_mmio_start >> PAGE_SHIFT);
-	vm_insert_pfn(vma, vmf->address, offset);
-	return VM_FAULT_NOPAGE;
+	return vmf_insert_pfn(vma, vmf->address, offset);
 }
 
 static const struct vm_operations_struct global_mmio_vmops = {
-- 
1.9.1

^ permalink raw reply related

* linux-next: build warnings from Linus' tree
From: Stephen Rothwell @ 2018-06-11 22:14 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, PowerPC
  Cc: Linux-Next Mailing List, Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 548 bytes --]

Hi all,

Building Linus' tree, today's linux-next build (powerpc ppc64_defconfig)
produced these warning:

ld: warning: orphan section `.gnu.hash' from `linker stubs' being placed in section `.gnu.hash'.
ld: warning: orphan section `.gnu.hash' from `linker stubs' being placed in section `.gnu.hash'.
ld: warning: orphan section `.gnu.hash' from `linker stubs' being placed in section `.gnu.hash'.

This may just be because I have started building using the native Debian
gcc for the powerpc builds ...

-- 
Cheers,
Stephen Rothwell

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH v2 08/12] macintosh/via-pmu68k: Don't load driver on unsupported hardware
From: Finn Thain @ 2018-06-11 23:47 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Michael Schmitz, Andreas Schwab, linuxppc-dev, linux-m68k,
	linux-kernel, Geert Uytterhoeven, Laurent Vivier
In-Reply-To: <3490f7d6f45ec2586858e4547e97474d93bd2d4d.camel@kernel.crashing.org>

On Sun, 10 Jun 2018, Benjamin Herrenschmidt wrote:

> Pre-PCI is basically "NUBUS" based even in absence of an actual NuBus 
> slot :-) It has to do with the internal HW architecture. The only ones 
> that aren't are the even older designs (the 68000 based ones).
> 

There is already some disagreement in the comments in the nubus-pmac code 
about the suitability of "PMU_NUBUS_BASED" as opposed to e.g. 
"PMU_WHITNEY_BASED".

Point is, the PMU driver doesn't care about the expansion slots or 
architecture (Whitney-based PMU appears on m68k and powerpc). So NuBus vs. 
PCI is a red herring here. The pmu_kind relates to backlight, buttons and 
battery.

(Leaving aside the PMU driver, if a pre-OpenFirmware Mac has a "slot zero" 
ROM, one can argue that it is actually a NuBus machine, regardless of any 
actual expansion slots.)

> What's the situation with those NuBus things ? What do they use as a 
> bootloader ? The old Apple one or BootX ? We should merge that port of 
> it's maintained.
> 

I agree that this code should not languish out-of-tree. But it would need 
more work before it could reasonably be submitted to reviewers.

I do have some nubus-pmac hardware but I also have more mac/68k driver 
work to do before I can tackle another architecture.

I don't know what the bootloader situation is, but it looks messy...
http://nubus-pmac.sourceforge.net/#booters

Laurent, does Emile work on these machines?

-- 

> Cheers,
> Ben.
>  

^ permalink raw reply

* linux-next: build failure in Linus' tree
From: Stephen Rothwell @ 2018-06-12  2:26 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann
  Cc: Linux-Next Mailing List, Linux Kernel Mailing List, David Miller,
	Networking, Michael Ellerman, Benjamin Herrenschmidt, PowerPC

[-- Attachment #1: Type: text/plain, Size: 528 bytes --]

Hi all,

Building Linus' tree, today's linux-next build (powerpc allyesconfig)
failed like this:

ld: net/bpfilter/bpfilter_umh.o: compiled for a little endian system and target is big endian
ld: failed to merge target specific data of file net/bpfilter/bpfilter_umh.o

This has come to light since I started using a native compiler (i.e. one
that can build executables, not just the kernel) for my PowerPC builds
on a powerpcle host.

I have switched back to my limited compiler.

-- 
Cheers,
Stephen Rothwell

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH kernel 4/6] powerpc/powernv: Add indirect levels to it_userspace
From: David Gibson @ 2018-06-12  2:26 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: linuxppc-dev, kvm-ppc, kvm, Alex Williamson,
	Benjamin Herrenschmidt
In-Reply-To: <20180608054633.18659-5-aik@ozlabs.ru>

[-- Attachment #1: Type: text/plain, Size: 16294 bytes --]

On Fri, Jun 08, 2018 at 03:46:31PM +1000, Alexey Kardashevskiy wrote:
> We want to support sparse memory and therefore huge chunks of DMA windows
> do not need to be mapped. If a DMA window big enough to require 2 or more
> indirect levels, and a DMA window is used to map all RAM (which is
> a default case for 64bit window), we can actually save some memory by
> not allocation TCE for regions which we are not going to map anyway.
> 
> The hardware tables alreary support indirect levels but we also keep
> host-physical-to-userspace translation array which is allocated by
> vmalloc() and is a flat array which might use quite some memory.
> 
> This converts it_userspace from vmalloc'ed array to a multi level table.
> 
> As the format becomes platform dependend, this replaces the direct access
> to it_usespace with a iommu_table_ops::useraddrptr hook which returns
> a pointer to the userspace copy of a TCE; future extension will return
> NULL if the level was not allocated.
> 
> This should not change non-KVM handling of TCE tables and it_userspace
> will not be allocated for non-KVM tables.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> ---
>  arch/powerpc/include/asm/iommu.h              |  6 +--
>  arch/powerpc/platforms/powernv/pci.h          |  3 +-
>  arch/powerpc/kvm/book3s_64_vio_hv.c           |  8 ----
>  arch/powerpc/platforms/powernv/pci-ioda-tce.c | 65 +++++++++++++++++++++------
>  arch/powerpc/platforms/powernv/pci-ioda.c     | 31 ++++++++++---
>  drivers/vfio/vfio_iommu_spapr_tce.c           | 46 -------------------
>  6 files changed, 81 insertions(+), 78 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index 803ac70..4bdcf22 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -69,6 +69,8 @@ struct iommu_table_ops {
>  			long index,
>  			unsigned long *hpa,
>  			enum dma_data_direction *direction);
> +
> +	__be64 *(*useraddrptr)(struct iommu_table *tbl, long index);
>  #endif
>  	void (*clear)(struct iommu_table *tbl,
>  			long index, long npages);
> @@ -123,9 +125,7 @@ struct iommu_table {
>  };
>  
>  #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
> -		((tbl)->it_userspace ? \
> -			&((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
> -			NULL)
> +		((tbl)->it_ops->useraddrptr((tbl), (entry)))
>  
>  /* Pure 2^n version of get_order */
>  static inline __attribute_const__
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index f507baf..5e02408 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -268,11 +268,12 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
>  extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
>  extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
>  		unsigned long *hpa, enum dma_data_direction *direction);
> +extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index);
>  extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
>  
>  extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
>  		__u32 page_shift, __u64 window_size, __u32 levels,
> -		struct iommu_table *tbl);
> +		bool alloc_userspace_copy, struct iommu_table *tbl);
>  extern void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
>  
>  extern long pnv_pci_link_table_and_group(int node, int num,
> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
> index 18109f3..db0490c 100644
> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> @@ -206,10 +206,6 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
>  		/* it_userspace allocation might be delayed */
>  		return H_TOO_HARD;
>  
> -	pua = (void *) vmalloc_to_phys(pua);
> -	if (WARN_ON_ONCE_RM(!pua))
> -		return H_HARDWARE;
> -
>  	mem = mm_iommu_lookup_rm(kvm->mm, be64_to_cpu(*pua), pgsize);
>  	if (!mem)
>  		return H_TOO_HARD;
> @@ -282,10 +278,6 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
>  	if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, &hpa)))
>  		return H_HARDWARE;
>  
> -	pua = (void *) vmalloc_to_phys(pua);
> -	if (WARN_ON_ONCE_RM(!pua))
> -		return H_HARDWARE;
> -
>  	if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
>  		return H_CLOSED;
>  
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> index 700ceb1..f14b282 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> @@ -31,9 +31,9 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
>  	tbl->it_type = TCE_PCI;
>  }
>  
> -static __be64 *pnv_tce(struct iommu_table *tbl, long idx)
> +static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx)
>  {
> -	__be64 *tmp = ((__be64 *)tbl->it_base);
> +	__be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base;
>  	int  level = tbl->it_indirect_levels;
>  	const long shift = ilog2(tbl->it_level_size);
>  	unsigned long mask = (tbl->it_level_size - 1) << (level * shift);
> @@ -67,7 +67,7 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
>  			((rpn + i) << tbl->it_page_shift);
>  		unsigned long idx = index - tbl->it_offset + i;
>  
> -		*(pnv_tce(tbl, idx)) = cpu_to_be64(newtce);
> +		*(pnv_tce(tbl, false, idx)) = cpu_to_be64(newtce);
>  	}
>  
>  	return 0;
> @@ -86,12 +86,21 @@ int pnv_tce_xchg(struct iommu_table *tbl, long index,
>  	if (newtce & TCE_PCI_WRITE)
>  		newtce |= TCE_PCI_READ;
>  
> -	oldtce = be64_to_cpu(xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce)));
> +	oldtce = be64_to_cpu(xchg(pnv_tce(tbl, false, idx),
> +				  cpu_to_be64(newtce)));
>  	*hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
>  	*direction = iommu_tce_direction(oldtce);
>  
>  	return 0;
>  }
> +
> +__be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index)
> +{
> +	if (WARN_ON_ONCE(!tbl->it_userspace))
> +		return NULL;
> +
> +	return pnv_tce(tbl, true, index - tbl->it_offset);
> +}
>  #endif
>  
>  void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
> @@ -101,13 +110,15 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
>  	for (i = 0; i < npages; i++) {
>  		unsigned long idx = index - tbl->it_offset + i;
>  
> -		*(pnv_tce(tbl, idx)) = cpu_to_be64(0);
> +		*(pnv_tce(tbl, false, idx)) = cpu_to_be64(0);
>  	}
>  }
>  
>  unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
>  {
> -	return be64_to_cpu(*(pnv_tce(tbl, index - tbl->it_offset)));
> +	__be64 *ptce = pnv_tce(tbl, false, index - tbl->it_offset);
> +
> +	return be64_to_cpu(*ptce);
>  }
>  
>  static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
> @@ -144,6 +155,10 @@ void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
>  
>  	pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size,
>  			tbl->it_indirect_levels);
> +	if (tbl->it_userspace) {
> +		pnv_pci_ioda2_table_do_free_pages(tbl->it_userspace, size,
> +				tbl->it_indirect_levels);
> +	}
>  }
>  
>  static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned int shift,
> @@ -191,10 +206,11 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned int shift,
>  
>  long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
>  		__u32 page_shift, __u64 window_size, __u32 levels,
> -		struct iommu_table *tbl)
> +		bool alloc_userspace_copy, struct iommu_table *tbl)
>  {
> -	void *addr;
> +	void *addr, *uas = NULL;
>  	unsigned long offset = 0, level_shift, total_allocated = 0;
> +	unsigned long total_allocated_uas = 0;
>  	const unsigned int window_shift = ilog2(window_size);
>  	unsigned int entries_shift = window_shift - page_shift;
>  	unsigned int table_shift = max_t(unsigned int, entries_shift + 3,
> @@ -228,10 +244,20 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
>  	 * we did not allocate as much as we wanted,
>  	 * release partially allocated table.
>  	 */
> -	if (offset < tce_table_size) {
> -		pnv_pci_ioda2_table_do_free_pages(addr,
> -				1ULL << (level_shift - 3), levels - 1);
> -		return -ENOMEM;
> +	if (offset < tce_table_size)
> +		goto free_tces_exit;
> +
> +	/* Allocate userspace view of the TCE table */
> +	if (alloc_userspace_copy) {
> +		offset = 0;
> +		uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
> +				levels, tce_table_size, &offset,
> +				&total_allocated_uas);
> +		if (!uas)
> +			goto free_tces_exit;
> +		if (offset < tce_table_size ||
> +				total_allocated_uas != total_allocated)
> +			goto free_uas_exit;
>  	}
>  
>  	/* Setup linux iommu table */
> @@ -240,11 +266,22 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
>  	tbl->it_level_size = 1ULL << (level_shift - 3);
>  	tbl->it_indirect_levels = levels - 1;
>  	tbl->it_allocated_size = total_allocated;
> +	tbl->it_userspace = uas;
>  
> -	pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n",
> -			window_size, tce_table_size, bus_offset);
> +	pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d\n",
> +			window_size, tce_table_size, bus_offset, tbl->it_base,
> +			tbl->it_userspace, levels);
>  
>  	return 0;
> +
> +free_uas_exit:
> +	pnv_pci_ioda2_table_do_free_pages(uas,
> +			1ULL << (level_shift - 3), levels - 1);
> +free_tces_exit:
> +	pnv_pci_ioda2_table_do_free_pages(addr,
> +			1ULL << (level_shift - 3), levels - 1);
> +
> +	return -ENOMEM;
>  }
>  
>  static void pnv_iommu_table_group_link_free(struct rcu_head *head)
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 9577059..c61c04d 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -2043,6 +2043,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
>  #ifdef CONFIG_IOMMU_API
>  	.exchange = pnv_ioda1_tce_xchg,
>  	.exchange_rm = pnv_ioda1_tce_xchg_rm,
> +	.useraddrptr = pnv_tce_useraddrptr,
>  #endif
>  	.clear = pnv_ioda1_tce_free,
>  	.get = pnv_tce_get,
> @@ -2207,6 +2208,7 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
>  #ifdef CONFIG_IOMMU_API
>  	.exchange = pnv_ioda2_tce_xchg,
>  	.exchange_rm = pnv_ioda2_tce_xchg_rm,
> +	.useraddrptr = pnv_tce_useraddrptr,
>  #endif
>  	.clear = pnv_ioda2_tce_free,
>  	.get = pnv_tce_get,
> @@ -2460,9 +2462,9 @@ void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
>  		pe->tce_bypass_enabled = enable;
>  }
>  
> -static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
> +static long pnv_pci_ioda2_do_create_table(struct iommu_table_group *table_group,
>  		int num, __u32 page_shift, __u64 window_size, __u32 levels,
> -		struct iommu_table **ptbl)
> +		bool alloc_userspace_copy, struct iommu_table **ptbl)
>  {
>  	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
>  			table_group);
> @@ -2479,7 +2481,7 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
>  
>  	ret = pnv_pci_ioda2_table_alloc_pages(nid,
>  			bus_offset, page_shift, window_size,
> -			levels, tbl);
> +			levels, alloc_userspace_copy, tbl);
>  	if (ret) {
>  		iommu_tce_table_put(tbl);
>  		return ret;
> @@ -2599,7 +2601,24 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
>  				tce_table_size, direct_table_size);
>  	}
>  
> -	return bytes;
> +	return bytes + bytes; /* one for HW table, one for userspace copy */
> +}
> +
> +static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
> +		int num, __u32 page_shift, __u64 window_size, __u32 levels,
> +		struct iommu_table **ptbl)
> +{
> +	return pnv_pci_ioda2_do_create_table(table_group,
> +			num, page_shift, window_size, levels, false, ptbl);
> +}
> +
> +static long pnv_pci_ioda2_create_table_userspace(
> +		struct iommu_table_group *table_group,
> +		int num, __u32 page_shift, __u64 window_size, __u32 levels,
> +		struct iommu_table **ptbl)
> +{
> +	return pnv_pci_ioda2_do_create_table(table_group,
> +			num, page_shift, window_size, levels, true, ptbl);
>  }
>  
>  static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
> @@ -2628,7 +2647,7 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
>  
>  static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
>  	.get_table_size = pnv_pci_ioda2_get_table_size,
> -	.create_table = pnv_pci_ioda2_create_table,
> +	.create_table = pnv_pci_ioda2_create_table_userspace,
>  	.set_window = pnv_pci_ioda2_set_window,
>  	.unset_window = pnv_pci_ioda2_unset_window,
>  	.take_ownership = pnv_ioda2_take_ownership,
> @@ -2733,7 +2752,7 @@ static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group)
>  
>  static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = {
>  	.get_table_size = pnv_pci_ioda2_get_table_size,
> -	.create_table = pnv_pci_ioda2_create_table,
> +	.create_table = pnv_pci_ioda2_create_table_userspace,
>  	.set_window = pnv_pci_ioda2_npu_set_window,
>  	.unset_window = pnv_pci_ioda2_npu_unset_window,
>  	.take_ownership = pnv_ioda2_npu_take_ownership,
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> index 81f48114..628a948 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -212,44 +212,6 @@ static long tce_iommu_register_pages(struct tce_container *container,
>  	return 0;
>  }
>  
> -static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl,
> -		struct mm_struct *mm)
> -{
> -	unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
> -			tbl->it_size, PAGE_SIZE);
> -	unsigned long *uas;
> -	long ret;
> -
> -	BUG_ON(tbl->it_userspace);
> -
> -	ret = try_increment_locked_vm(mm, cb >> PAGE_SHIFT);
> -	if (ret)
> -		return ret;
> -
> -	uas = vzalloc(cb);
> -	if (!uas) {
> -		decrement_locked_vm(mm, cb >> PAGE_SHIFT);
> -		return -ENOMEM;
> -	}
> -	tbl->it_userspace = (__be64 *) uas;
> -
> -	return 0;
> -}
> -
> -static void tce_iommu_userspace_view_free(struct iommu_table *tbl,
> -		struct mm_struct *mm)
> -{
> -	unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
> -			tbl->it_size, PAGE_SIZE);
> -
> -	if (!tbl->it_userspace)
> -		return;
> -
> -	vfree(tbl->it_userspace);
> -	tbl->it_userspace = NULL;
> -	decrement_locked_vm(mm, cb >> PAGE_SHIFT);
> -}
> -
>  static bool tce_page_is_contained(unsigned long hpa, unsigned page_shift)
>  {
>  	struct page *page = __va(realmode_pfn_to_page(hpa >> PAGE_SHIFT));
> @@ -608,12 +570,6 @@ static long tce_iommu_build_v2(struct tce_container *container,
>  	unsigned long hpa;
>  	enum dma_data_direction dirtmp;
>  
> -	if (!tbl->it_userspace) {
> -		ret = tce_iommu_userspace_view_alloc(tbl, container->mm);
> -		if (ret)
> -			return ret;
> -	}
> -
>  	for (i = 0; i < pages; ++i) {
>  		struct mm_iommu_table_group_mem_t *mem = NULL;
>  		__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
> @@ -693,7 +649,6 @@ static void tce_iommu_free_table(struct tce_container *container,
>  {
>  	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
>  
> -	tce_iommu_userspace_view_free(tbl, container->mm);
>  	iommu_tce_table_put(tbl);
>  	decrement_locked_vm(container->mm, pages);
>  }
> @@ -1208,7 +1163,6 @@ static void tce_iommu_release_ownership(struct tce_container *container,
>  			continue;
>  
>  		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
> -		tce_iommu_userspace_view_free(tbl, container->mm);
>  		if (tbl->it_map)
>  			iommu_release_ownership(tbl);
>  

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH kernel 5/6] powerpc/powernv: Rework TCE level allocation
From: David Gibson @ 2018-06-12  2:40 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: linuxppc-dev, kvm-ppc, kvm, Alex Williamson,
	Benjamin Herrenschmidt
In-Reply-To: <20180608054633.18659-6-aik@ozlabs.ru>

[-- Attachment #1: Type: text/plain, Size: 2605 bytes --]

On Fri, Jun 08, 2018 at 03:46:32PM +1000, Alexey Kardashevskiy wrote:
> This moves actual pages allocation to a separate function which is going
> to be reused later in on-demand TCE allocation.
> 
> While we are at it, remove unnecessary level size round up as the caller
> does this already.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> ---
>  arch/powerpc/platforms/powernv/pci-ioda-tce.c | 30 +++++++++++++++++----------
>  1 file changed, 19 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> index f14b282..36c2eb0 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> @@ -31,6 +31,23 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
>  	tbl->it_type = TCE_PCI;
>  }
>  
> +static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
> +{
> +	struct page *tce_mem = NULL;
> +	__be64 *addr;
> +
> +	tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT);
> +	if (!tce_mem) {
> +		pr_err("Failed to allocate a TCE memory, level shift=%d\n",
> +				shift);
> +		return NULL;
> +	}
> +	addr = page_address(tce_mem);
> +	memset(addr, 0, 1UL << shift);
> +
> +	return addr;
> +}
> +
>  static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx)
>  {
>  	__be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base;
> @@ -165,21 +182,12 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned int shift,
>  		unsigned int levels, unsigned long limit,
>  		unsigned long *current_offset, unsigned long *total_allocated)
>  {
> -	struct page *tce_mem = NULL;
>  	__be64 *addr, *tmp;
> -	unsigned int order = max_t(unsigned int, shift, PAGE_SHIFT) -
> -			PAGE_SHIFT;
> -	unsigned long allocated = 1UL << (order + PAGE_SHIFT);
> +	unsigned long allocated = 1UL << shift;
>  	unsigned int entries = 1UL << (shift - 3);
>  	long i;
>  
> -	tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
> -	if (!tce_mem) {
> -		pr_err("Failed to allocate a TCE memory, order=%d\n", order);
> -		return NULL;
> -	}
> -	addr = page_address(tce_mem);
> -	memset(addr, 0, allocated);
> +	addr = pnv_alloc_tce_level(nid, shift);
>  	*total_allocated += allocated;
>  
>  	--levels;

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* RE: [PATCH v2 3/3] powerpc/fsl: Implement cpu_show_spectre_v1/v2 for NXP PowerPC Book3E
From: Bharat Bhushan @ 2018-06-12  2:59 UTC (permalink / raw)
  To: Diana Madalina Craciun, linuxppc-dev@lists.ozlabs.org
  Cc: mpe@ellerman.id.au, oss@buserror.net, Leo Li,
	Diana Madalina Craciun
In-Reply-To: <1528721608-15443-4-git-send-email-diana.craciun@nxp.com>

Hi Diana,

> -----Original Message-----
> From: Diana Craciun [mailto:diana.craciun@nxp.com]
> Sent: Monday, June 11, 2018 6:23 PM
> To: linuxppc-dev@lists.ozlabs.org
> Cc: mpe@ellerman.id.au; oss@buserror.net; Leo Li <leoyang.li@nxp.com>;
> Bharat Bhushan <bharat.bhushan@nxp.com>; Diana Madalina Craciun
> <diana.craciun@nxp.com>
> Subject: [PATCH v2 3/3] powerpc/fsl: Implement cpu_show_spectre_v1/v2 for
> NXP PowerPC Book3E

Please add some description

>=20
> Signed-off-by: Diana Craciun <diana.craciun@nxp.com>
> ---
>  arch/powerpc/Kconfig           |  2 +-
>  arch/powerpc/kernel/security.c | 15 +++++++++++++++
>  2 files changed, 16 insertions(+), 1 deletion(-)
>=20
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index
> 940c955..a781d60 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -170,7 +170,7 @@ config PPC
>  	select GENERIC_CLOCKEVENTS_BROADCAST	if SMP
>  	select GENERIC_CMOS_UPDATE
>  	select GENERIC_CPU_AUTOPROBE
> -	select GENERIC_CPU_VULNERABILITIES	if PPC_BOOK3S_64
> +	select GENERIC_CPU_VULNERABILITIES	if PPC_BOOK3S_64 ||
> PPC_FSL_BOOK3E
>  	select GENERIC_IRQ_SHOW
>  	select GENERIC_IRQ_SHOW_LEVEL
>  	select GENERIC_SMP_IDLE_THREAD
> diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/securit=
y.c
> index 797c975..aceaadc 100644
> --- a/arch/powerpc/kernel/security.c
> +++ b/arch/powerpc/kernel/security.c
> @@ -183,3 +183,18 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
> struct device_attribute *attr, c  }  #endif /* CONFIG_PPC_BOOK3S_64 */
>=20
> +#ifdef CONFIG_PPC_FSL_BOOK3E
> +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute
> +*attr, char *buf) {
> +	if (barrier_nospec_enabled)
> +		return sprintf(buf, "Mitigation: __user pointer sanitization\n");
> +
> +	return sprintf(buf, "Vulnerable\n");
> +}
> +
> +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute
> +*attr, char *buf) {
> +	return sprintf(buf, "Vulnerable\n");
> +}
> +#endif /* CONFIG_PPC_FSL_BOOK3E */
> +
> --
> 2.5.5

^ permalink raw reply

* [PATCH v5 0/4] resource: Use list_head to link sibling resource
From: Baoquan He @ 2018-06-12  3:28 UTC (permalink / raw)
  To: linux-kernel, akpm, robh+dt, dan.j.williams, nicolas.pitre, josh,
	fengguang.wu, bp
  Cc: patrik.r.jakobsson, airlied, kys, haiyangz, sthemmin,
	dmitry.torokhov, frowand.list, keith.busch, jonathan.derrick,
	lorenzo.pieralisi, bhelgaas, tglx, brijesh.singh, jglisse,
	thomas.lendacky, gregkh, baiyaowei, richard.weiyang, devel,
	linux-input, linux-nvdimm, devicetree, linux-pci, ebiederm,
	vgoyal, dyoung, yinghai, kexec, monstr, davem, chris, jcmvbkbc,
	gustavo, maarten.lankhorst, seanpaul, linux-parisc, linuxppc-dev,
	Baoquan He

This patchset is doing:
1) Replace struct resource's sibling list from singly linked list to
list_head. Clearing out those pointer operation within singly linked
list for better code readability.
2) Based on list_head replacement, add a new function
walk_system_ram_res_rev() which can does reversed iteration on
iomem_resource's siblings.
3) Change kexec_file loading to search system RAM top down for kernel
loadin, using walk_system_ram_res_rev().

Note:
This patchset passed testing on my kvm guest, x86_64 arch with network
enabling. The thing we need pay attetion to is that a root resource's
child member need be initialized specifically with LIST_HEAD_INIT() if
statically defined or INIT_LIST_HEAD() for dynamically definition. Here
Just like we do for iomem_resource/ioport_resource, or the change in
get_pci_domain_busn_res().

Links of the old post (Boris pointed out that we should use
https://lkml.kernel.org/r/Message-ID, while it can't be opened from
my side, so paste all of them here.):
v4:
https://lkml.kernel.org/r/20180507063224.24229-1-bhe@redhat.com
https://lkml.org/lkml/2018/5/7/36

v3:
https://lkml.kernel.org/r/20180419001848.3041-1-bhe@redhat.com
https://lkml.org/lkml/2018/4/18/767

v2:
https://lkml.kernel.org/r/20180408024724.16812-1-bhe@redhat.com
https://lkml.org/lkml/2018/4/7/169

v1:
https://lkml.kernel.org/r/20180322033722.9279-1-bhe@redhat.com
https://lkml.org/lkml/2018/3/21/952

Changelog:
v4->v5:
  Add new patch 0001 to move duplicated reparent_resources() to
  kernel/resource.c to make it be shared by different ARCH-es.

  Fix several code bugs reported by test robot on ARCH powerpc and
  microblaze.
v3->v4:
  Fix several bugs test robot reported. Rewrite cover letter and patch
  log according to reviewer's comment.

v2->v3:
  Rename resource functions first_child() and sibling() to
  resource_first_chils() and resource_sibling(). Dan suggested this.

  Move resource_first_chils() and resource_sibling() to linux/ioport.h
  and make them as inline function. Rob suggested this. Accordingly add
  linux/list.h including in linux/ioport.h, please help review if this
  bring efficiency degradation or code redundancy.

  The change on struct resource {} bring two pointers of size increase,
  mention this in git log to make it more specifically, Rob suggested
  this.

v1->v2:
  Use list_head instead to link resource siblings. This is suggested by
  Andrew.

  Rewrite walk_system_ram_res_rev() after list_head is taken to link
  resouce siblings.

Baoquan He (4):
  resource: Move reparent_resources() to kernel/resource.c and make it
    public
  resource: Use list_head to link sibling resource
  resource: add walk_system_ram_res_rev()
  kexec_file: Load kernel at top of system RAM if required

 arch/arm/plat-samsung/pm-check.c            |   6 +-
 arch/microblaze/pci/pci-common.c            |  41 +----
 arch/powerpc/kernel/pci-common.c            |  39 +----
 arch/sparc/kernel/ioport.c                  |   2 +-
 arch/xtensa/include/asm/pci-bridge.h        |   4 +-
 drivers/eisa/eisa-bus.c                     |   2 +
 drivers/gpu/drm/drm_memory.c                |   3 +-
 drivers/gpu/drm/gma500/gtt.c                |   5 +-
 drivers/hv/vmbus_drv.c                      |  52 +++---
 drivers/input/joystick/iforce/iforce-main.c |   4 +-
 drivers/nvdimm/namespace_devs.c             |   6 +-
 drivers/nvdimm/nd.h                         |   5 +-
 drivers/of/address.c                        |   4 +-
 drivers/parisc/lba_pci.c                    |   4 +-
 drivers/pci/host/vmd.c                      |   8 +-
 drivers/pci/probe.c                         |   2 +
 drivers/pci/setup-bus.c                     |   2 +-
 include/linux/ioport.h                      |  21 ++-
 kernel/kexec_file.c                         |   2 +
 kernel/resource.c                           | 259 ++++++++++++++++++----------
 20 files changed, 244 insertions(+), 227 deletions(-)

-- 
2.13.6

^ permalink raw reply

* [PATCH v5 1/4] resource: Move reparent_resources() to kernel/resource.c and make it public
From: Baoquan He @ 2018-06-12  3:28 UTC (permalink / raw)
  To: linux-kernel, akpm, robh+dt, dan.j.williams, nicolas.pitre, josh,
	fengguang.wu, bp
  Cc: patrik.r.jakobsson, airlied, kys, haiyangz, sthemmin,
	dmitry.torokhov, frowand.list, keith.busch, jonathan.derrick,
	lorenzo.pieralisi, bhelgaas, tglx, brijesh.singh, jglisse,
	thomas.lendacky, gregkh, baiyaowei, richard.weiyang, devel,
	linux-input, linux-nvdimm, devicetree, linux-pci, ebiederm,
	vgoyal, dyoung, yinghai, kexec, monstr, davem, chris, jcmvbkbc,
	gustavo, maarten.lankhorst, seanpaul, linux-parisc, linuxppc-dev,
	Baoquan He, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman
In-Reply-To: <20180612032831.29747-1-bhe@redhat.com>

reparent_resources() is duplicated in arch/microblaze/pci/pci-common.c
and arch/powerpc/kernel/pci-common.c, so move it to kernel/resource.c
so that it's shared. Later its code also need be updated using list_head
to replace singly linked list.

Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
---
v4->v5:
  Fix several code bugs reported by test robot on ARCH powerpc and
  microblaze.

v3->v4:
  Fix several bugs test robot reported. And change patch log.

v2->v3:
  Rename resource functions first_child() and sibling() to
  resource_first_chils() and resource_sibling(). Dan suggested this.

  Move resource_first_chils() and resource_sibling() to linux/ioport.h
  and make them as inline function. Rob suggested this. Accordingly add
  linux/list.h including in linux/ioport.h, please help review if this
  bring efficiency degradation or code redundancy.

  The change on struct resource {} bring two pointers of size increase,
  mention this in git log to make it more specifically, Rob suggested
  this.

 arch/microblaze/pci/pci-common.c | 37 -------------------------------------
 arch/powerpc/kernel/pci-common.c | 35 -----------------------------------
 include/linux/ioport.h           |  1 +
 kernel/resource.c                | 36 ++++++++++++++++++++++++++++++++++++
 4 files changed, 37 insertions(+), 72 deletions(-)

diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index f34346d56095..7899bafab064 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -619,43 +619,6 @@ int pcibios_add_device(struct pci_dev *dev)
 EXPORT_SYMBOL(pcibios_add_device);
 
 /*
- * Reparent resource children of pr that conflict with res
- * under res, and make res replace those children.
- */
-static int __init reparent_resources(struct resource *parent,
-				     struct resource *res)
-{
-	struct resource *p, **pp;
-	struct resource **firstpp = NULL;
-
-	for (pp = &parent->child; (p = *pp) != NULL; pp = &p->sibling) {
-		if (p->end < res->start)
-			continue;
-		if (res->end < p->start)
-			break;
-		if (p->start < res->start || p->end > res->end)
-			return -1;	/* not completely contained */
-		if (firstpp == NULL)
-			firstpp = pp;
-	}
-	if (firstpp == NULL)
-		return -1;	/* didn't find any conflicting entries? */
-	res->parent = parent;
-	res->child = *firstpp;
-	res->sibling = *pp;
-	*firstpp = res;
-	*pp = NULL;
-	for (p = res->child; p != NULL; p = p->sibling) {
-		p->parent = res;
-		pr_debug("PCI: Reparented %s [%llx..%llx] under %s\n",
-			 p->name,
-			 (unsigned long long)p->start,
-			 (unsigned long long)p->end, res->name);
-	}
-	return 0;
-}
-
-/*
  *  Handle resources of PCI devices.  If the world were perfect, we could
  *  just allocate all the resource regions and do nothing more.  It isn't.
  *  On the other hand, we cannot just re-allocate all devices, as it would
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index fe9733ffffaa..926035bb378d 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1088,41 +1088,6 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res,
 EXPORT_SYMBOL(pcibios_align_resource);
 
 /*
- * Reparent resource children of pr that conflict with res
- * under res, and make res replace those children.
- */
-static int reparent_resources(struct resource *parent,
-				     struct resource *res)
-{
-	struct resource *p, **pp;
-	struct resource **firstpp = NULL;
-
-	for (pp = &parent->child; (p = *pp) != NULL; pp = &p->sibling) {
-		if (p->end < res->start)
-			continue;
-		if (res->end < p->start)
-			break;
-		if (p->start < res->start || p->end > res->end)
-			return -1;	/* not completely contained */
-		if (firstpp == NULL)
-			firstpp = pp;
-	}
-	if (firstpp == NULL)
-		return -1;	/* didn't find any conflicting entries? */
-	res->parent = parent;
-	res->child = *firstpp;
-	res->sibling = *pp;
-	*firstpp = res;
-	*pp = NULL;
-	for (p = res->child; p != NULL; p = p->sibling) {
-		p->parent = res;
-		pr_debug("PCI: Reparented %s %pR under %s\n",
-			 p->name, p, res->name);
-	}
-	return 0;
-}
-
-/*
  *  Handle resources of PCI devices.  If the world were perfect, we could
  *  just allocate all the resource regions and do nothing more.  It isn't.
  *  On the other hand, we cannot just re-allocate all devices, as it would
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index da0ebaec25f0..dfdcd0bfe54e 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -192,6 +192,7 @@ extern int allocate_resource(struct resource *root, struct resource *new,
 struct resource *lookup_resource(struct resource *root, resource_size_t start);
 int adjust_resource(struct resource *res, resource_size_t start,
 		    resource_size_t size);
+int reparent_resources(struct resource *parent, struct resource *res);
 resource_size_t resource_alignment(struct resource *res);
 static inline resource_size_t resource_size(const struct resource *res)
 {
diff --git a/kernel/resource.c b/kernel/resource.c
index 30e1bc68503b..5e7c56d5d838 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -983,6 +983,42 @@ int adjust_resource(struct resource *res, resource_size_t start,
 }
 EXPORT_SYMBOL(adjust_resource);
 
+/*
+ * Reparent resource children of pr that conflict with res
+ * under res, and make res replace those children.
+ */
+static int reparent_resources(struct resource *parent,
+				     struct resource *res)
+{
+	struct resource *p, **pp;
+	struct resource **firstpp = NULL;
+
+	for (pp = &parent->child; (p = *pp) != NULL; pp = &p->sibling) {
+		if (p->end < res->start)
+			continue;
+		if (res->end < p->start)
+			break;
+		if (p->start < res->start || p->end > res->end)
+			return -1;	/* not completely contained */
+		if (firstpp == NULL)
+			firstpp = pp;
+	}
+	if (firstpp == NULL)
+		return -1;	/* didn't find any conflicting entries? */
+	res->parent = parent;
+	res->child = *firstpp;
+	res->sibling = *pp;
+	*firstpp = res;
+	*pp = NULL;
+	for (p = res->child; p != NULL; p = p->sibling) {
+		p->parent = res;
+		pr_debug("PCI: Reparented %s %pR under %s\n",
+			 p->name, p, res->name);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(reparent_resources);
+
 static void __init __reserve_region_with_split(struct resource *root,
 		resource_size_t start, resource_size_t end,
 		const char *name)
-- 
2.13.6

^ permalink raw reply related

* [PATCH v5 2/4] resource: Use list_head to link sibling resource
From: Baoquan He @ 2018-06-12  3:28 UTC (permalink / raw)
  To: linux-kernel, akpm, robh+dt, dan.j.williams, nicolas.pitre, josh,
	fengguang.wu, bp
  Cc: patrik.r.jakobsson, airlied, kys, haiyangz, sthemmin,
	dmitry.torokhov, frowand.list, keith.busch, jonathan.derrick,
	lorenzo.pieralisi, bhelgaas, tglx, brijesh.singh, jglisse,
	thomas.lendacky, gregkh, baiyaowei, richard.weiyang, devel,
	linux-input, linux-nvdimm, devicetree, linux-pci, ebiederm,
	vgoyal, dyoung, yinghai, kexec, monstr, davem, chris, jcmvbkbc,
	gustavo, maarten.lankhorst, seanpaul, linux-parisc, linuxppc-dev,
	Baoquan He
In-Reply-To: <20180612032831.29747-1-bhe@redhat.com>

The struct resource uses singly linked list to link siblings, implemented
by pointer operation. Replace it with list_head for better code readability.

Based on this list_head replacement, it will be very easy to do reverse
iteration on iomem_resource's sibling list in later patch.

Besides, type of member variables of struct resource, sibling and child, are
changed from 'struct resource *' to 'struct list_head'. This brings two
pointers of size increase.

Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Patrik Jakobsson <patrik.r.jakobsson@gmail.com>
Cc: David Airlie <airlied@linux.ie>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Jonathan Derrick <jonathan.derrick@intel.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: devel@linuxdriverproject.org
Cc: linux-input@vger.kernel.org
Cc: linux-nvdimm@lists.01.org
Cc: devicetree@vger.kernel.org
Cc: linux-pci@vger.kernel.org
---
 arch/arm/plat-samsung/pm-check.c            |   6 +-
 arch/microblaze/pci/pci-common.c            |   4 +-
 arch/powerpc/kernel/pci-common.c            |   4 +-
 arch/sparc/kernel/ioport.c                  |   2 +-
 arch/xtensa/include/asm/pci-bridge.h        |   4 +-
 drivers/eisa/eisa-bus.c                     |   2 +
 drivers/gpu/drm/drm_memory.c                |   3 +-
 drivers/gpu/drm/gma500/gtt.c                |   5 +-
 drivers/hv/vmbus_drv.c                      |  52 +++----
 drivers/input/joystick/iforce/iforce-main.c |   4 +-
 drivers/nvdimm/namespace_devs.c             |   6 +-
 drivers/nvdimm/nd.h                         |   5 +-
 drivers/of/address.c                        |   4 +-
 drivers/parisc/lba_pci.c                    |   4 +-
 drivers/pci/host/vmd.c                      |   8 +-
 drivers/pci/probe.c                         |   2 +
 drivers/pci/setup-bus.c                     |   2 +-
 include/linux/ioport.h                      |  17 ++-
 kernel/resource.c                           | 211 ++++++++++++++--------------
 19 files changed, 176 insertions(+), 169 deletions(-)

diff --git a/arch/arm/plat-samsung/pm-check.c b/arch/arm/plat-samsung/pm-check.c
index cd2c02c68bc3..5494355b1c49 100644
--- a/arch/arm/plat-samsung/pm-check.c
+++ b/arch/arm/plat-samsung/pm-check.c
@@ -46,8 +46,8 @@ typedef u32 *(run_fn_t)(struct resource *ptr, u32 *arg);
 static void s3c_pm_run_res(struct resource *ptr, run_fn_t fn, u32 *arg)
 {
 	while (ptr != NULL) {
-		if (ptr->child != NULL)
-			s3c_pm_run_res(ptr->child, fn, arg);
+		if (!list_empty(&ptr->child))
+			s3c_pm_run_res(resource_first_child(&ptr->child), fn, arg);
 
 		if ((ptr->flags & IORESOURCE_SYSTEM_RAM)
 				== IORESOURCE_SYSTEM_RAM) {
@@ -57,7 +57,7 @@ static void s3c_pm_run_res(struct resource *ptr, run_fn_t fn, u32 *arg)
 			arg = (fn)(ptr, arg);
 		}
 
-		ptr = ptr->sibling;
+		ptr = resource_sibling(ptr);
 	}
 }
 
diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index 7899bafab064..2bf73e27e231 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -533,7 +533,9 @@ void pci_process_bridge_OF_ranges(struct pci_controller *hose,
 			res->flags = range.flags;
 			res->start = range.cpu_addr;
 			res->end = range.cpu_addr + range.size - 1;
-			res->parent = res->child = res->sibling = NULL;
+			res->parent = NULL;
+			INIT_LIST_HEAD(&res->child);
+			INIT_LIST_HEAD(&res->sibling);
 		}
 	}
 
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 926035bb378d..28fbe83c9daf 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -761,7 +761,9 @@ void pci_process_bridge_OF_ranges(struct pci_controller *hose,
 			res->flags = range.flags;
 			res->start = range.cpu_addr;
 			res->end = range.cpu_addr + range.size - 1;
-			res->parent = res->child = res->sibling = NULL;
+			res->parent = NULL;
+			INIT_LIST_HEAD(&res->child);
+			INIT_LIST_HEAD(&res->sibling);
 		}
 	}
 }
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index cca9134cfa7d..99efe4e98b16 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -669,7 +669,7 @@ static int sparc_io_proc_show(struct seq_file *m, void *v)
 	struct resource *root = m->private, *r;
 	const char *nm;
 
-	for (r = root->child; r != NULL; r = r->sibling) {
+	list_for_each_entry(r, &root->child, sibling) {
 		if ((nm = r->name) == NULL) nm = "???";
 		seq_printf(m, "%016llx-%016llx: %s\n",
 				(unsigned long long)r->start,
diff --git a/arch/xtensa/include/asm/pci-bridge.h b/arch/xtensa/include/asm/pci-bridge.h
index 0b68c76ec1e6..f487b06817df 100644
--- a/arch/xtensa/include/asm/pci-bridge.h
+++ b/arch/xtensa/include/asm/pci-bridge.h
@@ -71,8 +71,8 @@ static inline void pcibios_init_resource(struct resource *res,
 	res->flags = flags;
 	res->name = name;
 	res->parent = NULL;
-	res->sibling = NULL;
-	res->child = NULL;
+	INIT_LIST_HEAD(&res->child);
+	INIT_LIST_HEAD(&res->sibling);
 }
 
 
diff --git a/drivers/eisa/eisa-bus.c b/drivers/eisa/eisa-bus.c
index 1e8062f6dbfc..dba78f75fd06 100644
--- a/drivers/eisa/eisa-bus.c
+++ b/drivers/eisa/eisa-bus.c
@@ -408,6 +408,8 @@ static struct resource eisa_root_res = {
 	.start = 0,
 	.end   = 0xffffffff,
 	.flags = IORESOURCE_IO,
+	.sibling = LIST_HEAD_INIT(eisa_root_res.sibling),
+	.child  = LIST_HEAD_INIT(eisa_root_res.child),
 };
 
 static int eisa_bus_count;
diff --git a/drivers/gpu/drm/drm_memory.c b/drivers/gpu/drm/drm_memory.c
index 3c54044214db..53e300a993dc 100644
--- a/drivers/gpu/drm/drm_memory.c
+++ b/drivers/gpu/drm/drm_memory.c
@@ -155,9 +155,8 @@ u64 drm_get_max_iomem(void)
 	struct resource *tmp;
 	resource_size_t max_iomem = 0;
 
-	for (tmp = iomem_resource.child; tmp; tmp = tmp->sibling) {
+	list_for_each_entry(tmp, &iomem_resource.child, sibling)
 		max_iomem = max(max_iomem,  tmp->end);
-	}
 
 	return max_iomem;
 }
diff --git a/drivers/gpu/drm/gma500/gtt.c b/drivers/gpu/drm/gma500/gtt.c
index 3949b0990916..addd3bc009af 100644
--- a/drivers/gpu/drm/gma500/gtt.c
+++ b/drivers/gpu/drm/gma500/gtt.c
@@ -565,7 +565,7 @@ int psb_gtt_init(struct drm_device *dev, int resume)
 int psb_gtt_restore(struct drm_device *dev)
 {
 	struct drm_psb_private *dev_priv = dev->dev_private;
-	struct resource *r = dev_priv->gtt_mem->child;
+	struct resource *r;
 	struct gtt_range *range;
 	unsigned int restored = 0, total = 0, size = 0;
 
@@ -573,14 +573,13 @@ int psb_gtt_restore(struct drm_device *dev)
 	mutex_lock(&dev_priv->gtt_mutex);
 	psb_gtt_init(dev, 1);
 
-	while (r != NULL) {
+	list_for_each_entry(r, &dev_priv->gtt_mem->child, sibling) {
 		range = container_of(r, struct gtt_range, resource);
 		if (range->pages) {
 			psb_gtt_insert(dev, range, 1);
 			size += range->resource.end - range->resource.start;
 			restored++;
 		}
-		r = r->sibling;
 		total++;
 	}
 	mutex_unlock(&dev_priv->gtt_mutex);
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index b10fe26c4891..d87ec5a1bc4c 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1412,9 +1412,8 @@ static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx)
 {
 	resource_size_t start = 0;
 	resource_size_t end = 0;
-	struct resource *new_res;
+	struct resource *new_res, *tmp;
 	struct resource **old_res = &hyperv_mmio;
-	struct resource **prev_res = NULL;
 
 	switch (res->type) {
 
@@ -1461,44 +1460,36 @@ static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx)
 	/*
 	 * If two ranges are adjacent, merge them.
 	 */
-	do {
-		if (!*old_res) {
-			*old_res = new_res;
-			break;
-		}
-
-		if (((*old_res)->end + 1) == new_res->start) {
-			(*old_res)->end = new_res->end;
+	if (!*old_res) {
+		*old_res = new_res;
+		return AE_OK;
+	}
+	tmp = *old_res;
+	list_for_each_entry_from(tmp, &tmp->parent->child, sibling) {
+		if ((tmp->end + 1) == new_res->start) {
+			tmp->end = new_res->end;
 			kfree(new_res);
 			break;
 		}
 
-		if ((*old_res)->start == new_res->end + 1) {
-			(*old_res)->start = new_res->start;
+		if (tmp->start == new_res->end + 1) {
+			tmp->start = new_res->start;
 			kfree(new_res);
 			break;
 		}
 
-		if ((*old_res)->start > new_res->end) {
-			new_res->sibling = *old_res;
-			if (prev_res)
-				(*prev_res)->sibling = new_res;
-			*old_res = new_res;
+		if (tmp->start > new_res->end) {
+			list_add(&new_res->sibling, tmp->sibling.prev);
 			break;
 		}
-
-		prev_res = old_res;
-		old_res = &(*old_res)->sibling;
-
-	} while (1);
+	}
 
 	return AE_OK;
 }
 
 static int vmbus_acpi_remove(struct acpi_device *device)
 {
-	struct resource *cur_res;
-	struct resource *next_res;
+	struct resource *res;
 
 	if (hyperv_mmio) {
 		if (fb_mmio) {
@@ -1507,10 +1498,9 @@ static int vmbus_acpi_remove(struct acpi_device *device)
 			fb_mmio = NULL;
 		}
 
-		for (cur_res = hyperv_mmio; cur_res; cur_res = next_res) {
-			next_res = cur_res->sibling;
-			kfree(cur_res);
-		}
+		res = hyperv_mmio;
+		list_for_each_entry_from(res, &res->parent->child, sibling)
+			kfree(res);
 	}
 
 	return 0;
@@ -1596,7 +1586,8 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
 		}
 	}
 
-	for (iter = hyperv_mmio; iter; iter = iter->sibling) {
+	iter = hyperv_mmio;
+	list_for_each_entry_from(iter, &iter->parent->child, sibling) {
 		if ((iter->start >= max) || (iter->end <= min))
 			continue;
 
@@ -1639,7 +1630,8 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size)
 	struct resource *iter;
 
 	down(&hyperv_mmio_lock);
-	for (iter = hyperv_mmio; iter; iter = iter->sibling) {
+	iter = hyperv_mmio;
+	list_for_each_entry_from(iter, &iter->parent->child, sibling) {
 		if ((iter->start >= start + size) || (iter->end <= start))
 			continue;
 
diff --git a/drivers/input/joystick/iforce/iforce-main.c b/drivers/input/joystick/iforce/iforce-main.c
index daeeb4c7e3b0..5c0be27b33ff 100644
--- a/drivers/input/joystick/iforce/iforce-main.c
+++ b/drivers/input/joystick/iforce/iforce-main.c
@@ -305,8 +305,8 @@ int iforce_init_device(struct iforce *iforce)
 	iforce->device_memory.end = 200;
 	iforce->device_memory.flags = IORESOURCE_MEM;
 	iforce->device_memory.parent = NULL;
-	iforce->device_memory.child = NULL;
-	iforce->device_memory.sibling = NULL;
+	INIT_LIST_HEAD(&iforce->device_memory.child);
+	INIT_LIST_HEAD(&iforce->device_memory.sibling);
 
 /*
  * Wait until device ready - until it sends its first response.
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 28afdd668905..f53d410d9981 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -637,7 +637,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
  retry:
 	first = 0;
 	for_each_dpa_resource(ndd, res) {
-		struct resource *next = res->sibling, *new_res = NULL;
+		struct resource *next = resource_sibling(res), *new_res = NULL;
 		resource_size_t allocate, available = 0;
 		enum alloc_loc loc = ALLOC_ERR;
 		const char *action;
@@ -763,7 +763,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 	 * an initial "pmem-reserve pass".  Only do an initial BLK allocation
 	 * when none of the DPA space is reserved.
 	 */
-	if ((is_pmem || !ndd->dpa.child) && n == to_allocate)
+	if ((is_pmem || list_empty(&ndd->dpa.child)) && n == to_allocate)
 		return init_dpa_allocation(label_id, nd_region, nd_mapping, n);
 	return n;
 }
@@ -779,7 +779,7 @@ static int merge_dpa(struct nd_region *nd_region,
  retry:
 	for_each_dpa_resource(ndd, res) {
 		int rc;
-		struct resource *next = res->sibling;
+		struct resource *next = resource_sibling(res);
 		resource_size_t end = res->start + resource_size(res);
 
 		if (!next || strcmp(res->name, label_id->id) != 0
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 32e0364b48b9..da7da15e03e7 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -102,11 +102,10 @@ unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd);
 		(unsigned long long) (res ? res->start : 0), ##arg)
 
 #define for_each_dpa_resource(ndd, res) \
-	for (res = (ndd)->dpa.child; res; res = res->sibling)
+	list_for_each_entry(res, &(ndd)->dpa.child, sibling)
 
 #define for_each_dpa_resource_safe(ndd, res, next) \
-	for (res = (ndd)->dpa.child, next = res ? res->sibling : NULL; \
-			res; res = next, next = next ? next->sibling : NULL)
+	list_for_each_entry_safe(res, next, &(ndd)->dpa.child, sibling)
 
 struct nd_percpu_lane {
 	int count;
diff --git a/drivers/of/address.c b/drivers/of/address.c
index 53349912ac75..e2e25719ab52 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -330,7 +330,9 @@ int of_pci_range_to_resource(struct of_pci_range *range,
 {
 	int err;
 	res->flags = range->flags;
-	res->parent = res->child = res->sibling = NULL;
+	res->parent = NULL;
+	INIT_LIST_HEAD(&res->child);
+	INIT_LIST_HEAD(&res->sibling);
 	res->name = np->full_name;
 
 	if (res->flags & IORESOURCE_IO) {
diff --git a/drivers/parisc/lba_pci.c b/drivers/parisc/lba_pci.c
index 69bd98421eb1..7482bdfd1959 100644
--- a/drivers/parisc/lba_pci.c
+++ b/drivers/parisc/lba_pci.c
@@ -170,8 +170,8 @@ lba_dump_res(struct resource *r, int d)
 	for (i = d; i ; --i) printk(" ");
 	printk(KERN_DEBUG "%p [%lx,%lx]/%lx\n", r,
 		(long)r->start, (long)r->end, r->flags);
-	lba_dump_res(r->child, d+2);
-	lba_dump_res(r->sibling, d);
+	lba_dump_res(resource_first_child(&r->child), d+2);
+	lba_dump_res(resource_sibling(r), d);
 }
 
 
diff --git a/drivers/pci/host/vmd.c b/drivers/pci/host/vmd.c
index 942b64fc7f1f..e3ace20345c7 100644
--- a/drivers/pci/host/vmd.c
+++ b/drivers/pci/host/vmd.c
@@ -542,14 +542,14 @@ static struct pci_ops vmd_ops = {
 
 static void vmd_attach_resources(struct vmd_dev *vmd)
 {
-	vmd->dev->resource[VMD_MEMBAR1].child = &vmd->resources[1];
-	vmd->dev->resource[VMD_MEMBAR2].child = &vmd->resources[2];
+	list_add(&vmd->resources[1].sibling, &vmd->dev->resource[VMD_MEMBAR1].child);
+	list_add(&vmd->resources[2].sibling, &vmd->dev->resource[VMD_MEMBAR2].child);
 }
 
 static void vmd_detach_resources(struct vmd_dev *vmd)
 {
-	vmd->dev->resource[VMD_MEMBAR1].child = NULL;
-	vmd->dev->resource[VMD_MEMBAR2].child = NULL;
+	INIT_LIST_HEAD(&vmd->dev->resource[VMD_MEMBAR1].child);
+	INIT_LIST_HEAD(&vmd->dev->resource[VMD_MEMBAR2].child);
 }
 
 /*
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ac876e32de4b..9624dd1dfd49 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -59,6 +59,8 @@ static struct resource *get_pci_domain_busn_res(int domain_nr)
 	r->res.start = 0;
 	r->res.end = 0xff;
 	r->res.flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED;
+	INIT_LIST_HEAD(&r->res.child);
+	INIT_LIST_HEAD(&r->res.sibling);
 
 	list_add_tail(&r->list, &pci_domain_busn_res_list);
 
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 79b1824e83b4..8e685af8938d 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -2107,7 +2107,7 @@ int pci_reassign_bridge_resources(struct pci_dev *bridge, unsigned long type)
 				continue;
 
 			/* Ignore BARs which are still in use */
-			if (res->child)
+			if (!list_empty(&res->child))
 				continue;
 
 			ret = add_to_list(&saved, bridge, res, 0, 0);
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index dfdcd0bfe54e..b7456ae889dd 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -12,6 +12,7 @@
 #ifndef __ASSEMBLY__
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include <linux/list.h>
 /*
  * Resources are tree-like, allowing
  * nesting etc..
@@ -22,7 +23,8 @@ struct resource {
 	const char *name;
 	unsigned long flags;
 	unsigned long desc;
-	struct resource *parent, *sibling, *child;
+	struct list_head child, sibling;
+	struct resource *parent;
 };
 
 /*
@@ -216,7 +218,6 @@ static inline bool resource_contains(struct resource *r1, struct resource *r2)
 	return r1->start <= r2->start && r1->end >= r2->end;
 }
 
-
 /* Convenience shorthand with allocation */
 #define request_region(start,n,name)		__request_region(&ioport_resource, (start), (n), (name), 0)
 #define request_muxed_region(start,n,name)	__request_region(&ioport_resource, (start), (n), (name), IORESOURCE_MUXED)
@@ -287,6 +288,18 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
        return (r1->start <= r2->end && r1->end >= r2->start);
 }
 
+static inline struct resource *resource_sibling(struct resource *res)
+{
+	if (res->parent && !list_is_last(&res->sibling, &res->parent->child))
+		return list_next_entry(res, sibling);
+	return NULL;
+}
+
+static inline struct resource *resource_first_child(struct list_head *head)
+{
+	return list_first_entry_or_null(head, struct resource, sibling);
+}
+
 
 #endif /* __ASSEMBLY__ */
 #endif	/* _LINUX_IOPORT_H */
diff --git a/kernel/resource.c b/kernel/resource.c
index 5e7c56d5d838..ef9a20b75234 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -31,6 +31,8 @@ struct resource ioport_resource = {
 	.start	= 0,
 	.end	= IO_SPACE_LIMIT,
 	.flags	= IORESOURCE_IO,
+	.sibling = LIST_HEAD_INIT(ioport_resource.sibling),
+	.child  = LIST_HEAD_INIT(ioport_resource.child),
 };
 EXPORT_SYMBOL(ioport_resource);
 
@@ -39,6 +41,8 @@ struct resource iomem_resource = {
 	.start	= 0,
 	.end	= -1,
 	.flags	= IORESOURCE_MEM,
+	.sibling = LIST_HEAD_INIT(iomem_resource.sibling),
+	.child  = LIST_HEAD_INIT(iomem_resource.child),
 };
 EXPORT_SYMBOL(iomem_resource);
 
@@ -57,20 +61,20 @@ static DEFINE_RWLOCK(resource_lock);
  * by boot mem after the system is up. So for reusing the resource entry
  * we need to remember the resource.
  */
-static struct resource *bootmem_resource_free;
+static struct list_head bootmem_resource_free = LIST_HEAD_INIT(bootmem_resource_free);
 static DEFINE_SPINLOCK(bootmem_resource_lock);
 
 static struct resource *next_resource(struct resource *p, bool sibling_only)
 {
 	/* Caller wants to traverse through siblings only */
 	if (sibling_only)
-		return p->sibling;
+		return resource_sibling(p);
 
-	if (p->child)
-		return p->child;
-	while (!p->sibling && p->parent)
+	if (!list_empty(&p->child))
+		return resource_first_child(&p->child);
+	while (!resource_sibling(p) && p->parent)
 		p = p->parent;
-	return p->sibling;
+	return resource_sibling(p);
 }
 
 static void *r_next(struct seq_file *m, void *v, loff_t *pos)
@@ -90,7 +94,7 @@ static void *r_start(struct seq_file *m, loff_t *pos)
 	struct resource *p = PDE_DATA(file_inode(m->file));
 	loff_t l = 0;
 	read_lock(&resource_lock);
-	for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
+	for (p = resource_first_child(&p->child); p && l < *pos; p = r_next(m, p, &l))
 		;
 	return p;
 }
@@ -153,8 +157,7 @@ static void free_resource(struct resource *res)
 
 	if (!PageSlab(virt_to_head_page(res))) {
 		spin_lock(&bootmem_resource_lock);
-		res->sibling = bootmem_resource_free;
-		bootmem_resource_free = res;
+		list_add(&res->sibling, &bootmem_resource_free);
 		spin_unlock(&bootmem_resource_lock);
 	} else {
 		kfree(res);
@@ -166,10 +169,9 @@ static struct resource *alloc_resource(gfp_t flags)
 	struct resource *res = NULL;
 
 	spin_lock(&bootmem_resource_lock);
-	if (bootmem_resource_free) {
-		res = bootmem_resource_free;
-		bootmem_resource_free = res->sibling;
-	}
+	res = resource_first_child(&bootmem_resource_free);
+	if (res)
+		list_del(&res->sibling);
 	spin_unlock(&bootmem_resource_lock);
 
 	if (res)
@@ -177,6 +179,8 @@ static struct resource *alloc_resource(gfp_t flags)
 	else
 		res = kzalloc(sizeof(struct resource), flags);
 
+	INIT_LIST_HEAD(&res->child);
+	INIT_LIST_HEAD(&res->sibling);
 	return res;
 }
 
@@ -185,7 +189,7 @@ static struct resource * __request_resource(struct resource *root, struct resour
 {
 	resource_size_t start = new->start;
 	resource_size_t end = new->end;
-	struct resource *tmp, **p;
+	struct resource *tmp;
 
 	if (end < start)
 		return root;
@@ -193,64 +197,62 @@ static struct resource * __request_resource(struct resource *root, struct resour
 		return root;
 	if (end > root->end)
 		return root;
-	p = &root->child;
-	for (;;) {
-		tmp = *p;
-		if (!tmp || tmp->start > end) {
-			new->sibling = tmp;
-			*p = new;
+
+	if (list_empty(&root->child)) {
+		list_add(&new->sibling, &root->child);
+		new->parent = root;
+		INIT_LIST_HEAD(&new->child);
+		return NULL;
+	}
+
+	list_for_each_entry(tmp, &root->child, sibling) {
+		if (tmp->start > end) {
+			list_add(&new->sibling, tmp->sibling.prev);
 			new->parent = root;
+			INIT_LIST_HEAD(&new->child);
 			return NULL;
 		}
-		p = &tmp->sibling;
 		if (tmp->end < start)
 			continue;
 		return tmp;
 	}
+
+	list_add_tail(&new->sibling, &root->child);
+	new->parent = root;
+	INIT_LIST_HEAD(&new->child);
+	return NULL;
 }
 
 static int __release_resource(struct resource *old, bool release_child)
 {
-	struct resource *tmp, **p, *chd;
+	struct resource *tmp, *next, *chd;
 
-	p = &old->parent->child;
-	for (;;) {
-		tmp = *p;
-		if (!tmp)
-			break;
+	list_for_each_entry_safe(tmp, next, &old->parent->child, sibling) {
 		if (tmp == old) {
-			if (release_child || !(tmp->child)) {
-				*p = tmp->sibling;
+			if (release_child || list_empty(&tmp->child)) {
+				list_del(&tmp->sibling);
 			} else {
-				for (chd = tmp->child;; chd = chd->sibling) {
+				list_for_each_entry(chd, &tmp->child, sibling)
 					chd->parent = tmp->parent;
-					if (!(chd->sibling))
-						break;
-				}
-				*p = tmp->child;
-				chd->sibling = tmp->sibling;
+				list_splice(&tmp->child, tmp->sibling.prev);
+				list_del(&tmp->sibling);
 			}
+
 			old->parent = NULL;
 			return 0;
 		}
-		p = &tmp->sibling;
 	}
 	return -EINVAL;
 }
 
 static void __release_child_resources(struct resource *r)
 {
-	struct resource *tmp, *p;
+	struct resource *tmp, *next;
 	resource_size_t size;
 
-	p = r->child;
-	r->child = NULL;
-	while (p) {
-		tmp = p;
-		p = p->sibling;
-
+	list_for_each_entry_safe(tmp, next, &r->child, sibling) {
 		tmp->parent = NULL;
-		tmp->sibling = NULL;
+		INIT_LIST_HEAD(&tmp->sibling);
 		__release_child_resources(tmp);
 
 		printk(KERN_DEBUG "release child resource %pR\n", tmp);
@@ -259,6 +261,8 @@ static void __release_child_resources(struct resource *r)
 		tmp->start = 0;
 		tmp->end = size - 1;
 	}
+
+	INIT_LIST_HEAD(&tmp->child);
 }
 
 void release_child_resources(struct resource *r)
@@ -343,7 +347,8 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
 
 	read_lock(&resource_lock);
 
-	for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
+	for (p = resource_first_child(&iomem_resource.child); p;
+			p = next_resource(p, sibling_only)) {
 		if ((p->flags & res->flags) != res->flags)
 			continue;
 		if ((desc != IORES_DESC_NONE) && (desc != p->desc))
@@ -532,7 +537,7 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags,
 	struct resource *p;
 
 	read_lock(&resource_lock);
-	for (p = iomem_resource.child; p ; p = p->sibling) {
+	list_for_each_entry(p, &iomem_resource.child, sibling) {
 		bool is_type = (((p->flags & flags) == flags) &&
 				((desc == IORES_DESC_NONE) ||
 				 (desc == p->desc)));
@@ -586,7 +591,7 @@ static int __find_resource(struct resource *root, struct resource *old,
 			 resource_size_t  size,
 			 struct resource_constraint *constraint)
 {
-	struct resource *this = root->child;
+	struct resource *this = resource_first_child(&root->child);
 	struct resource tmp = *new, avail, alloc;
 
 	tmp.start = root->start;
@@ -596,7 +601,7 @@ static int __find_resource(struct resource *root, struct resource *old,
 	 */
 	if (this && this->start == root->start) {
 		tmp.start = (this == old) ? old->start : this->end + 1;
-		this = this->sibling;
+		this = resource_sibling(this);
 	}
 	for(;;) {
 		if (this)
@@ -632,7 +637,7 @@ next:		if (!this || this->end == root->end)
 
 		if (this != old)
 			tmp.start = this->end + 1;
-		this = this->sibling;
+		this = resource_sibling(this);
 	}
 	return -EBUSY;
 }
@@ -676,7 +681,7 @@ static int reallocate_resource(struct resource *root, struct resource *old,
 		goto out;
 	}
 
-	if (old->child) {
+	if (!list_empty(&old->child)) {
 		err = -EBUSY;
 		goto out;
 	}
@@ -757,7 +762,7 @@ struct resource *lookup_resource(struct resource *root, resource_size_t start)
 	struct resource *res;
 
 	read_lock(&resource_lock);
-	for (res = root->child; res; res = res->sibling) {
+	list_for_each_entry(res, &root->child, sibling) {
 		if (res->start == start)
 			break;
 	}
@@ -790,32 +795,27 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
 			break;
 	}
 
-	for (next = first; ; next = next->sibling) {
+	for (next = first; ; next = resource_sibling(next)) {
 		/* Partial overlap? Bad, and unfixable */
 		if (next->start < new->start || next->end > new->end)
 			return next;
-		if (!next->sibling)
+		if (!resource_sibling(next))
 			break;
-		if (next->sibling->start > new->end)
+		if (resource_sibling(next)->start > new->end)
 			break;
 	}
-
 	new->parent = parent;
-	new->sibling = next->sibling;
-	new->child = first;
+	list_add(&new->sibling, &next->sibling);
+	INIT_LIST_HEAD(&new->child);
 
-	next->sibling = NULL;
-	for (next = first; next; next = next->sibling)
+	/*
+	 * From first to next, they all fall into new's region, so change them
+	 * as new's children.
+	 */
+	list_cut_position(&new->child, first->sibling.prev, &next->sibling);
+	list_for_each_entry(next, &new->child, sibling)
 		next->parent = new;
 
-	if (parent->child == first) {
-		parent->child = new;
-	} else {
-		next = parent->child;
-		while (next->sibling != first)
-			next = next->sibling;
-		next->sibling = new;
-	}
 	return NULL;
 }
 
@@ -937,19 +937,17 @@ static int __adjust_resource(struct resource *res, resource_size_t start,
 	if ((start < parent->start) || (end > parent->end))
 		goto out;
 
-	if (res->sibling && (res->sibling->start <= end))
+	if (resource_sibling(res) && (resource_sibling(res)->start <= end))
 		goto out;
 
-	tmp = parent->child;
-	if (tmp != res) {
-		while (tmp->sibling != res)
-			tmp = tmp->sibling;
+	if (res->sibling.prev != &parent->child) {
+		tmp = list_prev_entry(res, sibling);
 		if (start <= tmp->end)
 			goto out;
 	}
 
 skip:
-	for (tmp = res->child; tmp; tmp = tmp->sibling)
+	list_for_each_entry(tmp, &res->child, sibling)
 		if ((tmp->start < start) || (tmp->end > end))
 			goto out;
 
@@ -987,31 +985,33 @@ EXPORT_SYMBOL(adjust_resource);
  * Reparent resource children of pr that conflict with res
  * under res, and make res replace those children.
  */
-static int reparent_resources(struct resource *parent,
-				     struct resource *res)
+int reparent_resources(struct resource *parent, struct resource *res)
 {
-	struct resource *p, **pp;
-	struct resource **firstpp = NULL;
+	struct resource *p, *first = NULL;
 
-	for (pp = &parent->child; (p = *pp) != NULL; pp = &p->sibling) {
+	list_for_each_entry(p, &parent->child, sibling) {
 		if (p->end < res->start)
 			continue;
 		if (res->end < p->start)
 			break;
 		if (p->start < res->start || p->end > res->end)
 			return -1;	/* not completely contained */
-		if (firstpp == NULL)
-			firstpp = pp;
+		if (first == NULL)
+			first = p;
 	}
-	if (firstpp == NULL)
+	if (first == NULL)
 		return -1;	/* didn't find any conflicting entries? */
 	res->parent = parent;
-	res->child = *firstpp;
-	res->sibling = *pp;
-	*firstpp = res;
-	*pp = NULL;
-	for (p = res->child; p != NULL; p = p->sibling) {
-		p->parent = res;
+	list_add(&res->sibling, &p->sibling.prev);
+	INIT_LIST_HEAD(&res->child);
+
+	/*
+	 * From first to p's previous sibling, they all fall into
+	 * res's region, change them as res's children.
+	 */
+	list_cut_position(&res->child, first->sibling.prev, res->sibling.prev);
+	list_for_each_entry(p, &new->child, sibling) {
+                p->parent = new;
 		pr_debug("PCI: Reparented %s %pR under %s\n",
 			 p->name, p, res->name);
 	}
@@ -1210,34 +1210,32 @@ EXPORT_SYMBOL(__request_region);
 void __release_region(struct resource *parent, resource_size_t start,
 			resource_size_t n)
 {
-	struct resource **p;
+	struct resource *res;
 	resource_size_t end;
 
-	p = &parent->child;
+	res = resource_first_child(&parent->child);
 	end = start + n - 1;
 
 	write_lock(&resource_lock);
 
 	for (;;) {
-		struct resource *res = *p;
-
 		if (!res)
 			break;
 		if (res->start <= start && res->end >= end) {
 			if (!(res->flags & IORESOURCE_BUSY)) {
-				p = &res->child;
+				res = resource_first_child(&res->child);
 				continue;
 			}
 			if (res->start != start || res->end != end)
 				break;
-			*p = res->sibling;
+			list_del(&res->sibling);
 			write_unlock(&resource_lock);
 			if (res->flags & IORESOURCE_MUXED)
 				wake_up(&muxed_resource_wait);
 			free_resource(res);
 			return;
 		}
-		p = &res->sibling;
+		res = resource_sibling(res);
 	}
 
 	write_unlock(&resource_lock);
@@ -1272,9 +1270,7 @@ EXPORT_SYMBOL(__release_region);
 int release_mem_region_adjustable(struct resource *parent,
 			resource_size_t start, resource_size_t size)
 {
-	struct resource **p;
-	struct resource *res;
-	struct resource *new_res;
+	struct resource *res, *new_res;
 	resource_size_t end;
 	int ret = -EINVAL;
 
@@ -1285,16 +1281,16 @@ int release_mem_region_adjustable(struct resource *parent,
 	/* The alloc_resource() result gets checked later */
 	new_res = alloc_resource(GFP_KERNEL);
 
-	p = &parent->child;
+	res = resource_first_child(&parent->child);
 	write_lock(&resource_lock);
 
-	while ((res = *p)) {
+	while ((res)) {
 		if (res->start >= end)
 			break;
 
 		/* look for the next resource if it does not fit into */
 		if (res->start > start || res->end < end) {
-			p = &res->sibling;
+			res = resource_sibling(res);
 			continue;
 		}
 
@@ -1302,14 +1298,14 @@ int release_mem_region_adjustable(struct resource *parent,
 			break;
 
 		if (!(res->flags & IORESOURCE_BUSY)) {
-			p = &res->child;
+			res = resource_first_child(&res->child);
 			continue;
 		}
 
 		/* found the target resource; let's adjust accordingly */
 		if (res->start == start && res->end == end) {
 			/* free the whole entry */
-			*p = res->sibling;
+			list_del(&res->sibling);
 			free_resource(res);
 			ret = 0;
 		} else if (res->start == start && res->end != end) {
@@ -1332,14 +1328,13 @@ int release_mem_region_adjustable(struct resource *parent,
 			new_res->flags = res->flags;
 			new_res->desc = res->desc;
 			new_res->parent = res->parent;
-			new_res->sibling = res->sibling;
-			new_res->child = NULL;
+			INIT_LIST_HEAD(&new_res->child);
 
 			ret = __adjust_resource(res, res->start,
 						start - res->start);
 			if (ret)
 				break;
-			res->sibling = new_res;
+			list_add(&new_res->sibling, &res->sibling);
 			new_res = NULL;
 		}
 
@@ -1520,7 +1515,7 @@ static int __init reserve_setup(char *str)
 			res->end = io_start + io_num - 1;
 			res->flags |= IORESOURCE_BUSY;
 			res->desc = IORES_DESC_NONE;
-			res->child = NULL;
+			INIT_LIST_HEAD(&res->child);
 			if (request_resource(parent, res) == 0)
 				reserved = x+1;
 		}
@@ -1540,7 +1535,7 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
 	loff_t l;
 
 	read_lock(&resource_lock);
-	for (p = p->child; p ; p = r_next(NULL, p, &l)) {
+	for (p = resource_first_child(&p->child); p; p = r_next(NULL, p, &l)) {
 		/*
 		 * We can probably skip the resources without
 		 * IORESOURCE_IO attribute?
@@ -1596,7 +1591,7 @@ bool iomem_is_exclusive(u64 addr)
 	addr = addr & PAGE_MASK;
 
 	read_lock(&resource_lock);
-	for (p = p->child; p ; p = r_next(NULL, p, &l)) {
+	for (p = resource_first_child(&p->child); p; p = r_next(NULL, p, &l)) {
 		/*
 		 * We can probably skip the resources without
 		 * IORESOURCE_IO attribute?
-- 
2.13.6

^ permalink raw reply related

* [PATCH v5 3/4] resource: add walk_system_ram_res_rev()
From: Baoquan He @ 2018-06-12  3:28 UTC (permalink / raw)
  To: linux-kernel, akpm, robh+dt, dan.j.williams, nicolas.pitre, josh,
	fengguang.wu, bp
  Cc: patrik.r.jakobsson, airlied, kys, haiyangz, sthemmin,
	dmitry.torokhov, frowand.list, keith.busch, jonathan.derrick,
	lorenzo.pieralisi, bhelgaas, tglx, brijesh.singh, jglisse,
	thomas.lendacky, gregkh, baiyaowei, richard.weiyang, devel,
	linux-input, linux-nvdimm, devicetree, linux-pci, ebiederm,
	vgoyal, dyoung, yinghai, kexec, monstr, davem, chris, jcmvbkbc,
	gustavo, maarten.lankhorst, seanpaul, linux-parisc, linuxppc-dev,
	Baoquan He
In-Reply-To: <20180612032831.29747-1-bhe@redhat.com>

This function, being a variant of walk_system_ram_res() introduced in
commit 8c86e70acead ("resource: provide new functions to walk through
resources"), walks through a list of all the resources of System RAM
in reversed order, i.e., from higher to lower.

It will be used in kexec_file code.

Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
---
 include/linux/ioport.h |  3 +++
 kernel/resource.c      | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index b7456ae889dd..066cc263e2cc 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -279,6 +279,9 @@ extern int
 walk_system_ram_res(u64 start, u64 end, void *arg,
 		    int (*func)(struct resource *, void *));
 extern int
+walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+			int (*func)(struct resource *, void *));
+extern int
 walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
 		    void *arg, int (*func)(struct resource *, void *));
 
diff --git a/kernel/resource.c b/kernel/resource.c
index ef9a20b75234..3128ac938f38 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -23,6 +23,8 @@
 #include <linux/pfn.h>
 #include <linux/mm.h>
 #include <linux/resource_ext.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
 #include <asm/io.h>
 
 
@@ -443,6 +445,44 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 }
 
 /*
+ * This function, being a variant of walk_system_ram_res(), calls the @func
+ * callback against all memory ranges of type System RAM which are marked as
+ * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from
+ * higher to lower.
+ */
+int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+				int (*func)(struct resource *, void *))
+{
+	unsigned long flags;
+	struct resource *res;
+	int ret = -1;
+
+	flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+	read_lock(&resource_lock);
+	list_for_each_entry_reverse(res, &iomem_resource.child, sibling) {
+		if (start >= end)
+			break;
+		if ((res->flags & flags) != flags)
+			continue;
+		if (res->desc != IORES_DESC_NONE)
+			continue;
+		if (res->end < start)
+			break;
+
+		if ((res->end >= start) && (res->start < end)) {
+			ret = (*func)(res, arg);
+			if (ret)
+				break;
+		}
+		end = res->start - 1;
+
+	}
+	read_unlock(&resource_lock);
+	return ret;
+}
+
+/*
  * This function calls the @func callback against all memory ranges, which
  * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
  */
-- 
2.13.6

^ permalink raw reply related

* [PATCH v5 4/4] kexec_file: Load kernel at top of system RAM if required
From: Baoquan He @ 2018-06-12  3:28 UTC (permalink / raw)
  To: linux-kernel, akpm, robh+dt, dan.j.williams, nicolas.pitre, josh,
	fengguang.wu, bp
  Cc: patrik.r.jakobsson, airlied, kys, haiyangz, sthemmin,
	dmitry.torokhov, frowand.list, keith.busch, jonathan.derrick,
	lorenzo.pieralisi, bhelgaas, tglx, brijesh.singh, jglisse,
	thomas.lendacky, gregkh, baiyaowei, richard.weiyang, devel,
	linux-input, linux-nvdimm, devicetree, linux-pci, ebiederm,
	vgoyal, dyoung, yinghai, kexec, monstr, davem, chris, jcmvbkbc,
	gustavo, maarten.lankhorst, seanpaul, linux-parisc, linuxppc-dev,
	Baoquan He
In-Reply-To: <20180612032831.29747-1-bhe@redhat.com>

For kexec_file loading, if kexec_buf.top_down is 'true', the memory which
is used to load kernel/initrd/purgatory is supposed to be allocated from
top to down. This is what we have been doing all along in the old kexec
loading interface and the kexec loading is still default setting in some
distributions. However, the current kexec_file loading interface doesn't
do likt this. The function arch_kexec_walk_mem() it calls ignores checking
kexec_buf.top_down, but calls walk_system_ram_res() directly to go through
all resources of System RAM from bottom to up, to try to find memory region
which can contain the specific kexec buffer, then call locate_mem_hole_callback()
to allocate memory in that found memory region from top to down. This brings
confusion especially when KASLR is widely supported , users have to make clear
why kexec/kdump kernel loading position is different between these two
interfaces in order to exclude unnecessary noises. Hence these two interfaces
need be unified on behaviour.

Here add checking if kexec_buf.top_down is 'true' in arch_kexec_walk_mem(),
if yes, call the newly added walk_system_ram_res_rev() to find memory region
from top to down to load kernel.

Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: kexec@lists.infradead.org
---
 kernel/kexec_file.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 75d8e7cf040e..7a66d9d5a534 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -518,6 +518,8 @@ int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
 					   IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
 					   crashk_res.start, crashk_res.end,
 					   kbuf, func);
+	else if (kbuf->top_down)
+		return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func);
 	else
 		return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
 }
-- 
2.13.6

^ permalink raw reply related

* Re: [PATCH v5 1/4] resource: Move reparent_resources() to kernel/resource.c and make it public
From: Baoquan He @ 2018-06-12  3:34 UTC (permalink / raw)
  To: linux-kernel, akpm, robh+dt, dan.j.williams, nicolas.pitre, josh,
	fengguang.wu, bp
  Cc: patrik.r.jakobsson, airlied, kys, haiyangz, sthemmin,
	dmitry.torokhov, frowand.list, keith.busch, jonathan.derrick,
	lorenzo.pieralisi, bhelgaas, tglx, brijesh.singh, jglisse,
	thomas.lendacky, gregkh, baiyaowei, richard.weiyang, devel,
	linux-input, linux-nvdimm, devicetree, linux-pci, ebiederm,
	vgoyal, dyoung, yinghai, kexec, monstr, davem, chris, jcmvbkbc,
	gustavo, maarten.lankhorst, seanpaul, linux-parisc, linuxppc-dev,
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
In-Reply-To: <20180612032831.29747-2-bhe@redhat.com>

On 06/12/18 at 11:28am, Baoquan He wrote:
> reparent_resources() is duplicated in arch/microblaze/pci/pci-common.c
> and arch/powerpc/kernel/pci-common.c, so move it to kernel/resource.c
> so that it's shared. Later its code also need be updated using list_head
> to replace singly linked list.
> 
> Signed-off-by: Baoquan He <bhe@redhat.com>
> Cc: Michal Simek <monstr@monstr.eu>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> ---
> v4->v5:
>   Fix several code bugs reported by test robot on ARCH powerpc and
>   microblaze.

Oops, I mistakenly added the patch change log of the current patch 0002
here. This patch is a newly added one.

> 
> v3->v4:
>   Fix several bugs test robot reported. And change patch log.
> 
> v2->v3:
>   Rename resource functions first_child() and sibling() to
>   resource_first_chils() and resource_sibling(). Dan suggested this.
> 
>   Move resource_first_chils() and resource_sibling() to linux/ioport.h
>   and make them as inline function. Rob suggested this. Accordingly add
>   linux/list.h including in linux/ioport.h, please help review if this
>   bring efficiency degradation or code redundancy.
> 
>   The change on struct resource {} bring two pointers of size increase,
>   mention this in git log to make it more specifically, Rob suggested
>   this.
> 
>  arch/microblaze/pci/pci-common.c | 37 -------------------------------------
>  arch/powerpc/kernel/pci-common.c | 35 -----------------------------------
>  include/linux/ioport.h           |  1 +
>  kernel/resource.c                | 36 ++++++++++++++++++++++++++++++++++++
>  4 files changed, 37 insertions(+), 72 deletions(-)
> 
> diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
> index f34346d56095..7899bafab064 100644
> --- a/arch/microblaze/pci/pci-common.c
> +++ b/arch/microblaze/pci/pci-common.c
> @@ -619,43 +619,6 @@ int pcibios_add_device(struct pci_dev *dev)
>  EXPORT_SYMBOL(pcibios_add_device);
>  
>  /*
> - * Reparent resource children of pr that conflict with res
> - * under res, and make res replace those children.
> - */
> -static int __init reparent_resources(struct resource *parent,
> -				     struct resource *res)
> -{
> -	struct resource *p, **pp;
> -	struct resource **firstpp = NULL;
> -
> -	for (pp = &parent->child; (p = *pp) != NULL; pp = &p->sibling) {
> -		if (p->end < res->start)
> -			continue;
> -		if (res->end < p->start)
> -			break;
> -		if (p->start < res->start || p->end > res->end)
> -			return -1;	/* not completely contained */
> -		if (firstpp == NULL)
> -			firstpp = pp;
> -	}
> -	if (firstpp == NULL)
> -		return -1;	/* didn't find any conflicting entries? */
> -	res->parent = parent;
> -	res->child = *firstpp;
> -	res->sibling = *pp;
> -	*firstpp = res;
> -	*pp = NULL;
> -	for (p = res->child; p != NULL; p = p->sibling) {
> -		p->parent = res;
> -		pr_debug("PCI: Reparented %s [%llx..%llx] under %s\n",
> -			 p->name,
> -			 (unsigned long long)p->start,
> -			 (unsigned long long)p->end, res->name);
> -	}
> -	return 0;
> -}
> -
> -/*
>   *  Handle resources of PCI devices.  If the world were perfect, we could
>   *  just allocate all the resource regions and do nothing more.  It isn't.
>   *  On the other hand, we cannot just re-allocate all devices, as it would
> diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
> index fe9733ffffaa..926035bb378d 100644
> --- a/arch/powerpc/kernel/pci-common.c
> +++ b/arch/powerpc/kernel/pci-common.c
> @@ -1088,41 +1088,6 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res,
>  EXPORT_SYMBOL(pcibios_align_resource);
>  
>  /*
> - * Reparent resource children of pr that conflict with res
> - * under res, and make res replace those children.
> - */
> -static int reparent_resources(struct resource *parent,
> -				     struct resource *res)
> -{
> -	struct resource *p, **pp;
> -	struct resource **firstpp = NULL;
> -
> -	for (pp = &parent->child; (p = *pp) != NULL; pp = &p->sibling) {
> -		if (p->end < res->start)
> -			continue;
> -		if (res->end < p->start)
> -			break;
> -		if (p->start < res->start || p->end > res->end)
> -			return -1;	/* not completely contained */
> -		if (firstpp == NULL)
> -			firstpp = pp;
> -	}
> -	if (firstpp == NULL)
> -		return -1;	/* didn't find any conflicting entries? */
> -	res->parent = parent;
> -	res->child = *firstpp;
> -	res->sibling = *pp;
> -	*firstpp = res;
> -	*pp = NULL;
> -	for (p = res->child; p != NULL; p = p->sibling) {
> -		p->parent = res;
> -		pr_debug("PCI: Reparented %s %pR under %s\n",
> -			 p->name, p, res->name);
> -	}
> -	return 0;
> -}
> -
> -/*
>   *  Handle resources of PCI devices.  If the world were perfect, we could
>   *  just allocate all the resource regions and do nothing more.  It isn't.
>   *  On the other hand, we cannot just re-allocate all devices, as it would
> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
> index da0ebaec25f0..dfdcd0bfe54e 100644
> --- a/include/linux/ioport.h
> +++ b/include/linux/ioport.h
> @@ -192,6 +192,7 @@ extern int allocate_resource(struct resource *root, struct resource *new,
>  struct resource *lookup_resource(struct resource *root, resource_size_t start);
>  int adjust_resource(struct resource *res, resource_size_t start,
>  		    resource_size_t size);
> +int reparent_resources(struct resource *parent, struct resource *res);
>  resource_size_t resource_alignment(struct resource *res);
>  static inline resource_size_t resource_size(const struct resource *res)
>  {
> diff --git a/kernel/resource.c b/kernel/resource.c
> index 30e1bc68503b..5e7c56d5d838 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
> @@ -983,6 +983,42 @@ int adjust_resource(struct resource *res, resource_size_t start,
>  }
>  EXPORT_SYMBOL(adjust_resource);
>  
> +/*
> + * Reparent resource children of pr that conflict with res
> + * under res, and make res replace those children.
> + */
> +static int reparent_resources(struct resource *parent,
> +				     struct resource *res)
> +{
> +	struct resource *p, **pp;
> +	struct resource **firstpp = NULL;
> +
> +	for (pp = &parent->child; (p = *pp) != NULL; pp = &p->sibling) {
> +		if (p->end < res->start)
> +			continue;
> +		if (res->end < p->start)
> +			break;
> +		if (p->start < res->start || p->end > res->end)
> +			return -1;	/* not completely contained */
> +		if (firstpp == NULL)
> +			firstpp = pp;
> +	}
> +	if (firstpp == NULL)
> +		return -1;	/* didn't find any conflicting entries? */
> +	res->parent = parent;
> +	res->child = *firstpp;
> +	res->sibling = *pp;
> +	*firstpp = res;
> +	*pp = NULL;
> +	for (p = res->child; p != NULL; p = p->sibling) {
> +		p->parent = res;
> +		pr_debug("PCI: Reparented %s %pR under %s\n",
> +			 p->name, p, res->name);
> +	}
> +	return 0;
> +}
> +EXPORT_SYMBOL(reparent_resources);
> +
>  static void __init __reserve_region_with_split(struct resource *root,
>  		resource_size_t start, resource_size_t end,
>  		const char *name)
> -- 
> 2.13.6
> 

^ permalink raw reply

* Re: [PATCH v5 1/4] resource: Move reparent_resources() to kernel/resource.c and make it public
From: kbuild test robot @ 2018-06-12  3:55 UTC (permalink / raw)
  To: Baoquan He
  Cc: kbuild-all, linux-kernel, akpm, robh+dt, dan.j.williams,
	nicolas.pitre, josh, fengguang.wu, bp, patrik.r.jakobsson,
	airlied, kys, haiyangz, sthemmin, dmitry.torokhov, frowand.list,
	keith.busch, jonathan.derrick, lorenzo.pieralisi, bhelgaas, tglx,
	brijesh.singh, jglisse, thomas.lendacky, gregkh, baiyaowei,
	richard.weiyang, devel, linux-input, linux-nvdimm, devicetree,
	linux-pci, ebiederm, vgoyal, dyoung, yinghai, kexec, monstr,
	davem, chris, jcmvbkbc, gustavo, maarten.lankhorst, seanpaul,
	linux-parisc, linuxppc-dev, Baoquan He, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman
In-Reply-To: <20180612032831.29747-2-bhe@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 2814 bytes --]

Hi Baoquan,

I love your patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.17 next-20180608]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Baoquan-He/resource-Use-list_head-to-link-sibling-resource/20180612-113600
config: i386-tinyconfig (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

Note: the linux-review/Baoquan-He/resource-Use-list_head-to-link-sibling-resource/20180612-113600 HEAD 5545e79eef6387857faf41cdffa7be6b1f5d4efe builds fine.
      It only hurts bisectibility.

All errors (new ones prefixed by >>):

>> kernel/resource.c:990:12: error: static declaration of 'reparent_resources' follows non-static declaration
    static int reparent_resources(struct resource *parent,
               ^~~~~~~~~~~~~~~~~~
   In file included from kernel/resource.c:14:0:
   include/linux/ioport.h:195:5: note: previous declaration of 'reparent_resources' was here
    int reparent_resources(struct resource *parent, struct resource *res);
        ^~~~~~~~~~~~~~~~~~
   kernel/resource.c:990:12: warning: 'reparent_resources' defined but not used [-Wunused-function]
    static int reparent_resources(struct resource *parent,
               ^~~~~~~~~~~~~~~~~~

vim +/reparent_resources +990 kernel/resource.c

   985	
   986	/*
   987	 * Reparent resource children of pr that conflict with res
   988	 * under res, and make res replace those children.
   989	 */
 > 990	static int reparent_resources(struct resource *parent,
   991					     struct resource *res)
   992	{
   993		struct resource *p, **pp;
   994		struct resource **firstpp = NULL;
   995	
   996		for (pp = &parent->child; (p = *pp) != NULL; pp = &p->sibling) {
   997			if (p->end < res->start)
   998				continue;
   999			if (res->end < p->start)
  1000				break;
  1001			if (p->start < res->start || p->end > res->end)
  1002				return -1;	/* not completely contained */
  1003			if (firstpp == NULL)
  1004				firstpp = pp;
  1005		}
  1006		if (firstpp == NULL)
  1007			return -1;	/* didn't find any conflicting entries? */
  1008		res->parent = parent;
  1009		res->child = *firstpp;
  1010		res->sibling = *pp;
  1011		*firstpp = res;
  1012		*pp = NULL;
  1013		for (p = res->child; p != NULL; p = p->sibling) {
  1014			p->parent = res;
  1015			pr_debug("PCI: Reparented %s %pR under %s\n",
  1016				 p->name, p, res->name);
  1017		}
  1018		return 0;
  1019	}
  1020	EXPORT_SYMBOL(reparent_resources);
  1021	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 6347 bytes --]

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox