LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 2/2] [PowerPC] Allow use of SIMD in interrupts from kernel code
From: Shawn Landden @ 2019-05-14  1:44 UTC (permalink / raw)
  Cc: Shawn Landden, Paul Mackerras, linuxppc-dev
In-Reply-To: <20190514014412.25373-1-shawn@git.icu>

This second patch is separate because it could be wrong,
like I am not sure about how kernel thread migration works,
and it is even allowing simd in preemptible kernel code.

Signed-off-by: Shawn Landden <shawn@git.icu>
---
 arch/powerpc/include/asm/simd.h      |  8 +++++
 arch/powerpc/include/asm/switch_to.h | 10 ++----
 arch/powerpc/kernel/process.c        | 50 ++++++++++++++++++++++++++--
 3 files changed, 57 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/simd.h b/arch/powerpc/include/asm/simd.h
index 2c02ad531..7b582b07e 100644
--- a/arch/powerpc/include/asm/simd.h
+++ b/arch/powerpc/include/asm/simd.h
@@ -7,7 +7,15 @@
  * It's always ok in process context (ie "not interrupt")
  * but it is sometimes ok even from an irq.
  */
+#ifdef CONFIG_ALTIVEC
+extern bool irq_simd_usable(void);
 static __must_check inline bool may_use_simd(void)
 {
 	return irq_simd_usable();
 }
+#else
+static inline bool may_use_simd(void)
+{
+	return false;
+}
+#endif
diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 5b03d8a82..537998997 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -44,10 +44,7 @@ extern void enable_kernel_altivec(void);
 extern void flush_altivec_to_thread(struct task_struct *);
 extern void giveup_altivec(struct task_struct *);
 extern void save_altivec(struct task_struct *);
-static inline void disable_kernel_altivec(void)
-{
-	msr_check_and_clear(MSR_VEC);
-}
+extern void disable_kernel_altivec(void);
 #else
 static inline void save_altivec(struct task_struct *t) { }
 static inline void __giveup_altivec(struct task_struct *t) { }
@@ -56,10 +53,7 @@ static inline void __giveup_altivec(struct task_struct *t) { }
 #ifdef CONFIG_VSX
 extern void enable_kernel_vsx(void);
 extern void flush_vsx_to_thread(struct task_struct *);
-static inline void disable_kernel_vsx(void)
-{
-	msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX);
-}
+extern void disable_kernel_vsx(void);
 #endif
 
 #ifdef CONFIG_SPE
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index e436d708a..41a0ab500 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -267,6 +267,29 @@ static int restore_fp(struct task_struct *tsk) { return 0; }
 #ifdef CONFIG_ALTIVEC
 #define loadvec(thr) ((thr).load_vec)
 
+/*
+ * Track whether the kernel is using the SIMD state
+ * currently.
+ *
+ * This flag is used:
+ *
+ *   - by IRQ context code to potentially use the FPU
+ *     if it's unused.
+ *
+ *   - to debug kernel_altivec/vsx_begin()/end() correctness
+ */
+static DEFINE_PER_CPU(bool, in_kernel_simd);
+
+static bool kernel_simd_disabled(void)
+{
+	return this_cpu_read(in_kernel_simd);
+}
+
+static bool interrupted_kernel_simd_idle(void)
+{
+	return !kernel_simd_disabled();
+}
+
 static void __giveup_altivec(struct task_struct *tsk)
 {
 	unsigned long msr;
@@ -295,7 +318,9 @@ void enable_kernel_altivec(void)
 {
 	unsigned long cpumsr;
 
-	WARN_ON(preemptible());
+	WARN_ON_ONCE(preemptible());
+	WARN_ON_ONCE(this_cpu_read(in_kernel_simd));
+	this_cpu_write(in_kernel_simd, true);
 
 	cpumsr = msr_check_and_set(MSR_VEC);
 
@@ -316,6 +341,14 @@ void enable_kernel_altivec(void)
 }
 EXPORT_SYMBOL(enable_kernel_altivec);
 
+extern void disable_kernel_altivec(void)
+{
+	WARN_ON_ONCE(!this_cpu_read(in_kernel_simd));
+	this_cpu_write(in_kernel_simd, false);
+	msr_check_and_clear(MSR_VEC);
+}
+EXPORT_SYMBOL(disable_kernel_altivec);
+
 /*
  * Make sure the VMX/Altivec register state in the
  * the thread_struct is up to date for task tsk.
@@ -371,7 +404,8 @@ static bool interrupted_user_mode(void)
 bool irq_simd_usable(void)
 {
 	return !in_interrupt() ||
-		interrupted_user_mode();
+		interrupted_user_mode() ||
+		interrupted_kernel_simd_idle();
 }
 EXPORT_SYMBOL(irq_simd_usable);
 
@@ -411,7 +445,9 @@ void enable_kernel_vsx(void)
 {
 	unsigned long cpumsr;
 
-	WARN_ON(preemptible());
+	WARN_ON_ONCE(preemptible());
+	WARN_ON_ONCE(this_cpu_read(in_kernel_simd));
+	this_cpu_write(in_kernel_simd, true);
 
 	cpumsr = msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
 
@@ -433,6 +469,14 @@ void enable_kernel_vsx(void)
 }
 EXPORT_SYMBOL(enable_kernel_vsx);
 
+void disable_kernel_vsx(void)
+{
+	WARN_ON_ONCE(!this_cpu_read(in_kernel_simd));
+	this_cpu_write(in_kernel_simd, false);
+	msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX);
+}
+EXPORT_SYMBOL(disable_kernel_vsx);
+
 void flush_vsx_to_thread(struct task_struct *tsk)
 {
 	if (tsk->thread.regs) {
-- 
2.21.0.1020.gf2820cf01a


^ permalink raw reply related

* Re: [PATCH] vsprintf: Do not break early boot with probing addresses
From: Sergey Senozhatsky @ 2019-05-14  2:07 UTC (permalink / raw)
  To: Petr Mladek
  Cc: linux-arch@vger.kernel.org, Sergey Senozhatsky, Heiko Carstens,
	linux-s390@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
	Rasmus Villemoes, linux-kernel@vger.kernel.org, Steven Rostedt,
	Michal Hocko, Sergey Senozhatsky, David Laight, Stephen Rothwell,
	Andy Shevchenko, Linus Torvalds, Martin Schwidefsky,
	Tobin C . Harding
In-Reply-To: <20190513124220.wty2qbnz4wo52h3x@pathway.suse.cz>

On (05/13/19 14:42), Petr Mladek wrote:
> > The "(null)" is good enough by itself and already an established
> > practice..
> 
> (efault) made more sense with the probe_kernel_read() that
> checked wide range of addresses. Well, I still think that
> it makes sense to distinguish a pure NULL. And it still
> used also for IS_ERR_VALUE().

Wouldn't anything within first PAGE_SIZE bytes be reported as
a NULL deref?

       char *p = (char *)(PAGE_SIZE - 2);
       *p = 'a';

gives

 kernel: BUG: kernel NULL pointer dereference, address = 0000000000000ffe
 kernel: #PF: supervisor-privileged write access from kernel code
 kernel: #PF: error_code(0x0002) - not-present page


And I like Steven's "(fault)" idea.
How about this:

	if ptr < PAGE_SIZE		-> "(null)"
	if IS_ERR_VALUE(ptr)		-> "(fault)"

	-ss

^ permalink raw reply

* [v2 1/2] [PowerPC] Add simd.h implementation
From: Shawn Landden @ 2019-05-14  2:23 UTC (permalink / raw)
  Cc: Shawn Landden, Paul Mackerras, linuxppc-dev
In-Reply-To: <20190513005104.20140-1-shawn@git.icu>

Based off the x86 one.

WireGuard really wants to be able to do SIMD in interrupts,
so it can accelerate its in-bound path.

Signed-off-by: Shawn Landden <shawn@git.icu>
---
 arch/powerpc/include/asm/simd.h | 10 ++++++++++
 arch/powerpc/kernel/process.c   | 30 ++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 arch/powerpc/include/asm/simd.h

diff --git a/arch/powerpc/include/asm/simd.h b/arch/powerpc/include/asm/simd.h
new file mode 100644
index 000000000..9b066d633
--- /dev/null
+++ b/arch/powerpc/include/asm/simd.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+/*
+ * may_use_simd - whether it is allowable at this time to issue SIMD
+ *                instructions or access the SIMD register file
+ *
+ * It's always ok in process context (ie "not interrupt")
+ * but it is sometimes ok even from an irq.
+ */
+extern bool may_use_simd(void);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index dd9e0d538..ef534831f 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -345,6 +345,36 @@ static int restore_altivec(struct task_struct *tsk)
 	}
 	return 0;
 }
+
+/*
+ * Were we in user mode when we were
+ * interrupted?
+ *
+ * Doing kernel_altivec/vsx_begin/end() is ok if we are running
+ * in an interrupt context from user mode - we'll just
+ * save the FPU state as required.
+ */
+static bool interrupted_user_mode(void)
+{
+	struct pt_regs *regs = get_irq_regs();
+
+	return regs && user_mode(regs);
+}
+
+/*
+ * Can we use FPU in kernel mode with the
+ * whole "kernel_fpu/altivec/vsx_begin/end()" sequence?
+ *
+ * It's always ok in process context (ie "not interrupt")
+ * but it is sometimes ok even from an irq.
+ */
+bool may_use_simd(void)
+{
+	return !in_interrupt() ||
+		interrupted_user_mode();
+}
+EXPORT_SYMBOL(may_use_simd);
+
 #else
 #define loadvec(thr) 0
 static inline int restore_altivec(struct task_struct *tsk) { return 0; }
-- 
2.21.0.1020.gf2820cf01a


^ permalink raw reply related

* [v2 2/2] [PowerPC] Allow use of SIMD in interrupts from kernel code
From: Shawn Landden @ 2019-05-14  2:23 UTC (permalink / raw)
  Cc: Shawn Landden, Paul Mackerras, linuxppc-dev
In-Reply-To: <20190514022308.32363-1-shawn@git.icu>

This second patch is separate because it could be wrong,
like I am not sure about how kernel thread migration works,
and it is even allowing simd in preemptible kernel code.

Signed-off-by: Shawn Landden <shawn@git.icu>
---
 arch/powerpc/include/asm/switch_to.h | 15 ++-----
 arch/powerpc/kernel/process.c        | 60 ++++++++++++++++++++++++++--
 2 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 5b03d8a82..c79f7d24a 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -30,10 +30,7 @@ extern void enable_kernel_fp(void);
 extern void flush_fp_to_thread(struct task_struct *);
 extern void giveup_fpu(struct task_struct *);
 extern void save_fpu(struct task_struct *);
-static inline void disable_kernel_fp(void)
-{
-	msr_check_and_clear(MSR_FP);
-}
+extern void disable_kernel_fp(void);
 #else
 static inline void save_fpu(struct task_struct *t) { }
 static inline void flush_fp_to_thread(struct task_struct *t) { }
@@ -44,10 +41,7 @@ extern void enable_kernel_altivec(void);
 extern void flush_altivec_to_thread(struct task_struct *);
 extern void giveup_altivec(struct task_struct *);
 extern void save_altivec(struct task_struct *);
-static inline void disable_kernel_altivec(void)
-{
-	msr_check_and_clear(MSR_VEC);
-}
+extern void disable_kernel_altivec(void);
 #else
 static inline void save_altivec(struct task_struct *t) { }
 static inline void __giveup_altivec(struct task_struct *t) { }
@@ -56,10 +50,7 @@ static inline void __giveup_altivec(struct task_struct *t) { }
 #ifdef CONFIG_VSX
 extern void enable_kernel_vsx(void);
 extern void flush_vsx_to_thread(struct task_struct *);
-static inline void disable_kernel_vsx(void)
-{
-	msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX);
-}
+extern void disable_kernel_vsx(void);
 #endif
 
 #ifdef CONFIG_SPE
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index ef534831f..d15f2cb51 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -170,6 +170,29 @@ void __msr_check_and_clear(unsigned long bits)
 EXPORT_SYMBOL(__msr_check_and_clear);
 
 #ifdef CONFIG_PPC_FPU
+/*
+ * Track whether the kernel is using the FPU state
+ * currently.
+ *
+ * This flag is used:
+ *
+ *   - by IRQ context code to potentially use the FPU
+ *     if it's unused.
+ *
+ *   - to debug kernel_fpu/altivec/vsx_begin()/end() correctness
+ */
+static DEFINE_PER_CPU(bool, in_kernel_fpu);
+
+static bool kernel_fpu_disabled(void)
+{
+        return this_cpu_read(in_kernel_fpu);
+}
+
+static bool interrupted_kernel_fpu_idle(void)
+{
+        return !kernel_fpu_disabled();
+}
+
 static void __giveup_fpu(struct task_struct *tsk)
 {
 	unsigned long msr;
@@ -230,7 +253,8 @@ void enable_kernel_fp(void)
 {
 	unsigned long cpumsr;
 
-	WARN_ON(preemptible());
+        WARN_ON_ONCE(this_cpu_read(in_kernel_fpu));
+        this_cpu_write(in_kernel_fpu, true);
 
 	cpumsr = msr_check_and_set(MSR_FP);
 
@@ -251,6 +275,15 @@ void enable_kernel_fp(void)
 }
 EXPORT_SYMBOL(enable_kernel_fp);
 
+void disable_kernel_fp(void)
+{
+        WARN_ON_ONCE(!this_cpu_read(in_kernel_fpu));
+        this_cpu_write(in_kernel_fpu, false);
+
+        msr_check_and_clear(MSR_FP);
+}
+EXPORT_SYMBOL(disable_kernel_fp);
+
 static int restore_fp(struct task_struct *tsk)
 {
 	if (tsk->thread.load_fp || tm_active_with_fp(tsk)) {
@@ -295,7 +328,8 @@ void enable_kernel_altivec(void)
 {
 	unsigned long cpumsr;
 
-	WARN_ON(preemptible());
+	WARN_ON_ONCE(this_cpu_read(in_kernel_fpu));
+	this_cpu_write(in_kernel_fpu, true);
 
 	cpumsr = msr_check_and_set(MSR_VEC);
 
@@ -316,6 +350,14 @@ void enable_kernel_altivec(void)
 }
 EXPORT_SYMBOL(enable_kernel_altivec);
 
+extern void disable_kernel_altivec(void)
+{
+	WARN_ON_ONCE(!this_cpu_read(in_kernel_fpu));
+	this_cpu_write(in_kernel_fpu, false);
+	msr_check_and_clear(MSR_VEC);
+}
+EXPORT_SYMBOL(disable_kernel_altivec);
+
 /*
  * Make sure the VMX/Altivec register state in the
  * the thread_struct is up to date for task tsk.
@@ -371,7 +413,8 @@ static bool interrupted_user_mode(void)
 bool may_use_simd(void)
 {
 	return !in_interrupt() ||
-		interrupted_user_mode();
+		interrupted_user_mode() ||
+		interrupted_kernel_fpu_idle();
 }
 EXPORT_SYMBOL(may_use_simd);
 
@@ -411,7 +454,8 @@ void enable_kernel_vsx(void)
 {
 	unsigned long cpumsr;
 
-	WARN_ON(preemptible());
+	WARN_ON_ONCE(this_cpu_read(in_kernel_fpu));
+	this_cpu_write(in_kernel_fpu, true);
 
 	cpumsr = msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
 
@@ -433,6 +477,14 @@ void enable_kernel_vsx(void)
 }
 EXPORT_SYMBOL(enable_kernel_vsx);
 
+void disable_kernel_vsx(void)
+{
+	WARN_ON_ONCE(!this_cpu_read(in_kernel_fpu));
+	this_cpu_write(in_kernel_fpu, false);
+	msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX);
+}
+EXPORT_SYMBOL(disable_kernel_vsx);
+
 void flush_vsx_to_thread(struct task_struct *tsk)
 {
 	if (tsk->thread.regs) {
-- 
2.21.0.1020.gf2820cf01a


^ permalink raw reply related

* Re: [PATCH] vsprintf: Do not break early boot with probing addresses
From: Sergey Senozhatsky @ 2019-05-14  2:25 UTC (permalink / raw)
  To: Petr Mladek
  Cc: linux-arch@vger.kernel.org, Sergey Senozhatsky, Heiko Carstens,
	linux-s390@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
	Rasmus Villemoes, linux-kernel@vger.kernel.org, Steven Rostedt,
	Michal Hocko, Sergey Senozhatsky, David Laight, Stephen Rothwell,
	Andy Shevchenko, Linus Torvalds, Martin Schwidefsky,
	Tobin C . Harding
In-Reply-To: <20190514020730.GA651@jagdpanzerIV>

On (05/14/19 11:07), Sergey Senozhatsky wrote:
> How about this:
> 
> 	if ptr < PAGE_SIZE		-> "(null)"

No, this is totally stupid. Forget about it. Sorry.


> 	if IS_ERR_VALUE(ptr)		-> "(fault)"

But Steven's "(fault)" is nice.

	-ss

^ permalink raw reply

* [RFC PATCH] mm/nvdimm: Fix kernel crash on devm_mremap_pages_release
From: Aneesh Kumar K.V @ 2019-05-14  2:53 UTC (permalink / raw)
  To: dan.j.williams; +Cc: linux-mm, linuxppc-dev, Aneesh Kumar K.V, linux-nvdimm

When we initialize the namespace, if we support altmap, we don't initialize all the
backing struct page where as while releasing the namespace we look at some of
these uninitilized struct page. This results in a kernel crash as below.

kernel BUG at include/linux/mm.h:1034!
cpu 0x2: Vector: 700 (Program Check) at [c00000024146b870]
    pc: c0000000003788f8: devm_memremap_pages_release+0x258/0x3a0
    lr: c0000000003788f4: devm_memremap_pages_release+0x254/0x3a0
    sp: c00000024146bb00
   msr: 800000000282b033
  current = 0xc000000241382f00
  paca    = 0xc00000003fffd680   irqmask: 0x03   irq_happened: 0x01
    pid   = 4114, comm = ndctl
 c0000000009bf8c0 devm_action_release+0x30/0x50
 c0000000009c0938 release_nodes+0x268/0x2d0
 c0000000009b95b4 device_release_driver_internal+0x164/0x230
 c0000000009b638c unbind_store+0x13c/0x190
 c0000000009b4f44 drv_attr_store+0x44/0x60
 c00000000058ccc0 sysfs_kf_write+0x70/0xa0
 c00000000058b52c kernfs_fop_write+0x1ac/0x290
 c0000000004a415c __vfs_write+0x3c/0x70
 c0000000004a85ac vfs_write+0xec/0x200
 c0000000004a8920 ksys_write+0x80/0x130
 c00000000000bee4 system_call+0x5c/0x70

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/page_alloc.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 59661106da16..892eabe1ec13 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5740,8 +5740,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 
 #ifdef CONFIG_ZONE_DEVICE
 	/*
-	 * Honor reservation requested by the driver for this ZONE_DEVICE
-	 * memory. We limit the total number of pages to initialize to just
+	 * We limit the total number of pages to initialize to just
 	 * those that might contain the memory mapping. We will defer the
 	 * ZONE_DEVICE page initialization until after we have released
 	 * the hotplug lock.
@@ -5750,8 +5749,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		if (!altmap)
 			return;
 
-		if (start_pfn == altmap->base_pfn)
-			start_pfn += altmap->reserve;
 		end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
 	}
 #endif
-- 
2.21.0


^ permalink raw reply related

* [PATCH] mm/nvdimm: Pick the right alignment default when creating dax devices
From: Aneesh Kumar K.V @ 2019-05-14  2:54 UTC (permalink / raw)
  To: dan.j.williams; +Cc: linux-mm, linuxppc-dev, Aneesh Kumar K.V, linux-nvdimm

Allow arch to provide the supported alignments and use hugepage alignment only
if we support hugepage. Right now we depend on compile time configs whereas this
patch switch this to runtime discovery.

Architectures like ppc64 can have THP enabled in code, but then can have
hugepage size disabled by the hypervisor. This allows us to create dax devices
with PAGE_SIZE alignment in this case.

Existing dax namespace with alignment larger than PAGE_SIZE will fail to
initialize in this specific case. We still allow fsdax namespace initialization.

With respect to identifying whether to enable hugepage fault for a dax device,
if THP is enabled during compile, we default to taking hugepage fault and in dax
fault handler if we find the fault size > alignment we retry with PAGE_SIZE
fault size.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/libnvdimm.h |  9 ++++++++
 arch/powerpc/mm/Makefile             |  1 +
 arch/powerpc/mm/nvdimm.c             | 34 ++++++++++++++++++++++++++++
 arch/x86/include/asm/libnvdimm.h     | 19 ++++++++++++++++
 drivers/nvdimm/nd.h                  |  6 -----
 drivers/nvdimm/pfn_devs.c            | 32 +++++++++++++++++++++++++-
 include/linux/huge_mm.h              |  7 +++++-
 7 files changed, 100 insertions(+), 8 deletions(-)
 create mode 100644 arch/powerpc/include/asm/libnvdimm.h
 create mode 100644 arch/powerpc/mm/nvdimm.c
 create mode 100644 arch/x86/include/asm/libnvdimm.h

diff --git a/arch/powerpc/include/asm/libnvdimm.h b/arch/powerpc/include/asm/libnvdimm.h
new file mode 100644
index 000000000000..d35fd7f48603
--- /dev/null
+++ b/arch/powerpc/include/asm/libnvdimm.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_LIBNVDIMM_H
+#define _ASM_POWERPC_LIBNVDIMM_H
+
+#define nd_pfn_supported_alignments nd_pfn_supported_alignments
+extern unsigned long *nd_pfn_supported_alignments(void);
+extern unsigned long nd_pfn_default_alignment(void);
+
+#endif
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 0f499db315d6..42e4a399ba5d 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_HIGHMEM)		+= highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)	+= copro_fault.o
 obj-$(CONFIG_PPC_PTDUMP)	+= ptdump/
 obj-$(CONFIG_KASAN)		+= kasan/
+obj-$(CONFIG_NVDIMM_PFN)		+= nvdimm.o
diff --git a/arch/powerpc/mm/nvdimm.c b/arch/powerpc/mm/nvdimm.c
new file mode 100644
index 000000000000..a29a4510715e
--- /dev/null
+++ b/arch/powerpc/mm/nvdimm.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+
+#include <linux/mm.h>
+/*
+ * We support only pte and pmd mappings for now.
+ */
+const unsigned long *nd_pfn_supported_alignments(void)
+{
+	static unsigned long supported_alignments[3];
+
+	supported_alignments[0] = PAGE_SIZE;
+
+	if (has_transparent_hugepage())
+		supported_alignments[1] = HPAGE_PMD_SIZE;
+	else
+		supported_alignments[1] = 0;
+
+	supported_alignments[2] = 0;
+	return supported_alignments;
+}
+
+/*
+ * Use pmd mapping if supported as default alignment
+ */
+unsigned long nd_pfn_default_alignment(void)
+{
+
+	if (has_transparent_hugepage())
+		return HPAGE_PMD_SIZE;
+	return PAGE_SIZE;
+}
diff --git a/arch/x86/include/asm/libnvdimm.h b/arch/x86/include/asm/libnvdimm.h
new file mode 100644
index 000000000000..3d5361db9164
--- /dev/null
+++ b/arch/x86/include/asm/libnvdimm.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_LIBNVDIMM_H
+#define _ASM_X86_LIBNVDIMM_H
+
+static inline unsigned long nd_pfn_default_alignment(void)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	return HPAGE_PMD_SIZE;
+#else
+	return PAGE_SIZE;
+#endif
+}
+
+static inline unsigned long nd_altmap_align_size(unsigned long nd_align)
+{
+	return PMD_SIZE;
+}
+
+#endif
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index a5ac3b240293..44fe923b2ee3 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -292,12 +292,6 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region)
 struct nd_pfn *to_nd_pfn(struct device *dev);
 #if IS_ENABLED(CONFIG_NVDIMM_PFN)
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PFN_DEFAULT_ALIGNMENT HPAGE_PMD_SIZE
-#else
-#define PFN_DEFAULT_ALIGNMENT PAGE_SIZE
-#endif
-
 int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns);
 bool is_nd_pfn(struct device *dev);
 struct device *nd_pfn_create(struct nd_region *nd_region);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 01f40672507f..347cab166376 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <asm/libnvdimm.h>
 #include "nd-core.h"
 #include "pfn.h"
 #include "nd.h"
@@ -111,6 +112,8 @@ static ssize_t align_show(struct device *dev,
 	return sprintf(buf, "%ld\n", nd_pfn->align);
 }
 
+#ifndef nd_pfn_supported_alignments
+#define nd_pfn_supported_alignments nd_pfn_supported_alignments
 static const unsigned long *nd_pfn_supported_alignments(void)
 {
 	/*
@@ -133,6 +136,7 @@ static const unsigned long *nd_pfn_supported_alignments(void)
 
 	return data;
 }
+#endif
 
 static ssize_t align_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
@@ -310,7 +314,7 @@ struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
 		return NULL;
 
 	nd_pfn->mode = PFN_MODE_NONE;
-	nd_pfn->align = PFN_DEFAULT_ALIGNMENT;
+	nd_pfn->align = nd_pfn_default_alignment();
 	dev = &nd_pfn->dev;
 	device_initialize(&nd_pfn->dev);
 	if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) {
@@ -420,6 +424,20 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn)
 	return 0;
 }
 
+static bool nd_supported_alignment(unsigned long align)
+{
+	int i;
+	const unsigned long *supported = nd_pfn_supported_alignments();
+
+	if (align == 0)
+		return false;
+
+	for (i = 0; supported[i]; i++)
+		if (align == supported[i])
+			return true;
+	return false;
+}
+
 int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 {
 	u64 checksum, offset;
@@ -474,6 +492,18 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 		align = 1UL << ilog2(offset);
 	mode = le32_to_cpu(pfn_sb->mode);
 
+	/*
+	 * Check whether the we support the alignment. For Dax if the
+	 * superblock alignment is not matching, we won't initialize
+	 * the device.
+	 */
+	if (!nd_supported_alignment(align) &&
+	    memcmp(pfn_sb->signature, DAX_SIG, PFN_SIG_LEN)) {
+		dev_err(&nd_pfn->dev, "init failed, settings mismatch\n");
+		dev_dbg(&nd_pfn->dev, "align: %lx:%lx\n", nd_pfn->align, align);
+		return -EINVAL;
+	}
+
 	if (!nd_pfn->uuid) {
 		/*
 		 * When probing a namepace via nd_pfn_probe() the uuid
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 381e872bfde0..d5cfea3d8b86 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -110,7 +110,12 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 
 	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
 		return true;
-
+	/*
+	 * For dax let's try to do hugepage fault always. If we don't support
+	 * hugepages we will not have enabled namespaces with hugepage alignment.
+	 * This also means we try to handle hugepage fault on device with
+	 * smaller alignment. But for then we will return with VM_FAULT_FALLBACK
+	 */
 	if (vma_is_dax(vma))
 		return true;
 
-- 
2.21.0


^ permalink raw reply related

* [PATCH] mm/nvdimm: Use correct alignment when looking at first pfn from a region
From: Aneesh Kumar K.V @ 2019-05-14  2:55 UTC (permalink / raw)
  To: dan.j.williams; +Cc: linux-mm, linuxppc-dev, Aneesh Kumar K.V, linux-nvdimm

We already add the start_pad to the resource->start but fails to section
align the start. This make sure with altmap we compute the right first
pfn when start_pad is zero and we are doing an align down of start address.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 kernel/memremap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/memremap.c b/kernel/memremap.c
index a856cb5ff192..23d77b60e728 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -59,9 +59,9 @@ static unsigned long pfn_first(struct dev_pagemap *pgmap)
 {
 	const struct resource *res = &pgmap->res;
 	struct vmem_altmap *altmap = &pgmap->altmap;
-	unsigned long pfn;
+	unsigned long pfn = PHYS_PFN(res->start);
 
-	pfn = res->start >> PAGE_SHIFT;
+	pfn = SECTION_ALIGN_DOWN(pfn);
 	if (pgmap->altmap_valid)
 		pfn += vmem_altmap_offset(altmap);
 	return pfn;
-- 
2.21.0


^ permalink raw reply related

* [PATCH] mm/nvdimm: Use correct #defines instead of opencoding
From: Aneesh Kumar K.V @ 2019-05-14  2:56 UTC (permalink / raw)
  To: dan.j.williams; +Cc: linux-mm, linuxppc-dev, Aneesh Kumar K.V, linux-nvdimm

The nfpn related change is needed to fix the kernel message

"number of pfns truncated from 2617344 to 163584"

The change makes sure the nfpns stored in the superblock is right value.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 drivers/nvdimm/pfn_devs.c    | 6 +++---
 drivers/nvdimm/region_devs.c | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 347cab166376..6751ff0296ef 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -777,8 +777,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 		 * when populating the vmemmap. This *should* be equal to
 		 * PMD_SIZE for most architectures.
 		 */
-		offset = ALIGN(start + reserve + 64 * npfns,
-				max(nd_pfn->align, PMD_SIZE)) - start;
+		offset = ALIGN(start + reserve + sizeof(struct page) * npfns,
+			       max(nd_pfn->align, PMD_SIZE)) - start;
 	} else if (nd_pfn->mode == PFN_MODE_RAM)
 		offset = ALIGN(start + reserve, nd_pfn->align) - start;
 	else
@@ -790,7 +790,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 		return -ENXIO;
 	}
 
-	npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
+	npfns = (size - offset - start_pad - end_trunc) / PAGE_SIZE;
 	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
 	pfn_sb->dataoff = cpu_to_le64(offset);
 	pfn_sb->npfns = cpu_to_le64(npfns);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index b4ef7d9ff22e..2d8facea5a03 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -994,10 +994,10 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 		struct nd_mapping_desc *mapping = &ndr_desc->mapping[i];
 		struct nvdimm *nvdimm = mapping->nvdimm;
 
-		if ((mapping->start | mapping->size) % SZ_4K) {
-			dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not 4K aligned\n",
-					caller, dev_name(&nvdimm->dev), i);
-
+		if ((mapping->start | mapping->size) % PAGE_SIZE) {
+			dev_err(&nvdimm_bus->dev,
+				"%s: %s mapping%d is not 4K aligned\n",
+				caller, dev_name(&nvdimm->dev), i);
 			return NULL;
 		}
 
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH] mm/nvdimm: Use correct #defines instead of opencoding
From: Dan Williams @ 2019-05-14  3:58 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Linux MM, linuxppc-dev, linux-nvdimm
In-Reply-To: <20190514025604.9997-1-aneesh.kumar@linux.ibm.com>

On Mon, May 13, 2019 at 7:56 PM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> The nfpn related change is needed to fix the kernel message
>
> "number of pfns truncated from 2617344 to 163584"
>
> The change makes sure the nfpns stored in the superblock is right value.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  drivers/nvdimm/pfn_devs.c    | 6 +++---
>  drivers/nvdimm/region_devs.c | 8 ++++----
>  2 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
> index 347cab166376..6751ff0296ef 100644
> --- a/drivers/nvdimm/pfn_devs.c
> +++ b/drivers/nvdimm/pfn_devs.c
> @@ -777,8 +777,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
>                  * when populating the vmemmap. This *should* be equal to
>                  * PMD_SIZE for most architectures.
>                  */
> -               offset = ALIGN(start + reserve + 64 * npfns,
> -                               max(nd_pfn->align, PMD_SIZE)) - start;
> +               offset = ALIGN(start + reserve + sizeof(struct page) * npfns,
> +                              max(nd_pfn->align, PMD_SIZE)) - start;

No, I think we need to record the page-size into the superblock format
otherwise this breaks in debug builds where the struct-page size is
extended.

>         } else if (nd_pfn->mode == PFN_MODE_RAM)
>                 offset = ALIGN(start + reserve, nd_pfn->align) - start;
>         else
> @@ -790,7 +790,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
>                 return -ENXIO;
>         }
>
> -       npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
> +       npfns = (size - offset - start_pad - end_trunc) / PAGE_SIZE;

Similar comment, if the page size is variable then the superblock
needs to explicitly account for it.

^ permalink raw reply

* Re: [RFC PATCH] mm/nvdimm: Fix kernel crash on devm_mremap_pages_release
From: Anshuman Khandual @ 2019-05-14  4:05 UTC (permalink / raw)
  To: Aneesh Kumar K.V, dan.j.williams; +Cc: linux-mm, linuxppc-dev, linux-nvdimm
In-Reply-To: <20190514025354.9108-1-aneesh.kumar@linux.ibm.com>

On 05/14/2019 08:23 AM, Aneesh Kumar K.V wrote:
> When we initialize the namespace, if we support altmap, we don't initialize all the
> backing struct page where as while releasing the namespace we look at some of
> these uninitilized struct page. This results in a kernel crash as below.
Yes this has been problematic which I have also previously encountered but in a bit
different way (while searching memory resources).

> 
> kernel BUG at include/linux/mm.h:1034!
What that would be ? Did not see a corresponding BUG_ON() line in the file.

> cpu 0x2: Vector: 700 (Program Check) at [c00000024146b870]
>     pc: c0000000003788f8: devm_memremap_pages_release+0x258/0x3a0
>     lr: c0000000003788f4: devm_memremap_pages_release+0x254/0x3a0
>     sp: c00000024146bb00
>    msr: 800000000282b033
>   current = 0xc000000241382f00
>   paca    = 0xc00000003fffd680   irqmask: 0x03   irq_happened: 0x01
>     pid   = 4114, comm = ndctl
>  c0000000009bf8c0 devm_action_release+0x30/0x50
>  c0000000009c0938 release_nodes+0x268/0x2d0
>  c0000000009b95b4 device_release_driver_internal+0x164/0x230
>  c0000000009b638c unbind_store+0x13c/0x190
>  c0000000009b4f44 drv_attr_store+0x44/0x60
>  c00000000058ccc0 sysfs_kf_write+0x70/0xa0
>  c00000000058b52c kernfs_fop_write+0x1ac/0x290
>  c0000000004a415c __vfs_write+0x3c/0x70
>  c0000000004a85ac vfs_write+0xec/0x200
>  c0000000004a8920 ksys_write+0x80/0x130
>  c00000000000bee4 system_call+0x5c/0x70

I saw this as memory hotplug problem with respect to ZONE_DEVICE based device memory.
Hence a bit different explanation which I never posted. I guess parts of the commit
message here can be used for a better comprehensive explanation of the problem.

mm/hotplug: Initialize struct pages for vmem_altmap reserved areas

The following ZONE_DEVICE ranges (altmap) have valid struct pages allocated
from within device memory memmap range.

A. Driver reserved area	[BASE -> BASE + RESV)
B. Device mmap area	[BASE + RESV -> BASE + RESV + FREE]
C. Device usable area	[BASE + RESV + FREE -> END]

BASE - pgmap->altmap.base_pfn (pgmap->res.start >> PAGE_SHIFT)
RESV - pgmap->altmap.reserve
FREE - pgmap->altmap.free
END  - pgmap->res->end >> PAGE_SHIFT

Struct page init for all areas happens in two phases which detects altmap
use case and init parts of the device range in each phase.

1. memmap_init_zone		(Device mmap area)
2. memmap_init_zone_device	(Device usable area)

memmap_init_zone() skips driver reserved area and does not init the
struct pages. This is problematic primarily for two reasons.

Though NODE_DATA(device_node(dev))->node_zones[ZONE_DEVICE] contains the
device memory range in it's entirety (in zone->spanned_pages) parts of this
range does not have zone set to ZONE_DEVICE in their struct page.

__remove_pages() called directly or from within arch_remove_memory() during
ZONE_DEVICE tear down procedure (devm_memremap_pages_release) hits an error
(like below) if there are reserved pages. This is because the first pfn of
the device range (invariably also the first pfn from reserved area) cannot
be identified belonging to ZONE_DEVICE. This erroneously leads range search
within iomem_resource region which never had this device memory region. So
this eventually ends up flashing the following error.

Unable to release resource <0x0000000680000000-0x00000006bfffffff> (-22)

Initialize struct pages for the driver reserved range while still staying
clear from it's contents.

> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  mm/page_alloc.c | 5 +----
>  1 file changed, 1 insertion(+), 4 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 59661106da16..892eabe1ec13 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5740,8 +5740,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>  
>  #ifdef CONFIG_ZONE_DEVICE
>  	/*
> -	 * Honor reservation requested by the driver for this ZONE_DEVICE
> -	 * memory. We limit the total number of pages to initialize to just
> +	 * We limit the total number of pages to initialize to just
Comment needs bit change to reflect on the fact that both driver reserved as
well as mapped area (containing altmap struct pages) needs init here.

^ permalink raw reply

* Re: [PATCH] mm/nvdimm: Use correct #defines instead of opencoding
From: Aneesh Kumar K.V @ 2019-05-14  4:05 UTC (permalink / raw)
  To: Dan Williams; +Cc: Linux MM, linuxppc-dev, linux-nvdimm
In-Reply-To: <CAPcyv4iNgFbSq0Hqb+CStRhGWMHfXx7tL3vrDaQ95DcBBY8QCQ@mail.gmail.com>

On 5/14/19 9:28 AM, Dan Williams wrote:
> On Mon, May 13, 2019 at 7:56 PM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> The nfpn related change is needed to fix the kernel message
>>
>> "number of pfns truncated from 2617344 to 163584"
>>
>> The change makes sure the nfpns stored in the superblock is right value.
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>   drivers/nvdimm/pfn_devs.c    | 6 +++---
>>   drivers/nvdimm/region_devs.c | 8 ++++----
>>   2 files changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
>> index 347cab166376..6751ff0296ef 100644
>> --- a/drivers/nvdimm/pfn_devs.c
>> +++ b/drivers/nvdimm/pfn_devs.c
>> @@ -777,8 +777,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
>>                   * when populating the vmemmap. This *should* be equal to
>>                   * PMD_SIZE for most architectures.
>>                   */
>> -               offset = ALIGN(start + reserve + 64 * npfns,
>> -                               max(nd_pfn->align, PMD_SIZE)) - start;
>> +               offset = ALIGN(start + reserve + sizeof(struct page) * npfns,
>> +                              max(nd_pfn->align, PMD_SIZE)) - start;
> 
> No, I think we need to record the page-size into the superblock format
> otherwise this breaks in debug builds where the struct-page size is
> extended.
> 
>>          } else if (nd_pfn->mode == PFN_MODE_RAM)
>>                  offset = ALIGN(start + reserve, nd_pfn->align) - start;
>>          else
>> @@ -790,7 +790,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
>>                  return -ENXIO;
>>          }
>>
>> -       npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
>> +       npfns = (size - offset - start_pad - end_trunc) / PAGE_SIZE;
> 
> Similar comment, if the page size is variable then the superblock
> needs to explicitly account for it.
> 

PAGE_SIZE is not really variable. What we can run into is the issue you 
mentioned above. The size of struct page can change which means the 
reserved space for keeping vmemmap in device may not be sufficient for 
certain kernel builds.

I was planning to add another patch that fails namespace init if we 
don't have enough space to keep the struct page.

Why do you suggest we need to have PAGE_SIZE as part of pfn superblock?

-aneesh


^ permalink raw reply

* Re: [PATCH] mm/nvdimm: Use correct #defines instead of opencoding
From: Dan Williams @ 2019-05-14  4:12 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Linux MM, linuxppc-dev, linux-nvdimm
In-Reply-To: <f99c4f11-a43d-c2d3-ab4f-b7072d090351@linux.ibm.com>

On Mon, May 13, 2019 at 9:05 PM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> On 5/14/19 9:28 AM, Dan Williams wrote:
> > On Mon, May 13, 2019 at 7:56 PM Aneesh Kumar K.V
> > <aneesh.kumar@linux.ibm.com> wrote:
> >>
> >> The nfpn related change is needed to fix the kernel message
> >>
> >> "number of pfns truncated from 2617344 to 163584"
> >>
> >> The change makes sure the nfpns stored in the superblock is right value.
> >>
> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> >> ---
> >>   drivers/nvdimm/pfn_devs.c    | 6 +++---
> >>   drivers/nvdimm/region_devs.c | 8 ++++----
> >>   2 files changed, 7 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
> >> index 347cab166376..6751ff0296ef 100644
> >> --- a/drivers/nvdimm/pfn_devs.c
> >> +++ b/drivers/nvdimm/pfn_devs.c
> >> @@ -777,8 +777,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
> >>                   * when populating the vmemmap. This *should* be equal to
> >>                   * PMD_SIZE for most architectures.
> >>                   */
> >> -               offset = ALIGN(start + reserve + 64 * npfns,
> >> -                               max(nd_pfn->align, PMD_SIZE)) - start;
> >> +               offset = ALIGN(start + reserve + sizeof(struct page) * npfns,
> >> +                              max(nd_pfn->align, PMD_SIZE)) - start;
> >
> > No, I think we need to record the page-size into the superblock format
> > otherwise this breaks in debug builds where the struct-page size is
> > extended.
> >
> >>          } else if (nd_pfn->mode == PFN_MODE_RAM)
> >>                  offset = ALIGN(start + reserve, nd_pfn->align) - start;
> >>          else
> >> @@ -790,7 +790,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
> >>                  return -ENXIO;
> >>          }
> >>
> >> -       npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
> >> +       npfns = (size - offset - start_pad - end_trunc) / PAGE_SIZE;
> >
> > Similar comment, if the page size is variable then the superblock
> > needs to explicitly account for it.
> >
>
> PAGE_SIZE is not really variable. What we can run into is the issue you
> mentioned above. The size of struct page can change which means the
> reserved space for keeping vmemmap in device may not be sufficient for
> certain kernel builds.
>
> I was planning to add another patch that fails namespace init if we
> don't have enough space to keep the struct page.
>
> Why do you suggest we need to have PAGE_SIZE as part of pfn superblock?

So that the kernel has a chance to identify cases where the superblock
it is handling was created on a system with different PAGE_SIZE
assumptions.

^ permalink raw reply

* Re: [RFC PATCH] mm/nvdimm: Fix kernel crash on devm_mremap_pages_release
From: Dan Williams @ 2019-05-14  4:15 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Keith Busch, Linux MM, linuxppc-dev, linux-nvdimm
In-Reply-To: <20190514025354.9108-1-aneesh.kumar@linux.ibm.com>

[ add Keith who was looking at something similar ]

On Mon, May 13, 2019 at 7:54 PM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> When we initialize the namespace, if we support altmap, we don't initialize all the
> backing struct page where as while releasing the namespace we look at some of
> these uninitilized struct page. This results in a kernel crash as below.
>
> kernel BUG at include/linux/mm.h:1034!
> cpu 0x2: Vector: 700 (Program Check) at [c00000024146b870]
>     pc: c0000000003788f8: devm_memremap_pages_release+0x258/0x3a0
>     lr: c0000000003788f4: devm_memremap_pages_release+0x254/0x3a0
>     sp: c00000024146bb00
>    msr: 800000000282b033
>   current = 0xc000000241382f00
>   paca    = 0xc00000003fffd680   irqmask: 0x03   irq_happened: 0x01
>     pid   = 4114, comm = ndctl
>  c0000000009bf8c0 devm_action_release+0x30/0x50
>  c0000000009c0938 release_nodes+0x268/0x2d0
>  c0000000009b95b4 device_release_driver_internal+0x164/0x230
>  c0000000009b638c unbind_store+0x13c/0x190
>  c0000000009b4f44 drv_attr_store+0x44/0x60
>  c00000000058ccc0 sysfs_kf_write+0x70/0xa0
>  c00000000058b52c kernfs_fop_write+0x1ac/0x290
>  c0000000004a415c __vfs_write+0x3c/0x70
>  c0000000004a85ac vfs_write+0xec/0x200
>  c0000000004a8920 ksys_write+0x80/0x130
>  c00000000000bee4 system_call+0x5c/0x70
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  mm/page_alloc.c | 5 +----
>  1 file changed, 1 insertion(+), 4 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 59661106da16..892eabe1ec13 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5740,8 +5740,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>
>  #ifdef CONFIG_ZONE_DEVICE
>         /*
> -        * Honor reservation requested by the driver for this ZONE_DEVICE
> -        * memory. We limit the total number of pages to initialize to just
> +        * We limit the total number of pages to initialize to just
>          * those that might contain the memory mapping. We will defer the
>          * ZONE_DEVICE page initialization until after we have released
>          * the hotplug lock.
> @@ -5750,8 +5749,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>                 if (!altmap)
>                         return;
>
> -               if (start_pfn == altmap->base_pfn)
> -                       start_pfn += altmap->reserve;

If it's reserved then we should not be accessing, even if the above
works in practice. Isn't the fix something more like this to fix up
the assumptions at release time?

diff --git a/kernel/memremap.c b/kernel/memremap.c
index a856cb5ff192..9074ba14572c 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -90,6 +90,7 @@ static void devm_memremap_pages_release(void *data)
  struct device *dev = pgmap->dev;
  struct resource *res = &pgmap->res;
  resource_size_t align_start, align_size;
+ struct vmem_altmap *altmap = pgmap->altmap_valid ? &pgmap->altmap : NULL;
  unsigned long pfn;
  int nid;

@@ -102,7 +103,10 @@ static void devm_memremap_pages_release(void *data)
  align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
  - align_start;

- nid = page_to_nid(pfn_to_page(align_start >> PAGE_SHIFT));
+ pfn = align_start >> PAGE_SHIFT;
+ if (altmap)
+ pfn += vmem_altmap_offset(altmap);
+ nid = page_to_nid(pfn_to_page(pfn));

  mem_hotplug_begin();
  if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
@@ -110,8 +114,7 @@ static void devm_memremap_pages_release(void *data)
  __remove_pages(page_zone(pfn_to_page(pfn)), pfn,
  align_size >> PAGE_SHIFT, NULL);
  } else {
- arch_remove_memory(nid, align_start, align_size,
- pgmap->altmap_valid ? &pgmap->altmap : NULL);
+ arch_remove_memory(nid, align_start, align_size, altmap);
  kasan_remove_zero_shadow(__va(align_start), align_size);
  }
  mem_hotplug_done();

^ permalink raw reply related

* Re: [PATCH] mm/nvdimm: Use correct alignment when looking at first pfn from a region
From: Dan Williams @ 2019-05-14  4:29 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Linux MM, linuxppc-dev, linux-nvdimm
In-Reply-To: <20190514025512.9670-1-aneesh.kumar@linux.ibm.com>

On Mon, May 13, 2019 at 7:55 PM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> We already add the start_pad to the resource->start but fails to section
> align the start. This make sure with altmap we compute the right first
> pfn when start_pad is zero and we are doing an align down of start address.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  kernel/memremap.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/memremap.c b/kernel/memremap.c
> index a856cb5ff192..23d77b60e728 100644
> --- a/kernel/memremap.c
> +++ b/kernel/memremap.c
> @@ -59,9 +59,9 @@ static unsigned long pfn_first(struct dev_pagemap *pgmap)
>  {
>         const struct resource *res = &pgmap->res;
>         struct vmem_altmap *altmap = &pgmap->altmap;
> -       unsigned long pfn;
> +       unsigned long pfn = PHYS_PFN(res->start);
>
> -       pfn = res->start >> PAGE_SHIFT;
> +       pfn = SECTION_ALIGN_DOWN(pfn);

This does not seem right to me it breaks the assumptions of where the
first expected valid pfn occurs in the passed in range.

^ permalink raw reply

* Re: [RFC PATCH] mm/nvdimm: Fix kernel crash on devm_mremap_pages_release
From: Aneesh Kumar K.V @ 2019-05-14  4:40 UTC (permalink / raw)
  To: Dan Williams; +Cc: Keith Busch, Linux MM, linuxppc-dev, linux-nvdimm
In-Reply-To: <CAPcyv4hsTvyRnLGr3y4JB6zPzdxb7WGQgaWs=5vRqf=L1DYynQ@mail.gmail.com>

On 5/14/19 9:45 AM, Dan Williams wrote:
> [ add Keith who was looking at something similar ]
> 
> On Mon, May 13, 2019 at 7:54 PM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> When we initialize the namespace, if we support altmap, we don't initialize all the
>> backing struct page where as while releasing the namespace we look at some of
>> these uninitilized struct page. This results in a kernel crash as below.
>>
>> kernel BUG at include/linux/mm.h:1034!
>> cpu 0x2: Vector: 700 (Program Check) at [c00000024146b870]
>>      pc: c0000000003788f8: devm_memremap_pages_release+0x258/0x3a0
>>      lr: c0000000003788f4: devm_memremap_pages_release+0x254/0x3a0
>>      sp: c00000024146bb00
>>     msr: 800000000282b033
>>    current = 0xc000000241382f00
>>    paca    = 0xc00000003fffd680   irqmask: 0x03   irq_happened: 0x01
>>      pid   = 4114, comm = ndctl
>>   c0000000009bf8c0 devm_action_release+0x30/0x50
>>   c0000000009c0938 release_nodes+0x268/0x2d0
>>   c0000000009b95b4 device_release_driver_internal+0x164/0x230
>>   c0000000009b638c unbind_store+0x13c/0x190
>>   c0000000009b4f44 drv_attr_store+0x44/0x60
>>   c00000000058ccc0 sysfs_kf_write+0x70/0xa0
>>   c00000000058b52c kernfs_fop_write+0x1ac/0x290
>>   c0000000004a415c __vfs_write+0x3c/0x70
>>   c0000000004a85ac vfs_write+0xec/0x200
>>   c0000000004a8920 ksys_write+0x80/0x130
>>   c00000000000bee4 system_call+0x5c/0x70
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>   mm/page_alloc.c | 5 +----
>>   1 file changed, 1 insertion(+), 4 deletions(-)
>>
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 59661106da16..892eabe1ec13 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -5740,8 +5740,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>>
>>   #ifdef CONFIG_ZONE_DEVICE
>>          /*
>> -        * Honor reservation requested by the driver for this ZONE_DEVICE
>> -        * memory. We limit the total number of pages to initialize to just
>> +        * We limit the total number of pages to initialize to just
>>           * those that might contain the memory mapping. We will defer the
>>           * ZONE_DEVICE page initialization until after we have released
>>           * the hotplug lock.
>> @@ -5750,8 +5749,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>>                  if (!altmap)
>>                          return;
>>
>> -               if (start_pfn == altmap->base_pfn)
>> -                       start_pfn += altmap->reserve;
> 
> If it's reserved then we should not be accessing, even if the above
> works in practice. Isn't the fix something more like this to fix up
> the assumptions at release time?
> 
> diff --git a/kernel/memremap.c b/kernel/memremap.c
> index a856cb5ff192..9074ba14572c 100644
> --- a/kernel/memremap.c
> +++ b/kernel/memremap.c
> @@ -90,6 +90,7 @@ static void devm_memremap_pages_release(void *data)
>    struct device *dev = pgmap->dev;
>    struct resource *res = &pgmap->res;
>    resource_size_t align_start, align_size;
> + struct vmem_altmap *altmap = pgmap->altmap_valid ? &pgmap->altmap : NULL;
>    unsigned long pfn;
>    int nid;
> 
> @@ -102,7 +103,10 @@ static void devm_memremap_pages_release(void *data)
>    align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
>    - align_start;
> 
> - nid = page_to_nid(pfn_to_page(align_start >> PAGE_SHIFT));
> + pfn = align_start >> PAGE_SHIFT;
> + if (altmap)
> + pfn += vmem_altmap_offset(altmap);
> + nid = page_to_nid(pfn_to_page(pfn));
> 
>    mem_hotplug_begin();
>    if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
> @@ -110,8 +114,7 @@ static void devm_memremap_pages_release(void *data)
>    __remove_pages(page_zone(pfn_to_page(pfn)), pfn,
>    align_size >> PAGE_SHIFT, NULL);
>    } else {
> - arch_remove_memory(nid, align_start, align_size,
> - pgmap->altmap_valid ? &pgmap->altmap : NULL);
> + arch_remove_memory(nid, align_start, align_size, altmap);
>    kasan_remove_zero_shadow(__va(align_start), align_size);
>    }
>    mem_hotplug_done();
> 
I did try that first. I was not sure about that. From the memory add vs 
remove perspective.

devm_memremap_pages:

align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
		- align_start;
align_end = align_start + align_size - 1;

error = arch_add_memory(nid, align_start, align_size, altmap,
				false);


devm_memremap_pages_release:

/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
		- align_start;

arch_remove_memory(nid, align_start, align_size,
		pgmap->altmap_valid ? &pgmap->altmap : NULL);


Now if we are fixing the memremap_pages_release, shouldn't we adjust 
alig_start w.r.t memremap_pages too? and I was not sure what that means 
w.r.t add/remove alignment requirements.

What is the intended usage of reserve area? I guess we want that part to 
be added? if so shouldn't we remove them?


-aneesh


^ permalink raw reply

* Re: [PATCH] mm/nvdimm: Use correct alignment when looking at first pfn from a region
From: Aneesh Kumar K.V @ 2019-05-14  4:41 UTC (permalink / raw)
  To: Dan Williams; +Cc: Linux MM, linuxppc-dev, linux-nvdimm
In-Reply-To: <CAPcyv4hgNUDxjgYNkxOXJ9hfLb6z2+E1yasNoZNDKFUxkCzWLA@mail.gmail.com>

On 5/14/19 9:59 AM, Dan Williams wrote:
> On Mon, May 13, 2019 at 7:55 PM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> We already add the start_pad to the resource->start but fails to section
>> align the start. This make sure with altmap we compute the right first
>> pfn when start_pad is zero and we are doing an align down of start address.
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>   kernel/memremap.c | 4 ++--
>>   1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/kernel/memremap.c b/kernel/memremap.c
>> index a856cb5ff192..23d77b60e728 100644
>> --- a/kernel/memremap.c
>> +++ b/kernel/memremap.c
>> @@ -59,9 +59,9 @@ static unsigned long pfn_first(struct dev_pagemap *pgmap)
>>   {
>>          const struct resource *res = &pgmap->res;
>>          struct vmem_altmap *altmap = &pgmap->altmap;
>> -       unsigned long pfn;
>> +       unsigned long pfn = PHYS_PFN(res->start);
>>
>> -       pfn = res->start >> PAGE_SHIFT;
>> +       pfn = SECTION_ALIGN_DOWN(pfn);
> 
> This does not seem right to me it breaks the assumptions of where the
> first expected valid pfn occurs in the passed in range.
> 

How do we define the first valid pfn? Isn't that at pfn_sb->dataoff ?

-aneesh


^ permalink raw reply

* [Bug 203597] kernel 4.9.175 fails to boot on a PowerMac G4 3,6 at early stage
From: bugzilla-daemon @ 2019-05-14  4:43 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-203597-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=203597

Christophe Leroy (christophe.leroy@c-s.fr) changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |christophe.leroy@c-s.fr

--- Comment #2 from Christophe Leroy (christophe.leroy@c-s.fr) ---
You are missing following commit:

https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b45ba4a51cd

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* Re: [PATCH] mm/nvdimm: Use correct #defines instead of opencoding
From: Aneesh Kumar K.V @ 2019-05-14  4:46 UTC (permalink / raw)
  To: Dan Williams; +Cc: Linux MM, linuxppc-dev, linux-nvdimm
In-Reply-To: <CAPcyv4gOr8SFbdtBbWhMOU-wdYuMCQ4Jn2SznGRsv6Vku97Xnw@mail.gmail.com>

On 5/14/19 9:42 AM, Dan Williams wrote:
> On Mon, May 13, 2019 at 9:05 PM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> On 5/14/19 9:28 AM, Dan Williams wrote:
>>> On Mon, May 13, 2019 at 7:56 PM Aneesh Kumar K.V
>>> <aneesh.kumar@linux.ibm.com> wrote:
>>>>
>>>> The nfpn related change is needed to fix the kernel message
>>>>
>>>> "number of pfns truncated from 2617344 to 163584"
>>>>
>>>> The change makes sure the nfpns stored in the superblock is right value.
>>>>
>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>> ---
>>>>    drivers/nvdimm/pfn_devs.c    | 6 +++---
>>>>    drivers/nvdimm/region_devs.c | 8 ++++----
>>>>    2 files changed, 7 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
>>>> index 347cab166376..6751ff0296ef 100644
>>>> --- a/drivers/nvdimm/pfn_devs.c
>>>> +++ b/drivers/nvdimm/pfn_devs.c
>>>> @@ -777,8 +777,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
>>>>                    * when populating the vmemmap. This *should* be equal to
>>>>                    * PMD_SIZE for most architectures.
>>>>                    */
>>>> -               offset = ALIGN(start + reserve + 64 * npfns,
>>>> -                               max(nd_pfn->align, PMD_SIZE)) - start;
>>>> +               offset = ALIGN(start + reserve + sizeof(struct page) * npfns,
>>>> +                              max(nd_pfn->align, PMD_SIZE)) - start;
>>>
>>> No, I think we need to record the page-size into the superblock format
>>> otherwise this breaks in debug builds where the struct-page size is
>>> extended.
>>>
>>>>           } else if (nd_pfn->mode == PFN_MODE_RAM)
>>>>                   offset = ALIGN(start + reserve, nd_pfn->align) - start;
>>>>           else
>>>> @@ -790,7 +790,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
>>>>                   return -ENXIO;
>>>>           }
>>>>
>>>> -       npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
>>>> +       npfns = (size - offset - start_pad - end_trunc) / PAGE_SIZE;
>>>
>>> Similar comment, if the page size is variable then the superblock
>>> needs to explicitly account for it.
>>>
>>
>> PAGE_SIZE is not really variable. What we can run into is the issue you
>> mentioned above. The size of struct page can change which means the
>> reserved space for keeping vmemmap in device may not be sufficient for
>> certain kernel builds.
>>
>> I was planning to add another patch that fails namespace init if we
>> don't have enough space to keep the struct page.
>>
>> Why do you suggest we need to have PAGE_SIZE as part of pfn superblock?
> 
> So that the kernel has a chance to identify cases where the superblock
> it is handling was created on a system with different PAGE_SIZE
> assumptions.
> 

The reason to do that is we don't have enough space to keep struct page 
backing the total number of pfns? If so, what i suggested above should 
handle that.

or are you finding any other reason why we should fail a namespace init 
with a different PAGE_SIZE value?

My another patch handle the details w.r.t devdax alignment for which 
devdax got created with PAGE_SIZE 4K but we are now trying to load that 
in a kernel with PAGE_SIZE 64k.

-aneesh


^ permalink raw reply

* Re: [PATCH v2] powerpc: Fix compile issue with force DAWR
From: Michael Neuling @ 2019-05-14  4:47 UTC (permalink / raw)
  To: Christophe Leroy, mpe; +Cc: linuxppc-dev
In-Reply-To: <f1015de7-faf1-ae6d-d1f9-9c904f19c58b@c-s.fr>

On Mon, 2019-05-13 at 11:08 +0200, Christophe Leroy wrote:
> 
> Le 13/05/2019 à 09:17, Michael Neuling a écrit :
> > If you compile with KVM but without CONFIG_HAVE_HW_BREAKPOINT you fail
> > at linking with:
> >    arch/powerpc/kvm/book3s_hv_rmhandlers.o:(.text+0x708): undefined
> > reference to `dawr_force_enable'
> > 
> > This was caused by commit c1fe190c0672 ("powerpc: Add force enable of
> > DAWR on P9 option").
> > 
> > This puts more of the dawr infrastructure in a new file.
> 
> I think you are doing a bit more than that. I think you should explain 
> that you define a new CONFIG_ option, when it is selected, etc ...
> 
> The commit you are referring to is talking about P9. It looks like your 
> patch covers a lot more, so it should be mentionned her I guess.

Not really. It looks like I'm doing a lot but I'm really just moving code around
to deal with the ugliness of a bunch of config options and dependencies. 

> > Signed-off-by: Michael Neuling <mikey@neuling.org>
> 
> You should add a Fixes: tag, ie:
> 
> Fixes: c1fe190c0672 ("powerpc: Add force enable of DAWR on P9 option")

Ok

> 
> > --
> > v2:
> >    Fixes based on Christophe Leroy's comments:
> >    - Fix commit message formatting
> >    - Move more DAWR code into dawr.c
> > ---
> >   arch/powerpc/Kconfig                     |  5 ++
> >   arch/powerpc/include/asm/hw_breakpoint.h | 20 ++++---
> >   arch/powerpc/kernel/Makefile             |  1 +
> >   arch/powerpc/kernel/dawr.c               | 75 ++++++++++++++++++++++++
> >   arch/powerpc/kernel/hw_breakpoint.c      | 56 ------------------
> >   arch/powerpc/kvm/Kconfig                 |  1 +
> >   6 files changed, 94 insertions(+), 64 deletions(-)
> >   create mode 100644 arch/powerpc/kernel/dawr.c
> > 
> > diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> > index 2711aac246..f4b19e48cc 100644
> > --- a/arch/powerpc/Kconfig
> > +++ b/arch/powerpc/Kconfig
> > @@ -242,6 +242,7 @@ config PPC
> >   	select SYSCTL_EXCEPTION_TRACE
> >   	select THREAD_INFO_IN_TASK
> >   	select VIRT_TO_BUS			if !PPC64
> > +	select PPC_DAWR_FORCE_ENABLE		if PPC64 || PERF
> 
> What's PERF ? Did you mean PERF_EVENTS ?
> 
> Then what you mean is:
> - Selected all the time for PPC64
> - Selected for PPC32 when PERF is also selected.
> 
> Is that what you want ? At first I thought it was linked to P9.

This is wrong.  I think we just want PPC64. PERF is a red herring.

> And ... did you read below statement ?

Clearly not :-)

> 
> >   	#
> >   	# Please keep this list sorted alphabetically.
> >   	#
> > @@ -369,6 +370,10 @@ config PPC_ADV_DEBUG_DAC_RANGE
> >   	depends on PPC_ADV_DEBUG_REGS && 44x
> >   	default y
> >   
> > +config PPC_DAWR_FORCE_ENABLE
> > +	bool
> > +	default y
> > +
> 
> Why defaulting it to y. Then how is it set to n ?

Good point.

> 
> >   config ZONE_DMA
> >   	bool
> >   	default y if PPC_BOOK3E_64
> > diff --git a/arch/powerpc/include/asm/hw_breakpoint.h
> > b/arch/powerpc/include/asm/hw_breakpoint.h
> > index 0fe8c1e46b..ffbc8eab41 100644
> > --- a/arch/powerpc/include/asm/hw_breakpoint.h
> > +++ b/arch/powerpc/include/asm/hw_breakpoint.h
> > @@ -47,6 +47,8 @@ struct arch_hw_breakpoint {
> >   #define HW_BRK_TYPE_PRIV_ALL	(HW_BRK_TYPE_USER | HW_BRK_TYPE_KERNEL |
> > \
> >   				 HW_BRK_TYPE_HYP)
> >   
> > +extern int set_dawr(struct arch_hw_breakpoint *brk);
> > +
> >   #ifdef CONFIG_HAVE_HW_BREAKPOINT
> >   #include <linux/kdebug.h>
> >   #include <asm/reg.h>
> > @@ -90,18 +92,20 @@ static inline void hw_breakpoint_disable(void)
> >   extern void thread_change_pc(struct task_struct *tsk, struct pt_regs
> > *regs);
> >   int hw_breakpoint_handler(struct die_args *args);
> >   
> > -extern int set_dawr(struct arch_hw_breakpoint *brk);
> > -extern bool dawr_force_enable;
> > -static inline bool dawr_enabled(void)
> > -{
> > -	return dawr_force_enable;
> > -}
> > -
> 
> That's a very simple function, why not keep it here (or in another .h) 
> as 'static inline' ?

Sure.

> >   #else	/* CONFIG_HAVE_HW_BREAKPOINT */
> >   static inline void hw_breakpoint_disable(void) { }
> >   static inline void thread_change_pc(struct task_struct *tsk,
> >   					struct pt_regs *regs) { }
> > -static inline bool dawr_enabled(void) { return false; }
> > +
> >   #endif	/* CONFIG_HAVE_HW_BREAKPOINT */
> > +
> > +extern bool dawr_force_enable;
> > +
> > +#ifdef CONFIG_PPC_DAWR_FORCE_ENABLE
> > +extern bool dawr_enabled(void);
> 
> Functions should not be 'extern'. I'm sure checkpatch --strict will tell 
> you.

That's not what's currently being done in this header file.  I'm keeping with
the style of that file.

> > +#else
> > +#define dawr_enabled() true
> 
> true by default ?
> Previously it was false by default.

Thanks, yeah that's wrong but I need to rethink the config option to make it
CONFIG_PPC_DAWR. 

This patch is far more difficult than it should be due to the mess that
CONFIG_HAVE_HW_BREAKPOINT and CONFIG_PPC_ADV_DEBUG_REGS creates in ptrace.c and
process.c. We really need to fix up 
https://github.com/linuxppc/issues/issues/128

> And why a #define ? That's better to keep a static inline.

Changed.

> 
> > +#endif
> > +
> >   #endif	/* __KERNEL__ */
> >   #endif	/* _PPC_BOOK3S_64_HW_BREAKPOINT_H */
> > diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
> > index 0ea6c4aa3a..a9c497c34f 100644
> > --- a/arch/powerpc/kernel/Makefile
> > +++ b/arch/powerpc/kernel/Makefile
> > @@ -56,6 +56,7 @@ obj-$(CONFIG_PPC64)		+= setup_64.o
> > sys_ppc32.o \
> >   obj-$(CONFIG_VDSO32)		+= vdso32/
> >   obj-$(CONFIG_PPC_WATCHDOG)	+= watchdog.o
> >   obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
> > +obj-$(CONFIG_PPC_DAWR_FORCE_ENABLE)	+= dawr.o
> >   obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_ppc970.o cpu_setup_pa6t.o
> >   obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power.o
> >   obj-$(CONFIG_PPC_BOOK3S_64)	+= mce.o mce_power.o
> > diff --git a/arch/powerpc/kernel/dawr.c b/arch/powerpc/kernel/dawr.c
> > new file mode 100644
> > index 0000000000..cf1d02fe1e
> > --- /dev/null
> > +++ b/arch/powerpc/kernel/dawr.c
> > @@ -0,0 +1,75 @@
> > +// SPDX-License-Identifier: GPL-2.0+
> > +//
> > +// DAWR infrastructure
> > +//
> > +// Copyright 2019, Michael Neuling, IBM Corporation.
> > +
> > +#include <linux/types.h>
> > +#include <linux/export.h>
> > +#include <linux/fs.h>
> > +#include <linux/debugfs.h>
> > +#include <asm/debugfs.h>
> > +#include <asm/machdep.h>
> > +#include <asm/hvcall.h>
> > +
> > +bool dawr_force_enable;
> > +EXPORT_SYMBOL_GPL(dawr_force_enable);
> > +
> > +extern bool dawr_enabled(void)
> 
> extern ????

oops
> 
> > +{
> > +	return dawr_force_enable;
> > +}
> > +EXPORT_SYMBOL_GPL(dawr_enabled);
> 
> Since dawr_force_enable is also exported, I see no point in having such 
> a tiny function as an exported function, was better as a 'static inline'.

Yep, changed to static inline.

> > +
> > +static ssize_t dawr_write_file_bool(struct file *file,
> > +				    const char __user *user_buf,
> > +				    size_t count, loff_t *ppos)
> > +{
> > +	struct arch_hw_breakpoint null_brk = {0, 0, 0};
> > +	size_t rc;
> > +
> > +	/* Send error to user if they hypervisor won't allow us to write DAWR */
> > +	if ((!dawr_force_enable) &&
> > +	    (firmware_has_feature(FW_FEATURE_LPAR)) &&
> > +	    (set_dawr(&null_brk) != H_SUCCESS))
> 
> The above is not real clear.
> set_dabr() returns 0, H_SUCCESS is not used there.

It pseries_set_dawr() will return a hcall number.

This code hasn't changed. I'm just moving it.

> 
> > +		return -1;
> > +
> > +	rc = debugfs_write_file_bool(file, user_buf, count, ppos);
> > +	if (rc)
> > +		return rc;
> > +
> > +	/* If we are clearing, make sure all CPUs have the DAWR cleared */
> > +	if (!dawr_force_enable)
> > +		smp_call_function((smp_call_func_t)set_dawr, &null_brk, 0);
> > +
> > +	return rc;
> > +}
> > +
> > +static const struct file_operations dawr_enable_fops = {
> > +	.read =		debugfs_read_file_bool,
> > +	.write =	dawr_write_file_bool,
> > +	.open =		simple_open,
> > +	.llseek =	default_llseek,
> > +};
> > +
> > +static int __init dawr_force_setup(void)
> > +{
> > +	dawr_force_enable = false;
> 
> The above is not needed, the BSS is zeroised at kernel startup.
> 
> > +
> > +	if (cpu_has_feature(CPU_FTR_DAWR)) {
> > +		/* Don't setup sysfs file for user control on P8 */
> > +		dawr_force_enable = true;
> 
> Strange comment, word "don't" doesn't really fit with a 'true'
> 
> > +		return 0;
> > +	}
> > +
> > +	if (PVR_VER(mfspr(SPRN_PVR)) == PVR_POWER9) {
> 
> You could use pvr_version_is(PVR_POWER9) instead of open codiing.

All this code hasn't changed. I'm just moving it.

Feel free to clean it up but lets fix a real problem here.

> 
> > +		/* Turn DAWR off by default, but allow admin to turn it on */
> > +		dawr_force_enable = false;
> > +		debugfs_create_file_unsafe("dawr_enable_dangerous", 0600,
> > +					   powerpc_debugfs_root,
> > +					   &dawr_force_enable,
> > +					   &dawr_enable_fops);
> > +	}
> > +	return 0;
> > +}
> > +arch_initcall(dawr_force_setup);
> 
> Wouldn't it also make sense to move set_dawr() from process.c to here ?

Yep, done.

> 
> > diff --git a/arch/powerpc/kernel/hw_breakpoint.c
> > b/arch/powerpc/kernel/hw_breakpoint.c
> > index da307dd93e..95605a9c9a 100644
> > --- a/arch/powerpc/kernel/hw_breakpoint.c
> > +++ b/arch/powerpc/kernel/hw_breakpoint.c
> > @@ -380,59 +380,3 @@ void hw_breakpoint_pmu_read(struct perf_event *bp)
> >   {
> >   	/* TODO */
> >   }
> > -
> > -bool dawr_force_enable;
> > -EXPORT_SYMBOL_GPL(dawr_force_enable);
> > -
> > -static ssize_t dawr_write_file_bool(struct file *file,
> > -				    const char __user *user_buf,
> > -				    size_t count, loff_t *ppos)
> > -{
> > -	struct arch_hw_breakpoint null_brk = {0, 0, 0};
> > -	size_t rc;
> > -
> > -	/* Send error to user if they hypervisor won't allow us to write DAWR */
> > -	if ((!dawr_force_enable) &&
> > -	    (firmware_has_feature(FW_FEATURE_LPAR)) &&
> > -	    (set_dawr(&null_brk) != H_SUCCESS))
> > -		return -1;
> > -
> > -	rc = debugfs_write_file_bool(file, user_buf, count, ppos);
> > -	if (rc)
> > -		return rc;
> > -
> > -	/* If we are clearing, make sure all CPUs have the DAWR cleared */
> > -	if (!dawr_force_enable)
> > -		smp_call_function((smp_call_func_t)set_dawr, &null_brk, 0);
> > -
> > -	return rc;
> > -}
> > -
> > -static const struct file_operations dawr_enable_fops = {
> > -	.read =		debugfs_read_file_bool,
> > -	.write =	dawr_write_file_bool,
> > -	.open =		simple_open,
> > -	.llseek =	default_llseek,
> > -};
> > -
> > -static int __init dawr_force_setup(void)
> > -{
> > -	dawr_force_enable = false;
> > -
> > -	if (cpu_has_feature(CPU_FTR_DAWR)) {
> > -		/* Don't setup sysfs file for user control on P8 */
> > -		dawr_force_enable = true;
> > -		return 0;
> > -	}
> > -
> > -	if (PVR_VER(mfspr(SPRN_PVR)) == PVR_POWER9) {
> > -		/* Turn DAWR off by default, but allow admin to turn it on */
> > -		dawr_force_enable = false;
> > -		debugfs_create_file_unsafe("dawr_enable_dangerous", 0600,
> > -					   powerpc_debugfs_root,
> > -					   &dawr_force_enable,
> > -					   &dawr_enable_fops);
> > -	}
> > -	return 0;
> > -}
> > -arch_initcall(dawr_force_setup);
> > diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
> > index bfdde04e49..9c0d315108 100644
> > --- a/arch/powerpc/kvm/Kconfig
> > +++ b/arch/powerpc/kvm/Kconfig
> > @@ -39,6 +39,7 @@ config KVM_BOOK3S_32_HANDLER
> >   config KVM_BOOK3S_64_HANDLER
> >   	bool
> >   	select KVM_BOOK3S_HANDLER
> > +	select PPC_DAWR_FORCE_ENABLE
> >   
> >   config KVM_BOOK3S_PR_POSSIBLE
> >   	bool
> > 
> 
> Christophe
> 


^ permalink raw reply

* Re: [PATCH v1 4/8] soc/fsl/qbman: Use index when accessing device tree properties
From: Joakim Tjernlund @ 2019-05-14  4:51 UTC (permalink / raw)
  To: linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, leoyang.li@nxp.com,
	roy.pledge@nxp.com
  Cc: laurentiu.tudor@nxp.com, madalin.bucur@nxp.com
In-Reply-To: <DB6PR0402MB27278B23001A8965AE493CE3860F0@DB6PR0402MB2727.eurprd04.prod.outlook.com>

On Mon, 2019-05-13 at 17:40 +0000, Roy Pledge wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you recognize the sender and know the content is safe.
> 
> 
> On 5/13/2019 12:40 PM, Joakim Tjernlund wrote:
> > On Mon, 2019-05-13 at 16:09 +0000, Roy Pledge wrote:
> > > The index value should be passed to the of_parse_phandle()
> > > function to ensure the correct property is read.
> > Is this a bug fix? Maybe for stable too?
> > 
> >  Jocke
> Yes this could go to stable.  I will include stable@vger.kernel.org when
> I send the next version.

I think you need to send this patch separately then. The whole series cannot go to stable.

 Jocke

> > > Signed-off-by: Roy Pledge <roy.pledge@nxp.com>
> > > ---
> > >  drivers/soc/fsl/qbman/dpaa_sys.c | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/soc/fsl/qbman/dpaa_sys.c b/drivers/soc/fsl/qbman/dpaa_sys.c
> > > index 3e0a7f3..0b901a8 100644
> > > --- a/drivers/soc/fsl/qbman/dpaa_sys.c
> > > +++ b/drivers/soc/fsl/qbman/dpaa_sys.c
> > > @@ -49,7 +49,7 @@ int qbman_init_private_mem(struct device *dev, int idx, dma_addr_t *addr,
> > >                         idx, ret);
> > >                 return -ENODEV;
> > >         }
> > > -       mem_node = of_parse_phandle(dev->of_node, "memory-region", 0);
> > > +       mem_node = of_parse_phandle(dev->of_node, "memory-region", idx);
> > >         if (mem_node) {
> > >                 ret = of_property_read_u64(mem_node, "size", &size64);
> > >                 if (ret) {
> > > --
> > > 2.7.4
> > > 


^ permalink raw reply

* Re: [Bug 203597] New: kernel 4.9.175 fails to boot on a PowerMac G4 3,6 at early stage
From: Christophe Leroy @ 2019-05-14  4:56 UTC (permalink / raw)
  To: Greg KH; +Cc: erhard_f, linuxppc-dev, stable@vger.kernel.org
In-Reply-To: <bug-203597-206035@https.bugzilla.kernel.org/>

Hi Greg,

Could you please apply b45ba4a51cde29b2939365ef0c07ad34c8321789 to 4.9 
since 51c3c62b58b357e8d35e4cc32f7b4ec907426fe3 was applied allthought 
marked for #4.13+

Thanks
Christophe

Le 13/05/2019 à 22:18, bugzilla-daemon@bugzilla.kernel.org a écrit :
> https://bugzilla.kernel.org/show_bug.cgi?id=203597
> 
>              Bug ID: 203597
>             Summary: kernel 4.9.175 fails to boot on a PowerMac G4 3,6 at
>                      early stage
>             Product: Platform Specific/Hardware
>             Version: 2.5
>      Kernel Version: 4.9.175
>            Hardware: PPC-32
>                  OS: Linux
>                Tree: Mainline
>              Status: NEW
>            Severity: normal
>            Priority: P1
>           Component: PPC-32
>            Assignee: platform_ppc-32@kernel-bugs.osdl.org
>            Reporter: erhard_f@mailbox.org
>          Regression: No
> 
> Created attachment 282743
>    --> https://bugzilla.kernel.org/attachment.cgi?id=282743&action=edit
> kernel .config (PowerMac G4 MDD)
> 
> Trying out older kernels on the G4 MDD I noticed recent 4.9.x freeze the
> machine. Only message displayed in black letters on a white screen:
> 
> done
> found display   : /pco@f0000000/ATY,AlteracParent@10/ATY,Alterac_B@1,
> opening...
> 
> 
> It's a hard freeze, RCU_CPU_STALL_TIMEOUT does not kick in.
> 
> Tried other stable kernels, which all worked:
> 4.19.37
> 4.14.114
> 4.4.179
> 
> So I suppose it's only a 4.9.x issue. Last working 4.9.x kernel I had in
> service was 4.9.161. First one I spotted freezing was 4.9.171. A bisect gave me
> the following commit:
> 
> 1c38a84d45862be06ac418618981631eddbda741 is the first bad commit
> commit 1c38a84d45862be06ac418618981631eddbda741
> Author: Michael Neuling <mikey@neuling.org>
> Date:   Thu Apr 11 21:45:59 2019 +1000
> 
>      powerpc: Avoid code patching freed init sections
> 
>      commit 51c3c62b58b357e8d35e4cc32f7b4ec907426fe3 upstream.
> 
>      This stops us from doing code patching in init sections after they've
>      been freed.
> 
>      In this chain:
>        kvm_guest_init() ->
>          kvm_use_magic_page() ->
>            fault_in_pages_readable() ->
>               __get_user() ->
>                 __get_user_nocheck() ->
>                   barrier_nospec();
> 
>      We have a code patching location at barrier_nospec() and
>      kvm_guest_init() is an init function. This whole chain gets inlined,
>      so when we free the init section (hence kvm_guest_init()), this code
>      goes away and hence should no longer be patched.
> 
>      We seen this as userspace memory corruption when using a memory
>      checker while doing partition migration testing on powervm (this
>      starts the code patching post migration via
>      /sys/kernel/mobility/migration). In theory, it could also happen when
>      using /sys/kernel/debug/powerpc/barrier_nospec.
> 
>      Cc: stable@vger.kernel.org # 4.13+
>      Signed-off-by: Michael Neuling <mikey@neuling.org>
>      Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
>      Reviewed-by: Christophe Leroy <christophe.leroy@c-s.fr>
>      Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
>      Signed-off-by: Sasha Levin <sashal@kernel.org>
> 

^ permalink raw reply

* Re: [PATCH v2] powerpc: Fix compile issue with force DAWR
From: Christophe Leroy @ 2019-05-14  5:26 UTC (permalink / raw)
  To: Michael Neuling, mpe; +Cc: linuxppc-dev
In-Reply-To: <4ae1ab46779c5724d129bbeb62859e288ff7dffa.camel@neuling.org>



Le 14/05/2019 à 06:47, Michael Neuling a écrit :
> On Mon, 2019-05-13 at 11:08 +0200, Christophe Leroy wrote:
>>
>> Le 13/05/2019 à 09:17, Michael Neuling a écrit :
>>> If you compile with KVM but without CONFIG_HAVE_HW_BREAKPOINT you fail
>>> at linking with:
>>>     arch/powerpc/kvm/book3s_hv_rmhandlers.o:(.text+0x708): undefined
>>> reference to `dawr_force_enable'
>>>
>>> This was caused by commit c1fe190c0672 ("powerpc: Add force enable of
>>> DAWR on P9 option").
>>>
>>> This puts more of the dawr infrastructure in a new file.
>>
>> I think you are doing a bit more than that. I think you should explain
>> that you define a new CONFIG_ option, when it is selected, etc ...
>>
>> The commit you are referring to is talking about P9. It looks like your
>> patch covers a lot more, so it should be mentionned her I guess.
> 
> Not really. It looks like I'm doing a lot but I'm really just moving code around
> to deal with the ugliness of a bunch of config options and dependencies.
> 
>>> Signed-off-by: Michael Neuling <mikey@neuling.org>
>>
>> You should add a Fixes: tag, ie:
>>
>> Fixes: c1fe190c0672 ("powerpc: Add force enable of DAWR on P9 option")
> 
> Ok
> 
>>
>>> --
>>> v2:
>>>     Fixes based on Christophe Leroy's comments:
>>>     - Fix commit message formatting
>>>     - Move more DAWR code into dawr.c
>>> ---
>>>    arch/powerpc/Kconfig                     |  5 ++
>>>    arch/powerpc/include/asm/hw_breakpoint.h | 20 ++++---
>>>    arch/powerpc/kernel/Makefile             |  1 +
>>>    arch/powerpc/kernel/dawr.c               | 75 ++++++++++++++++++++++++
>>>    arch/powerpc/kernel/hw_breakpoint.c      | 56 ------------------
>>>    arch/powerpc/kvm/Kconfig                 |  1 +
>>>    6 files changed, 94 insertions(+), 64 deletions(-)
>>>    create mode 100644 arch/powerpc/kernel/dawr.c
>>>
>>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>>> index 2711aac246..f4b19e48cc 100644
>>> --- a/arch/powerpc/Kconfig
>>> +++ b/arch/powerpc/Kconfig
>>> @@ -242,6 +242,7 @@ config PPC
>>>    	select SYSCTL_EXCEPTION_TRACE
>>>    	select THREAD_INFO_IN_TASK
>>>    	select VIRT_TO_BUS			if !PPC64
>>> +	select PPC_DAWR_FORCE_ENABLE		if PPC64 || PERF
>>
>> What's PERF ? Did you mean PERF_EVENTS ?
>>
>> Then what you mean is:
>> - Selected all the time for PPC64
>> - Selected for PPC32 when PERF is also selected.
>>
>> Is that what you want ? At first I thought it was linked to P9.
> 
> This is wrong.  I think we just want PPC64. PERF is a red herring.

Are you sure ? Michael suggested PERF || KVM as far as I remember.

> 
>> And ... did you read below statement ?
> 
> Clearly not :-)
> 
>>
>>>    	#
>>>    	# Please keep this list sorted alphabetically.
>>>    	#
>>> @@ -369,6 +370,10 @@ config PPC_ADV_DEBUG_DAC_RANGE
>>>    	depends on PPC_ADV_DEBUG_REGS && 44x
>>>    	default y
>>>    
>>> +config PPC_DAWR_FORCE_ENABLE
>>> +	bool
>>> +	default y
>>> +
>>
>> Why defaulting it to y. Then how is it set to n ?
> 
> Good point.
> 
>>
>>>    config ZONE_DMA
>>>    	bool
>>>    	default y if PPC_BOOK3E_64
>>> diff --git a/arch/powerpc/include/asm/hw_breakpoint.h
>>> b/arch/powerpc/include/asm/hw_breakpoint.h
>>> index 0fe8c1e46b..ffbc8eab41 100644
>>> --- a/arch/powerpc/include/asm/hw_breakpoint.h
>>> +++ b/arch/powerpc/include/asm/hw_breakpoint.h
>>> @@ -47,6 +47,8 @@ struct arch_hw_breakpoint {
>>>    #define HW_BRK_TYPE_PRIV_ALL	(HW_BRK_TYPE_USER | HW_BRK_TYPE_KERNEL |
>>> \
>>>    				 HW_BRK_TYPE_HYP)
>>>    
>>> +extern int set_dawr(struct arch_hw_breakpoint *brk);
>>> +
>>>    #ifdef CONFIG_HAVE_HW_BREAKPOINT
>>>    #include <linux/kdebug.h>
>>>    #include <asm/reg.h>
>>> @@ -90,18 +92,20 @@ static inline void hw_breakpoint_disable(void)
>>>    extern void thread_change_pc(struct task_struct *tsk, struct pt_regs
>>> *regs);
>>>    int hw_breakpoint_handler(struct die_args *args);
>>>    
>>> -extern int set_dawr(struct arch_hw_breakpoint *brk);
>>> -extern bool dawr_force_enable;
>>> -static inline bool dawr_enabled(void)
>>> -{
>>> -	return dawr_force_enable;
>>> -}
>>> -
>>
>> That's a very simple function, why not keep it here (or in another .h)
>> as 'static inline' ?
> 
> Sure.
> 
>>>    #else	/* CONFIG_HAVE_HW_BREAKPOINT */
>>>    static inline void hw_breakpoint_disable(void) { }
>>>    static inline void thread_change_pc(struct task_struct *tsk,
>>>    					struct pt_regs *regs) { }
>>> -static inline bool dawr_enabled(void) { return false; }
>>> +
>>>    #endif	/* CONFIG_HAVE_HW_BREAKPOINT */
>>> +
>>> +extern bool dawr_force_enable;
>>> +
>>> +#ifdef CONFIG_PPC_DAWR_FORCE_ENABLE
>>> +extern bool dawr_enabled(void);
>>
>> Functions should not be 'extern'. I'm sure checkpatch --strict will tell
>> you.
> 
> That's not what's currently being done in this header file.  I'm keeping with
> the style of that file.

style is not defined on a per file basis. There is the style from the 
past and the nowadays style. If you keep old style just because the file 
includes old style statements, then the code will never improve.

All new patches should come with clean 'checkpatch' report and follow 
new style. Having mixed styles in a file is not a problem, that's the 
way to improvement. See arch/powerpc/mm/mmu_decl.h as an exemple.

> 
>>> +#else
>>> +#define dawr_enabled() true
>>
>> true by default ?
>> Previously it was false by default.
> 
> Thanks, yeah that's wrong but I need to rethink the config option to make it
> CONFIG_PPC_DAWR.
> 
> This patch is far more difficult than it should be due to the mess that
> CONFIG_HAVE_HW_BREAKPOINT and CONFIG_PPC_ADV_DEBUG_REGS creates in ptrace.c and
> process.c. We really need to fix up
> https://github.com/linuxppc/issues/issues/128
> 
>> And why a #define ? That's better to keep a static inline.
> 
> Changed.
> 
>>
>>> +#endif
>>> +
>>>    #endif	/* __KERNEL__ */
>>>    #endif	/* _PPC_BOOK3S_64_HW_BREAKPOINT_H */
>>> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
>>> index 0ea6c4aa3a..a9c497c34f 100644
>>> --- a/arch/powerpc/kernel/Makefile
>>> +++ b/arch/powerpc/kernel/Makefile
>>> @@ -56,6 +56,7 @@ obj-$(CONFIG_PPC64)		+= setup_64.o
>>> sys_ppc32.o \
>>>    obj-$(CONFIG_VDSO32)		+= vdso32/
>>>    obj-$(CONFIG_PPC_WATCHDOG)	+= watchdog.o
>>>    obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
>>> +obj-$(CONFIG_PPC_DAWR_FORCE_ENABLE)	+= dawr.o
>>>    obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_ppc970.o cpu_setup_pa6t.o
>>>    obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power.o
>>>    obj-$(CONFIG_PPC_BOOK3S_64)	+= mce.o mce_power.o
>>> diff --git a/arch/powerpc/kernel/dawr.c b/arch/powerpc/kernel/dawr.c
>>> new file mode 100644
>>> index 0000000000..cf1d02fe1e
>>> --- /dev/null
>>> +++ b/arch/powerpc/kernel/dawr.c
>>> @@ -0,0 +1,75 @@
>>> +// SPDX-License-Identifier: GPL-2.0+
>>> +//
>>> +// DAWR infrastructure
>>> +//
>>> +// Copyright 2019, Michael Neuling, IBM Corporation.
>>> +
>>> +#include <linux/types.h>
>>> +#include <linux/export.h>
>>> +#include <linux/fs.h>
>>> +#include <linux/debugfs.h>
>>> +#include <asm/debugfs.h>
>>> +#include <asm/machdep.h>
>>> +#include <asm/hvcall.h>
>>> +
>>> +bool dawr_force_enable;
>>> +EXPORT_SYMBOL_GPL(dawr_force_enable);
>>> +
>>> +extern bool dawr_enabled(void)
>>
>> extern ????
> 
> oops
>>
>>> +{
>>> +	return dawr_force_enable;
>>> +}
>>> +EXPORT_SYMBOL_GPL(dawr_enabled);
>>
>> Since dawr_force_enable is also exported, I see no point in having such
>> a tiny function as an exported function, was better as a 'static inline'.
> 
> Yep, changed to static inline.
> 
>>> +
>>> +static ssize_t dawr_write_file_bool(struct file *file,
>>> +				    const char __user *user_buf,
>>> +				    size_t count, loff_t *ppos)
>>> +{
>>> +	struct arch_hw_breakpoint null_brk = {0, 0, 0};
>>> +	size_t rc;
>>> +
>>> +	/* Send error to user if they hypervisor won't allow us to write DAWR */
>>> +	if ((!dawr_force_enable) &&
>>> +	    (firmware_has_feature(FW_FEATURE_LPAR)) &&
>>> +	    (set_dawr(&null_brk) != H_SUCCESS))
>>
>> The above is not real clear.
>> set_dabr() returns 0, H_SUCCESS is not used there.
> 
> It pseries_set_dawr() will return a hcall number.

Right, then it maybe means set_dawr() should be fixes ?

> 
> This code hasn't changed. I'm just moving it.

Right, but could be an improvment for another patch.
As far as I remember you are the one who wrote that code at first place, 
arent't you ?

> 
>>
>>> +		return -1;
>>> +
>>> +	rc = debugfs_write_file_bool(file, user_buf, count, ppos);
>>> +	if (rc)
>>> +		return rc;
>>> +
>>> +	/* If we are clearing, make sure all CPUs have the DAWR cleared */
>>> +	if (!dawr_force_enable)
>>> +		smp_call_function((smp_call_func_t)set_dawr, &null_brk, 0);
>>> +
>>> +	return rc;
>>> +}
>>> +
>>> +static const struct file_operations dawr_enable_fops = {
>>> +	.read =		debugfs_read_file_bool,
>>> +	.write =	dawr_write_file_bool,
>>> +	.open =		simple_open,
>>> +	.llseek =	default_llseek,
>>> +};
>>> +
>>> +static int __init dawr_force_setup(void)
>>> +{
>>> +	dawr_force_enable = false;
>>
>> The above is not needed, the BSS is zeroised at kernel startup.
>>
>>> +
>>> +	if (cpu_has_feature(CPU_FTR_DAWR)) {
>>> +		/* Don't setup sysfs file for user control on P8 */
>>> +		dawr_force_enable = true;
>>
>> Strange comment, word "don't" doesn't really fit with a 'true'
>>
>>> +		return 0;
>>> +	}
>>> +
>>> +	if (PVR_VER(mfspr(SPRN_PVR)) == PVR_POWER9) {
>>
>> You could use pvr_version_is(PVR_POWER9) instead of open codiing.
> 
> All this code hasn't changed. I'm just moving it.

Right, but I think the comments are worth it, allthough that would be 
for another patch.

> 
> Feel free to clean it up but lets fix a real problem here.

Yes I can but it will conflict with your patch.


Christophe

> 
>>
>>> +		/* Turn DAWR off by default, but allow admin to turn it on */
>>> +		dawr_force_enable = false;
>>> +		debugfs_create_file_unsafe("dawr_enable_dangerous", 0600,
>>> +					   powerpc_debugfs_root,
>>> +					   &dawr_force_enable,
>>> +					   &dawr_enable_fops);
>>> +	}
>>> +	return 0;
>>> +}
>>> +arch_initcall(dawr_force_setup);
>>
>> Wouldn't it also make sense to move set_dawr() from process.c to here ?
> 
> Yep, done.
> 
>>
>>> diff --git a/arch/powerpc/kernel/hw_breakpoint.c
>>> b/arch/powerpc/kernel/hw_breakpoint.c
>>> index da307dd93e..95605a9c9a 100644
>>> --- a/arch/powerpc/kernel/hw_breakpoint.c
>>> +++ b/arch/powerpc/kernel/hw_breakpoint.c
>>> @@ -380,59 +380,3 @@ void hw_breakpoint_pmu_read(struct perf_event *bp)
>>>    {
>>>    	/* TODO */
>>>    }
>>> -
>>> -bool dawr_force_enable;
>>> -EXPORT_SYMBOL_GPL(dawr_force_enable);
>>> -
>>> -static ssize_t dawr_write_file_bool(struct file *file,
>>> -				    const char __user *user_buf,
>>> -				    size_t count, loff_t *ppos)
>>> -{
>>> -	struct arch_hw_breakpoint null_brk = {0, 0, 0};
>>> -	size_t rc;
>>> -
>>> -	/* Send error to user if they hypervisor won't allow us to write DAWR */
>>> -	if ((!dawr_force_enable) &&
>>> -	    (firmware_has_feature(FW_FEATURE_LPAR)) &&
>>> -	    (set_dawr(&null_brk) != H_SUCCESS))
>>> -		return -1;
>>> -
>>> -	rc = debugfs_write_file_bool(file, user_buf, count, ppos);
>>> -	if (rc)
>>> -		return rc;
>>> -
>>> -	/* If we are clearing, make sure all CPUs have the DAWR cleared */
>>> -	if (!dawr_force_enable)
>>> -		smp_call_function((smp_call_func_t)set_dawr, &null_brk, 0);
>>> -
>>> -	return rc;
>>> -}
>>> -
>>> -static const struct file_operations dawr_enable_fops = {
>>> -	.read =		debugfs_read_file_bool,
>>> -	.write =	dawr_write_file_bool,
>>> -	.open =		simple_open,
>>> -	.llseek =	default_llseek,
>>> -};
>>> -
>>> -static int __init dawr_force_setup(void)
>>> -{
>>> -	dawr_force_enable = false;
>>> -
>>> -	if (cpu_has_feature(CPU_FTR_DAWR)) {
>>> -		/* Don't setup sysfs file for user control on P8 */
>>> -		dawr_force_enable = true;
>>> -		return 0;
>>> -	}
>>> -
>>> -	if (PVR_VER(mfspr(SPRN_PVR)) == PVR_POWER9) {
>>> -		/* Turn DAWR off by default, but allow admin to turn it on */
>>> -		dawr_force_enable = false;
>>> -		debugfs_create_file_unsafe("dawr_enable_dangerous", 0600,
>>> -					   powerpc_debugfs_root,
>>> -					   &dawr_force_enable,
>>> -					   &dawr_enable_fops);
>>> -	}
>>> -	return 0;
>>> -}
>>> -arch_initcall(dawr_force_setup);
>>> diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
>>> index bfdde04e49..9c0d315108 100644
>>> --- a/arch/powerpc/kvm/Kconfig
>>> +++ b/arch/powerpc/kvm/Kconfig
>>> @@ -39,6 +39,7 @@ config KVM_BOOK3S_32_HANDLER
>>>    config KVM_BOOK3S_64_HANDLER
>>>    	bool
>>>    	select KVM_BOOK3S_HANDLER
>>> +	select PPC_DAWR_FORCE_ENABLE
>>>    
>>>    config KVM_BOOK3S_PR_POSSIBLE
>>>    	bool
>>>
>>
>> Christophe
>>

^ permalink raw reply

* Re: [PATCH 1/2] [PowerPC] Add simd.h implementation
From: Benjamin Herrenschmidt @ 2019-05-14  5:43 UTC (permalink / raw)
  To: Shawn Landden; +Cc: Paul Mackerras, linuxppc-dev
In-Reply-To: <20190514014412.25373-1-shawn@git.icu>

On Mon, 2019-05-13 at 22:44 -0300, Shawn Landden wrote:
> +
> +/*
> + * Were we in user mode when we were
> + * interrupted?
> + *
> + * Doing kernel_altivec/vsx_begin/end() is ok if we are running
> + * in an interrupt context from user mode - we'll just
> + * save the FPU state as required.
> + */
> +static bool interrupted_user_mode(void)
> +{
> +       struct pt_regs *regs = get_irq_regs();
> +
> +       return regs && user_mode(regs);
> +}
> +

That's interesting .... it *could* work but we'll have to careful audit
the code to make sure thats ok.

We probably also want to handle the case where the CPU is in the idle
loop.

Do we always save the user state when switching out these days ? If
yes, then there's no "live" state to worry about...

Cheers,
Ben.

^ permalink raw reply

* [PATCH 1/3] powerpc/book3s: Use config independent helpers for page table walk
From: Aneesh Kumar K.V @ 2019-05-14  6:03 UTC (permalink / raw)
  To: npiggin, paulus, mpe; +Cc: Aneesh Kumar K.V, linuxppc-dev

Even when we have HugeTLB and THP disabled, kernel linear map can still be
mapped with hugepages. This is only an issue with radix translation because hash
MMU doesn't map kernel linear range in linux page table and other kernel
map areas are not mapped using hugepage.

Add config independent helpers and put WARN_ON() when we don't expect things
to be mapped via hugepages.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 21 +++++++++++++++
 arch/powerpc/include/asm/pgtable.h           | 24 +++++++++++++++++
 arch/powerpc/include/asm/pte-walk.h          | 28 ++++++++++++++++++--
 arch/powerpc/kvm/book3s_64_mmu_radix.c       | 12 +++------
 arch/powerpc/mm/book3s64/radix_pgtable.c     | 10 +++----
 arch/powerpc/mm/pgtable.c                    | 16 +++++------
 arch/powerpc/mm/pgtable_64.c                 | 12 ++++++---
 arch/powerpc/mm/ptdump/ptdump.c              |  6 ++---
 arch/powerpc/xmon/xmon.c                     |  6 ++---
 9 files changed, 102 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 7dede2e34b70..f402b2a96ef3 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1313,5 +1313,26 @@ static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_va
 	return false;
 }
 
+/*
+ * Like pmd_huge() and pmd_large(), but works regardless of config options
+ */
+#define pmd_is_leaf pmd_is_leaf
+static inline bool pmd_is_leaf(pmd_t pmd)
+{
+	return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
+}
+
+#define pud_is_leaf pud_is_leaf
+static inline bool pud_is_leaf(pud_t pud)
+{
+	return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
+}
+
+#define pgd_is_leaf pgd_is_leaf
+static inline bool pgd_is_leaf(pgd_t pgd)
+{
+	return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE));
+}
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 3f53be60fb01..bf7d771f342e 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -140,6 +140,30 @@ static inline void pte_frag_set(mm_context_t *ctx, void *p)
 }
 #endif
 
+#ifndef pmd_is_leaf
+#define pmd_is_leaf pmd_is_leaf
+static inline bool pmd_is_leaf(pmd_t pmd)
+{
+	return false;
+}
+#endif
+
+#ifndef pud_is_leaf
+#define pud_is_leaf pud_is_leaf
+static inline bool pud_is_leaf(pud_t pud)
+{
+	return false;
+}
+#endif
+
+#ifndef pgd_is_leaf
+#define pgd_is_leaf pgd_is_leaf
+static inline bool pgd_is_leaf(pgd_t pgd)
+{
+	return false;
+}
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_H */
diff --git a/arch/powerpc/include/asm/pte-walk.h b/arch/powerpc/include/asm/pte-walk.h
index 2d633e9d686c..33fa5dd8ee6a 100644
--- a/arch/powerpc/include/asm/pte-walk.h
+++ b/arch/powerpc/include/asm/pte-walk.h
@@ -10,8 +10,20 @@ extern pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea,
 				    bool *is_thp, unsigned *hshift)
 {
+	pte_t *pte;
+
 	VM_WARN(!arch_irqs_disabled(), "%s called with irq enabled\n", __func__);
-	return __find_linux_pte(pgdir, ea, is_thp, hshift);
+	pte = __find_linux_pte(pgdir, ea, is_thp, hshift);
+
+#if defined(CONFIG_DEBUG_VM) &&						\
+	!(defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE))
+	/*
+	 * We should not find huge page if these configs are not enabled.
+	 */
+	if (hshift)
+		WARN_ON(*hshift);
+#endif
+	return pte;
 }
 
 static inline pte_t *find_init_mm_pte(unsigned long ea, unsigned *hshift)
@@ -26,10 +38,22 @@ static inline pte_t *find_init_mm_pte(unsigned long ea, unsigned *hshift)
 static inline pte_t *find_current_mm_pte(pgd_t *pgdir, unsigned long ea,
 					 bool *is_thp, unsigned *hshift)
 {
+	pte_t *pte;
+
 	VM_WARN(!arch_irqs_disabled(), "%s called with irq enabled\n", __func__);
 	VM_WARN(pgdir != current->mm->pgd,
 		"%s lock less page table lookup called on wrong mm\n", __func__);
-	return __find_linux_pte(pgdir, ea, is_thp, hshift);
+	pte = __find_linux_pte(pgdir, ea, is_thp, hshift);
+
+#if defined(CONFIG_DEBUG_VM) &&						\
+	!(defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE))
+	/*
+	 * We should not find huge page if these configs are not enabled.
+	 */
+	if (hshift)
+		WARN_ON(*hshift);
+#endif
+	return pte;
 }
 
 #endif /* _ASM_POWERPC_PTE_WALK_H */
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index f55ef071883f..91efee7f0329 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -363,12 +363,6 @@ static void kvmppc_pte_free(pte_t *ptep)
 	kmem_cache_free(kvm_pte_cache, ptep);
 }
 
-/* Like pmd_huge() and pmd_large(), but works regardless of config options */
-static inline int pmd_is_leaf(pmd_t pmd)
-{
-	return !!(pmd_val(pmd) & _PAGE_PTE);
-}
-
 static pmd_t *kvmppc_pmd_alloc(void)
 {
 	return kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
@@ -489,7 +483,7 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
 	for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
 		if (!pud_present(*p))
 			continue;
-		if (pud_huge(*p)) {
+		if (pud_is_leaf(*p)) {
 			pud_clear(p);
 		} else {
 			pmd_t *pmd;
@@ -588,7 +582,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 		new_pud = pud_alloc_one(kvm->mm, gpa);
 
 	pmd = NULL;
-	if (pud && pud_present(*pud) && !pud_huge(*pud))
+	if (pud && pud_present(*pud) && !pud_is_leaf(*pud))
 		pmd = pmd_offset(pud, gpa);
 	else if (level <= 1)
 		new_pmd = kvmppc_pmd_alloc();
@@ -611,7 +605,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 		new_pud = NULL;
 	}
 	pud = pud_offset(pgd, gpa);
-	if (pud_huge(*pud)) {
+	if (pud_is_leaf(*pud)) {
 		unsigned long hgpa = gpa & PUD_MASK;
 
 		/* Check if we raced and someone else has set the same thing */
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 247a38a3f3a6..b78f6fbdd307 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -202,14 +202,14 @@ void radix__change_memory_range(unsigned long start, unsigned long end,
 		pudp = pud_alloc(&init_mm, pgdp, idx);
 		if (!pudp)
 			continue;
-		if (pud_huge(*pudp)) {
+		if (pud_is_leaf(*pudp)) {
 			ptep = (pte_t *)pudp;
 			goto update_the_pte;
 		}
 		pmdp = pmd_alloc(&init_mm, pudp, idx);
 		if (!pmdp)
 			continue;
-		if (pmd_huge(*pmdp)) {
+		if (pmd_is_leaf(*pmdp)) {
 			ptep = pmdp_ptep(pmdp);
 			goto update_the_pte;
 		}
@@ -837,7 +837,7 @@ static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
 		if (!pmd_present(*pmd))
 			continue;
 
-		if (pmd_huge(*pmd)) {
+		if (pmd_is_leaf(*pmd)) {
 			split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
 			continue;
 		}
@@ -862,7 +862,7 @@ static void remove_pud_table(pud_t *pud_start, unsigned long addr,
 		if (!pud_present(*pud))
 			continue;
 
-		if (pud_huge(*pud)) {
+		if (pud_is_leaf(*pud)) {
 			split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
 			continue;
 		}
@@ -888,7 +888,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
 		if (!pgd_present(*pgd))
 			continue;
 
-		if (pgd_huge(*pgd)) {
+		if (pgd_is_leaf(*pgd)) {
 			split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
 			continue;
 			/* This should fall through? */
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index db4a6253df92..b97aee03924f 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -340,10 +340,11 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 	if (pgd_none(pgd))
 		return NULL;
 
-	if (pgd_huge(pgd)) {
+	if (pgd_is_leaf(pgd)) {
 		ret_pte = (pte_t *)pgdp;
 		goto out;
 	}
+
 	if (is_hugepd(__hugepd(pgd_val(pgd)))) {
 		hpdp = (hugepd_t *)&pgd;
 		goto out_huge;
@@ -361,14 +362,16 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 	if (pud_none(pud))
 		return NULL;
 
-	if (pud_huge(pud)) {
+	if (pud_is_leaf(pud)) {
 		ret_pte = (pte_t *)pudp;
 		goto out;
 	}
+
 	if (is_hugepd(__hugepd(pud_val(pud)))) {
 		hpdp = (hugepd_t *)&pud;
 		goto out_huge;
 	}
+
 	pdshift = PMD_SHIFT;
 	pmdp = pmd_offset(&pud, ea);
 	pmd  = READ_ONCE(*pmdp);
@@ -385,15 +388,12 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 		ret_pte = (pte_t *)pmdp;
 		goto out;
 	}
-	/*
-	 * pmd_large check below will handle the swap pmd pte
-	 * we need to do both the check because they are config
-	 * dependent.
-	 */
-	if (pmd_huge(pmd) || pmd_large(pmd)) {
+
+	if (pmd_is_leaf(pmd)) {
 		ret_pte = (pte_t *)pmdp;
 		goto out;
 	}
+
 	if (is_hugepd(__hugepd(pmd_val(pmd)))) {
 		hpdp = (hugepd_t *)&pmd;
 		goto out_huge;
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index d2d976ff8a0e..bfb47b037a2f 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -296,16 +296,20 @@ EXPORT_SYMBOL(__iounmap_at);
 /* 4 level page table */
 struct page *pgd_page(pgd_t pgd)
 {
-	if (pgd_huge(pgd))
+	if (pgd_is_leaf(pgd)) {
+		VM_WARN_ON(!pgd_huge(pgd));
 		return pte_page(pgd_pte(pgd));
+	}
 	return virt_to_page(pgd_page_vaddr(pgd));
 }
 #endif
 
 struct page *pud_page(pud_t pud)
 {
-	if (pud_huge(pud))
+	if (pud_is_leaf(pud)) {
+		VM_WARN_ON(!pud_huge(pud));
 		return pte_page(pud_pte(pud));
+	}
 	return virt_to_page(pud_page_vaddr(pud));
 }
 
@@ -315,8 +319,10 @@ struct page *pud_page(pud_t pud)
  */
 struct page *pmd_page(pmd_t pmd)
 {
-	if (pmd_large(pmd) || pmd_huge(pmd) || pmd_devmap(pmd))
+	if (pmd_is_leaf(pmd)) {
+		VM_WARN_ON(!(pmd_large(pmd) || pmd_huge(pmd) || pmd_devmap(pmd)));
 		return pte_page(pmd_pte(pmd));
+	}
 	return virt_to_page(pmd_page_vaddr(pmd));
 }
 
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 646876d9da64..abe60d25b4e6 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -277,7 +277,7 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
 
 	for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
 		addr = start + i * PMD_SIZE;
-		if (!pmd_none(*pmd) && !pmd_huge(*pmd))
+		if (!pmd_none(*pmd) && !pmd_is_leaf(*pmd))
 			/* pmd exists */
 			walk_pte(st, pmd, addr);
 		else
@@ -293,7 +293,7 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
 
 	for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
 		addr = start + i * PUD_SIZE;
-		if (!pud_none(*pud) && !pud_huge(*pud))
+		if (!pud_none(*pud) && !pud_is_leaf(*pud))
 			/* pud exists */
 			walk_pmd(st, pud, addr);
 		else
@@ -314,7 +314,7 @@ static void walk_pagetables(struct pg_state *st)
 	 * the hash pagetable.
 	 */
 	for (i = 0; i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
-		if (!pgd_none(*pgd) && !pgd_huge(*pgd))
+		if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd))
 			/* pgd exists */
 			walk_pud(st, pgd, addr);
 		else
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 1b0149b2bb6c..49353e01edfa 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -3094,7 +3094,7 @@ static void show_pte(unsigned long addr)
 
 	printf("pgd  @ 0x%px\n", pgdir);
 
-	if (pgd_huge(*pgdp)) {
+	if (pgd_is_leaf(*pgdp)) {
 		format_pte(pgdp, pgd_val(*pgdp));
 		return;
 	}
@@ -3107,7 +3107,7 @@ static void show_pte(unsigned long addr)
 		return;
 	}
 
-	if (pud_huge(*pudp)) {
+	if (pud_is_leaf(*pudp)) {
 		format_pte(pudp, pud_val(*pudp));
 		return;
 	}
@@ -3121,7 +3121,7 @@ static void show_pte(unsigned long addr)
 		return;
 	}
 
-	if (pmd_huge(*pmdp)) {
+	if (pmd_is_leaf(*pmdp)) {
 		format_pte(pmdp, pmd_val(*pmdp));
 		return;
 	}
-- 
2.21.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox