LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/4] powerpc/drmem: Make lmb_size 64 bit
From: Aneesh Kumar K.V @ 2020-08-06 16:23 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Nathan Lynch, Aneesh Kumar K.V, stable

Similar to commit 89c140bbaeee ("pseries: Fix 64 bit logical memory block panic")
make sure different variables tracking lmb_size are updated to be 64 bit.

This was found by code audit.

Cc: stable@vger.kernel.org
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/drmem.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index 17ccc6474ab6..d719cbac34b2 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -21,7 +21,7 @@ struct drmem_lmb {
 struct drmem_lmb_info {
 	struct drmem_lmb        *lmbs;
 	int                     n_lmbs;
-	u32                     lmb_size;
+	u64                     lmb_size;
 };
 
 extern struct drmem_lmb_info *drmem_info;
@@ -67,7 +67,7 @@ struct of_drconf_cell_v2 {
 #define DRCONF_MEM_RESERVED	0x00000080
 #define DRCONF_MEM_HOTREMOVABLE	0x00000100
 
-static inline u32 drmem_lmb_size(void)
+static inline u64 drmem_lmb_size(void)
 {
 	return drmem_info->lmb_size;
 }
-- 
2.26.2


^ permalink raw reply related

* [PATCH v2 2/4] powerpc/mem: Store the dt_root_size/addr cell values for later usage
From: Aneesh Kumar K.V @ 2020-08-06 16:23 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Nathan Lynch, Aneesh Kumar K.V
In-Reply-To: <20200806162329.276534-1-aneesh.kumar@linux.ibm.com>

dt_root_addr_cells and dt_root_size_cells are __initdata variables.
So make a copy of the same which can be used post init.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/drmem.h | 2 ++
 arch/powerpc/kernel/prom.c       | 7 +++++++
 arch/powerpc/mm/numa.c           | 1 +
 3 files changed, 10 insertions(+)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index d719cbac34b2..ffb59caa88ee 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -123,4 +123,6 @@ static inline void lmb_clear_nid(struct drmem_lmb *lmb)
 }
 #endif
 
+extern int mem_addr_cells, mem_size_cells;
+
 #endif /* _ASM_POWERPC_LMB_H */
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index d8a2fb87ba0c..9a1701e85747 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -73,6 +73,7 @@ u64 ppc64_rma_size;
 #endif
 static phys_addr_t first_memblock_size;
 static int __initdata boot_cpu_count;
+int mem_addr_cells, mem_size_cells;
 
 static int __init early_parse_mem(char *p)
 {
@@ -536,6 +537,12 @@ static int __init early_init_dt_scan_memory_ppc(unsigned long node,
 						const char *uname,
 						int depth, void *data)
 {
+	/*
+	 * Make a copy from __initdata variable
+	 */
+	mem_addr_cells = dt_root_addr_cells;
+	mem_size_cells = dt_root_size_cells;
+
 #ifdef CONFIG_PPC_PSERIES
 	if (depth == 1 &&
 	    strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) {
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 058fee9a0835..77d41d9775d2 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -368,6 +368,7 @@ static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
 	of_node_put(memory);
 }
 
+/*  dt_mem_next_cell is __init  */
 static unsigned long read_n_cells(int n, const __be32 **buf)
 {
 	unsigned long result = 0;
-- 
2.26.2


^ permalink raw reply related

* [PATCH v2 3/4] powerpc/memhotplug: Make lmb size 64bit
From: Aneesh Kumar K.V @ 2020-08-06 16:23 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Nathan Lynch, Aneesh Kumar K.V, stable
In-Reply-To: <20200806162329.276534-1-aneesh.kumar@linux.ibm.com>

Similar to commit 89c140bbaeee ("pseries: Fix 64 bit logical memory block panic")
make sure different variables tracking lmb_size are updated to be 64 bit.

This was found by code audit.

Cc: stable@vger.kernel.org
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 .../platforms/pseries/hotplug-memory.c        | 37 +++++++++++--------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 5d545b78111f..1fe3204c843a 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -277,7 +277,7 @@ static int dlpar_offline_lmb(struct drmem_lmb *lmb)
 	return dlpar_change_lmb_state(lmb, false);
 }
 
-static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size)
+static int pseries_remove_memblock(unsigned long base, unsigned long memblock_size)
 {
 	unsigned long block_sz, start_pfn;
 	int sections_per_block;
@@ -308,9 +308,9 @@ static int pseries_remove_memblock(unsigned long base, unsigned int memblock_siz
 
 static int pseries_remove_mem_node(struct device_node *np)
 {
-	const __be32 *regs;
+	const __be32 *prop;
 	unsigned long base;
-	unsigned int lmb_size;
+	unsigned long lmb_size;
 	int ret = -EINVAL;
 
 	/*
@@ -322,12 +322,16 @@ static int pseries_remove_mem_node(struct device_node *np)
 	/*
 	 * Find the base address and size of the memblock
 	 */
-	regs = of_get_property(np, "reg", NULL);
-	if (!regs)
+	prop = of_get_property(np, "reg", NULL);
+	if (!prop)
 		return ret;
 
-	base = be64_to_cpu(*(unsigned long *)regs);
-	lmb_size = be32_to_cpu(regs[3]);
+	/*
+	 * "reg" property represents (addr,size) tuple.
+	 */
+	base = of_read_number(prop, mem_addr_cells);
+	prop += mem_addr_cells;
+	lmb_size = of_read_number(prop, mem_size_cells);
 
 	pseries_remove_memblock(base, lmb_size);
 	return 0;
@@ -557,7 +561,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
 
 #else
 static inline int pseries_remove_memblock(unsigned long base,
-					  unsigned int memblock_size)
+					  unsigned long memblock_size)
 {
 	return -EOPNOTSUPP;
 }
@@ -878,9 +882,9 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
 
 static int pseries_add_mem_node(struct device_node *np)
 {
-	const __be32 *regs;
+	const __be32 *prop;
 	unsigned long base;
-	unsigned int lmb_size;
+	unsigned long lmb_size;
 	int ret = -EINVAL;
 
 	/*
@@ -892,12 +896,15 @@ static int pseries_add_mem_node(struct device_node *np)
 	/*
 	 * Find the base and size of the memblock
 	 */
-	regs = of_get_property(np, "reg", NULL);
-	if (!regs)
+	prop = of_get_property(np, "reg", NULL);
+	if (!prop)
 		return ret;
-
-	base = be64_to_cpu(*(unsigned long *)regs);
-	lmb_size = be32_to_cpu(regs[3]);
+	/*
+	 * "reg" property represents (addr,size) tuple.
+	 */
+	base = of_read_number(prop, mem_addr_cells);
+	prop += mem_addr_cells;
+	lmb_size = of_read_number(prop, mem_size_cells);
 
 	/*
 	 * Update memory region to represent the memory add
-- 
2.26.2


^ permalink raw reply related

* [PATCH v2 4/4] powerpc/book3s64/radix: Make radix_mem_block_size 64bit
From: Aneesh Kumar K.V @ 2020-08-06 16:23 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Nathan Lynch, Aneesh Kumar K.V
In-Reply-To: <20200806162329.276534-1-aneesh.kumar@linux.ibm.com>

Similar to commit 89c140bbaeee ("pseries: Fix 64 bit logical memory block panic")
make sure different variables tracking lmb_size are updated to be 64 bit.

Fixes: af9d00e93a4f ("powerpc/mm/radix: Create separate mappings for hot-plugged memory")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/mmu.h | 2 +-
 arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 55442d45c597..1a0c9d09950f 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -85,7 +85,7 @@ extern unsigned int mmu_base_pid;
 /*
  * memory block size used with radix translation.
  */
-extern unsigned int __ro_after_init radix_mem_block_size;
+extern unsigned long __ro_after_init radix_mem_block_size;
 
 #define PRTB_SIZE_SHIFT	(mmu_pid_bits + 4)
 #define PRTB_ENTRIES	(1ul << mmu_pid_bits)
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 28c784976bed..ca76d9d6372a 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -34,7 +34,7 @@
 
 unsigned int mmu_pid_bits;
 unsigned int mmu_base_pid;
-unsigned int radix_mem_block_size __ro_after_init;
+unsigned long radix_mem_block_size __ro_after_init;
 
 static __ref void *early_alloc_pgtable(unsigned long size, int nid,
 			unsigned long region_start, unsigned long region_end)
-- 
2.26.2


^ permalink raw reply related

* [PATCH v1 2/5] powerpc/fault: Unnest definition of page_fault_is_write() and page_fault_is_bad()
From: Christophe Leroy @ 2020-08-06 17:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <7baae4086cbb9ffb08c933b065ff7d29dbc03dd6.1596734104.git.christophe.leroy@csgroup.eu>

To make it more readable, separate page_fault_is_write() and page_fault_is_bad()
to avoir several levels of #ifdefs

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/fault.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 2efa34d7e644..9ef9ee244f72 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -363,17 +363,19 @@ static void sanity_check_fault(bool is_write, bool is_user,
  */
 #if (defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
 #define page_fault_is_write(__err)	((__err) & ESR_DST)
-#define page_fault_is_bad(__err)	(0)
 #else
 #define page_fault_is_write(__err)	((__err) & DSISR_ISSTORE)
-#if defined(CONFIG_PPC_8xx)
+#endif
+
+#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
+#define page_fault_is_bad(__err)	(0)
+#elif defined(CONFIG_PPC_8xx)
 #define page_fault_is_bad(__err)	((__err) & DSISR_NOEXEC_OR_G)
 #elif defined(CONFIG_PPC64)
 #define page_fault_is_bad(__err)	((__err) & DSISR_BAD_FAULT_64S)
 #else
 #define page_fault_is_bad(__err)	((__err) & DSISR_BAD_FAULT_32S)
 #endif
-#endif
 
 /*
  * For 600- and 800-family processors, the error_code parameter is DSISR
-- 
2.25.0


^ permalink raw reply related

* [PATCH v1 1/5] powerpc/mm: sanity_check_fault() should work for all,  not only BOOK3S
From: Christophe Leroy @ 2020-08-06 17:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
  Cc: linuxppc-dev, linux-kernel

The verification and message introduced by commit 374f3f5979f9
("powerpc/mm/hash: Handle user access of kernel address gracefully")
applies to all platforms, it should not be limited to BOOK3S.

Make the BOOK3S version of sanity_check_fault() the one for all,
and bail out earlier if not BOOK3S.

Fixes: 374f3f5979f9 ("powerpc/mm/hash: Handle user access of kernel address gracefully")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/fault.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 925a7231abb3..2efa34d7e644 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -303,7 +303,6 @@ static inline void cmo_account_page_fault(void)
 static inline void cmo_account_page_fault(void) { }
 #endif /* CONFIG_PPC_SMLPAR */
 
-#ifdef CONFIG_PPC_BOOK3S
 static void sanity_check_fault(bool is_write, bool is_user,
 			       unsigned long error_code, unsigned long address)
 {
@@ -320,6 +319,9 @@ static void sanity_check_fault(bool is_write, bool is_user,
 		return;
 	}
 
+	if (!IS_ENABLED(CONFIG_PPC_BOOK3S))
+		return;
+
 	/*
 	 * For hash translation mode, we should never get a
 	 * PROTFAULT. Any update to pte to reduce access will result in us
@@ -354,10 +356,6 @@ static void sanity_check_fault(bool is_write, bool is_user,
 
 	WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
 }
-#else
-static void sanity_check_fault(bool is_write, bool is_user,
-			       unsigned long error_code, unsigned long address) { }
-#endif /* CONFIG_PPC_BOOK3S */
 
 /*
  * Define the correct "is_write" bit in error_code based
-- 
2.25.0


^ permalink raw reply related

* [PATCH v1 4/5] powerpc/fault: Avoid heavy search_exception_tables() verification
From: Christophe Leroy @ 2020-08-06 17:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <7baae4086cbb9ffb08c933b065ff7d29dbc03dd6.1596734104.git.christophe.leroy@csgroup.eu>

search_exception_tables() is an heavy operation, we have to avoid it.
When KUAP is selected, we'll know the fault has been blocked by KUAP.
Otherwise, it behaves just as if the address was already in the TLBs
and no fault was generated.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/fault.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 525e0c2b5406..edde169ba3a6 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -214,24 +214,14 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
 	if (address >= TASK_SIZE)
 		return true;
 
-	if (!is_exec && (error_code & DSISR_PROTFAULT) &&
-	    !search_exception_tables(regs->nip)) {
+	// Read/write fault blocked by KUAP is bad, it can never succeed.
+	if (bad_kuap_fault(regs, address, is_write)) {
 		pr_crit_ratelimited("Kernel attempted to access user page (%lx) - exploit attempt? (uid: %d)\n",
-				    address,
-				    from_kuid(&init_user_ns, current_uid()));
-	}
-
-	// Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad
-	if (!search_exception_tables(regs->nip))
-		return true;
-
-	// Read/write fault in a valid region (the exception table search passed
-	// above), but blocked by KUAP is bad, it can never succeed.
-	if (bad_kuap_fault(regs, address, is_write))
+				    address, from_kuid(&init_user_ns, current_uid()));
 		return true;
+	}
 
-	// What's left? Kernel fault on user in well defined regions (extable
-	// matched), and allowed by KUAP in the faulting context.
+	// What's left? Kernel fault on user and allowed by KUAP in the faulting context.
 	return false;
 }
 
-- 
2.25.0


^ permalink raw reply related

* [PATCH v1 3/5] powerpc/fault: Reorder tests in bad_kernel_fault()
From: Christophe Leroy @ 2020-08-06 17:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <7baae4086cbb9ffb08c933b065ff7d29dbc03dd6.1596734104.git.christophe.leroy@csgroup.eu>

Check address earlier to simplify the following test.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/fault.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 9ef9ee244f72..525e0c2b5406 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -210,17 +210,17 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
 		return true;
 	}
 
-	if (!is_exec && address < TASK_SIZE && (error_code & DSISR_PROTFAULT) &&
+	// Kernel fault on kernel address is bad
+	if (address >= TASK_SIZE)
+		return true;
+
+	if (!is_exec && (error_code & DSISR_PROTFAULT) &&
 	    !search_exception_tables(regs->nip)) {
 		pr_crit_ratelimited("Kernel attempted to access user page (%lx) - exploit attempt? (uid: %d)\n",
 				    address,
 				    from_kuid(&init_user_ns, current_uid()));
 	}
 
-	// Kernel fault on kernel address is bad
-	if (address >= TASK_SIZE)
-		return true;
-
 	// Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad
 	if (!search_exception_tables(regs->nip))
 		return true;
-- 
2.25.0


^ permalink raw reply related

* [PATCH v1 5/5] powerpc/fault: Perform exception fixup in do_page_fault()
From: Christophe Leroy @ 2020-08-06 17:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <7baae4086cbb9ffb08c933b065ff7d29dbc03dd6.1596734104.git.christophe.leroy@csgroup.eu>

Exception fixup doesn't require the heady full regs saving,
do it from do_page_fault() directly.

For that, split bad_page_fault() in two parts.

As bad_page_fault() can also be called from other places than
handle_page_fault(), it will still perform exception fixup and
fallback on __bad_page_fault().

handle_page_fault() directly calls __bad_page_fault() as the
exception fixup will now be done by do_page_fault()

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/entry_32.S       |  2 +-
 arch/powerpc/kernel/exceptions-64e.S |  2 +-
 arch/powerpc/kernel/exceptions-64s.S |  2 +-
 arch/powerpc/mm/fault.c              | 33 ++++++++++++++++++++--------
 4 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index f4d0af8e1136..c198786591f9 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -678,7 +678,7 @@ handle_page_fault:
 	mr	r5,r3
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	lwz	r4,_DAR(r1)
-	bl	bad_page_fault
+	bl	__bad_page_fault
 	b	ret_from_except_full
 
 #ifdef CONFIG_PPC_BOOK3S_32
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index d9ed79415100..dd9161ea5da8 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -1024,7 +1024,7 @@ storage_fault_common:
 	mr	r5,r3
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	ld	r4,_DAR(r1)
-	bl	bad_page_fault
+	bl	__bad_page_fault
 	b	ret_from_except
 
 /*
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index f7d748b88705..2cb3bcfb896d 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -3254,7 +3254,7 @@ handle_page_fault:
 	mr	r5,r3
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	ld	r4,_DAR(r1)
-	bl	bad_page_fault
+	bl	__bad_page_fault
 	b	interrupt_return
 
 /* We have a data breakpoint exception - handle it */
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index edde169ba3a6..bd6e397eb84a 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -542,10 +542,20 @@ NOKPROBE_SYMBOL(__do_page_fault);
 int do_page_fault(struct pt_regs *regs, unsigned long address,
 		  unsigned long error_code)
 {
+	const struct exception_table_entry *entry;
 	enum ctx_state prev_state = exception_enter();
 	int rc = __do_page_fault(regs, address, error_code);
 	exception_exit(prev_state);
-	return rc;
+	if (likely(!rc))
+		return 0;
+
+	entry = search_exception_tables(regs->nip);
+	if (unlikely(!entry))
+		return rc;
+
+	instruction_pointer_set(regs, extable_fixup(entry));
+
+	return 0;
 }
 NOKPROBE_SYMBOL(do_page_fault);
 
@@ -554,17 +564,10 @@ NOKPROBE_SYMBOL(do_page_fault);
  * It is called from the DSI and ISI handlers in head.S and from some
  * of the procedures in traps.c.
  */
-void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
+void __bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
 {
-	const struct exception_table_entry *entry;
 	int is_write = page_fault_is_write(regs->dsisr);
 
-	/* Are we prepared to handle this fault?  */
-	if ((entry = search_exception_tables(regs->nip)) != NULL) {
-		regs->nip = extable_fixup(entry);
-		return;
-	}
-
 	/* kernel has accessed a bad area */
 
 	switch (TRAP(regs)) {
@@ -598,3 +601,15 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
 
 	die("Kernel access of bad area", regs, sig);
 }
+
+void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
+{
+	const struct exception_table_entry *entry;
+
+	/* Are we prepared to handle this fault?  */
+	entry = search_exception_tables(instruction_pointer(regs));
+	if (entry)
+		instruction_pointer_set(regs, extable_fixup(entry));
+	else
+		__bad_page_fault(regs, address, sig);
+}
-- 
2.25.0


^ permalink raw reply related

* [Bug 207359] MegaRAID SAS 9361 controller hang/reset
From: bugzilla-daemon @ 2020-08-06 17:56 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-207359-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=207359

--- Comment #4 from Cameron (cam@neo-zeon.de) ---
I converted the box's filesystems from BTRFS to XFS, and switched the page size
from 4k to 64k. The problem appears to be entirely gone now. I am able to
conclusively run 5.7.13 without issue, which I verified as having the
megaraid_sas controller hang problem while still running my previous BTRFS+4k
page configuration.

Unfortunately, it took a great deal of time to perform this conversion, and I
wasn't able to keep the box down even longer to test if converting to XFS and
64k pages individually resolved the issue. All I can say for certain is that
either switching to XFS, to a 64k page size, or both has fixed the problem for
me.

The backup volume is a single SATA disk that is still using BTRFS (for
snapshotting), and is not giving me any trouble. But if this has any relation
to https://bugzilla.kernel.org/show_bug.cgi?id=206123, then this may not be
conclusive due to being that SATA disks potentially may not trigger the issue.
The single disk also can't push as much IO as the RAID10 volume so that may be
another reason.

My quasi educated non-kernel-dev guess is that this is probably a bug relating
to the 4k page size. Whether or not the regular behavior of BTRFS exacerbates
this (making it easier to reproduce), is possible, but unknown.

Hopefully someone else encountering this issue will find this helpful.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* Re: [PATCH v8 5/8] powerpc/vdso: Prepare for switching VDSO to generic C implementation.
From: Segher Boessenkool @ 2020-08-06 18:33 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Christophe Leroy, nathanl, arnd, linux-kernel,
	Tulio Magno Quites Machado Filho, Paul Mackerras, luto,
	linux-arch, tglx, vincenzo.frascino, linuxppc-dev
In-Reply-To: <87r1sky1hm.fsf@mpe.ellerman.id.au>

Hi!

On Thu, Aug 06, 2020 at 12:03:33PM +1000, Michael Ellerman wrote:
> Segher Boessenkool <segher@kernel.crashing.org> writes:
> > On Wed, Aug 05, 2020 at 04:24:16PM +1000, Michael Ellerman wrote:
> >> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> >> > Indeed, 32-bit doesn't have a redzone, so I believe it needs a stack 
> >> > frame whenever it has anything to same.

^^^

> >> >     fbb60:	94 21 ff e0 	stwu    r1,-32(r1)
> >
> > This is the *only* place where you can use a negative offset from r1:
> > in the stwu to extend the stack (set up a new stack frame, or make the
> > current one bigger).
> 
> (You're talking about 32-bit code here right?)

The "SYSV" ELF binding, yeah, which is used for 32-bit on Linux (give or
take, ho hum).

The ABIs that have a red zone are much nicer here (but less simple) :-)

> >> At the same time it's much safer for us to just save/restore r2, and
> >> probably in the noise performance wise.
> >
> > If you want a function to be able to work with ABI-compliant code safely
> > (in all cases), you'll have to make it itself ABI-compliant as well,
> > yes :-)
> 
> True. Except this is the VDSO which has previously been a bit wild west
> as far as ABI goes :)

It could get away with many things because it was guaranteed to be a
leaf function.  Some of those things even violate the ABIs, but you can
get away with it easily, much reduced scope.  Now if this is generated
code, violating the rules will catch up with you sooner rather than
later ;-)


Segher

^ permalink raw reply

* Re: [PATCH v1 5/5] powerpc/fault: Perform exception fixup in do_page_fault()
From: kernel test robot @ 2020-08-06 21:07 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, npiggin
  Cc: linuxppc-dev, kbuild-all, linux-kernel
In-Reply-To: <5748c8f5cf0a9b3686169e2c7709107e6aaec408.1596734105.git.christophe.leroy@csgroup.eu>

[-- Attachment #1: Type: text/plain, Size: 3059 bytes --]

Hi Christophe,

I love your patch! Perhaps something to improve:

[auto build test WARNING on powerpc/next]
[also build test WARNING on v5.8 next-20200806]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Christophe-Leroy/powerpc-mm-sanity_check_fault-should-work-for-all-not-only-BOOK3S/20200807-012433
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-allyesconfig (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> arch/powerpc/mm/fault.c:567:6: warning: no previous prototype for '__bad_page_fault' [-Wmissing-prototypes]
     567 | void __bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
         |      ^~~~~~~~~~~~~~~~

vim +/__bad_page_fault +567 arch/powerpc/mm/fault.c

   561	
   562	/*
   563	 * bad_page_fault is called when we have a bad access from the kernel.
   564	 * It is called from the DSI and ISI handlers in head.S and from some
   565	 * of the procedures in traps.c.
   566	 */
 > 567	void __bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
   568	{
   569		int is_write = page_fault_is_write(regs->dsisr);
   570	
   571		/* kernel has accessed a bad area */
   572	
   573		switch (TRAP(regs)) {
   574		case 0x300:
   575		case 0x380:
   576		case 0xe00:
   577			pr_alert("BUG: %s on %s at 0x%08lx\n",
   578				 regs->dar < PAGE_SIZE ? "Kernel NULL pointer dereference" :
   579				 "Unable to handle kernel data access",
   580				 is_write ? "write" : "read", regs->dar);
   581			break;
   582		case 0x400:
   583		case 0x480:
   584			pr_alert("BUG: Unable to handle kernel instruction fetch%s",
   585				 regs->nip < PAGE_SIZE ? " (NULL pointer?)\n" : "\n");
   586			break;
   587		case 0x600:
   588			pr_alert("BUG: Unable to handle kernel unaligned access at 0x%08lx\n",
   589				 regs->dar);
   590			break;
   591		default:
   592			pr_alert("BUG: Unable to handle unknown paging fault at 0x%08lx\n",
   593				 regs->dar);
   594			break;
   595		}
   596		printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
   597			regs->nip);
   598	
   599		if (task_stack_end_corrupted(current))
   600			printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
   601	
   602		die("Kernel access of bad area", regs, sig);
   603	}
   604	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 69763 bytes --]

^ permalink raw reply

* Re: [PATCH] powerpc/perf: Account for interrupts during PMC overflow for an invalid SIAR check
From: Alexey Kardashevskiy @ 2020-08-06 23:51 UTC (permalink / raw)
  To: Athira Rajeev, mpe; +Cc: maddy, linuxppc-dev
In-Reply-To: <1596717992-7321-1-git-send-email-atrajeev@linux.vnet.ibm.com>



On 06/08/2020 22:46, Athira Rajeev wrote:
> Performance monitor interrupt handler checks if any counter has overflown
> and calls `record_and_restart` in core-book3s which invokes
> `perf_event_overflow` to record the sample information.
> Apart from creating sample, perf_event_overflow also does the interrupt
> and period checks via perf_event_account_interrupt.
> 
> Currently we record information only if the SIAR valid bit is set
> ( using `siar_valid` check ) and hence the interrupt check.
> But it is possible that we do sampling for some events that are not
> generating valid SIAR and hence there is no chance to disable the event
> if interrupts is more than max_samples_per_tick. This leads to soft lockup.
> 
> Fix this by adding perf_event_account_interrupt in the invalid siar
> code path for a sampling event. ie if siar is invalid, just do interrupt
> check and don't record the sample information.
> 
> Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
> Reported-by: Alexey Kardashevskiy <aik@ozlabs.ru>



Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>


> ---
>  arch/powerpc/perf/core-book3s.c | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
> index 01d7028..626e587 100644
> --- a/arch/powerpc/perf/core-book3s.c
> +++ b/arch/powerpc/perf/core-book3s.c
> @@ -2101,6 +2101,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
>  
>  		if (perf_event_overflow(event, &data, regs))
>  			power_pmu_stop(event, 0);
> +	} else if (period) {
> +		/* Account for interrupt incase of invalid siar */
> +		if (perf_event_account_interrupt(event))
> +			power_pmu_stop(event, 0);
>  	}
>  }
>  
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH] powerpc/book3s64/radix: Make radix_mem_block_size 64bit
From: Michael Ellerman @ 2020-08-07  1:11 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linuxppc-dev
In-Reply-To: <878sesq6yl.fsf@linux.ibm.com>

"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
> Michael Ellerman <mpe@ellerman.id.au> writes:
>
>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>> Similar to commit: 89c140bbaeee ("pseries: Fix 64 bit logical memory block panic")
>>> make sure we update different variables tracking lmb_size are updated
>>> to be 64 bit.
>>
>> That commit went to all stable releases, should this one also?
>>
>
> radix_mem_block_size got added recently and it is not yet upstram. But
> the drmem_lmb_info change can be a stable candidate. We also need this
>
> I will split this as two patches?

Yes, sounds good.

cheers

> modified   arch/powerpc/include/asm/drmem.h
> @@ -67,7 +67,7 @@ struct of_drconf_cell_v2 {
>  #define DRCONF_MEM_RESERVED	0x00000080
>  #define DRCONF_MEM_HOTREMOVABLE	0x00000100
>  
> -static inline u32 drmem_lmb_size(void)
> +static inline u64 drmem_lmb_size(void)
>  {
>  	return drmem_info->lmb_size;
>  }
>
> -aneesh

^ permalink raw reply

* Re: [PATCH V5 0/4] powerpc/perf: Add support for perf extended regs in powerpc
From: Athira Rajeev @ 2020-08-07  2:11 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Ravi Bangoria, Michael Neuling, maddy, kajoljain, linuxppc-dev,
	Jiri Olsa, Jiri Olsa
In-Reply-To: <20200806122052.GC71359@kernel.org>



> On 06-Aug-2020, at 5:50 PM, Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> 
> Em Fri, Jul 31, 2020 at 11:04:14PM +0530, Athira Rajeev escreveu:
>> 
>> 
>>> On 31-Jul-2020, at 1:20 AM, Jiri Olsa <jolsa@redhat.com> wrote:
>>> 
>>> On Thu, Jul 30, 2020 at 01:24:40PM +0530, Athira Rajeev wrote:
>>>> 
>>>> 
>>>>> On 27-Jul-2020, at 10:46 PM, Athira Rajeev <atrajeev@linux.vnet.ibm.com> wrote:
>>>>> 
>>>>> Patch set to add support for perf extended register capability in
>>>>> powerpc. The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to
>>>>> indicate the PMU which support extended registers. The generic code
>>>>> define the mask of extended registers as 0 for non supported architectures.
>>>>> 
>>>>> Patches 1 and 2 are the kernel side changes needed to include
>>>>> base support for extended regs in powerpc and in power10.
>>>>> Patches 3 and 4 are the perf tools side changes needed to support the
>>>>> extended registers.
>>>>> 
>>>> 
>>>> Hi Arnaldo, Jiri
>>>> 
>>>> please let me know if you have any comments/suggestions on this patch series to add support for perf extended regs.
>>> 
>>> hi,
>>> can't really tell for powerpc, but in general
>>> perf tool changes look ok
>>> 
>> 
>> Hi Jiri,
>> Thanks for checking the patchset.
> 
> So I'dd say you submit a v6, split into the kernel part, that probably
> should go via the PPC arch tree, and I can pick the tooling part, ok?
> 
> - Arnaldo

Sure Arnaldo, I will send a v6.

Thanks,
Athira

^ permalink raw reply

* Re: [PATCH v8 5/8] powerpc/vdso: Prepare for switching VDSO to generic C implementation.
From: Michael Ellerman @ 2020-08-07  2:44 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Christophe Leroy, nathanl, arnd, linux-kernel,
	Tulio Magno Quites Machado Filho, Paul Mackerras, luto,
	linux-arch, tglx, vincenzo.frascino, linuxppc-dev
In-Reply-To: <20200806183316.GV6753@gate.crashing.org>

Segher Boessenkool <segher@kernel.crashing.org> writes:
> On Thu, Aug 06, 2020 at 12:03:33PM +1000, Michael Ellerman wrote:
>> Segher Boessenkool <segher@kernel.crashing.org> writes:
>> > On Wed, Aug 05, 2020 at 04:24:16PM +1000, Michael Ellerman wrote:
>> >> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>> >> > Indeed, 32-bit doesn't have a redzone, so I believe it needs a stack 
>> >> > frame whenever it has anything to same.
>
> ^^^
>
>> >> >     fbb60:	94 21 ff e0 	stwu    r1,-32(r1)
>> >
>> > This is the *only* place where you can use a negative offset from r1:
>> > in the stwu to extend the stack (set up a new stack frame, or make the
>> > current one bigger).
>> 
>> (You're talking about 32-bit code here right?)
>
> The "SYSV" ELF binding, yeah, which is used for 32-bit on Linux (give or
> take, ho hum).
>
> The ABIs that have a red zone are much nicer here (but less simple) :-)

Yep, just checking I wasn't misunderstanding your comment about negative
offsets.

>> >> At the same time it's much safer for us to just save/restore r2, and
>> >> probably in the noise performance wise.
>> >
>> > If you want a function to be able to work with ABI-compliant code safely
>> > (in all cases), you'll have to make it itself ABI-compliant as well,
>> > yes :-)
>> 
>> True. Except this is the VDSO which has previously been a bit wild west
>> as far as ABI goes :)
>
> It could get away with many things because it was guaranteed to be a
> leaf function.  Some of those things even violate the ABIs, but you can
> get away with it easily, much reduced scope.  Now if this is generated
> code, violating the rules will catch up with you sooner rather than
> later ;-)

Agreed.

cheers

^ permalink raw reply

* Re: [PATCH] powerpc/signal: Move and simplify get_clean_sp()
From: Michael Ellerman @ 2020-08-07  2:48 UTC (permalink / raw)
  To: Christoph Hellwig, Christophe Leroy
  Cc: Paul Mackerras, linuxppc-dev, linux-kernel
In-Reply-To: <20200806092547.GA2544@infradead.org>

Christoph Hellwig <hch@infradead.org> writes:
> On Thu, Aug 06, 2020 at 08:50:20AM +0000, Christophe Leroy wrote:
>> get_clean_sp() is only used in kernel/signal.c . Move it there.
>> 
>> And GCC is smart enough to reduce the function when on PPC32, no
>> need of a special PPC32 simple version.
>
> What about just open coding it in the only caller, which would seem even
> cleaner?

+1

cheers

^ permalink raw reply

* Re: [RFC PATCH 1/2] powerpc/numa: Introduce logical numa id
From: Nathan Lynch @ 2020-08-07  4:24 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev, Srikar Dronamraju
In-Reply-To: <20200731111916.243569-1-aneesh.kumar@linux.ibm.com>

"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index e437a9ac4956..6c659aada55b 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -221,25 +221,51 @@ static void initialize_distance_lookup_table(int nid,
>  	}
>  }
>  
> +static u32 nid_map[MAX_NUMNODES] = {[0 ... MAX_NUMNODES - 1] =  NUMA_NO_NODE};

It's odd to me to use MAX_NUMNODES for this array when it's going to be
indexed not by Linux's logical node IDs but by the platform-provided
domain number, which has no relation to MAX_NUMNODES.

> +
> +int firmware_group_id_to_nid(int firmware_gid)
> +{
> +	static int last_nid = 0;
> +
> +	/*
> +	 * For PowerNV we don't change the node id. This helps to avoid
> +	 * confusion w.r.t the expected node ids. On pseries, node numbers
> +	 * are virtualized. Hence do logical node id for pseries.
> +	 */
> +	if (!firmware_has_feature(FW_FEATURE_LPAR))
> +		return firmware_gid;
> +
> +	if (firmware_gid ==  -1)
> +		return NUMA_NO_NODE;
> +
> +	if (nid_map[firmware_gid] == NUMA_NO_NODE)
> +		nid_map[firmware_gid] = last_nid++;

This should at least be bounds-checked in case of domain numbering in
excess of MAX_NUMNODES. Or a different data structure should be used?
Not sure.

I'd prefer Linux's logical node type not be easily interchangeable with
the firmware node/group id type. The firmware type could be something
like:

struct affinity_domain {
	u32 val;
};
typedef struct affinity_domain affinity_domain_t;

with appropriate accessors/APIs.

This can prevent a whole class of errors that is currently possible with
CPUs, since the logical and "hardware" ID types are both simple
integers.

^ permalink raw reply

* Re: [PATCH v5 3/3] mm/page_alloc: Keep memoryless cpuless node 0 offline
From: Andrew Morton @ 2020-08-07  4:32 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Gautham R Shenoy, Andi Kleen, David Hildenbrand, linuxppc-dev,
	linux-kernel, Michal Hocko, linux-mm, Satheesh Rajendran,
	Mel Gorman, Kirill A. Shutemov, Christopher Lameter,
	Michal Such?nek, Linus Torvalds, Vlastimil Babka
In-Reply-To: <20200703125823.GA26243@linux.vnet.ibm.com>

On Fri, 3 Jul 2020 18:28:23 +0530 Srikar Dronamraju <srikar@linux.vnet.ibm.com> wrote:

> > The memory hotplug changes that somehow because you can hotremove numa
> > nodes and therefore make the nodemask sparse but that is not a common
> > case. I am not sure what would happen if a completely new node was added
> > and its corresponding node was already used by the renumbered one
> > though. It would likely conflate the two I am afraid. But I am not sure
> > this is really possible with x86 and a lack of a bug report would
> > suggest that nobody is doing that at least.
> > 
> 
> JFYI,
> Satheesh copied in this mailchain had opened a bug a year on crash with vcpu
> hotplug on memoryless node. 
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=202187

So...  do we merge this patch or not?  Seems that the overall view is
"risky but nobody is likely to do anything better any time soon"?

^ permalink raw reply

* Re: [RFC PATCH 1/2] powerpc/numa: Introduce logical numa id
From: Aneesh Kumar K.V @ 2020-08-07  5:02 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: linuxppc-dev, Srikar Dronamraju
In-Reply-To: <87pn83ytet.fsf@linux.ibm.com>

On 8/7/20 9:54 AM, Nathan Lynch wrote:
> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>> index e437a9ac4956..6c659aada55b 100644
>> --- a/arch/powerpc/mm/numa.c
>> +++ b/arch/powerpc/mm/numa.c
>> @@ -221,25 +221,51 @@ static void initialize_distance_lookup_table(int nid,
>>   	}
>>   }
>>   
>> +static u32 nid_map[MAX_NUMNODES] = {[0 ... MAX_NUMNODES - 1] =  NUMA_NO_NODE};
> 
> It's odd to me to use MAX_NUMNODES for this array when it's going to be
> indexed not by Linux's logical node IDs but by the platform-provided
> domain number, which has no relation to MAX_NUMNODES.


I didn't want to dynamically allocate this. We could fetch 
"ibm,max-associativity-domains" to find the size for that. The current 
code do assume  firmware group id to not exceed MAX_NUMNODES. Hence kept 
the array size to be MAX_NUMNODEs. I do agree that it is confusing. May 
be we can do #define MAX_AFFINITY_DOMAIN MAX_NUMNODES?



> 
>> +
>> +int firmware_group_id_to_nid(int firmware_gid)
>> +{
>> +	static int last_nid = 0;
>> +
>> +	/*
>> +	 * For PowerNV we don't change the node id. This helps to avoid
>> +	 * confusion w.r.t the expected node ids. On pseries, node numbers
>> +	 * are virtualized. Hence do logical node id for pseries.
>> +	 */
>> +	if (!firmware_has_feature(FW_FEATURE_LPAR))
>> +		return firmware_gid;
>> +
>> +	if (firmware_gid ==  -1)
>> +		return NUMA_NO_NODE;
>> +
>> +	if (nid_map[firmware_gid] == NUMA_NO_NODE)
>> +		nid_map[firmware_gid] = last_nid++;
> 
> This should at least be bounds-checked in case of domain numbering in
> excess of MAX_NUMNODES. Or a different data structure should be used?
> Not sure.
> 
> I'd prefer Linux's logical node type not be easily interchangeable with
> the firmware node/group id type. The firmware type could be something
> like:
> 
> struct affinity_domain {
> 	u32 val;
> };
> typedef struct affinity_domain affinity_domain_t;
> 
> with appropriate accessors/APIs.
> 

That is a good idea. Will use this.

-aneesh


^ permalink raw reply

* Re: [PATCH v2 2/2] powerpc/pci: unmap all interrupts when a PHB is removed
From: Alexey Kardashevskiy @ 2020-08-07  6:01 UTC (permalink / raw)
  To: Cédric Le Goater, Michael Ellerman
  Cc: Oliver O'Halloran, linuxppc-dev
In-Reply-To: <20200617162938.743439-3-clg@kaod.org>



On 18/06/2020 02:29, Cédric Le Goater wrote:
> Some PCI adapters, like GPUs, use the "interrupt-map" property to
> describe interrupt mappings other than the legacy INTx interrupts.
> There can be more than 4 mappings.
> 
> To clear all interrupts when a PHB is removed, we need to increase the
> 'irq_map' array in which mappings are recorded. Compute the number of
> interrupt mappings from the "interrupt-map" property and allocate a
> bigger 'irq_map' array.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  arch/powerpc/kernel/pci-common.c | 49 +++++++++++++++++++++++++++++++-
>  1 file changed, 48 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
> index 515480a4bac6..deb831f0ae13 100644
> --- a/arch/powerpc/kernel/pci-common.c
> +++ b/arch/powerpc/kernel/pci-common.c
> @@ -353,9 +353,56 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr)
>  	return NULL;
>  }
>  
> +/*
> + * Assumption is made on the interrupt parent. All interrupt-map
> + * entries are considered to have the same parent.
> + */
> +static int pcibios_irq_map_count(struct pci_controller *phb)

I wonder if
int of_irq_count(struct device_node *dev)
could work here too. If it does not, then never mind.


Other than that, the only other comment is - merge this one into 1/2 as
1/2 alone won't properly fix the problem but it may look like that it does:

for phyp, the test machine just happens to have 4 entries in the map but
this is the phyp implementation detail;

for qemu, there are more but we only unregister 4 but kvm does not care
in general so it is ok which is also implementation detail;

and 2/2 just makes these details not matter. Thanks,



> +{
> +	const __be32 *imap;
> +	int imaplen;
> +	struct device_node *parent;
> +	u32 intsize, addrsize, parintsize, paraddrsize;
> +
> +	if (of_property_read_u32(phb->dn, "#interrupt-cells", &intsize))
> +		return 0;
> +	if (of_property_read_u32(phb->dn, "#address-cells", &addrsize))
> +		return 0;
> +
> +	imap = of_get_property(phb->dn, "interrupt-map", &imaplen);
> +	if (!imap) {
> +		pr_debug("%pOF : no interrupt-map\n", phb->dn);
> +		return 0;
> +	}
> +	imaplen /= sizeof(u32);
> +	pr_debug("%pOF : imaplen=%d\n", phb->dn, imaplen);
> +
> +	if (imaplen < (addrsize + intsize + 1))
> +		return 0;
> +
> +	imap += intsize + addrsize;
> +	parent = of_find_node_by_phandle(be32_to_cpup(imap));
> +	if (!parent) {
> +		pr_debug("%pOF : no imap parent found !\n", phb->dn);
> +		return 0;
> +	}
> +
> +	if (of_property_read_u32(parent, "#interrupt-cells", &parintsize)) {
> +		pr_debug("%pOF : parent lacks #interrupt-cells!\n", phb->dn);
> +		return 0;
> +	}
> +
> +	if (of_property_read_u32(parent, "#address-cells", &paraddrsize))
> +		paraddrsize = 0;
> +
> +	return imaplen / (addrsize + intsize + 1 + paraddrsize + parintsize);
> +}
> +
>  static void pcibios_irq_map_init(struct pci_controller *phb)
>  {
> -	phb->irq_count = PCI_NUM_INTX;
> +	phb->irq_count = pcibios_irq_map_count(phb);
> +	if (phb->irq_count < PCI_NUM_INTX)
> +		phb->irq_count = PCI_NUM_INTX;
>  
>  	pr_debug("%pOF : interrupt map #%d\n", phb->dn, phb->irq_count);
>  
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH v5 3/3] mm/page_alloc: Keep memoryless cpuless node 0 offline
From: David Hildenbrand @ 2020-08-07  6:58 UTC (permalink / raw)
  To: Andrew Morton, Srikar Dronamraju
  Cc: Gautham R Shenoy, Andi Kleen, linuxppc-dev, linux-kernel,
	Michal Hocko, linux-mm, Satheesh Rajendran, Mel Gorman,
	Kirill A. Shutemov, Christopher Lameter, Michal Such?nek,
	Linus Torvalds, Vlastimil Babka
In-Reply-To: <20200806213211.6a6a56037fe771836e5abbe9@linux-foundation.org>

On 07.08.20 06:32, Andrew Morton wrote:
> On Fri, 3 Jul 2020 18:28:23 +0530 Srikar Dronamraju <srikar@linux.vnet.ibm.com> wrote:
> 
>>> The memory hotplug changes that somehow because you can hotremove numa
>>> nodes and therefore make the nodemask sparse but that is not a common
>>> case. I am not sure what would happen if a completely new node was added
>>> and its corresponding node was already used by the renumbered one
>>> though. It would likely conflate the two I am afraid. But I am not sure
>>> this is really possible with x86 and a lack of a bug report would
>>> suggest that nobody is doing that at least.
>>>
>>
>> JFYI,
>> Satheesh copied in this mailchain had opened a bug a year on crash with vcpu
>> hotplug on memoryless node. 
>>
>> https://bugzilla.kernel.org/show_bug.cgi?id=202187
> 
> So...  do we merge this patch or not?  Seems that the overall view is
> "risky but nobody is likely to do anything better any time soon"?

I recall the issue Michal saw was "fix powerpc" vs. "break other
architectures". @Michal how should we proceed? At least x86-64 won't be
affected IIUC.

-- 
Thanks,

David / dhildenb


^ permalink raw reply

* Re: [PATCH] powerpc/pseries/hotplug-cpu: increase wait time for vCPU death
From: Nathan Lynch @ 2020-08-07  7:05 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Greg Kurz, linuxppc-dev, Michael Roth, Thiago Jung Bauermann,
	Cedric Le Goater
In-Reply-To: <87zh79yen7.fsf@mpe.ellerman.id.au>

Hi everyone,

Michael Ellerman <mpe@ellerman.id.au> writes:
> Greg Kurz <groug@kaod.org> writes:
>> On Tue, 04 Aug 2020 23:35:10 +1000
>> Michael Ellerman <mpe@ellerman.id.au> wrote:
>>> Spinning forever seems like a bad idea, but as has been demonstrated at
>>> least twice now, continuing when we don't know the state of the other
>>> CPU can lead to straight up crashes.
>>> 
>>> So I think I'm persuaded that it's preferable to have the kernel stuck
>>> spinning rather than oopsing.
>>> 
>>
>> +1
>>
>>> I'm 50/50 on whether we should have a cond_resched() in the loop. My
>>> first instinct is no, if we're stuck here for 20s a stack trace would be
>>> good. But then we will probably hit that on some big and/or heavily
>>> loaded machine.
>>> 
>>> So possibly we should call cond_resched() but have some custom logic in
>>> the loop to print a warning if we are stuck for more than some
>>> sufficiently long amount of time.
>>
>> How long should that be ?
>
> Yeah good question.
>
> I guess step one would be seeing how long it can take on the 384 vcpu
> machine. And we can probably test on some other big machines.
>
> Hopefully Nathan can give us some idea of how long he's seen it take on
> large systems? I know he was concerned about the 20s timeout of the
> softlockup detector.

Maybe I'm not quite clear what this is referring to, but I don't think
stop-self/query-stopped-state latency increases with processor count, at
least not on PowerVM. And IIRC I was concerned with the earlier patch's
potential for causing the softlockup watchdog to rightly complain by
polling the stopped state without ever scheduling away.

The fact that smp_query_cpu_stopped() kind of collapses the two distinct
results from the query-cpu-stopped-state RTAS call into one return value
may make it harder than necessary to reason about the questions around
cond_resched() and whether to warn.

Sorry to pull this stunt but I have had some code sitting in a neglected
branch that I think gets the logic around this right.

What we should have is a simple C wrapper for the RTAS call that reflects the
architected inputs and outputs:

================================================================
(-- rtas.c --)

/**
 * rtas_query_cpu_stopped_state() - Call RTAS query-cpu-stopped-state.
 * @hwcpu: Identifies the processor thread to be queried.
 * @status: Pointer to status, valid only on success.
 *
 * Determine whether the given processor thread is in the stopped
 * state.  If successful and @status is non-NULL, the thread's status
 * is stored to @status.
 *
 * Return:
 * * 0   - Success
 * * -1  - Hardware error
 * * -2  - Busy, try again later
 */
int rtas_query_cpu_stopped_state(unsigned int hwcpu, unsigned int *status)
{
       unsigned int cpu_status;
       int token;
       int fwrc;

       token = rtas_token("query-cpu-stopped-state");

       fwrc = rtas_call(token, 1, 2, &cpu_status, hwcpu);
       if (fwrc != 0)
               goto out;

       if (status != NULL)
               *status = cpu_status;
out:
       return fwrc;
}
================================================================


And then a utility function that waits for the remote thread to enter
stopped state, with higher-level logic for rescheduling and warning. The
fact that smp_query_cpu_stopped() currently does not handle a -2/busy
status is a bug, fixed below by using rtas_busy_delay(). Note the
justification for the explicit cond_resched() in the outer loop:

================================================================
(-- rtas.h --)

/* query-cpu-stopped-state CPU_status */
#define RTAS_QCSS_STATUS_STOPPED     0
#define RTAS_QCSS_STATUS_IN_PROGRESS 1
#define RTAS_QCSS_STATUS_NOT_STOPPED 2

(-- pseries/hotplug-cpu.c --)

/**
 * wait_for_cpu_stopped() - Wait for a cpu to enter RTAS stopped state.
 */
static void wait_for_cpu_stopped(unsigned int cpu)
{
       unsigned int status;
       unsigned int hwcpu;

       hwcpu = get_hard_smp_processor_id(cpu);

       do {
               int fwrc;

               /*
                * rtas_busy_delay() will yield only if RTAS returns a
                * busy status. Since query-cpu-stopped-state can
                * yield RTAS_QCSS_STATUS_IN_PROGRESS or
                * RTAS_QCSS_STATUS_NOT_STOPPED for an unbounded
                * period before the target thread stops, we must take
                * care to explicitly reschedule while polling.
                */
               cond_resched();

               do {
                       fwrc = rtas_query_cpu_stopped_state(hwcpu, &status);
               } while (rtas_busy_delay(fwrc));

               if (fwrc == 0)
                       continue;

               pr_err_ratelimited("query-cpu-stopped-state for "
                                  "thread 0x%x returned %d\n",
                                  hwcpu, fwrc);
               goto out;

       } while (status == RTAS_QCSS_STATUS_NOT_STOPPED ||
                status == RTAS_QCSS_STATUS_IN_PROGRESS);

       if (status != RTAS_QCSS_STATUS_STOPPED) {
               pr_err_ratelimited("query-cpu-stopped-state yielded unknown "
                                  "status %d for thread 0x%x\n",
                                  status, hwcpu);
       }
out:
       return;
}

[...]

static void pseries_cpu_die(unsigned int cpu)
{
       wait_for_cpu_stopped(cpu);
       paca_ptrs[cpu]->cpu_start = 0;
}
================================================================

wait_for_cpu_stopped() should be able to accommodate a time-based
warning if necessary, but speaking as a likely recipient of any bug
reports that would arise here, I'm not convinced of the need and I
don't know what a good value would be. It's relatively easy to sample
the stack of a task that's apparently failing to make progress, plus I
probably would use 'perf probe' or similar to report the inputs and
outputs for the RTAS call.

I'm happy to make this a proper submission after I can clean it up and
retest it, or Michael R. is welcome to appropriate it, assuming it's
acceptable.


^ permalink raw reply

* Re: [PATCH] ASoC: fsl-asoc-card: Get "extal" clock rate by clk_get_rate
From: Shengjiu Wang @ 2020-08-07  7:17 UTC (permalink / raw)
  To: Mark Brown
  Cc: Linux-ALSA, Timur Tabi, Xiubo Li, Fabio Estevam, Shengjiu Wang,
	Takashi Iwai, Liam Girdwood, Nicolin Chen, linuxppc-dev,
	linux-kernel
In-Reply-To: <20200806123721.GC6442@sirena.org.uk>

On Thu, Aug 6, 2020 at 8:39 PM Mark Brown <broonie@kernel.org> wrote:
>
> On Thu, Aug 06, 2020 at 03:39:45PM +0800, Shengjiu Wang wrote:
>
> >       } else if (of_node_name_eq(cpu_np, "esai")) {
> > +             struct clk *esai_clk = clk_get(&cpu_pdev->dev, "extal");
> > +
> > +             if (!IS_ERR(esai_clk)) {
> > +                     priv->cpu_priv.sysclk_freq[TX] = clk_get_rate(esai_clk);
> > +                     priv->cpu_priv.sysclk_freq[RX] = clk_get_rate(esai_clk);
> > +                     clk_put(esai_clk);
> > +             }
>
> This should handle probe deferral.  Also if this clock is in use
> shouldn't we be enabling it?  It looks like it's intended to be a
> crystal so it's probably forced on all the time but sometimes there's
> power control for crystals, or perhaps someone might do something
> unusual with the hardware.

Ok, will add handler for probe deferral.

This clock is not a crystal, "extal" clock is for cpu dai, it is from
soc internal PLL. which is enabled by cpu dai, here is just to
get the clock rate.

best regards
wang shengjiu

^ permalink raw reply

* [PATCH v2 1/2] sched/topology: Allow archs to override cpu_smt_mask
From: Srikar Dronamraju @ 2020-08-07  7:45 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Gautham R Shenoy, Michael Neuling, Vincent Guittot,
	Srikar Dronamraju, Peter Zijlstra, Mel Gorman, LKML,
	Valentin Schneider, linuxppc-dev, Ingo Molnar, Dietmar Eggemann

cpu_smt_mask tracks topology_sibling_cpumask. This would be good for
most architectures. One of the users of cpu_smt_mask(), would be to
identify idle-cores. On Power9, a pair of SMT4 cores can be presented by
the firmware as a SMT8 core for backward compatibility reasons.

Powerpc allows LPARs to be live migrated from Power8 to Power9. Do note
Power8 had only SMT8 cores. Existing software which has been
developed/configured for Power8 would expect to see SMT8 core.
Maintaining the illusion of SMT8 core is a requirement to make that
work.

In order to maintain above userspace backward compatibility with
previous versions of processor, Power9 onwards there is option to the
firmware to advertise a pair of SMT4 cores as a fused cores aka SMT8
core. On Power9 this pair shares the L2 cache as well. However, from the
scheduler's point of view, a core should be determined by SMT4, since
its a completely independent unit of compute. Hence allow PowerPc
architecture to override the default cpu_smt_mask() to point to the SMT4
cores in a SMT8 mode.

This will ensure the scheduler is always given the right information.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: LKML <linux-kernel@vger.kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Acked-by; Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
Changelog v1->v2:
	Update the commit msg based on the discussion in community esp
	with Peter Zijlstra and Michael Ellerman.

 include/linux/topology.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 608fa4aadf0e..ad03df1cc266 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -198,7 +198,7 @@ static inline int cpu_to_mem(int cpu)
 #define topology_die_cpumask(cpu)		cpumask_of(cpu)
 #endif
 
-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask)
 static inline const struct cpumask *cpu_smt_mask(int cpu)
 {
 	return topology_sibling_cpumask(cpu);
-- 
2.18.2


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox