LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v14 3/9] powerpc/time: Move timebase functions into new asm/vdso/timebase.h
From: Michael Ellerman @ 2020-11-26 13:10 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <20201126131006.2431205-1-mpe@ellerman.id.au>

From: Christophe Leroy <christophe.leroy@csgroup.eu>

In order to easily use get_tb() from C VDSO, move timebase
functions into a new header named asm/vdso/timebase.h

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
[mpe: Move into asm/vdso, drop dubious historical authorship comments,
      move mftb() etc. also]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/2d03f4b466156c0a0bfe5494c8874dcac952445c.1604426550.git.christophe.leroy@csgroup.eu
---
 arch/powerpc/include/asm/reg.h           | 31 -----------
 arch/powerpc/include/asm/time.h          | 30 +---------
 arch/powerpc/include/asm/timex.h         |  2 +-
 arch/powerpc/include/asm/vdso/timebase.h | 71 ++++++++++++++++++++++++
 4 files changed, 73 insertions(+), 61 deletions(-)
 create mode 100644 arch/powerpc/include/asm/vdso/timebase.h

v14: mpe: Move into asm/vdso, drop dubious historical authorship comments, move mftb() etc. also.

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index f877a576b338..602236e223c4 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1419,37 +1419,6 @@ static inline void msr_check_and_clear(unsigned long bits)
 		__msr_check_and_clear(bits);
 }
 
-#if defined(CONFIG_PPC_CELL) || defined(CONFIG_E500)
-#define mftb()		({unsigned long rval;				\
-			asm volatile(					\
-				"90:	mfspr %0, %2;\n"		\
-				ASM_FTR_IFSET(				\
-					"97:	cmpwi %0,0;\n"		\
-					"	beq- 90b;\n", "", %1)	\
-			: "=r" (rval) \
-			: "i" (CPU_FTR_CELL_TB_BUG), "i" (SPRN_TBRL) : "cr0"); \
-			rval;})
-#elif defined(CONFIG_PPC_8xx)
-#define mftb()		({unsigned long rval;	\
-			asm volatile("mftbl %0" : "=r" (rval)); rval;})
-#else
-#define mftb()		({unsigned long rval;	\
-			asm volatile("mfspr %0, %1" : \
-				     "=r" (rval) : "i" (SPRN_TBRL)); rval;})
-#endif /* !CONFIG_PPC_CELL */
-
-#if defined(CONFIG_PPC_8xx)
-#define mftbu()		({unsigned long rval;	\
-			asm volatile("mftbu %0" : "=r" (rval)); rval;})
-#else
-#define mftbu()		({unsigned long rval;	\
-			asm volatile("mfspr %0, %1" : "=r" (rval) : \
-				"i" (SPRN_TBRU)); rval;})
-#endif
-
-#define mttbl(v)	asm volatile("mttbl %0":: "r"(v))
-#define mttbu(v)	asm volatile("mttbu %0":: "r"(v))
-
 #ifdef CONFIG_PPC32
 #define mfsrin(v)	({unsigned int rval; \
 			asm volatile("mfsrin %0,%1" : "=r" (rval) : "r" (v)); \
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 2f566c1a754c..a59f8030f020 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -15,6 +15,7 @@
 
 #include <asm/processor.h>
 #include <asm/cpu_has_feature.h>
+#include <asm/vdso/timebase.h>
 
 /* time.c */
 extern unsigned long tb_ticks_per_jiffy;
@@ -38,12 +39,6 @@ struct div_result {
 	u64 result_low;
 };
 
-/* For compatibility, get_tbl() is defined as get_tb() on ppc64 */
-static inline unsigned long get_tbl(void)
-{
-	return mftb();
-}
-
 static inline u64 get_vtb(void)
 {
 #ifdef CONFIG_PPC_BOOK3S_64
@@ -53,29 +48,6 @@ static inline u64 get_vtb(void)
 	return 0;
 }
 
-static inline u64 get_tb(void)
-{
-	unsigned int tbhi, tblo, tbhi2;
-
-	if (IS_ENABLED(CONFIG_PPC64))
-		return mftb();
-
-	do {
-		tbhi = mftbu();
-		tblo = mftb();
-		tbhi2 = mftbu();
-	} while (tbhi != tbhi2);
-
-	return ((u64)tbhi << 32) | tblo;
-}
-
-static inline void set_tb(unsigned int upper, unsigned int lower)
-{
-	mtspr(SPRN_TBWL, 0);
-	mtspr(SPRN_TBWU, upper);
-	mtspr(SPRN_TBWL, lower);
-}
-
 /* Accessor functions for the decrementer register.
  * The 4xx doesn't even have a decrementer.  I tried to use the
  * generic timer interrupt code, which seems OK, with the 4xx PIT
diff --git a/arch/powerpc/include/asm/timex.h b/arch/powerpc/include/asm/timex.h
index 95988870a57b..fa2e76e4093a 100644
--- a/arch/powerpc/include/asm/timex.h
+++ b/arch/powerpc/include/asm/timex.h
@@ -9,7 +9,7 @@
  */
 
 #include <asm/cputable.h>
-#include <asm/reg.h>
+#include <asm/vdso/timebase.h>
 
 #define CLOCK_TICK_RATE	1024000 /* Underlying HZ */
 
diff --git a/arch/powerpc/include/asm/vdso/timebase.h b/arch/powerpc/include/asm/vdso/timebase.h
new file mode 100644
index 000000000000..ac6769b348c6
--- /dev/null
+++ b/arch/powerpc/include/asm/vdso/timebase.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Common timebase prototypes and such for all ppc machines.
+ */
+
+#ifndef _ASM_POWERPC_VDSO_TIMEBASE_H
+#define _ASM_POWERPC_VDSO_TIMEBASE_H
+
+#include <asm/reg.h>
+
+#if defined(CONFIG_PPC_CELL) || defined(CONFIG_E500)
+#define mftb()		({unsigned long rval;				\
+			asm volatile(					\
+				"90:	mfspr %0, %2;\n"		\
+				ASM_FTR_IFSET(				\
+					"97:	cmpwi %0,0;\n"		\
+					"	beq- 90b;\n", "", %1)	\
+			: "=r" (rval) \
+			: "i" (CPU_FTR_CELL_TB_BUG), "i" (SPRN_TBRL) : "cr0"); \
+			rval;})
+#elif defined(CONFIG_PPC_8xx)
+#define mftb()		({unsigned long rval;	\
+			asm volatile("mftbl %0" : "=r" (rval)); rval;})
+#else
+#define mftb()		({unsigned long rval;	\
+			asm volatile("mfspr %0, %1" : \
+				     "=r" (rval) : "i" (SPRN_TBRL)); rval;})
+#endif /* !CONFIG_PPC_CELL */
+
+#if defined(CONFIG_PPC_8xx)
+#define mftbu()		({unsigned long rval;	\
+			asm volatile("mftbu %0" : "=r" (rval)); rval;})
+#else
+#define mftbu()		({unsigned long rval;	\
+			asm volatile("mfspr %0, %1" : "=r" (rval) : \
+				"i" (SPRN_TBRU)); rval;})
+#endif
+
+#define mttbl(v)	asm volatile("mttbl %0":: "r"(v))
+#define mttbu(v)	asm volatile("mttbu %0":: "r"(v))
+
+/* For compatibility, get_tbl() is defined as get_tb() on ppc64 */
+static inline unsigned long get_tbl(void)
+{
+	return mftb();
+}
+
+static inline u64 get_tb(void)
+{
+	unsigned int tbhi, tblo, tbhi2;
+
+	if (IS_ENABLED(CONFIG_PPC64))
+		return mftb();
+
+	do {
+		tbhi = mftbu();
+		tblo = mftb();
+		tbhi2 = mftbu();
+	} while (tbhi != tbhi2);
+
+	return ((u64)tbhi << 32) | tblo;
+}
+
+static inline void set_tb(unsigned int upper, unsigned int lower)
+{
+	mtspr(SPRN_TBWL, 0);
+	mtspr(SPRN_TBWU, upper);
+	mtspr(SPRN_TBWL, lower);
+}
+
+#endif /* _ASM_POWERPC_VDSO_TIMEBASE_H */
-- 
2.25.1


^ permalink raw reply related

* [PATCH v14 2/9] powerpc/processor: Move cpu_relax() into asm/vdso/processor.h
From: Michael Ellerman @ 2020-11-26 13:09 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <20201126131006.2431205-1-mpe@ellerman.id.au>

From: Christophe Leroy <christophe.leroy@csgroup.eu>

cpu_relax() need to be in asm/vdso/processor.h to be used by
the C VDSO generic library.

Move it there.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
[mpe: Tweak include guards to match our usual formatting]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/3747db2c50eb98a86088871bef7ed7c593e336bb.1604426550.git.christophe.leroy@csgroup.eu
---
 arch/powerpc/include/asm/processor.h      | 13 ++-----------
 arch/powerpc/include/asm/vdso/processor.h | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+), 11 deletions(-)
 create mode 100644 arch/powerpc/include/asm/vdso/processor.h

v14: mpe: Tweak include guards to match our usual formatting

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index c61c859b51a8..333e3b6c76fb 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -6,6 +6,8 @@
  * Copyright (C) 2001 PPC 64 Team, IBM Corp
  */
 
+#include <vdso/processor.h>
+
 #include <asm/reg.h>
 
 #ifdef CONFIG_VSX
@@ -63,14 +65,6 @@ extern int _chrp_type;
 
 #endif /* defined(__KERNEL__) && defined(CONFIG_PPC32) */
 
-/* Macros for adjusting thread priority (hardware multi-threading) */
-#define HMT_very_low()   asm volatile("or 31,31,31   # very low priority")
-#define HMT_low()	 asm volatile("or 1,1,1	     # low priority")
-#define HMT_medium_low() asm volatile("or 6,6,6      # medium low priority")
-#define HMT_medium()	 asm volatile("or 2,2,2	     # medium priority")
-#define HMT_medium_high() asm volatile("or 5,5,5      # medium high priority")
-#define HMT_high()	 asm volatile("or 3,3,3	     # high priority")
-
 #ifdef __KERNEL__
 
 #ifdef CONFIG_PPC64
@@ -344,7 +338,6 @@ static inline unsigned long __pack_fe01(unsigned int fpmode)
 }
 
 #ifdef CONFIG_PPC64
-#define cpu_relax()	do { HMT_low(); HMT_medium(); barrier(); } while (0)
 
 #define spin_begin()	HMT_low()
 
@@ -363,8 +356,6 @@ do {								\
 	}							\
 } while (0)
 
-#else
-#define cpu_relax()	barrier()
 #endif
 
 /* Check that a certain kernel stack pointer is valid in task_struct p */
diff --git a/arch/powerpc/include/asm/vdso/processor.h b/arch/powerpc/include/asm/vdso/processor.h
new file mode 100644
index 000000000000..e072577bc7c0
--- /dev/null
+++ b/arch/powerpc/include/asm/vdso/processor.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_POWERPC_VDSO_PROCESSOR_H
+#define _ASM_POWERPC_VDSO_PROCESSOR_H
+
+#ifndef __ASSEMBLY__
+
+/* Macros for adjusting thread priority (hardware multi-threading) */
+#define HMT_very_low()		asm volatile("or 31, 31, 31	# very low priority")
+#define HMT_low()		asm volatile("or 1, 1, 1	# low priority")
+#define HMT_medium_low()	asm volatile("or 6, 6, 6	# medium low priority")
+#define HMT_medium()		asm volatile("or 2, 2, 2	# medium priority")
+#define HMT_medium_high()	asm volatile("or 5, 5, 5	# medium high priority")
+#define HMT_high()		asm volatile("or 3, 3, 3	# high priority")
+
+#ifdef CONFIG_PPC64
+#define cpu_relax()	do { HMT_low(); HMT_medium(); barrier(); } while (0)
+#else
+#define cpu_relax()	barrier()
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_POWERPC_VDSO_PROCESSOR_H */
-- 
2.25.1


^ permalink raw reply related

* [PATCH v14 1/9] powerpc/feature: Use CONFIG_PPC64 instead of __powerpc64__ to define possible features
From: Michael Ellerman @ 2020-11-26 13:09 UTC (permalink / raw)
  To: linuxppc-dev

From: Christophe Leroy <christophe.leroy@csgroup.eu>

In order to build VDSO32 for PPC64, we need to have CPU_FTRS_POSSIBLE
and CPU_FTRS_ALWAYS independant of whether we are building the
32 bits VDSO or the 64 bits VDSO.

Use #ifdef CONFIG_PPC64 instead of #ifdef __powerpc64__

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/08ffecd31403b3c692f25f97b07a378ba784873e.1604426550.git.christophe.leroy@csgroup.eu
---
 arch/powerpc/include/asm/cputable.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

v14: unchanged

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 5e31960a56a9..e069a2d9f7c1 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -488,7 +488,7 @@ static inline void cpu_feature_keys_init(void) { }
 	    CPU_FTR_PURR | CPU_FTR_REAL_LE | CPU_FTR_DABRX)
 #define CPU_FTRS_COMPATIBLE	(CPU_FTR_PPCAS_ARCH_V2)
 
-#ifdef __powerpc64__
+#ifdef CONFIG_PPC64
 #ifdef CONFIG_PPC_BOOK3E
 #define CPU_FTRS_POSSIBLE	(CPU_FTRS_E6500 | CPU_FTRS_E5500)
 #else
@@ -545,7 +545,7 @@ enum {
 };
 #endif /* __powerpc64__ */
 
-#ifdef __powerpc64__
+#ifdef CONFIG_PPC64
 #ifdef CONFIG_PPC_BOOK3E
 #define CPU_FTRS_ALWAYS		(CPU_FTRS_E6500 & CPU_FTRS_E5500)
 #else
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH v2 3/6] perf/core: Fix arch_perf_get_page_size()
From: Peter Zijlstra @ 2020-11-26 13:06 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: mark.rutland, aneesh.kumar, linux-arch, catalin.marinas, will,
	alexander.shishkin, linuxppc-dev, npiggin, linux-kernel, acme,
	davem, dave.hansen, ak, eranian, sparclinux, jolsa, mingo,
	kirill.shutemov, kan.liang
In-Reply-To: <20201126125606.GR4327@casper.infradead.org>

On Thu, Nov 26, 2020 at 12:56:06PM +0000, Matthew Wilcox wrote:
> On Thu, Nov 26, 2020 at 01:42:07PM +0100, Peter Zijlstra wrote:
> > +	pgdp = pgd_offset(mm, addr);
> > +	pgd = READ_ONCE(*pgdp);
> 
> I forget how x86-32-PAE maps to Linux's PGD/P4D/PUD/PMD scheme, but
> according to volume 3, section 4.4.2, PAE paging uses a 64-bit PDE, so
> whether a PDE is a PGD or a PMD, we're only reading it with READ_ONCE
> rather than the lockless-retry method used by ptep_get_lockless().
> So it's potentially racy?  Do we need a pmdp_get_lockless() or
> pgdp_get_lockless()?

Oh gawd... this isn't new here though, right? Current gup_fast also gets
that wrong, if it is in deed wrong.

I suppose it's a race far more likely today, with THP and all, than it
ever was back then.

^ permalink raw reply

* Re: [PATCH v2 1/6] mm/gup: Provide gup_get_pte() more generic
From: Peter Zijlstra @ 2020-11-26 13:02 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: mark.rutland, aneesh.kumar, linux-arch, catalin.marinas, will,
	alexander.shishkin, linuxppc-dev, npiggin, linux-kernel, acme,
	davem, dave.hansen, ak, eranian, sparclinux, jolsa, mingo,
	kirill.shutemov, kan.liang
In-Reply-To: <20201126124300.GP4327@casper.infradead.org>

On Thu, Nov 26, 2020 at 12:43:00PM +0000, Matthew Wilcox wrote:
> On Thu, Nov 26, 2020 at 01:01:15PM +0100, Peter Zijlstra wrote:
> > +#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
> > +/*
> > + * WARNING: only to be used in the get_user_pages_fast() implementation.
> > + * With get_user_pages_fast(), we walk down the pagetables without taking any
> > + * locks.  For this we would like to load the pointers atomically, but sometimes
> > + * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE).  What
> > + * we do have is the guarantee that a PTE will only either go from not present
> > + * to present, or present to not present or both -- it will not switch to a
> > + * completely different present page without a TLB flush in between; something
> > + * that we are blocking by holding interrupts off.
> 
> I feel like this comment needs some love.  How about:
> 
>  * For walking the pagetables without holding any locks.  Some architectures
>  * (eg x86-32 PAE) cannot load the entries atomically without using
>  * expensive instructions.  We are guaranteed that a PTE will only either go
>  * from not present to present, or present to not present -- it will not
>  * switch to a completely different present page without a TLB flush
>  * inbetween; which we are blocking by holding interrupts off.
> 
> And it would be nice to have an assertion that interrupts are disabled
> in the code.  Because comments are nice, but nobody reads them.

Quite agreed, I'll stick a separate patch on with the updated comment
and a lockdep_assert_irqs_disabled() in. I'm afraid that latter will make
for header soup though :/

We'll see, let the robots have it.

^ permalink raw reply

* Re: [PATCH v2 4/6] arm64/mm: Implement pXX_leaf_size() support
From: Peter Zijlstra @ 2020-11-26 12:57 UTC (permalink / raw)
  To: kan.liang, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	eranian
  Cc: linux-arch, ak, catalin.marinas, linuxppc-dev, willy,
	linux-kernel, dave.hansen, npiggin, aneesh.kumar, sparclinux,
	will, davem, kirill.shutemov
In-Reply-To: <20201126121121.226210959@infradead.org>


Now with pmd_cont() defined...

---
Subject: arm64/mm: Implement pXX_leaf_size() support
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri Nov 13 11:46:06 CET 2020

ARM64 has non-pagetable aligned large page support with PTE_CONT, when
this bit is set the page is part of a super-page. Match the hugetlb
code and support these super pages for PTE and PMD levels.

This enables PERF_SAMPLE_{DATA,CODE}_PAGE_SIZE to report accurate
pagetable leaf sizes.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/arm64/include/asm/pgtable.h |    4 ++++
 1 file changed, 4 insertions(+)

--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -407,6 +407,7 @@ static inline int pmd_trans_huge(pmd_t p
 #define pmd_dirty(pmd)		pte_dirty(pmd_pte(pmd))
 #define pmd_young(pmd)		pte_young(pmd_pte(pmd))
 #define pmd_valid(pmd)		pte_valid(pmd_pte(pmd))
+#define pmd_cont(pmd)		pte_cont(pmd_pte(pmd))
 #define pmd_wrprotect(pmd)	pte_pmd(pte_wrprotect(pmd_pte(pmd)))
 #define pmd_mkold(pmd)		pte_pmd(pte_mkold(pmd_pte(pmd)))
 #define pmd_mkwrite(pmd)	pte_pmd(pte_mkwrite(pmd_pte(pmd)))
@@ -503,6 +504,9 @@ extern pgprot_t phys_mem_access_prot(str
 				 PMD_TYPE_SECT)
 #define pmd_leaf(pmd)		pmd_sect(pmd)
 
+#define pmd_leaf_size(pmd)	(pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)
+#define pte_leaf_size(pte)	(pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE)
+
 #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
 static inline bool pud_sect(pud_t pud) { return false; }
 static inline bool pud_table(pud_t pud) { return true; }

^ permalink raw reply

* Re: [PATCH v2 3/6] perf/core: Fix arch_perf_get_page_size()
From: Matthew Wilcox @ 2020-11-26 12:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mark.rutland, aneesh.kumar, linux-arch, catalin.marinas, will,
	alexander.shishkin, linuxppc-dev, npiggin, linux-kernel, acme,
	davem, dave.hansen, ak, eranian, sparclinux, jolsa, mingo,
	kirill.shutemov, kan.liang
In-Reply-To: <20201126124207.GM3040@hirez.programming.kicks-ass.net>

On Thu, Nov 26, 2020 at 01:42:07PM +0100, Peter Zijlstra wrote:
> +	pgdp = pgd_offset(mm, addr);
> +	pgd = READ_ONCE(*pgdp);

I forget how x86-32-PAE maps to Linux's PGD/P4D/PUD/PMD scheme, but
according to volume 3, section 4.4.2, PAE paging uses a 64-bit PDE, so
whether a PDE is a PGD or a PMD, we're only reading it with READ_ONCE
rather than the lockless-retry method used by ptep_get_lockless().
So it's potentially racy?  Do we need a pmdp_get_lockless() or
pgdp_get_lockless()?

[...]
> +	pmdp = pmd_offset_lockless(pudp, pud, addr);
> +	pmd = READ_ONCE(*pmdp);
> +	if (!pmd_present(pmd))
>  		return 0;
>  
> +	if (pmd_leaf(pmd))
> +		return pmd_leaf_size(pmd);
>  

^ permalink raw reply

* Re: [PATCH v2 2/6] mm: Introduce pXX_leaf_size()
From: Matthew Wilcox @ 2020-11-26 12:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mark.rutland, aneesh.kumar, linux-arch, catalin.marinas, will,
	alexander.shishkin, linuxppc-dev, npiggin, linux-kernel, acme,
	davem, dave.hansen, ak, eranian, sparclinux, jolsa, mingo,
	kirill.shutemov, kan.liang
In-Reply-To: <20201126121121.102580109@infradead.org>

On Thu, Nov 26, 2020 at 01:01:16PM +0100, Peter Zijlstra wrote:
> A number of architectures have non-pagetable aligned huge/large pages.
> For such architectures a leaf can actually be part of a larger entry.
> 
> Provide generic helpers to determine the size of a page-table leaf.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>

^ permalink raw reply

* Re: [PATCH v2 1/6] mm/gup: Provide gup_get_pte() more generic
From: Matthew Wilcox @ 2020-11-26 12:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mark.rutland, aneesh.kumar, linux-arch, catalin.marinas, will,
	alexander.shishkin, linuxppc-dev, npiggin, linux-kernel, acme,
	davem, dave.hansen, ak, eranian, sparclinux, jolsa, mingo,
	kirill.shutemov, kan.liang
In-Reply-To: <20201126121121.036370527@infradead.org>

On Thu, Nov 26, 2020 at 01:01:15PM +0100, Peter Zijlstra wrote:
> +#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
> +/*
> + * WARNING: only to be used in the get_user_pages_fast() implementation.
> + * With get_user_pages_fast(), we walk down the pagetables without taking any
> + * locks.  For this we would like to load the pointers atomically, but sometimes
> + * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE).  What
> + * we do have is the guarantee that a PTE will only either go from not present
> + * to present, or present to not present or both -- it will not switch to a
> + * completely different present page without a TLB flush in between; something
> + * that we are blocking by holding interrupts off.

I feel like this comment needs some love.  How about:

 * For walking the pagetables without holding any locks.  Some architectures
 * (eg x86-32 PAE) cannot load the entries atomically without using
 * expensive instructions.  We are guaranteed that a PTE will only either go
 * from not present to present, or present to not present -- it will not
 * switch to a completely different present page without a TLB flush
 * inbetween; which we are blocking by holding interrupts off.

And it would be nice to have an assertion that interrupts are disabled
in the code.  Because comments are nice, but nobody reads them.

> +static inline pte_t ptep_get_lockless(pte_t *ptep)
> +{
> +	pte_t pte;
> +
> +	do {
> +		pte.pte_low = ptep->pte_low;
> +		smp_rmb();
> +		pte.pte_high = ptep->pte_high;
> +		smp_rmb();
> +	} while (unlikely(pte.pte_low != ptep->pte_low));
> +
> +	return pte;
> +}

^ permalink raw reply

* Re: [PATCH v2 3/6] perf/core: Fix arch_perf_get_page_size()
From: Peter Zijlstra @ 2020-11-26 12:42 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: mark.rutland, aneesh.kumar, linux-arch, catalin.marinas, will,
	alexander.shishkin, linuxppc-dev, npiggin, linux-kernel, acme,
	davem, dave.hansen, ak, eranian, sparclinux, jolsa, mingo,
	kirill.shutemov, kan.liang
In-Reply-To: <20201126123458.GO4327@casper.infradead.org>

On Thu, Nov 26, 2020 at 12:34:58PM +0000, Matthew Wilcox wrote:
> On Thu, Nov 26, 2020 at 01:01:17PM +0100, Peter Zijlstra wrote:
> > The (new) page-table walker in arch_perf_get_page_size() is broken in
> > various ways. Specifically while it is used in a lockless manner, it
> > doesn't depend on CONFIG_HAVE_FAST_GUP nor uses the proper _lockless
> > offset methods, nor is careful to only read each entry only once.
> > 
> > Also the hugetlb support is broken due to calling pte_page() without
> > first checking pte_special().
> > 
> > Rewrite the whole thing to be a proper lockless page-table walker and
> > employ the new pXX_leaf_size() pgtable functions to determine the
> > pagetable size without looking at the page-frames.
> > 
> > Fixes: 51b646b2d9f8 ("perf,mm: Handle non-page-table-aligned hugetlbfs")
> > Fixes: 8d97e71811aa ("perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE")
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > Tested-by: Kan Liang <kan.liang@linux.intel.com>
> > ---
> >  arch/arm64/include/asm/pgtable.h    |    3 +
> >  arch/sparc/include/asm/pgtable_64.h |   13 ++++
> >  arch/sparc/mm/hugetlbpage.c         |   19 ++++--
> >  include/linux/pgtable.h             |   16 +++++
> >  kernel/events/core.c                |  102 +++++++++++++-----------------------
> >  5 files changed, 82 insertions(+), 71 deletions(-)
> 
> This diffstat doesn't match the patch in this email ...

Urgh, no idea how I did that... I must've edited the diff and not done a
quilt-refresh. Updated below.

---
Subject: perf/core: Fix arch_perf_get_page_size()
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 Nov 2020 13:43:57 +0100

The (new) page-table walker in arch_perf_get_page_size() is broken in
various ways. Specifically while it is used in a lockless manner, it
doesn't depend on CONFIG_HAVE_FAST_GUP nor uses the proper _lockless
offset methods, nor is careful to only read each entry only once.

Also the hugetlb support is broken due to calling pte_page() without
first checking pte_special().

Rewrite the whole thing to be a proper lockless page-table walker and
employ the new pXX_leaf_size() pgtable functions to determine the
pagetable size without looking at the page-frames.

Fixes: 51b646b2d9f8 ("perf,mm: Handle non-page-table-aligned hugetlbfs")
Fixes: 8d97e71811aa ("perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Kan Liang <kan.liang@linux.intel.com>
---
 kernel/events/core.c |  105 ++++++++++++++++++---------------------------------
 1 file changed, 39 insertions(+), 66 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -52,6 +52,7 @@
 #include <linux/mount.h>
 #include <linux/min_heap.h>
 #include <linux/highmem.h>
+#include <linux/pgtable.h>
 
 #include "internal.h"
 
@@ -7001,90 +7002,62 @@ static u64 perf_virt_to_phys(u64 virt)
 	return phys_addr;
 }
 
-#ifdef CONFIG_MMU
-
 /*
- * Return the MMU page size of a given virtual address.
- *
- * This generic implementation handles page-table aligned huge pages, as well
- * as non-page-table aligned hugetlbfs compound pages.
- *
- * If an architecture supports and uses non-page-table aligned pages in their
- * kernel mapping it will need to provide it's own implementation of this
- * function.
+ * Return the pagetable size of a given virtual address.
  */
-__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
 {
-	struct page *page;
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	pgd = pgd_offset(mm, addr);
-	if (pgd_none(*pgd))
-		return 0;
+	u64 size = 0;
 
-	p4d = p4d_offset(pgd, addr);
-	if (!p4d_present(*p4d))
+#ifdef CONFIG_HAVE_FAST_GUP
+	pgd_t *pgdp, pgd;
+	p4d_t *p4dp, p4d;
+	pud_t *pudp, pud;
+	pmd_t *pmdp, pmd;
+	pte_t *ptep, pte;
+
+	pgdp = pgd_offset(mm, addr);
+	pgd = READ_ONCE(*pgdp);
+	if (pgd_none(pgd))
 		return 0;
 
-	if (p4d_leaf(*p4d))
-		return 1ULL << P4D_SHIFT;
+	if (pgd_leaf(pgd))
+		return pgd_leaf_size(pgd);
 
-	pud = pud_offset(p4d, addr);
-	if (!pud_present(*pud))
+	p4dp = p4d_offset_lockless(pgdp, pgd, addr);
+	p4d = READ_ONCE(*p4dp);
+	if (!p4d_present(p4d))
 		return 0;
 
-	if (pud_leaf(*pud)) {
-#ifdef pud_page
-		page = pud_page(*pud);
-		if (PageHuge(page))
-			return page_size(compound_head(page));
-#endif
-		return 1ULL << PUD_SHIFT;
-	}
+	if (p4d_leaf(p4d))
+		return p4d_leaf_size(p4d);
 
-	pmd = pmd_offset(pud, addr);
-	if (!pmd_present(*pmd))
+	pudp = pud_offset_lockless(p4dp, p4d, addr);
+	pud = READ_ONCE(*pudp);
+	if (!pud_present(pud))
 		return 0;
 
-	if (pmd_leaf(*pmd)) {
-#ifdef pmd_page
-		page = pmd_page(*pmd);
-		if (PageHuge(page))
-			return page_size(compound_head(page));
-#endif
-		return 1ULL << PMD_SHIFT;
-	}
+	if (pud_leaf(pud))
+		return pud_leaf_size(pud);
 
-	pte = pte_offset_map(pmd, addr);
-	if (!pte_present(*pte)) {
-		pte_unmap(pte);
+	pmdp = pmd_offset_lockless(pudp, pud, addr);
+	pmd = READ_ONCE(*pmdp);
+	if (!pmd_present(pmd))
 		return 0;
-	}
 
-	page = pte_page(*pte);
-	if (PageHuge(page)) {
-		u64 size = page_size(compound_head(page));
-		pte_unmap(pte);
-		return size;
-	}
+	if (pmd_leaf(pmd))
+		return pmd_leaf_size(pmd);
 
-	pte_unmap(pte);
-	return PAGE_SIZE;
-}
-
-#else
+	ptep = pte_offset_map(&pmd, addr);
+	pte = ptep_get_lockless(ptep);
+	if (pte_present(pte))
+		size = pte_leaf_size(pte);
+	pte_unmap(ptep);
+#endif /* CONFIG_HAVE_FAST_GUP */
 
-static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
-{
-	return 0;
+	return size;
 }
 
-#endif
-
 static u64 perf_get_page_size(unsigned long addr)
 {
 	struct mm_struct *mm;
@@ -7109,7 +7082,7 @@ static u64 perf_get_page_size(unsigned l
 		mm = &init_mm;
 	}
 
-	size = arch_perf_get_page_size(mm, addr);
+	size = perf_get_pgtable_size(mm, addr);
 
 	local_irq_restore(flags);
 

^ permalink raw reply

* Re: [PATCH v2 3/6] perf/core: Fix arch_perf_get_page_size()
From: Matthew Wilcox @ 2020-11-26 12:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mark.rutland, aneesh.kumar, linux-arch, catalin.marinas, will,
	alexander.shishkin, linuxppc-dev, npiggin, linux-kernel, acme,
	davem, dave.hansen, ak, eranian, sparclinux, jolsa, mingo,
	kirill.shutemov, kan.liang
In-Reply-To: <20201126121121.164675154@infradead.org>

On Thu, Nov 26, 2020 at 01:01:17PM +0100, Peter Zijlstra wrote:
> The (new) page-table walker in arch_perf_get_page_size() is broken in
> various ways. Specifically while it is used in a lockless manner, it
> doesn't depend on CONFIG_HAVE_FAST_GUP nor uses the proper _lockless
> offset methods, nor is careful to only read each entry only once.
> 
> Also the hugetlb support is broken due to calling pte_page() without
> first checking pte_special().
> 
> Rewrite the whole thing to be a proper lockless page-table walker and
> employ the new pXX_leaf_size() pgtable functions to determine the
> pagetable size without looking at the page-frames.
> 
> Fixes: 51b646b2d9f8 ("perf,mm: Handle non-page-table-aligned hugetlbfs")
> Fixes: 8d97e71811aa ("perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE")
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: Kan Liang <kan.liang@linux.intel.com>
> ---
>  arch/arm64/include/asm/pgtable.h    |    3 +
>  arch/sparc/include/asm/pgtable_64.h |   13 ++++
>  arch/sparc/mm/hugetlbpage.c         |   19 ++++--
>  include/linux/pgtable.h             |   16 +++++
>  kernel/events/core.c                |  102 +++++++++++++-----------------------
>  5 files changed, 82 insertions(+), 71 deletions(-)

This diffstat doesn't match the patch in this email ...

> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -52,6 +52,7 @@
>  #include <linux/mount.h>
>  #include <linux/min_heap.h>
>  #include <linux/highmem.h>
> +#include <linux/pgtable.h>
>  
>  #include "internal.h"
>  
> @@ -7001,90 +7001,62 @@ static u64 perf_virt_to_phys(u64 virt)
>  	return phys_addr;
>  }
>  
> -#ifdef CONFIG_MMU
> -
>  /*
> - * Return the MMU page size of a given virtual address.
> - *
> - * This generic implementation handles page-table aligned huge pages, as well
> - * as non-page-table aligned hugetlbfs compound pages.
> - *
> - * If an architecture supports and uses non-page-table aligned pages in their
> - * kernel mapping it will need to provide it's own implementation of this
> - * function.
> + * Return the pagetable size of a given virtual address.
>   */
> -__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> +static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
>  {
> -	struct page *page;
> -	pgd_t *pgd;
> -	p4d_t *p4d;
> -	pud_t *pud;
> -	pmd_t *pmd;
> -	pte_t *pte;
> +	u64 size = 0;
>  
> -	pgd = pgd_offset(mm, addr);
> -	if (pgd_none(*pgd))
> -		return 0;
> +#ifdef CONFIG_HAVE_FAST_GUP
> +	pgd_t *pgdp, pgd;
> +	p4d_t *p4dp, p4d;
> +	pud_t *pudp, pud;
> +	pmd_t *pmdp, pmd;
> +	pte_t *ptep, pte;
>  
> -	p4d = p4d_offset(pgd, addr);
> -	if (!p4d_present(*p4d))
> +	pgdp = pgd_offset(mm, addr);
> +	pgd = READ_ONCE(*pgdp);
> +	if (pgd_none(pgd))
>  		return 0;
>  
> -	if (p4d_leaf(*p4d))
> -		return 1ULL << P4D_SHIFT;
> +	if (pgd_leaf(pgd))
> +		return pgd_leaf_size(pgd);
>  
> -	pud = pud_offset(p4d, addr);
> -	if (!pud_present(*pud))
> +	p4dp = p4d_offset_lockless(pgdp, pgd, addr);
> +	p4d = READ_ONCE(*p4dp);
> +	if (!p4d_present(p4d))
>  		return 0;
>  
> -	if (pud_leaf(*pud)) {
> -#ifdef pud_page
> -		page = pud_page(*pud);
> -		if (PageHuge(page))
> -			return page_size(compound_head(page));
> -#endif
> -		return 1ULL << PUD_SHIFT;
> -	}
> +	if (p4d_leaf(p4d))
> +		return p4d_leaf_size(p4d);
>  
> -	pmd = pmd_offset(pud, addr);
> -	if (!pmd_present(*pmd))
> +	pudp = pud_offset_lockless(p4dp, p4d, addr);
> +	pud = READ_ONCE(*pudp);
> +	if (!pud_present(pud))
>  		return 0;
>  
> -	if (pmd_leaf(*pmd)) {
> -#ifdef pmd_page
> -		page = pmd_page(*pmd);
> -		if (PageHuge(page))
> -			return page_size(compound_head(page));
> -#endif
> -		return 1ULL << PMD_SHIFT;
> -	}
> +	if (pud_leaf(pud))
> +		return pud_leaf_size(pud);
>  
> -	pte = pte_offset_map(pmd, addr);
> -	if (!pte_present(*pte)) {
> -		pte_unmap(pte);
> +	pmdp = pmd_offset_lockless(pudp, pud, addr);
> +	pmd = READ_ONCE(*pmdp);
> +	if (!pmd_present(pmd))
>  		return 0;
> -	}
>  
> -	page = pte_page(*pte);
> -	if (PageHuge(page)) {
> -		u64 size = page_size(compound_head(page));
> -		pte_unmap(pte);
> -		return size;
> -	}
> -
> -	pte_unmap(pte);
> -	return PAGE_SIZE;
> -}
> +	if (pmd_leaf(pmd))
> +		return pmd_leaf_size(pmd);
>  
> -#else
> +	ptep = pte_offset_map(&pmd, addr);
> +	pte = ptep_get_lockless(ptep);
> +	if (pte_present(pte))
> +		size = pte_leaf_size(pte);
> +	pte_unmap(ptep);
> +#endif /* CONFIG_HAVE_FAST_GUP */
>  
> -static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> -{
> -	return 0;
> +	return size;
>  }
>  
> -#endif
> -
>  static u64 perf_get_page_size(unsigned long addr)
>  {
>  	struct mm_struct *mm;
> @@ -7109,7 +7081,7 @@ static u64 perf_get_page_size(unsigned l
>  		mm = &init_mm;
>  	}
>  
> -	size = arch_perf_get_page_size(mm, addr);
> +	size = perf_get_pgtable_size(mm, addr);
>  
>  	local_irq_restore(flags);
>  
> 
> 

^ permalink raw reply

* [PATCH v2 1/6] mm/gup: Provide gup_get_pte() more generic
From: Peter Zijlstra @ 2020-11-26 12:01 UTC (permalink / raw)
  To: kan.liang, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	eranian
  Cc: linux-arch, ak, catalin.marinas, peterz, linuxppc-dev, willy,
	linux-kernel, dave.hansen, npiggin, aneesh.kumar, sparclinux,
	will, davem, kirill.shutemov
In-Reply-To: <20201126120114.071913521@infradead.org>

In order to write another lockless page-table walker, we need
gup_get_pte() exposed. While doing that, rename it to
ptep_get_lockless() to match the existing ptep_get() naming.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/pgtable.h |   55 +++++++++++++++++++++++++++++++++++++++++++++
 mm/gup.c                |   58 ------------------------------------------------
 2 files changed, 56 insertions(+), 57 deletions(-)

--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -258,6 +258,61 @@ static inline pte_t ptep_get(pte_t *ptep
 }
 #endif
 
+#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
+/*
+ * WARNING: only to be used in the get_user_pages_fast() implementation.
+ *
+ * With get_user_pages_fast(), we walk down the pagetables without taking any
+ * locks.  For this we would like to load the pointers atomically, but sometimes
+ * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE).  What
+ * we do have is the guarantee that a PTE will only either go from not present
+ * to present, or present to not present or both -- it will not switch to a
+ * completely different present page without a TLB flush in between; something
+ * that we are blocking by holding interrupts off.
+ *
+ * Setting ptes from not present to present goes:
+ *
+ *   ptep->pte_high = h;
+ *   smp_wmb();
+ *   ptep->pte_low = l;
+ *
+ * And present to not present goes:
+ *
+ *   ptep->pte_low = 0;
+ *   smp_wmb();
+ *   ptep->pte_high = 0;
+ *
+ * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
+ * We load pte_high *after* loading pte_low, which ensures we don't see an older
+ * value of pte_high.  *Then* we recheck pte_low, which ensures that we haven't
+ * picked up a changed pte high. We might have gotten rubbish values from
+ * pte_low and pte_high, but we are guaranteed that pte_low will not have the
+ * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
+ * operates on present ptes we're safe.
+ */
+static inline pte_t ptep_get_lockless(pte_t *ptep)
+{
+	pte_t pte;
+
+	do {
+		pte.pte_low = ptep->pte_low;
+		smp_rmb();
+		pte.pte_high = ptep->pte_high;
+		smp_rmb();
+	} while (unlikely(pte.pte_low != ptep->pte_low));
+
+	return pte;
+}
+#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
+/*
+ * We require that the PTE can be read atomically.
+ */
+static inline pte_t ptep_get_lockless(pte_t *ptep)
+{
+	return ptep_get(ptep);
+}
+#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2079,62 +2079,6 @@ static void put_compound_head(struct pag
 	put_page(page);
 }
 
-#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
-
-/*
- * WARNING: only to be used in the get_user_pages_fast() implementation.
- *
- * With get_user_pages_fast(), we walk down the pagetables without taking any
- * locks.  For this we would like to load the pointers atomically, but sometimes
- * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE).  What
- * we do have is the guarantee that a PTE will only either go from not present
- * to present, or present to not present or both -- it will not switch to a
- * completely different present page without a TLB flush in between; something
- * that we are blocking by holding interrupts off.
- *
- * Setting ptes from not present to present goes:
- *
- *   ptep->pte_high = h;
- *   smp_wmb();
- *   ptep->pte_low = l;
- *
- * And present to not present goes:
- *
- *   ptep->pte_low = 0;
- *   smp_wmb();
- *   ptep->pte_high = 0;
- *
- * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
- * We load pte_high *after* loading pte_low, which ensures we don't see an older
- * value of pte_high.  *Then* we recheck pte_low, which ensures that we haven't
- * picked up a changed pte high. We might have gotten rubbish values from
- * pte_low and pte_high, but we are guaranteed that pte_low will not have the
- * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
- * operates on present ptes we're safe.
- */
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
-	pte_t pte;
-
-	do {
-		pte.pte_low = ptep->pte_low;
-		smp_rmb();
-		pte.pte_high = ptep->pte_high;
-		smp_rmb();
-	} while (unlikely(pte.pte_low != ptep->pte_low));
-
-	return pte;
-}
-#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
-/*
- * We require that the PTE can be read atomically.
- */
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
-	return ptep_get(ptep);
-}
-#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
-
 static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
 					    unsigned int flags,
 					    struct page **pages)
@@ -2160,7 +2104,7 @@ static int gup_pte_range(pmd_t pmd, unsi
 
 	ptem = ptep = pte_offset_map(&pmd, addr);
 	do {
-		pte_t pte = gup_get_pte(ptep);
+		pte_t pte = ptep_get_lockless(ptep);
 		struct page *head, *page;
 
 		/*



^ permalink raw reply

* [PATCH v2 0/6] perf/mm: Fix PERF_SAMPLE_*_PAGE_SIZE
From: Peter Zijlstra @ 2020-11-26 12:01 UTC (permalink / raw)
  To: kan.liang, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	eranian
  Cc: linux-arch, ak, catalin.marinas, peterz, linuxppc-dev, willy,
	linux-kernel, dave.hansen, npiggin, aneesh.kumar, sparclinux,
	will, davem, kirill.shutemov

Hi,

These patches provide generic infrastructure to determine TLB page size from
page table entries alone. Perf will use this (for either data or code address)
to aid in profiling TLB issues.

While most architectures only have page table aligned large pages, some
(notably ARM64, Sparc64 and Power) provide non page table aligned large pages
and need to provide their own implementation of these functions.

I've provided (completely untested) implementations for ARM64, Sparc64 and
Power/8xxx (it looks like I'm still missing Power/Book3s64/hash support).

Changes since -v1:

 - Changed wording to reflect these are page-table sizes; actual TLB sizes
   might vary.
 - added Power/8xx

Barring any objections I'll queue these in tip/perf/core, as these patches fix
the code that's currently in there.



^ permalink raw reply

* [PATCH v2 4/6] arm64/mm: Implement pXX_leaf_size() support
From: Peter Zijlstra @ 2020-11-26 12:01 UTC (permalink / raw)
  To: kan.liang, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	eranian
  Cc: linux-arch, ak, catalin.marinas, peterz, linuxppc-dev, willy,
	linux-kernel, dave.hansen, npiggin, aneesh.kumar, sparclinux,
	will, davem, kirill.shutemov
In-Reply-To: <20201126120114.071913521@infradead.org>

ARM64 has non-pagetable aligned large page support with PTE_CONT, when
this bit is set the page is part of a super-page. Match the hugetlb
code and support these super pages for PTE and PMD levels.

This enables PERF_SAMPLE_{DATA,CODE}_PAGE_SIZE to report accurate
pagetable leaf sizes.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/arm64/include/asm/pgtable.h |    3 +++
 1 file changed, 3 insertions(+)

--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -503,6 +503,9 @@ extern pgprot_t phys_mem_access_prot(str
 				 PMD_TYPE_SECT)
 #define pmd_leaf(pmd)		pmd_sect(pmd)
 
+#define pmd_leaf_size(pmd)	(pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)
+#define pte_leaf_size(pte)	(pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE)
+
 #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
 static inline bool pud_sect(pud_t pud) { return false; }
 static inline bool pud_table(pud_t pud) { return true; }



^ permalink raw reply

* [PATCH v2 3/6] perf/core: Fix arch_perf_get_page_size()
From: Peter Zijlstra @ 2020-11-26 12:01 UTC (permalink / raw)
  To: kan.liang, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	eranian
  Cc: linux-arch, ak, catalin.marinas, peterz, linuxppc-dev, willy,
	linux-kernel, dave.hansen, npiggin, aneesh.kumar, sparclinux,
	will, davem, kirill.shutemov
In-Reply-To: <20201126120114.071913521@infradead.org>

The (new) page-table walker in arch_perf_get_page_size() is broken in
various ways. Specifically while it is used in a lockless manner, it
doesn't depend on CONFIG_HAVE_FAST_GUP nor uses the proper _lockless
offset methods, nor is careful to only read each entry only once.

Also the hugetlb support is broken due to calling pte_page() without
first checking pte_special().

Rewrite the whole thing to be a proper lockless page-table walker and
employ the new pXX_leaf_size() pgtable functions to determine the
pagetable size without looking at the page-frames.

Fixes: 51b646b2d9f8 ("perf,mm: Handle non-page-table-aligned hugetlbfs")
Fixes: 8d97e71811aa ("perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Kan Liang <kan.liang@linux.intel.com>
---
 arch/arm64/include/asm/pgtable.h    |    3 +
 arch/sparc/include/asm/pgtable_64.h |   13 ++++
 arch/sparc/mm/hugetlbpage.c         |   19 ++++--
 include/linux/pgtable.h             |   16 +++++
 kernel/events/core.c                |  102 +++++++++++++-----------------------
 5 files changed, 82 insertions(+), 71 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -52,6 +52,7 @@
 #include <linux/mount.h>
 #include <linux/min_heap.h>
 #include <linux/highmem.h>
+#include <linux/pgtable.h>
 
 #include "internal.h"
 
@@ -7001,90 +7001,62 @@ static u64 perf_virt_to_phys(u64 virt)
 	return phys_addr;
 }
 
-#ifdef CONFIG_MMU
-
 /*
- * Return the MMU page size of a given virtual address.
- *
- * This generic implementation handles page-table aligned huge pages, as well
- * as non-page-table aligned hugetlbfs compound pages.
- *
- * If an architecture supports and uses non-page-table aligned pages in their
- * kernel mapping it will need to provide it's own implementation of this
- * function.
+ * Return the pagetable size of a given virtual address.
  */
-__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
 {
-	struct page *page;
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
+	u64 size = 0;
 
-	pgd = pgd_offset(mm, addr);
-	if (pgd_none(*pgd))
-		return 0;
+#ifdef CONFIG_HAVE_FAST_GUP
+	pgd_t *pgdp, pgd;
+	p4d_t *p4dp, p4d;
+	pud_t *pudp, pud;
+	pmd_t *pmdp, pmd;
+	pte_t *ptep, pte;
 
-	p4d = p4d_offset(pgd, addr);
-	if (!p4d_present(*p4d))
+	pgdp = pgd_offset(mm, addr);
+	pgd = READ_ONCE(*pgdp);
+	if (pgd_none(pgd))
 		return 0;
 
-	if (p4d_leaf(*p4d))
-		return 1ULL << P4D_SHIFT;
+	if (pgd_leaf(pgd))
+		return pgd_leaf_size(pgd);
 
-	pud = pud_offset(p4d, addr);
-	if (!pud_present(*pud))
+	p4dp = p4d_offset_lockless(pgdp, pgd, addr);
+	p4d = READ_ONCE(*p4dp);
+	if (!p4d_present(p4d))
 		return 0;
 
-	if (pud_leaf(*pud)) {
-#ifdef pud_page
-		page = pud_page(*pud);
-		if (PageHuge(page))
-			return page_size(compound_head(page));
-#endif
-		return 1ULL << PUD_SHIFT;
-	}
+	if (p4d_leaf(p4d))
+		return p4d_leaf_size(p4d);
 
-	pmd = pmd_offset(pud, addr);
-	if (!pmd_present(*pmd))
+	pudp = pud_offset_lockless(p4dp, p4d, addr);
+	pud = READ_ONCE(*pudp);
+	if (!pud_present(pud))
 		return 0;
 
-	if (pmd_leaf(*pmd)) {
-#ifdef pmd_page
-		page = pmd_page(*pmd);
-		if (PageHuge(page))
-			return page_size(compound_head(page));
-#endif
-		return 1ULL << PMD_SHIFT;
-	}
+	if (pud_leaf(pud))
+		return pud_leaf_size(pud);
 
-	pte = pte_offset_map(pmd, addr);
-	if (!pte_present(*pte)) {
-		pte_unmap(pte);
+	pmdp = pmd_offset_lockless(pudp, pud, addr);
+	pmd = READ_ONCE(*pmdp);
+	if (!pmd_present(pmd))
 		return 0;
-	}
 
-	page = pte_page(*pte);
-	if (PageHuge(page)) {
-		u64 size = page_size(compound_head(page));
-		pte_unmap(pte);
-		return size;
-	}
-
-	pte_unmap(pte);
-	return PAGE_SIZE;
-}
+	if (pmd_leaf(pmd))
+		return pmd_leaf_size(pmd);
 
-#else
+	ptep = pte_offset_map(&pmd, addr);
+	pte = ptep_get_lockless(ptep);
+	if (pte_present(pte))
+		size = pte_leaf_size(pte);
+	pte_unmap(ptep);
+#endif /* CONFIG_HAVE_FAST_GUP */
 
-static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
-{
-	return 0;
+	return size;
 }
 
-#endif
-
 static u64 perf_get_page_size(unsigned long addr)
 {
 	struct mm_struct *mm;
@@ -7109,7 +7081,7 @@ static u64 perf_get_page_size(unsigned l
 		mm = &init_mm;
 	}
 
-	size = arch_perf_get_page_size(mm, addr);
+	size = perf_get_pgtable_size(mm, addr);
 
 	local_irq_restore(flags);
 



^ permalink raw reply

* [PATCH v2 5/6] sparc64/mm: Implement pXX_leaf_size() support
From: Peter Zijlstra @ 2020-11-26 12:01 UTC (permalink / raw)
  To: kan.liang, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	eranian
  Cc: linux-arch, ak, catalin.marinas, peterz, linuxppc-dev, willy,
	linux-kernel, dave.hansen, npiggin, aneesh.kumar, sparclinux,
	will, davem, kirill.shutemov
In-Reply-To: <20201126120114.071913521@infradead.org>

Sparc64 has non-pagetable aligned large page support; wire up the
pXX_leaf_size() functions to report the correct pagetable page size.

This enables PERF_SAMPLE_{DATA,CODE}_PAGE_SIZE to report accurate
pagetable leaf sizes.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/sparc/include/asm/pgtable_64.h |   13 +++++++++++++
 arch/sparc/mm/hugetlbpage.c         |   19 +++++++++++++------
 2 files changed, 26 insertions(+), 6 deletions(-)

--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -1121,6 +1121,19 @@ extern unsigned long cmdline_memory_size
 
 asmlinkage void do_sparc64_fault(struct pt_regs *regs);
 
+#ifdef CONFIG_HUGETLB_PAGE
+
+#define pud_leaf_size pud_leaf_size
+extern unsigned long pud_leaf_size(pud_t pud);
+
+#define pmd_leaf_size pmd_leaf_size
+extern unsigned long pmd_leaf_size(pmd_t pmd);
+
+#define pte_leaf_size pte_leaf_size
+extern unsigned long pte_leaf_size(pte_t pte);
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
 #endif /* !(__ASSEMBLY__) */
 
 #endif /* !(_SPARC64_PGTABLE_H) */
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -247,14 +247,17 @@ static unsigned int sun4u_huge_tte_to_sh
 	return shift;
 }
 
-static unsigned int huge_tte_to_shift(pte_t entry)
+static unsigned long tte_to_shift(pte_t entry)
 {
-	unsigned long shift;
-
 	if (tlb_type == hypervisor)
-		shift = sun4v_huge_tte_to_shift(entry);
-	else
-		shift = sun4u_huge_tte_to_shift(entry);
+		return sun4v_huge_tte_to_shift(entry);
+
+	return sun4u_huge_tte_to_shift(entry);
+}
+
+static unsigned int huge_tte_to_shift(pte_t entry)
+{
+	unsigned long shift = tte_to_shift(entry);
 
 	if (shift == PAGE_SHIFT)
 		WARN_ONCE(1, "tto_to_shift: invalid hugepage tte=0x%lx\n",
@@ -272,6 +275,10 @@ static unsigned long huge_tte_to_size(pt
 	return size;
 }
 
+unsigned long pud_leaf_size(pud_t pud) { return 1UL << tte_to_shift((pte_t)pud); }
+unsigned long pmd_leaf_size(pmd_t pmd) { return 1UL << tte_to_shift((pte_t)pmd); }
+unsigned long pte_leaf_size(pte_t pte) { return 1UL << tte_to_shift((pte_t)pte); }
+
 pte_t *huge_pte_alloc(struct mm_struct *mm,
 			unsigned long addr, unsigned long sz)
 {



^ permalink raw reply

* [PATCH v2 6/6] powerpc/8xx: Implement pXX_leaf_size() support
From: Peter Zijlstra @ 2020-11-26 12:01 UTC (permalink / raw)
  To: kan.liang, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	eranian
  Cc: linux-arch, ak, catalin.marinas, peterz, linuxppc-dev, willy,
	linux-kernel, dave.hansen, npiggin, aneesh.kumar, sparclinux,
	will, davem, kirill.shutemov
In-Reply-To: <20201126120114.071913521@infradead.org>

Christophe Leroy wrote:

> I can help with powerpc 8xx. It is a 32 bits powerpc. The PGD has 1024
> entries, that means each entry maps 4M.
>
> Page sizes are 4k, 16k, 512k and 8M.
>
> For the 8M pages we use hugepd with a single entry. The two related PGD
> entries point to the same hugepd.
>
> For the other sizes, they are in standard page tables. 16k pages appear
> 4 times in the page table. 512k entries appear 128 times in the page
> table.
>
> When the PGD entry has _PMD_PAGE_8M bits, the PMD entry points to a
> hugepd with holds the single 8M entry.
>
> In the PTE, we have two bits: _PAGE_SPS and _PAGE_HUGE
>
> _PAGE_HUGE means it is a 512k page
> _PAGE_SPS means it is not a 4k page
>
> The kernel can by build either with 4k pages as standard page size, or
> 16k pages. It doesn't change the page table layout though.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/powerpc/include/asm/nohash/32/pte-8xx.h |   23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -135,6 +135,29 @@ static inline pte_t pte_mkhuge(pte_t pte
 }
 
 #define pte_mkhuge pte_mkhuge
+
+static inline unsigned long pgd_leaf_size(pgd_t pgd)
+{
+	if (pgd_val(pgd) & _PMD_PAGE_8M)
+		return SZ_8M;
+	return SZ_4M;
+}
+
+#define pgd_leaf_size pgd_leaf_size
+
+static inline unsigned long pte_leaf_size(pte_t pte)
+{
+	pte_basic_t val = pte_val(pte);
+
+	if (val & _PAGE_HUGE)
+		return SZ_512K;
+	if (val & _PAGE_SPS)
+		return SZ_16K;
+	return SZ_4K;
+}
+
+#define pte_leaf_size pte_leaf_size
+
 #endif
 
 #endif /* __KERNEL__ */



^ permalink raw reply

* [PATCH v2 2/6] mm: Introduce pXX_leaf_size()
From: Peter Zijlstra @ 2020-11-26 12:01 UTC (permalink / raw)
  To: kan.liang, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	eranian
  Cc: linux-arch, ak, catalin.marinas, peterz, linuxppc-dev, willy,
	linux-kernel, dave.hansen, npiggin, aneesh.kumar, sparclinux,
	will, davem, kirill.shutemov
In-Reply-To: <20201126120114.071913521@infradead.org>

A number of architectures have non-pagetable aligned huge/large pages.
For such architectures a leaf can actually be part of a larger entry.

Provide generic helpers to determine the size of a page-table leaf.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/pgtable.h |   16 ++++++++++++++++
 1 file changed, 16 insertions(+)

--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1536,4 +1536,20 @@ typedef unsigned int pgtbl_mod_mask;
 #define pmd_leaf(x)	0
 #endif
 
+#ifndef pgd_leaf_size
+#define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT)
+#endif
+#ifndef p4d_leaf_size
+#define p4d_leaf_size(x) P4D_SIZE
+#endif
+#ifndef pud_leaf_size
+#define pud_leaf_size(x) PUD_SIZE
+#endif
+#ifndef pmd_leaf_size
+#define pmd_leaf_size(x) PMD_SIZE
+#endif
+#ifndef pte_leaf_size
+#define pte_leaf_size(x) PAGE_SIZE
+#endif
+
 #endif /* _LINUX_PGTABLE_H */



^ permalink raw reply

* Re: [PATCH 0/5] perf/mm: Fix PERF_SAMPLE_*_PAGE_SIZE
From: Peter Zijlstra @ 2020-11-26 10:46 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: mark.rutland, aneesh.kumar, willy, catalin.marinas, will,
	alexander.shishkin, linuxppc-dev, npiggin, linux-kernel, acme,
	davem, dave.hansen, ak, eranian, sparclinux, linux-arch, jolsa,
	mingo, kirill.shutemov, kan.liang
In-Reply-To: <20201120122004.GG3021@hirez.programming.kicks-ass.net>

On Fri, Nov 20, 2020 at 01:20:04PM +0100, Peter Zijlstra wrote:

> > > I can help with powerpc 8xx. It is a 32 bits powerpc. The PGD has 1024
> > > entries, that means each entry maps 4M.
> > > 
> > > Page sizes are 4k, 16k, 512k and 8M.
> > > 
> > > For the 8M pages we use hugepd with a single entry. The two related PGD
> > > entries point to the same hugepd.
> > > 
> > > For the other sizes, they are in standard page tables. 16k pages appear
> > > 4 times in the page table. 512k entries appear 128 times in the page
> > > table.
> > > 
> > > When the PGD entry has _PMD_PAGE_8M bits, the PMD entry points to a
> > > hugepd with holds the single 8M entry.
> > > 
> > > In the PTE, we have two bits: _PAGE_SPS and _PAGE_HUGE
> > > 
> > > _PAGE_HUGE means it is a 512k page
> > > _PAGE_SPS means it is not a 4k page
> > > 
> > > The kernel can by build either with 4k pages as standard page size, or
> > > 16k pages. It doesn't change the page table layout though.
> > > 
> > > Hope this is clear. Now I don't really know to wire that up to your series.

Does the below accurately reflect things?

Let me go find a suitable cross-compiler ..

diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 1581204467e1..fcc48d590d88 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -135,6 +135,29 @@ static inline pte_t pte_mkhuge(pte_t pte)
 }
 
 #define pte_mkhuge pte_mkhuge
+
+static inline unsigned long pgd_leaf_size(pgd_t pgd)
+{
+	if (pgd_val(pgd) & _PMD_PAGE_8M)
+		return SZ_8M;
+	return SZ_4M;
+}
+
+#define pgd_leaf_size pgd_leaf_size
+
+static inline unsigned long pte_leaf_size(pte_t pte)
+{
+	pte_basic_t val = pte_val(pte);
+
+	if (val & _PAGE_HUGE)
+		return SZ_512K;
+	if (val & _PAGE_SPS)
+		return SZ_16K;
+	return SZ_4K;
+}
+
+#define pte_leaf_size pte_leaf_size
+
 #endif
 
 #endif /* __KERNEL__ */

^ permalink raw reply related

* Re: [PATCH v6 16/22] powerpc/book3s64/kuap: Improve error reporting with KUAP
From: Christophe Leroy @ 2020-11-26 10:39 UTC (permalink / raw)
  To: Michael Ellerman, Aneesh Kumar K.V, linuxppc-dev
In-Reply-To: <87r1ogxy8j.fsf@mpe.ellerman.id.au>



Le 26/11/2020 à 10:29, Michael Ellerman a écrit :
> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>>
>>> Le 25/11/2020 à 06:16, Aneesh Kumar K.V a écrit :
>>>> With hash translation use DSISR_KEYFAULT to identify a wrong access.
>>>> With Radix we look at the AMR value and type of fault.
>>>>
>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>> ---
>>>>    arch/powerpc/include/asm/book3s/32/kup.h     |  4 +--
>>>>    arch/powerpc/include/asm/book3s/64/kup.h     | 27 ++++++++++++++++----
>>>>    arch/powerpc/include/asm/kup.h               |  4 +--
>>>>    arch/powerpc/include/asm/nohash/32/kup-8xx.h |  4 +--
>>>>    arch/powerpc/mm/fault.c                      |  2 +-
>>>>    5 files changed, 29 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h
>>>> index 32fd4452e960..b18cd931e325 100644
>>>> --- a/arch/powerpc/include/asm/book3s/32/kup.h
>>>> +++ b/arch/powerpc/include/asm/book3s/32/kup.h
>>>> @@ -177,8 +177,8 @@ static inline void restore_user_access(unsigned long flags)
>>>>    		allow_user_access(to, to, end - addr, KUAP_READ_WRITE);
>>>>    }
>>>>    
>>>> -static inline bool
>>>> -bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
>>>> +static inline bool bad_kuap_fault(struct pt_regs *regs, unsigned long address,
>>>> +				  bool is_write, unsigned long error_code)
>>>>    {
>>>>    	unsigned long begin = regs->kuap & 0xf0000000;
>>>>    	unsigned long end = regs->kuap << 28;
>>>> diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
>>>> index 4a3d0d601745..2922c442a218 100644
>>>> --- a/arch/powerpc/include/asm/book3s/64/kup.h
>>>> +++ b/arch/powerpc/include/asm/book3s/64/kup.h
>>>> @@ -301,12 +301,29 @@ static inline void set_kuap(unsigned long value)
>>>>    	isync();
>>>>    }
>>>>    
>>>> -static inline bool
>>>> -bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
>>>> +#define RADIX_KUAP_BLOCK_READ	UL(0x4000000000000000)
>>>> +#define RADIX_KUAP_BLOCK_WRITE	UL(0x8000000000000000)
>>>> +
>>>> +static inline bool bad_kuap_fault(struct pt_regs *regs, unsigned long address,
>>>> +				  bool is_write, unsigned long error_code)
>>>>    {
>>>> -	return WARN(mmu_has_feature(MMU_FTR_KUAP) &&
>>>> -		    (regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : AMR_KUAP_BLOCK_READ)),
>>>> -		    "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read");
>>>> +	if (!mmu_has_feature(MMU_FTR_KUAP))
>>>> +		return false;
>>>> +
>>>> +	if (radix_enabled()) {
>>>> +		/*
>>>> +		 * Will be a storage protection fault.
>>>> +		 * Only check the details of AMR[0]
>>>> +		 */
>>>> +		return WARN((regs->kuap & (is_write ? RADIX_KUAP_BLOCK_WRITE : RADIX_KUAP_BLOCK_READ)),
>>>> +			    "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read");
>>>
>>> I think it is pointless to keep the WARN() here.
>>>
>>> I have a series aiming at removing them. See
>>> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/cc9129bdda1dbc2f0a09cf45fece7d0b0e690784.1605541983.git.christophe.leroy@csgroup.eu/
>>
>> Can we do this as a spearate patch as you posted above? We can drop the
>> WARN in that while keeping the hash branch to look at DSISR value.
> 
> Yeah we can reconcile Christophe's series with yours later.
> 
> I'm still not 100% convinced I want to drop that WARN.

Ok, you can still take the rest of the series as that patch is the last one.

But, I really can't see the point with the WARN. When I hip a kuap bad fault, I get a double dump 
(see below). The second one is the interesting one, it tells me everything about the fault. But the 
WARN provides internals of do_page_fault() function. What interesting information do I get from there ?

[   37.842509] lkdtm: attempting bad write at b7bae000
[   37.842526] ------------[ cut here ]------------
[   37.842536] Bug: write fault blocked by segment registers !
[   37.842598] WARNING: CPU: 0 PID: 434 at arch/powerpc/include/asm/book3s/32/kup.h:189 
do_page_fault+0x3c8/0x5f0
[   37.842630] CPU: 0 PID: 434 Comm: busybox Not tainted 5.10.0-rc5-s3k-dev-01343-g8bec80f73baa #4165
[   37.842650] NIP:  c00155e4 LR: c00155e4 CTR: 00000000
[   37.842670] REGS: e6719c78 TRAP: 0700   Not tainted  (5.10.0-rc5-s3k-dev-01343-g8bec80f73baa)
[   37.842683] MSR:  00021032 <ME,IR,DR,RI>  CR: 22002224  XER: 20000000
[   37.842750]
[   37.842750] GPR00: c00155e4 e6719d30 c113c660 0000002f c097adf8 c097af10 00000027 00000027
[   37.842750] GPR08: c0b0afbc 00000000 00000023 00000001 24002224 100d166a 100a0920 00000000
[   37.842750] GPR16: 100cac0c 100b0000 10169444 1016a685 100d0000 100d0000 00000000 100a0900
[   37.842750] GPR24: ffffffef ffffffff c1392220 00000300 c076f424 02000000 b7bae000 e6719d70
[   37.843049] NIP [c00155e4] do_page_fault+0x3c8/0x5f0
[   37.843074] LR [c00155e4] do_page_fault+0x3c8/0x5f0
[   37.843087] Call Trace:
[   37.843114] [e6719d30] [c00155e4] do_page_fault+0x3c8/0x5f0 (unreliable)
[   37.843154] [e6719d60] [c0014384] handle_page_fault+0x10/0x3c
[   37.843211] --- interrupt: 301 at lkdtm_ACCESS_USERSPACE+0xdc/0xe4
[   37.843211]     LR = lkdtm_ACCESS_USERSPACE+0xd0/0xe4
[   37.843238] [e6719e48] [c039d76c] direct_entry+0xe0/0x164
[   37.843281] [e6719e68] [c0286730] full_proxy_write+0x78/0xbc
[   37.843325] [e6719e88] [c01657a8] vfs_write+0xdc/0x458
[   37.843359] [e6719f08] [c0165cb0] ksys_write+0x6c/0x11c
[   37.843397] [e6719f38] [c0014164] ret_from_syscall+0x0/0x34
[   37.843426] --- interrupt: c01 at 0xfd55784
[   37.843426]     LR = 0xfe16244
[   37.843438] Instruction dump:
[   37.843459] 38600007 4bff7a19 3bc00000 4bfffdbc 419e0110 813f00b0 55280006 7c1e4040
[   37.843529] 408000f4 3c60c080 3863e148 4801552d <0fe00000> 3c80c072 3c60c097 38840d84
[   37.843602] ---[ end trace 29c115c8ef352681 ]---
[   37.843627] Kernel attempted to write user page (b7bae000) - exploit attempt? (uid: 0)
[   37.851531] BUG: Unable to handle kernel data access on write at 0xb7bae000
[   37.858472] Faulting instruction address: 0xc039e550
[   37.863432] Oops: Kernel access of bad area, sig: 11 [#1]
[   37.868822] BE PAGE_SIZE=4K PREEMPT CMPCPRO
[   37.873029] SAF3000 DIE NOTIFICATION
[   37.876624] CPU: 0 PID: 434 Comm: busybox Tainted: G        W 
5.10.0-rc5-s3k-dev-01343-g8bec80f73baa #4165
[   37.886940] NIP:  c039e550 LR: c039e544 CTR: 00000000
[   37.891988] REGS: e6719d70 TRAP: 0300   Tainted: G        W 
(5.10.0-rc5-s3k-dev-01343-g8bec80f73baa)
[   37.901866] MSR:  00009032 <EE,ME,IR,DR,RI>  CR: 24002224  XER: 00000000
[   37.908617] DAR: b7bae000 DSISR: 0a000000
[   37.908617] GPR00: c039e544 e6719e28 c113c660 c083aad8 c097adf8 c097af10 00000027 00000027
[   37.908617] GPR08: c0b0afbc c0dec0de 00000023 00000001 28002224 100d166a 100a0920 00000000
[   37.908617] GPR16: 100cac0c 100b0000 10169444 1016a685 100d0000 100d0000 00000000 100a0900
[   37.908617] GPR24: ffffffef ffffffff e6719f10 00000011 c076f424 c1cb1000 c0839e24 b7bae000
[   37.946267] NIP [c039e550] lkdtm_ACCESS_USERSPACE+0xdc/0xe4
[   37.951842] LR [c039e544] lkdtm_ACCESS_USERSPACE+0xd0/0xe4
[   37.957316] Call Trace:
[   37.959782] [e6719e28] [c039e544] lkdtm_ACCESS_USERSPACE+0xd0/0xe4 (unreliable)
[   37.967102] [e6719e48] [c039d76c] direct_entry+0xe0/0x164
[   37.972524] [e6719e68] [c0286730] full_proxy_write+0x78/0xbc
[   37.978204] [e6719e88] [c01657a8] vfs_write+0xdc/0x458
[   37.983358] [e6719f08] [c0165cb0] ksys_write+0x6c/0x11c
[   37.988605] [e6719f38] [c0014164] ret_from_syscall+0x0/0x34
[   37.994185] --- interrupt: c01 at 0xfd55784
[   37.994185]     LR = 0xfe16244
[   38.001385] Instruction dump:
[   38.004360] 3863ac00 3d29c0df 3929c0de 91210008 4bcd04c9 3c60c084 3863ac24 7fe4fb78
[   38.012149] 4bcd04b9 3c60c084 81210008 3863aad8 <913f0000> 4bffff80 3c60c084 3863aa00
[   38.020120] ---[ end trace 29c115c8ef352682 ]---

Christophe

^ permalink raw reply

* Re: [PATCH 0/4] powerpc/64s: Fix for radix TLB invalidation bug
From: Aneesh Kumar K.V @ 2020-11-26 10:36 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev
  Cc: Milton Miller, Paul Mackerras, Nicholas Piggin
In-Reply-To: <20201126102530.691335-1-npiggin@gmail.com>

Nicholas Piggin <npiggin@gmail.com> writes:

> This fixes a tricky bug that was noticed by TLB multi-hits in a guest
> stress testing CPU hotplug, but TLB invalidation means any kind of
> data corruption is possible.
>
> Thanks,
> Nick
>
> Nicholas Piggin (4):
>   powerpc/64s: Fix hash ISA v3.0 TLBIEL instruction generation
>   powerpc/64s/pseries: Fix hash tlbiel_all_isa300 for guest kernels
>   kernel/cpu: add arch override for clear_tasks_mm_cpumask() mm handling
>   powerpc/64s: Trim offlined CPUs from mm_cpumasks
>
>  arch/powerpc/include/asm/book3s/64/mmu.h     | 12 ++++++++++
>  arch/powerpc/mm/book3s64/hash_native.c       | 23 +++++++++++++-------
>  arch/powerpc/mm/book3s64/mmu_context.c       | 20 +++++++++++++++++
>  arch/powerpc/platforms/powermac/smp.c        |  2 ++
>  arch/powerpc/platforms/powernv/smp.c         |  3 +++
>  arch/powerpc/platforms/pseries/hotplug-cpu.c |  3 +++
>  kernel/cpu.c                                 |  6 ++++-
>  7 files changed, 60 insertions(+), 9 deletions(-)
>
You can add for the series

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

-aneesh

^ permalink raw reply

* Re: [PATCH 1/2] powerpc: sstep: Fix load and update instructions
From: Ravi Bangoria @ 2020-11-26 10:36 UTC (permalink / raw)
  To: Sandipan Das, mpe
  Cc: Ravi Bangoria, jniethe5, paulus, naveen.n.rao, linuxppc-dev, dja
In-Reply-To: <20201119054139.244083-1-sandipan@linux.ibm.com>



On 11/19/20 11:11 AM, Sandipan Das wrote:
> The Power ISA says that the fixed-point load and update
> instructions must neither use R0 for the base address (RA)
> nor have the destination (RT) and the base address (RA) as
> the same register. In these cases, the instruction is
> invalid. This applies to the following instructions.
>    * Load Byte and Zero with Update (lbzu)
>    * Load Byte and Zero with Update Indexed (lbzux)
>    * Load Halfword and Zero with Update (lhzu)
>    * Load Halfword and Zero with Update Indexed (lhzux)
>    * Load Halfword Algebraic with Update (lhau)
>    * Load Halfword Algebraic with Update Indexed (lhaux)
>    * Load Word and Zero with Update (lwzu)
>    * Load Word and Zero with Update Indexed (lwzux)
>    * Load Word Algebraic with Update Indexed (lwaux)
>    * Load Doubleword with Update (ldu)
>    * Load Doubleword with Update Indexed (ldux)
> 
> However, the following behaviour is observed using some
> invalid opcodes where RA = RT.
> 
> An userspace program using an invalid instruction word like
> 0xe9ce0001, i.e. "ldu r14, 0(r14)", runs and exits without
> getting terminated abruptly. The instruction performs the
> load operation but does not write the effective address to
> the base address register. Attaching an uprobe at that
> instruction's address results in emulation which writes the
> effective address to the base register. Thus, the final value
> of the base address register is different.
> 
> To remove any inconsistencies, this adds an additional check
> for the aforementioned instructions to make sure that they
> are treated as unknown by the emulation infrastructure when
> RA = 0 or RA = RT. The kernel will then fallback to executing
> the instruction on hardware.
> 
> Signed-off-by: Sandipan Das <sandipan@linux.ibm.com>

For the series:
Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>

^ permalink raw reply

* [PATCH 4/4] powerpc/64s: Trim offlined CPUs from mm_cpumasks
From: Nicholas Piggin @ 2020-11-26 10:25 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Aneesh Kumar K.V, Paul Mackerras, Nicholas Piggin, Milton Miller
In-Reply-To: <20201126102530.691335-1-npiggin@gmail.com>

When offlining a CPU, powerpc/64s does not flush TLBs, rather it just
leaves the CPU set in mm_cpumasks, so it continues to receive TLBIEs
to manage its TLBs.

However the exit_flush_lazy_tlbs() function expects that after
returning, all CPUs (except self) have flushed TLBs for that mm, in
which case TLBIEL can be used for this flush. This breaks for offline
CPUs because they don't get the IPI to flush their TLB. This can lead
to stale translations.

Fix this by clearing the CPU from mm_cpumasks, then flushing all TLBs
before going offline.

These offlined CPU bits stuck in the cpumask also prevents the cpumask
from being trimmed back to local mode, which means continual broadcast
IPIs or TLBIEs are needed for TLB flushing. This patch prevents that
situation too.

A cast of many were involved in working this out, but in particular
Milton, Aneesh, Paul made key discoveries.

Fixes: 0cef77c7798a7 ("powerpc/64s/radix: flush remote CPUs out of single-threaded mm_cpumask")
Debugged-by: Milton Miller <miltonm@us.ibm.com>
Debugged-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Debugged-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/book3s/64/mmu.h     | 12 ++++++++++++
 arch/powerpc/mm/book3s64/mmu_context.c       | 20 ++++++++++++++++++++
 arch/powerpc/platforms/powermac/smp.c        |  2 ++
 arch/powerpc/platforms/powernv/smp.c         |  3 +++
 arch/powerpc/platforms/pseries/hotplug-cpu.c |  3 +++
 5 files changed, 40 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index e0b52940e43c..750918451dd2 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -242,6 +242,18 @@ extern void radix_init_pseries(void);
 static inline void radix_init_pseries(void) { };
 #endif
 
+#ifdef CONFIG_HOTPLUG_CPU
+#define arch_clear_mm_cpumask_cpu(cpu, mm)				\
+	do {								\
+		if (cpumask_test_cpu(cpu, mm_cpumask(mm))) {		\
+			atomic_dec(&(mm)->context.active_cpus);		\
+			cpumask_clear_cpu(cpu, mm_cpumask(mm));		\
+		}							\
+	} while (0)
+
+void cleanup_cpu_mmu_context(void);
+#endif
+
 static inline int get_user_context(mm_context_t *ctx, unsigned long ea)
 {
 	int index = ea >> MAX_EA_BITS_PER_CONTEXT;
diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
index 1c54821de7bf..0c8557220ae2 100644
--- a/arch/powerpc/mm/book3s64/mmu_context.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -17,6 +17,7 @@
 #include <linux/export.h>
 #include <linux/gfp.h>
 #include <linux/slab.h>
+#include <linux/cpu.h>
 
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
@@ -307,3 +308,22 @@ void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 	isync();
 }
 #endif
+
+/**
+ * cleanup_cpu_mmu_context - Clean up MMU details for this CPU (newly offlined)
+ *
+ * This clears the CPU from mm_cpumask for all processes, and then flushes the
+ * local TLB to ensure TLB coherency in case the CPU is onlined again.
+ *
+ * KVM guest translations are not necessarily flushed here. If KVM started
+ * using mm_cpumask or the Linux APIs which do, this would have to be resolved.
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+void cleanup_cpu_mmu_context(void)
+{
+	int cpu = smp_processor_id();
+
+	clear_tasks_mm_cpumask(cpu);
+	tlbiel_all();
+}
+#endif
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index 74ebe664b016..adae2a6712e1 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -911,6 +911,8 @@ static int smp_core99_cpu_disable(void)
 
 	mpic_cpu_set_priority(0xf);
 
+	cleanup_cpu_mmu_context();
+
 	return 0;
 }
 
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 54c4ba45c7ce..cbb67813cd5d 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -143,6 +143,9 @@ static int pnv_smp_cpu_disable(void)
 		xive_smp_disable_cpu();
 	else
 		xics_migrate_irqs_away();
+
+	cleanup_cpu_mmu_context();
+
 	return 0;
 }
 
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index f2837e33bf5d..a02012f1b04a 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -90,6 +90,9 @@ static int pseries_cpu_disable(void)
 		xive_smp_disable_cpu();
 	else
 		xics_migrate_irqs_away();
+
+	cleanup_cpu_mmu_context();
+
 	return 0;
 }
 
-- 
2.23.0


^ permalink raw reply related

* [PATCH 3/4] kernel/cpu: add arch override for clear_tasks_mm_cpumask() mm handling
From: Nicholas Piggin @ 2020-11-26 10:25 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Peter Zijlstra, Aneesh Kumar K.V, Paul Mackerras, Nicholas Piggin,
	Milton Miller
In-Reply-To: <20201126102530.691335-1-npiggin@gmail.com>

powerpc/64s keeps a counter in the mm which counts bits set in
mm_cpumask as well as other things. This means it can't use generic code
to clear bits out of the mask and doesn't adjust the arch specific
counter.

Add an arch override that allows powerpc/64s to use
clear_tasks_mm_cpumask().

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 kernel/cpu.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ff2578ecf17..2b8d7a5db383 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -815,6 +815,10 @@ void __init cpuhp_threads_init(void)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+#ifndef arch_clear_mm_cpumask_cpu
+#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
+#endif
+
 /**
  * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
  * @cpu: a CPU id
@@ -850,7 +854,7 @@ void clear_tasks_mm_cpumask(int cpu)
 		t = find_lock_task_mm(p);
 		if (!t)
 			continue;
-		cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
+		arch_clear_mm_cpumask_cpu(cpu, t->mm);
 		task_unlock(t);
 	}
 	rcu_read_unlock();
-- 
2.23.0


^ permalink raw reply related

* [PATCH 2/4] powerpc/64s/pseries: Fix hash tlbiel_all_isa300 for guest kernels
From: Nicholas Piggin @ 2020-11-26 10:25 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Aneesh Kumar K.V, Paul Mackerras, Nicholas Piggin, Milton Miller
In-Reply-To: <20201126102530.691335-1-npiggin@gmail.com>

tlbiel_all() can not be usable in !HVMODE when running hash presently,
remove HV privileged flushes when running in guest to make it usable.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/book3s64/hash_native.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c
index 97fa42d7027e..52e170bd95ae 100644
--- a/arch/powerpc/mm/book3s64/hash_native.c
+++ b/arch/powerpc/mm/book3s64/hash_native.c
@@ -92,16 +92,15 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
 	asm volatile("ptesync": : :"memory");
 
 	/*
-	 * Flush the first set of the TLB, and any caching of partition table
-	 * entries. Then flush the remaining sets of the TLB. Hash mode uses
-	 * partition scoped TLB translations.
+	 * Flush the partition table cache if this is HV mode.
 	 */
-	tlbiel_hash_set_isa300(0, is, 0, 2, 0);
-	for (set = 1; set < num_sets; set++)
-		tlbiel_hash_set_isa300(set, is, 0, 0, 0);
+	if (early_cpu_has_feature(CPU_FTR_HVMODE))
+		tlbiel_hash_set_isa300(0, is, 0, 2, 0);
 
 	/*
-	 * Now invalidate the process table cache.
+	 * Now invalidate the process table cache. UPRT=0 HPT modes (what
+	 * current hardware implements) do not use the process table, but
+	 * add the flushes anyway.
 	 *
 	 * From ISA v3.0B p. 1078:
 	 *     The following forms are invalid.
@@ -110,6 +109,14 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
 	 */
 	tlbiel_hash_set_isa300(0, is, 0, 2, 1);
 
+	/*
+	 * Then flush the sets of the TLB proper. Hash mode uses
+	 * partition scoped TLB translations, which may be flushed
+	 * in !HV mode.
+	 */
+	for (set = 0; set < num_sets; set++)
+		tlbiel_hash_set_isa300(set, is, 0, 0, 0);
+
 	ppc_after_tlbiel_barrier();
 
 	asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory");
-- 
2.23.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox