LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 1/2] powerpc/ptdump: Refactor update of st->last_pa
From: Christophe Leroy @ 2020-06-29 11:17 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

st->last_pa is always updated in note_page() so it can
be done outside the if/elseif/else block.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/ptdump/ptdump.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index b2ed1ca4f254..20a039867934 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -211,7 +211,6 @@ static void note_page(struct pg_state *st, unsigned long addr,
 		st->current_flags = flag;
 		st->start_address = addr;
 		st->start_pa = pa;
-		st->last_pa = pa;
 		st->page_size = page_size;
 		pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
 	/*
@@ -251,13 +250,11 @@ static void note_page(struct pg_state *st, unsigned long addr,
 		}
 		st->start_address = addr;
 		st->start_pa = pa;
-		st->last_pa = pa;
 		st->page_size = page_size;
 		st->current_flags = flag;
 		st->level = level;
-	} else {
-		st->last_pa = pa;
 	}
+	st->last_pa = pa;
 }
 
 static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 5/6] powerpc/32s: Kernel space starts at TASK_SIZE
From: Christophe Leroy @ 2020-06-29 11:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1593428200.git.christophe.leroy@csgroup.eu>

Kernel space starts at TASK_SIZE. Select kernel page table
when address is over TASK_SIZE.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/head_32.S       | 12 ++++++------
 arch/powerpc/mm/book3s32/hash_low.S |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 705c042309d8..bbef6ce8322b 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -474,7 +474,7 @@ InstructionTLBMiss:
 	/* Get PTE (linux-style) and check access */
 	mfspr	r3,SPRN_IMISS
 #if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC)
-	lis	r1,PAGE_OFFSET@h		/* check if kernel address */
+	lis	r1, TASK_SIZE@h		/* check if kernel address */
 	cmplw	0,r1,r3
 #endif
 	mfspr	r2, SPRN_SPRG_PGDIR
@@ -484,7 +484,7 @@ InstructionTLBMiss:
 	li	r1,_PAGE_PRESENT | _PAGE_EXEC
 #endif
 #if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC)
-	bge-	112f
+	bgt-	112f
 	lis	r2, (swapper_pg_dir - PAGE_OFFSET)@ha	/* if kernel address, use */
 	addi	r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l	/* kernel page table */
 #endif
@@ -541,7 +541,7 @@ DataLoadTLBMiss:
  */
 	/* Get PTE (linux-style) and check access */
 	mfspr	r3,SPRN_DMISS
-	lis	r1,PAGE_OFFSET@h		/* check if kernel address */
+	lis	r1, TASK_SIZE@h		/* check if kernel address */
 	cmplw	0,r1,r3
 	mfspr	r2, SPRN_SPRG_PGDIR
 #ifdef CONFIG_SWAP
@@ -549,7 +549,7 @@ DataLoadTLBMiss:
 #else
 	li	r1, _PAGE_PRESENT
 #endif
-	bge-	112f
+	bgt-	112f
 	lis	r2, (swapper_pg_dir - PAGE_OFFSET)@ha	/* if kernel address, use */
 	addi	r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l	/* kernel page table */
 112:	rlwimi	r2,r3,12,20,29		/* insert top 10 bits of address */
@@ -621,7 +621,7 @@ DataStoreTLBMiss:
  */
 	/* Get PTE (linux-style) and check access */
 	mfspr	r3,SPRN_DMISS
-	lis	r1,PAGE_OFFSET@h		/* check if kernel address */
+	lis	r1, TASK_SIZE@h		/* check if kernel address */
 	cmplw	0,r1,r3
 	mfspr	r2, SPRN_SPRG_PGDIR
 #ifdef CONFIG_SWAP
@@ -629,7 +629,7 @@ DataStoreTLBMiss:
 #else
 	li	r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT
 #endif
-	bge-	112f
+	bgt-	112f
 	lis	r2, (swapper_pg_dir - PAGE_OFFSET)@ha	/* if kernel address, use */
 	addi	r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l	/* kernel page table */
 112:	rlwimi	r2,r3,12,20,29		/* insert top 10 bits of address */
diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S
index 923ad8f374eb..1690d369688b 100644
--- a/arch/powerpc/mm/book3s32/hash_low.S
+++ b/arch/powerpc/mm/book3s32/hash_low.S
@@ -62,7 +62,7 @@ _GLOBAL(hash_page)
 	isync
 #endif
 	/* Get PTE (linux-style) and check access */
-	lis	r0,KERNELBASE@h		/* check if kernel address */
+	lis	r0, TASK_SIZE@h		/* check if kernel address */
 	cmplw	0,r4,r0
 	ori	r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
 	mfspr	r5, SPRN_SPRG_PGDIR	/* phys page-table root */
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 6/6] powerpc/32s: Use dedicated segment for modules with STRICT_KERNEL_RWX
From: Christophe Leroy @ 2020-06-29 11:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1593428200.git.christophe.leroy@csgroup.eu>

When STRICT_KERNEL_RWX is set, we want to set NX bit on vmalloc
segments. But modules require exec.

Use a dedicated segment for modules. There is not much space
above kernel, and we don't waste vmalloc space to do alignment.
Therefore, we take the segment before PAGE_OFFSET for modules.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/Kconfig                         |  1 +
 arch/powerpc/include/asm/book3s/32/pgtable.h | 15 +++++----------
 arch/powerpc/mm/ptdump/ptdump.c              |  8 ++++++++
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 51abc59c3334..963b3bc7d969 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -1194,6 +1194,7 @@ config TASK_SIZE_BOOL
 config TASK_SIZE
 	hex "Size of user task space" if TASK_SIZE_BOOL
 	default "0x80000000" if PPC_8xx
+	default "0xb0000000" if PPC_BOOK3S_32 && STRICT_KERNEL_RWX
 	default "0xc0000000"
 endmenu
 
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 224912432821..36443cda8dcf 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -184,17 +184,7 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
  */
 #define VMALLOC_OFFSET (0x1000000) /* 16M */
 
-/*
- * With CONFIG_STRICT_KERNEL_RWX, kernel segments are set NX. But when modules
- * are used, NX cannot be set on VMALLOC space. So vmalloc VM space and linear
- * memory shall not share segments.
- */
-#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_MODULES)
-#define VMALLOC_START ((ALIGN((long)high_memory, 256L << 20) + VMALLOC_OFFSET) & \
-		       ~(VMALLOC_OFFSET - 1))
-#else
 #define VMALLOC_START ((((long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)))
-#endif
 
 #ifdef CONFIG_KASAN_VMALLOC
 #define VMALLOC_END	ALIGN_DOWN(ioremap_bot, PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
@@ -202,6 +192,11 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
 #define VMALLOC_END	ioremap_bot
 #endif
 
+#ifdef CONFIG_STRICT_KERNEL_RWX
+#define MODULES_END	ALIGN_DOWN(PAGE_OFFSET, SZ_256M)
+#define MODULES_VADDR	(MODULES_END - SZ_256M)
+#endif
+
 #ifndef __ASSEMBLY__
 #include <linux/sched.h>
 #include <linux/threads.h>
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 9d942136c7be..b2ed1ca4f254 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -74,6 +74,10 @@ struct addr_marker {
 
 static struct addr_marker address_markers[] = {
 	{ 0,	"Start of kernel VM" },
+#ifdef MODULES_VADDR
+	{ 0,	"modules start" },
+	{ 0,	"modules end" },
+#endif
 	{ 0,	"vmalloc() Area" },
 	{ 0,	"vmalloc() End" },
 #ifdef CONFIG_PPC64
@@ -352,6 +356,10 @@ static void populate_markers(void)
 	address_markers[i++].start_address = PAGE_OFFSET;
 #else
 	address_markers[i++].start_address = TASK_SIZE;
+#endif
+#ifdef MODULES_VADDR
+	address_markers[i++].start_address = MODULES_VADDR;
+	address_markers[i++].start_address = MODULES_END;
 #endif
 	address_markers[i++].start_address = VMALLOC_START;
 	address_markers[i++].start_address = VMALLOC_END;
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 1/6] powerpc/lib: Prepare code-patching for modules allocated outside vmalloc space
From: Christophe Leroy @ 2020-06-29 11:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1593428200.git.christophe.leroy@csgroup.eu>

Use is_vmalloc_or_module_addr() instead of is_vmalloc_addr()

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/lib/code-patching.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 0a051dfeb177..8c3934ea6220 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -93,7 +93,7 @@ static int map_patch_area(void *addr, unsigned long text_poke_addr)
 	unsigned long pfn;
 	int err;
 
-	if (is_vmalloc_addr(addr))
+	if (is_vmalloc_or_module_addr(addr))
 		pfn = vmalloc_to_pfn(addr);
 	else
 		pfn = __pa_symbol(addr) >> PAGE_SHIFT;
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 2/6] powerpc: Use MODULES_VADDR if defined
From: Christophe Leroy @ 2020-06-29 11:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1593428200.git.christophe.leroy@csgroup.eu>

In order to allow allocation of modules outside of vmalloc space,
use MODULES_VADDR and MODULES_END when MODULES_VADDR is defined.

Redefine module_alloc() when MODULES_VADDR defined.
Unmap corresponding KASAN shadow memory.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/module.c          | 11 +++++++++++
 arch/powerpc/mm/kasan/kasan_init_32.c |  6 ++++++
 2 files changed, 17 insertions(+)

diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index df649acb5631..a211b0253cdb 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -86,3 +86,14 @@ int module_finalize(const Elf_Ehdr *hdr,
 
 	return 0;
 }
+
+#ifdef MODULES_VADDR
+void *module_alloc(unsigned long size)
+{
+	BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
+
+	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, GFP_KERNEL,
+				    PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
+				    __builtin_return_address(0));
+}
+#endif
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index 0760e1e754e4..f1bc267d42af 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -115,6 +115,12 @@ static void __init kasan_unmap_early_shadow_vmalloc(void)
 	unsigned long k_end = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_END);
 
 	kasan_update_early_region(k_start, k_end, __pte(0));
+
+#ifdef MODULES_VADDR
+	k_start = (unsigned long)kasan_mem_to_shadow((void *)MODULES_VADDR);
+	k_end = (unsigned long)kasan_mem_to_shadow((void *)MODULES_END);
+	kasan_update_early_region(k_start, k_end, __pte(0));
+#endif
 }
 
 static void __init kasan_mmu_init(void)
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 3/6] powerpc/32s: Only leave NX unset on segments used for modules
From: Christophe Leroy @ 2020-06-29 11:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1593428200.git.christophe.leroy@csgroup.eu>

Instead of leaving NX unset on all segments above the start
of vmalloc space, only leave NX unset on segments used for
modules.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/book3s32/mmu.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 03b6ba54460e..c0162911f6cb 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -187,6 +187,17 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
 	return __mmu_mapin_ram(border, top);
 }
 
+static bool is_module_segment(unsigned long addr)
+{
+	if (!IS_ENABLED(CONFIG_MODULES))
+		return false;
+	if (addr < ALIGN_DOWN(VMALLOC_START, SZ_256M))
+		return false;
+	if (addr >= ALIGN(VMALLOC_END, SZ_256M))
+		return false;
+	return true;
+}
+
 void mmu_mark_initmem_nx(void)
 {
 	int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
@@ -223,9 +234,9 @@ void mmu_mark_initmem_nx(void)
 
 	for (i = TASK_SIZE >> 28; i < 16; i++) {
 		/* Do not set NX on VM space for modules */
-		if (IS_ENABLED(CONFIG_MODULES) &&
-		    (VMALLOC_START & 0xf0000000) == i << 28)
-			break;
+		if (is_module_segment(i << 28))
+			continue;
+
 		mtsrin(mfsrin(i << 28) | 0x10000000, i << 28);
 	}
 }
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 4/6] powerpc/32: Set user/kernel boundary at TASK_SIZE instead of PAGE_OFFSET
From: Christophe Leroy @ 2020-06-29 11:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1593428200.git.christophe.leroy@csgroup.eu>

User space stops at TASK_SIZE. At the moment, kernel space starts
at PAGE_OFFSET.

In order to use space between TASK_SIZE and PAGE_OFFSET for modules,
make TASK_SIZE the limit between user and kernel space.

Note that fault.c already considers TASK_SIZE as the boundary between
user and kernel space.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/page.h | 4 +++-
 arch/powerpc/mm/ptdump/ptdump.c | 8 ++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index a63fe6f3a0ff..254687258f42 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -255,8 +255,10 @@ static inline bool pfn_valid(unsigned long pfn)
  */
 #ifdef CONFIG_PPC_BOOK3E_64
 #define is_kernel_addr(x)	((x) >= 0x8000000000000000ul)
-#else
+#elif defined(CONFIG_PPC_BOOK3S_64)
 #define is_kernel_addr(x)	((x) >= PAGE_OFFSET)
+#else
+#define is_kernel_addr(x)	((x) >= TASK_SIZE)
 #endif
 
 #ifndef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index de6e05ef871c..9d942136c7be 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -348,7 +348,11 @@ static void populate_markers(void)
 {
 	int i = 0;
 
+#ifdef CONFIG_PPC64
 	address_markers[i++].start_address = PAGE_OFFSET;
+#else
+	address_markers[i++].start_address = TASK_SIZE;
+#endif
 	address_markers[i++].start_address = VMALLOC_START;
 	address_markers[i++].start_address = VMALLOC_END;
 #ifdef CONFIG_PPC64
@@ -385,7 +389,7 @@ static int ptdump_show(struct seq_file *m, void *v)
 	struct pg_state st = {
 		.seq = m,
 		.marker = address_markers,
-		.start_address = PAGE_OFFSET,
+		.start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE,
 	};
 
 #ifdef CONFIG_PPC64
@@ -429,7 +433,7 @@ void ptdump_check_wx(void)
 		.seq = NULL,
 		.marker = address_markers,
 		.check_wx = true,
-		.start_address = PAGE_OFFSET,
+		.start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE,
 	};
 
 #ifdef CONFIG_PPC64
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 0/6] powerpc/32s: Allocate modules outside of vmalloc space for STRICT_KERNEL_RWX
From: Christophe Leroy @ 2020-06-29 11:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

On book3s32 (hash), exec protection is set per 256Mb segments with NX bit.
Instead of clearing NX bit on vmalloc space when CONFIG_MODULES is selected,
allocate modules in a dedicated segment (0xb0000000-0xbfffffff by default).
This allows to keep exec protection on vmalloc space while allowing exec
on modules.

v2:
- Removed the two patches that fix ptdump. Will submitted independently
- Only changing the user/kernel boundary for PPC32 now.
- Reordered the patches inside the series.

Christophe Leroy (6):
  powerpc/lib: Prepare code-patching for modules allocated outside
    vmalloc space
  powerpc: Use MODULES_VADDR if defined
  powerpc/32s: Only leave NX unset on segments used for modules
  powerpc/32: Set user/kernel boundary at TASK_SIZE instead of
    PAGE_OFFSET
  powerpc/32s: Kernel space starts at TASK_SIZE
  powerpc/32s: Use dedicated segment for modules with STRICT_KERNEL_RWX

 arch/powerpc/Kconfig                         |  1 +
 arch/powerpc/include/asm/book3s/32/pgtable.h | 15 +++++----------
 arch/powerpc/include/asm/page.h              |  4 +++-
 arch/powerpc/kernel/head_32.S                | 12 ++++++------
 arch/powerpc/kernel/module.c                 | 11 +++++++++++
 arch/powerpc/lib/code-patching.c             |  2 +-
 arch/powerpc/mm/book3s32/hash_low.S          |  2 +-
 arch/powerpc/mm/book3s32/mmu.c               | 17 ++++++++++++++---
 arch/powerpc/mm/kasan/kasan_init_32.c        |  6 ++++++
 arch/powerpc/mm/ptdump/ptdump.c              | 16 ++++++++++++++--
 10 files changed, 62 insertions(+), 24 deletions(-)

-- 
2.25.0


^ permalink raw reply

* Re: [PATCH 13/13] powerpc/dma: Remove dev->archdata.iommu_domain
From: Michael Ellerman @ 2020-06-29 10:57 UTC (permalink / raw)
  To: Joerg Roedel, iommu
  Cc: linux-ia64, Heiko Stuebner, David Airlie, Catalin Marinas,
	Joonas Lahtinen, Thierry Reding, Paul Mackerras, Will Deacon,
	Marek Szyprowski, Joerg Roedel, x86, Russell King, Fenghua Yu,
	Joerg Roedel, intel-gfx, Jani Nikula, Rodrigo Vivi,
	Matthias Brugger, linux-arm-kernel, Tony Luck, linuxppc-dev,
	linux-kernel, Daniel Vetter, David Woodhouse, Lu Baolu
In-Reply-To: <20200625130836.1916-14-joro@8bytes.org>

Joerg Roedel <joro@8bytes.org> writes:
> From: Joerg Roedel <jroedel@suse.de>
>
> There are no users left, so remove the pointer and save some memory.
>
> Signed-off-by: Joerg Roedel <jroedel@suse.de>
> ---
>  arch/powerpc/include/asm/device.h | 3 ---
>  1 file changed, 3 deletions(-)

It's a little hard to confirm there are no users left just with grep,
but I think you've got them all, and the compiler should tell us if
you've missed any.

Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)

cheers

> diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h
> index 266542769e4b..1bc595213338 100644
> --- a/arch/powerpc/include/asm/device.h
> +++ b/arch/powerpc/include/asm/device.h
> @@ -34,9 +34,6 @@ struct dev_archdata {
>  	struct iommu_table	*iommu_table_base;
>  #endif
>  
> -#ifdef CONFIG_IOMMU_API
> -	void			*iommu_domain;
> -#endif
>  #ifdef CONFIG_PPC64
>  	struct pci_dn		*pci_data;
>  #endif
> -- 
> 2.27.0

^ permalink raw reply

* Re: [PATCH] powerpc: Warn about use of smt_snooze_delay
From: Gautham R Shenoy @ 2020-06-29 10:42 UTC (permalink / raw)
  To: Joel Stanley; +Cc: linuxppc-dev, ego
In-Reply-To: <20200625100349.2408899-1-joel@jms.id.au>

On Thu, Jun 25, 2020 at 07:33:49PM +0930, Joel Stanley wrote:
> It's not done anything for a long time. Save the percpu variable, and
> emit a warning to remind users to not expect it to do anything.
> 
> Signed-off-by: Joel Stanley <joel@jms.id.au>

The only known user of "smt_snooze_delay" is the "ppc64_cpu" which
uses the presence of this file to assume that the system is SMT
capable.

Since we have "/sys/devices/system/cpu/smt/" these days, perhaps the
userspace utility can use that and we can get rid of the file
altogether ?

FWIW,
Acked-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
> ---
>  arch/powerpc/kernel/sysfs.c | 41 +++++++++++++------------------------
>  1 file changed, 14 insertions(+), 27 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
> index 571b3259697e..530ae92bc46d 100644
> --- a/arch/powerpc/kernel/sysfs.c
> +++ b/arch/powerpc/kernel/sysfs.c
> @@ -32,29 +32,25 @@
>  
>  static DEFINE_PER_CPU(struct cpu, cpu_devices);
>  
> -/*
> - * SMT snooze delay stuff, 64-bit only for now
> - */
> -
>  #ifdef CONFIG_PPC64
>  
> -/* Time in microseconds we delay before sleeping in the idle loop */
> -static DEFINE_PER_CPU(long, smt_snooze_delay) = { 100 };
> +/*
> + * Snooze delay has not been hooked up since 3fa8cad82b94 ("powerpc/pseries/cpuidle:
> + * smt-snooze-delay cleanup.") and has been broken even longer. As was foretold in
> + * 2014:
> + *
> + *  "ppc64_util currently utilises it. Once we fix ppc64_util, propose to clean
> + *  up the kernel code."
> + *
> + * At some point in the future this code should be removed.
> + */
>  
>  static ssize_t store_smt_snooze_delay(struct device *dev,
>  				      struct device_attribute *attr,
>  				      const char *buf,
>  				      size_t count)
>  {
> -	struct cpu *cpu = container_of(dev, struct cpu, dev);
> -	ssize_t ret;
> -	long snooze;
> -
> -	ret = sscanf(buf, "%ld", &snooze);
> -	if (ret != 1)
> -		return -EINVAL;
> -
> -	per_cpu(smt_snooze_delay, cpu->dev.id) = snooze;
> +	WARN_ON_ONCE("smt_snooze_delay sysfs file has no effect\n");
>  	return count;
>  }
>  
> @@ -62,9 +58,9 @@ static ssize_t show_smt_snooze_delay(struct device *dev,
>  				     struct device_attribute *attr,
>  				     char *buf)
>  {
> -	struct cpu *cpu = container_of(dev, struct cpu, dev);
> +	WARN_ON_ONCE("smt_snooze_delay sysfs file has no effect\n");
>  
> -	return sprintf(buf, "%ld\n", per_cpu(smt_snooze_delay, cpu->dev.id));
> +	return sprintf(buf, "100\n");
>  }
>  
>  static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
> @@ -72,16 +68,7 @@ static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
>  
>  static int __init setup_smt_snooze_delay(char *str)
>  {
> -	unsigned int cpu;
> -	long snooze;
> -
> -	if (!cpu_has_feature(CPU_FTR_SMT))
> -		return 1;
> -
> -	snooze = simple_strtol(str, NULL, 10);
> -	for_each_possible_cpu(cpu)
> -		per_cpu(smt_snooze_delay, cpu) = snooze;
> -
> +	WARN_ON_ONCE("smt-snooze-delay command line option has no effect\n");
>  	return 1;
>  }
>  __setup("smt-snooze-delay=", setup_smt_snooze_delay);
> -- 
> 2.27.0
> 

^ permalink raw reply

* [PATCH 1/3] powerpc/cacheinfo: Use cpumap_print to print cpumap
From: Srikar Dronamraju @ 2020-06-29 10:37 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: Nathan Lynch, Tejun Heo, linuxppc-dev, Srikar Dronamraju
In-Reply-To: <20200629103703.4538-1-srikar@linux.vnet.ibm.com>

Tejun Heo had modified shared_cpu_map_show to use scnprintf instead of
cpumap_print during support for *pb[l] format.
Refer commit 0c118b7bd09a ("powerpc: use %*pb[l] to print bitmaps including
cpumasks and nodemasks")

cpumap_print_to_pagebuf is a standard function to print cpumap.  With
commit 9cf79d115f0d ("bitmap: remove explicit newline handling using
scnprintf format string"), there is no need to print explicit newline and
trailing null character. cpumap_print_to_pagebuf internally uses
scnprintf. Hence replace scnprintf with cpumap_print_to_pagebuf.

Note: shared_cpu_map_show in drivers/base/cacheinfo.c already uses
cpumap_print_to_pagebuf.

Before this patch
# cat /sys/devices/system/cpu0/cache/index1/shared_cpu_map
00ff

#
		(Notice the extra blank line).

After this patch
# cat /sys/devices/system/cpu0/cache/index1/shared_cpu_map
00ff
#

Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/cacheinfo.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 470336277c67..0d3c45e2fccd 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -652,7 +652,7 @@ static ssize_t shared_cpu_map_show(struct kobject *k, struct kobj_attribute *att
 	struct cache_index_dir *index;
 	struct cache *cache;
 	const struct cpumask *mask;
-	int ret, cpu;
+	int cpu;
 
 	index = kobj_to_cache_index_dir(k);
 	cache = index->cache;
@@ -664,11 +664,7 @@ static ssize_t shared_cpu_map_show(struct kobject *k, struct kobj_attribute *att
 		mask  = &cache->shared_cpu_map;
 	}
 
-	ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb\n",
-			cpumask_pr_args(mask));
-	buf[ret++] = '\n';
-	buf[ret] = '\0';
-	return ret;
+	return cpumap_print_to_pagebuf(false, buf, mask);
 }
 
 static struct kobj_attribute cache_shared_cpu_map_attr =
-- 
2.17.1


^ permalink raw reply related

* [PATCH 3/3] powerpc/cacheinfo: Add per cpu per index shared_cpu_list
From: Srikar Dronamraju @ 2020-06-29 10:37 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: Nathan Lynch, linuxppc-dev, Srikar Dronamraju
In-Reply-To: <20200629103703.4538-1-srikar@linux.vnet.ibm.com>

Unlike drivers/base/cacheinfo, powerpc cacheinfo code is not exposing
shared_cpu_list under /sys/devices/system/cpu/cpu<n>/cache/index<m>

Add shared_cpu_list to per cpu per index directory to maintain parity
with x86.  Some scripts (example: mmtests
https://github.com/gormanm/mmtests) seem to be looking for
shared_cpu_list instead of shared_cpu_map.

Before this patch
# ls /sys/devices/system/cpu0/cache/index1
coherency_line_size  number_of_sets  size  ways_of_associativity
level                shared_cpu_map  type
# cat /sys/devices/system/cpu0/cache/index1/shared_cpu_map
00ff
#

After this patch
# ls /sys/devices/system/cpu0/cache/index1
coherency_line_size  number_of_sets   shared_cpu_map  type
level                shared_cpu_list  size            ways_of_associativity
# cat /sys/devices/system/cpu0/cache/index1/shared_cpu_map
00ff
# cat /sys/devices/system/cpu0/cache/index1/shared_cpu_list
0-7
#

Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/cacheinfo.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 5be870f99623..d8d4552af30a 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -670,12 +670,20 @@ show_shared_cpumap(struct kobject *k, struct kobj_attribute *attr, char *buf, bo
 
 static ssize_t shared_cpu_map_show(struct kobject *k, struct kobj_attribute *attr, char *buf)
 {
-	return show_shared_cpumap(k, attr, buf, false)
+	return show_shared_cpumap(k, attr, buf, false);
+}
+
+static ssize_t shared_cpu_list_show(struct kobject *k, struct kobj_attribute *attr, char *buf)
+{
+	return show_shared_cpumap(k, attr, buf, true);
 }
 
 static struct kobj_attribute cache_shared_cpu_map_attr =
 	__ATTR(shared_cpu_map, 0444, shared_cpu_map_show, NULL);
 
+static struct kobj_attribute cache_shared_cpu_list_attr =
+	__ATTR(shared_cpu_list, 0444, shared_cpu_list_show, NULL);
+
 /* Attributes which should always be created -- the kobject/sysfs core
  * does this automatically via kobj_type->default_attrs.  This is the
  * minimum data required to uniquely identify a cache.
@@ -684,6 +692,7 @@ static struct attribute *cache_index_default_attrs[] = {
 	&cache_type_attr.attr,
 	&cache_level_attr.attr,
 	&cache_shared_cpu_map_attr.attr,
+	&cache_shared_cpu_list_attr.attr,
 	NULL,
 };
 
-- 
2.17.1


^ permalink raw reply related

* [PATCH 2/3] powerpc/cacheinfo: Make cpumap_show code reusable
From: Srikar Dronamraju @ 2020-06-29 10:37 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: Nathan Lynch, linuxppc-dev, Srikar Dronamraju
In-Reply-To: <20200629103703.4538-1-srikar@linux.vnet.ibm.com>

In anticipation of implementing shared_cpu_list, move code under
shared_cpu_map_show to a common function.

No functional changes.

Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/cacheinfo.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 0d3c45e2fccd..5be870f99623 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -647,7 +647,8 @@ static const struct cpumask *get_big_core_shared_cpu_map(int cpu, struct cache *
 	return &cache->shared_cpu_map;
 }
 
-static ssize_t shared_cpu_map_show(struct kobject *k, struct kobj_attribute *attr, char *buf)
+static ssize_t
+show_shared_cpumap(struct kobject *k, struct kobj_attribute *attr, char *buf, bool list)
 {
 	struct cache_index_dir *index;
 	struct cache *cache;
@@ -664,7 +665,12 @@ static ssize_t shared_cpu_map_show(struct kobject *k, struct kobj_attribute *att
 		mask  = &cache->shared_cpu_map;
 	}
 
-	return cpumap_print_to_pagebuf(false, buf, mask);
+	return cpumap_print_to_pagebuf(list, buf, mask);
+}
+
+static ssize_t shared_cpu_map_show(struct kobject *k, struct kobj_attribute *attr, char *buf)
+{
+	return show_shared_cpumap(k, attr, buf, false)
 }
 
 static struct kobj_attribute cache_shared_cpu_map_attr =
-- 
2.17.1


^ permalink raw reply related

* [PATCH 0/3] Implement shared_cpu_list for powerpc
From: Srikar Dronamraju @ 2020-06-29 10:37 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: Nathan Lynch, linuxppc-dev, Srikar Dronamraju

shared_cpu_list sysfs file is missing in powerpc and shared_cpu_map gives an
extra newline character.

Before this patchset
# ls /sys/devices/system/cpu0/cache/index1
coherency_line_size  number_of_sets  size  ways_of_associativity
level                shared_cpu_map  type
# cat /sys/devices/system/cpu0/cache/index1/shared_cpu_map
00ff

#
		(Notice the extra blank line).

After this patchset
# ls /sys/devices/system/cpu0/cache/index1
coherency_line_size  number_of_sets   shared_cpu_map  type
level                shared_cpu_list  size            ways_of_associativity
# cat /sys/devices/system/cpu0/cache/index1/shared_cpu_map
00ff
# cat /sys/devices/system/cpu0/cache/index1/shared_cpu_list
0-7
#

Srikar Dronamraju (3):
  powerpc/cacheinfo: Use cpumap_print to print cpumap
  powerpc/cacheinfo: Make cpumap_show code reusable
  powerpc/cacheinfo: Add per cpu per index shared_cpu_list

 arch/powerpc/kernel/cacheinfo.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: linuxppc-dev@lists.ozlabs.org
-- 
2.17.1


^ permalink raw reply

* Re: [PATCH v3 3/4] KVM: PPC: Book3S HV: migrate remaining normal-GFNs to secure-GFNs in H_SVM_INIT_DONE
From: Laurent Dufour @ 2020-06-29  8:48 UTC (permalink / raw)
  To: bharata, Ram Pai
  Cc: cclaudio, kvm-ppc, sathnaga, aneesh.kumar, sukadev, linuxppc-dev,
	bauerman, david
In-Reply-To: <20200628162047.GB27215@in.ibm.com>

Le 28/06/2020 à 18:20, Bharata B Rao a écrit :
> On Fri, Jun 19, 2020 at 03:43:41PM -0700, Ram Pai wrote:
>> H_SVM_INIT_DONE incorrectly assumes that the Ultravisor has explicitly
> 
> As noted in the last iteration, can you reword the above please?
> I don't see it as an incorrect assumption, but see it as extension of
> scope now :-)
> 
>> called H_SVM_PAGE_IN for all secure pages. These GFNs continue to be
>> normal GFNs associated with normal PFNs; when infact, these GFNs should
>> have been secure GFNs, associated with device PFNs.
>>
>> Move all the PFNs associated with the SVM's GFNs, to secure-PFNs, in
>> H_SVM_INIT_DONE. Skip the GFNs that are already Paged-in or Shared
>> through H_SVM_PAGE_IN, or Paged-in followed by a Paged-out through
>> UV_PAGE_OUT.
>>
>> Cc: Paul Mackerras <paulus@ozlabs.org>
>> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>> Cc: Michael Ellerman <mpe@ellerman.id.au>
>> Cc: Bharata B Rao <bharata@linux.ibm.com>
>> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
>> Cc: Laurent Dufour <ldufour@linux.ibm.com>
>> Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
>> Cc: David Gibson <david@gibson.dropbear.id.au>
>> Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
>> Cc: kvm-ppc@vger.kernel.org
>> Cc: linuxppc-dev@lists.ozlabs.org
>> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
>> ---
>>   Documentation/powerpc/ultravisor.rst        |   2 +
>>   arch/powerpc/include/asm/kvm_book3s_uvmem.h |   2 +
>>   arch/powerpc/kvm/book3s_hv_uvmem.c          | 154 +++++++++++++++++++++++-----
>>   3 files changed, 132 insertions(+), 26 deletions(-)
>>
>> diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst
>> index 363736d..3bc8957 100644
>> --- a/Documentation/powerpc/ultravisor.rst
>> +++ b/Documentation/powerpc/ultravisor.rst
>> @@ -933,6 +933,8 @@ Return values
>>   	* H_UNSUPPORTED		if called from the wrong context (e.g.
>>   				from an SVM or before an H_SVM_INIT_START
>>   				hypercall).
>> +	* H_STATE		if the hypervisor could not successfully
>> +                                transition the VM to Secure VM.
>>   
>>   Description
>>   ~~~~~~~~~~~
>> diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
>> index 5a9834e..b9cd7eb 100644
>> --- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h
>> +++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
>> @@ -22,6 +22,8 @@ unsigned long kvmppc_h_svm_page_out(struct kvm *kvm,
>>   unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm);
>>   void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
>>   			     struct kvm *kvm, bool skip_page_out);
>> +int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
>> +			const struct kvm_memory_slot *memslot);
>>   #else
>>   static inline int kvmppc_uvmem_init(void)
>>   {
>> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
>> index c8c0290..449e8a7 100644
>> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
>> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
>> @@ -93,6 +93,7 @@
>>   #include <asm/ultravisor.h>
>>   #include <asm/mman.h>
>>   #include <asm/kvm_ppc.h>
>> +#include <asm/kvm_book3s_uvmem.h>
>>   
>>   static struct dev_pagemap kvmppc_uvmem_pgmap;
>>   static unsigned long *kvmppc_uvmem_bitmap;
>> @@ -339,6 +340,21 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
>>   	return false;
>>   }
>>   
>> +/* return true, if the GFN is a shared-GFN, or a secure-GFN */
>> +bool kvmppc_gfn_has_transitioned(unsigned long gfn, struct kvm *kvm)
>> +{
>> +	struct kvmppc_uvmem_slot *p;
>> +
>> +	list_for_each_entry(p, &kvm->arch.uvmem_pfns, list) {
>> +		if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
>> +			unsigned long index = gfn - p->base_pfn;
>> +
>> +			return (p->pfns[index] & KVMPPC_GFN_FLAG_MASK);
>> +		}
>> +	}
>> +	return false;
>> +}
>> +
>>   unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
>>   {
>>   	struct kvm_memslots *slots;
>> @@ -379,12 +395,31 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
>>   
>>   unsigned long kvmppc_h_svm_init_done(struct kvm *kvm)
>>   {
>> +	struct kvm_memslots *slots;
>> +	struct kvm_memory_slot *memslot;
>> +	int srcu_idx;
>> +	long ret = H_SUCCESS;
>> +
>>   	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
>>   		return H_UNSUPPORTED;
>>   
>> +	/* migrate any unmoved normal pfn to device pfns*/
>> +	srcu_idx = srcu_read_lock(&kvm->srcu);
>> +	slots = kvm_memslots(kvm);
>> +	kvm_for_each_memslot(memslot, slots) {
>> +		ret = kvmppc_uv_migrate_mem_slot(kvm, memslot);
>> +		if (ret) {
>> +			ret = H_STATE;
>> +			goto out;
>> +		}
>> +	}
>> +
>>   	kvm->arch.secure_guest |= KVMPPC_SECURE_INIT_DONE;
>>   	pr_info("LPID %d went secure\n", kvm->arch.lpid);
>> -	return H_SUCCESS;
>> +
>> +out:
>> +	srcu_read_unlock(&kvm->srcu, srcu_idx);
>> +	return ret;
>>   }
>>   
>>   /*
>> @@ -505,12 +540,14 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
>>   }
>>   
>>   /*
>> - * Alloc a PFN from private device memory pool and copy page from normal
>> - * memory to secure memory using UV_PAGE_IN uvcall.
>> + * Alloc a PFN from private device memory pool. If @pagein is true,
>> + * copy page from normal memory to secure memory using UV_PAGE_IN uvcall.
>>    */
>> -static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
>> -		   unsigned long end, unsigned long gpa, struct kvm *kvm,
>> -		   unsigned long page_shift, bool *downgrade)
>> +static int kvmppc_svm_migrate_page(struct vm_area_struct *vma,
>> +		unsigned long start,
>> +		unsigned long end, unsigned long gpa, struct kvm *kvm,
>> +		unsigned long page_shift,
>> +		bool pagein)
>>   {
>>   	unsigned long src_pfn, dst_pfn = 0;
>>   	struct migrate_vma mig;
>> @@ -526,18 +563,6 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
>>   	mig.src = &src_pfn;
>>   	mig.dst = &dst_pfn;
>>   
>> -	/*
>> -	 * We come here with mmap_sem write lock held just for
>> -	 * ksm_madvise(), otherwise we only need read mmap_sem.
>> -	 * Hence downgrade to read lock once ksm_madvise() is done.
>> -	 */
> 
> Can you please retain this comment? This explains why we take write lock
> and then downgrade.
> 
>> -	ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
>> -			  MADV_UNMERGEABLE, &vma->vm_flags);
>> -	downgrade_write(&kvm->mm->mmap_sem);
>> -	*downgrade = true;
> 
> When I introduced this variable, there was a suggestion to rename it
> to "downgraded", but we were a bit late then. When you are touching
> this now, can you rename it appropriately?
> 
>> -	if (ret)
>> -		return ret;
>> -
>>   	ret = migrate_vma_setup(&mig);
>>   	if (ret)
>>   		return ret;
>> @@ -553,11 +578,16 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
>>   		goto out_finalize;
>>   	}
>>   
>> -	pfn = *mig.src >> MIGRATE_PFN_SHIFT;
>> -	spage = migrate_pfn_to_page(*mig.src);
>> -	if (spage)
>> -		uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0,
>> -			   page_shift);
>> +	if (pagein) {
>> +		pfn = *mig.src >> MIGRATE_PFN_SHIFT;
>> +		spage = migrate_pfn_to_page(*mig.src);
>> +		if (spage) {
>> +			ret = uv_page_in(kvm->arch.lpid, pfn << page_shift,
>> +					gpa, 0, page_shift);
>> +			if (ret)
>> +				goto out_finalize;
>> +		}
>> +	}
>>   
>>   	*mig.dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
>>   	migrate_vma_pages(&mig);
>> @@ -566,6 +596,66 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
>>   	return ret;
>>   }
>>   
>> +int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
>> +		const struct kvm_memory_slot *memslot)
>> +{
>> +	unsigned long gfn = memslot->base_gfn;
>> +	unsigned long end;
>> +	bool downgrade = false;
>> +	struct vm_area_struct *vma;
>> +	int i, ret = 0;
>> +	unsigned long start = gfn_to_hva(kvm, gfn);
>> +
>> +	if (kvm_is_error_hva(start))
>> +		return H_STATE;
>> +
>> +	end = start + (memslot->npages << PAGE_SHIFT);
>> +
>> +	down_write(&kvm->mm->mmap_sem);
>> +
>> +	mutex_lock(&kvm->arch.uvmem_lock);
>> +	vma = find_vma_intersection(kvm->mm, start, end);
>> +	if (!vma || vma->vm_start > start || vma->vm_end < end) {
>> +		ret = H_STATE;
>> +		goto out_unlock;
>> +	}
>> +
>> +	ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
>> +			  MADV_UNMERGEABLE, &vma->vm_flags);
>> +	downgrade_write(&kvm->mm->mmap_sem);
>> +	downgrade = true;
>> +	if (ret) {
>> +		ret = H_STATE;
>> +		goto out_unlock;
>> +	}
>> +
>> +	for (i = 0; i < memslot->npages; i++, ++gfn) {
>> +		/*
>> +		 * skip GFNs that have already tranistioned.
>> +		 * paged-in GFNs, shared GFNs, paged-in GFNs
>> +		 * that were later paged-out.
>> +		 */
>> +		if (kvmppc_gfn_has_transitioned(gfn, kvm))
>> +			continue;
>> +
>> +		start = gfn_to_hva(kvm, gfn);
>> +		end = start + (1UL << PAGE_SHIFT);
>> +		ret = kvmppc_svm_migrate_page(vma, start, end,
>> +			(gfn << PAGE_SHIFT), kvm, PAGE_SHIFT, false);
>> +
> 
> As I said last time, you are assuming that the vma that you obtained
> in the beginning actually spans the entire memslot range. This might
> be true as you haven't found any issues during testing, but I feel it
> is better if there is no such implicit assumption in the code here.

I agree that assumptions are sometimes not good for future work, but here the 
mmap_sem is held, and the VMA's boundaries have already been checked, so how 
could the VMA not spans over the memslot's range?

Am I missing something?

Cheers,
Laurent.

^ permalink raw reply

* Re: [PATCH V3 0/4] mm/debug_vm_pgtable: Add some more tests
From: Anshuman Khandual @ 2020-06-29  8:32 UTC (permalink / raw)
  To: Alexander Gordeev, Gerald Schaefer
  Cc: linux-doc, Heiko Carstens, linux-mm, Paul Mackerras,
	H. Peter Anvin, linux-riscv, Will Deacon, linux-arch, linux-s390,
	Jonathan Corbet, x86, Mike Rapoport, Christian Borntraeger,
	Ingo Molnar, ziy, Catalin Marinas, linux-snps-arc, Vasily Gorbik,
	Borislav Petkov, Paul Walmsley, Kirill A . Shutemov,
	Thomas Gleixner, linux-arm-kernel, christophe.leroy, Vineet Gupta,
	linux-kernel, Palmer Dabbelt, Andrew Morton, linuxppc-dev
In-Reply-To: <20200624144015.GD24934@oc3871087118.ibm.com>



On 06/24/2020 08:10 PM, Alexander Gordeev wrote:
> On Wed, Jun 24, 2020 at 01:48:08PM +0200, Gerald Schaefer wrote:
>> On Wed, 24 Jun 2020 13:05:39 +0200
>> Alexander Gordeev <agordeev@linux.ibm.com> wrote:
>>
>>> On Wed, Jun 24, 2020 at 08:43:10AM +0530, Anshuman Khandual wrote:
>>>
>>> [...]
>>>
>>>> Hello Gerald/Christophe/Vineet,
>>>>
>>>> It would be really great if you could give this series a quick test
>>>> on s390/ppc/arc platforms respectively. Thank you.
>>>
>>> That worked for me with the default and debug s390 configurations.
>>> Would you like to try with some particular options or combinations
>>> of the options?
>>
>> It will be enabled automatically on all archs that set
>> ARCH_HAS_DEBUG_VM_PGTABLE, which we do for s390 unconditionally.
>> Also, DEBUG_VM has to be set, which we have only in the debug config.
>> So only the s390 debug config will have it enabled, you can check
>> dmesg for "debug_vm_pgtable" to see when / where it was run, and if it
>> triggered any warnings.
> 
> Yes, that is what I did ;)
> 
> I should have been more clear. I wonder whether Anshuman has in
> mind other options which possibly makes sense to set or unset
> and check how it goes with non-standard configurations.

After enabling CONFIG_DEBUG_VM either explicitly or via DEBUG_VM, ideally
any memory config combination on s390 which can change platform page table
helpers (validated with CONFIG_DEBUG_VM) should also get tested. Recently,
there was a kernel crash on ppc64 [1] and a build failure on ppc32 [2] for
some particular configs. Hence it will be great if you could run this test
on multiple s390 configurations.

[1] 787d563b8642f35c5 ("mm/debug_vm_pgtable: fix kernel crash by checking for THP support")
[2] 9449c9cb420b249eb ("mm/debug_vm_pgtable: fix build failure with powerpc 8xx")

- Anshuman

^ permalink raw reply

* [PATCH V4 2/3] cpufreq: Register governors at core_initcall
From: Viresh Kumar @ 2020-06-29  8:24 UTC (permalink / raw)
  To: Rafael Wysocki, Arnd Bergmann, Viresh Kumar, Ingo Molnar,
	Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman
  Cc: Quentin Perret, kernel-team, adharmap, linux-pm, linux-kernel,
	linuxppc-dev, tkjos
In-Reply-To: <cover.1593418662.git.viresh.kumar@linaro.org>

From: Quentin Perret <qperret@google.com>

Currently, most CPUFreq governors are registered at core_initcall time
when used as default, and module_init otherwise. In preparation for
letting users specify the default governor on the kernel command line,
change all of them to use core_initcall unconditionally, as is already
the case for schedutil and performance. This will enable us to assume
builtin governors have been registered before the builtin CPUFreq
drivers probe.

And since all governors now have similar init/exit patterns, introduce
two new macros cpufreq_governor_{init,exit}() to factorize the code.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 .../platforms/cell/cpufreq_spudemand.c        | 26 ++-----------------
 drivers/cpufreq/cpufreq_conservative.c        | 22 ++++------------
 drivers/cpufreq/cpufreq_ondemand.c            | 24 +++++------------
 drivers/cpufreq/cpufreq_performance.c         | 14 ++--------
 drivers/cpufreq/cpufreq_powersave.c           | 18 +++----------
 drivers/cpufreq/cpufreq_userspace.c           | 18 +++----------
 include/linux/cpufreq.h                       | 14 ++++++++++
 kernel/sched/cpufreq_schedutil.c              |  6 +----
 8 files changed, 36 insertions(+), 106 deletions(-)

diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c
index 55b31eadb3c8..ca7849e113d7 100644
--- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c
+++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c
@@ -126,30 +126,8 @@ static struct cpufreq_governor spu_governor = {
 	.stop = spu_gov_stop,
 	.owner = THIS_MODULE,
 };
-
-/*
- * module init and destoy
- */
-
-static int __init spu_gov_init(void)
-{
-	int ret;
-
-	ret = cpufreq_register_governor(&spu_governor);
-	if (ret)
-		printk(KERN_ERR "registration of governor failed\n");
-	return ret;
-}
-
-static void __exit spu_gov_exit(void)
-{
-	cpufreq_unregister_governor(&spu_governor);
-}
-
-
-module_init(spu_gov_init);
-module_exit(spu_gov_exit);
+cpufreq_governor_init(spu_governor);
+cpufreq_governor_exit(spu_governor);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Christian Krafft <krafft@de.ibm.com>");
-
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 737ff3b9c2c0..aa39ff31ec9f 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -322,17 +322,7 @@ static struct dbs_governor cs_governor = {
 	.start = cs_start,
 };
 
-#define CPU_FREQ_GOV_CONSERVATIVE	(&cs_governor.gov)
-
-static int __init cpufreq_gov_dbs_init(void)
-{
-	return cpufreq_register_governor(CPU_FREQ_GOV_CONSERVATIVE);
-}
-
-static void __exit cpufreq_gov_dbs_exit(void)
-{
-	cpufreq_unregister_governor(CPU_FREQ_GOV_CONSERVATIVE);
-}
+#define CPU_FREQ_GOV_CONSERVATIVE	(cs_governor.gov)
 
 MODULE_AUTHOR("Alexander Clouter <alex@digriz.org.uk>");
 MODULE_DESCRIPTION("'cpufreq_conservative' - A dynamic cpufreq governor for "
@@ -343,11 +333,9 @@ MODULE_LICENSE("GPL");
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
 struct cpufreq_governor *cpufreq_default_governor(void)
 {
-	return CPU_FREQ_GOV_CONSERVATIVE;
+	return &CPU_FREQ_GOV_CONSERVATIVE;
 }
-
-core_initcall(cpufreq_gov_dbs_init);
-#else
-module_init(cpufreq_gov_dbs_init);
 #endif
-module_exit(cpufreq_gov_dbs_exit);
+
+cpufreq_governor_init(CPU_FREQ_GOV_CONSERVATIVE);
+cpufreq_governor_exit(CPU_FREQ_GOV_CONSERVATIVE);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 82a4d37ddecb..ac361a8b1d3b 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -408,7 +408,7 @@ static struct dbs_governor od_dbs_gov = {
 	.start = od_start,
 };
 
-#define CPU_FREQ_GOV_ONDEMAND	(&od_dbs_gov.gov)
+#define CPU_FREQ_GOV_ONDEMAND	(od_dbs_gov.gov)
 
 static void od_set_powersave_bias(unsigned int powersave_bias)
 {
@@ -429,7 +429,7 @@ static void od_set_powersave_bias(unsigned int powersave_bias)
 			continue;
 
 		policy = cpufreq_cpu_get_raw(cpu);
-		if (!policy || policy->governor != CPU_FREQ_GOV_ONDEMAND)
+		if (!policy || policy->governor != &CPU_FREQ_GOV_ONDEMAND)
 			continue;
 
 		policy_dbs = policy->governor_data;
@@ -461,16 +461,6 @@ void od_unregister_powersave_bias_handler(void)
 }
 EXPORT_SYMBOL_GPL(od_unregister_powersave_bias_handler);
 
-static int __init cpufreq_gov_dbs_init(void)
-{
-	return cpufreq_register_governor(CPU_FREQ_GOV_ONDEMAND);
-}
-
-static void __exit cpufreq_gov_dbs_exit(void)
-{
-	cpufreq_unregister_governor(CPU_FREQ_GOV_ONDEMAND);
-}
-
 MODULE_AUTHOR("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>");
 MODULE_AUTHOR("Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>");
 MODULE_DESCRIPTION("'cpufreq_ondemand' - A dynamic cpufreq governor for "
@@ -480,11 +470,9 @@ MODULE_LICENSE("GPL");
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
 struct cpufreq_governor *cpufreq_default_governor(void)
 {
-	return CPU_FREQ_GOV_ONDEMAND;
+	return &CPU_FREQ_GOV_ONDEMAND;
 }
-
-core_initcall(cpufreq_gov_dbs_init);
-#else
-module_init(cpufreq_gov_dbs_init);
 #endif
-module_exit(cpufreq_gov_dbs_exit);
+
+cpufreq_governor_init(CPU_FREQ_GOV_ONDEMAND);
+cpufreq_governor_exit(CPU_FREQ_GOV_ONDEMAND);
diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c
index def9afe0f5b8..71c1d9aba772 100644
--- a/drivers/cpufreq/cpufreq_performance.c
+++ b/drivers/cpufreq/cpufreq_performance.c
@@ -23,16 +23,6 @@ static struct cpufreq_governor cpufreq_gov_performance = {
 	.limits		= cpufreq_gov_performance_limits,
 };
 
-static int __init cpufreq_gov_performance_init(void)
-{
-	return cpufreq_register_governor(&cpufreq_gov_performance);
-}
-
-static void __exit cpufreq_gov_performance_exit(void)
-{
-	cpufreq_unregister_governor(&cpufreq_gov_performance);
-}
-
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE
 struct cpufreq_governor *cpufreq_default_governor(void)
 {
@@ -50,5 +40,5 @@ MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
 MODULE_DESCRIPTION("CPUfreq policy governor 'performance'");
 MODULE_LICENSE("GPL");
 
-core_initcall(cpufreq_gov_performance_init);
-module_exit(cpufreq_gov_performance_exit);
+cpufreq_governor_init(cpufreq_gov_performance);
+cpufreq_governor_exit(cpufreq_gov_performance);
diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c
index 1ae66019eb83..7749522355b5 100644
--- a/drivers/cpufreq/cpufreq_powersave.c
+++ b/drivers/cpufreq/cpufreq_powersave.c
@@ -23,16 +23,6 @@ static struct cpufreq_governor cpufreq_gov_powersave = {
 	.owner		= THIS_MODULE,
 };
 
-static int __init cpufreq_gov_powersave_init(void)
-{
-	return cpufreq_register_governor(&cpufreq_gov_powersave);
-}
-
-static void __exit cpufreq_gov_powersave_exit(void)
-{
-	cpufreq_unregister_governor(&cpufreq_gov_powersave);
-}
-
 MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
 MODULE_DESCRIPTION("CPUfreq policy governor 'powersave'");
 MODULE_LICENSE("GPL");
@@ -42,9 +32,7 @@ struct cpufreq_governor *cpufreq_default_governor(void)
 {
 	return &cpufreq_gov_powersave;
 }
-
-core_initcall(cpufreq_gov_powersave_init);
-#else
-module_init(cpufreq_gov_powersave_init);
 #endif
-module_exit(cpufreq_gov_powersave_exit);
+
+cpufreq_governor_init(cpufreq_gov_powersave);
+cpufreq_governor_exit(cpufreq_gov_powersave);
diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c
index b43e7cd502c5..50a4d7846580 100644
--- a/drivers/cpufreq/cpufreq_userspace.c
+++ b/drivers/cpufreq/cpufreq_userspace.c
@@ -126,16 +126,6 @@ static struct cpufreq_governor cpufreq_gov_userspace = {
 	.owner		= THIS_MODULE,
 };
 
-static int __init cpufreq_gov_userspace_init(void)
-{
-	return cpufreq_register_governor(&cpufreq_gov_userspace);
-}
-
-static void __exit cpufreq_gov_userspace_exit(void)
-{
-	cpufreq_unregister_governor(&cpufreq_gov_userspace);
-}
-
 MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>, "
 		"Russell King <rmk@arm.linux.org.uk>");
 MODULE_DESCRIPTION("CPUfreq policy governor 'userspace'");
@@ -146,9 +136,7 @@ struct cpufreq_governor *cpufreq_default_governor(void)
 {
 	return &cpufreq_gov_userspace;
 }
-
-core_initcall(cpufreq_gov_userspace_init);
-#else
-module_init(cpufreq_gov_userspace_init);
 #endif
-module_exit(cpufreq_gov_userspace_exit);
+
+cpufreq_governor_init(cpufreq_gov_userspace);
+cpufreq_governor_exit(cpufreq_gov_userspace);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 3494f6763597..e62b022cb07e 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -577,6 +577,20 @@ unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy);
 int cpufreq_register_governor(struct cpufreq_governor *governor);
 void cpufreq_unregister_governor(struct cpufreq_governor *governor);
 
+#define cpufreq_governor_init(__governor)			\
+static int __init __governor##_init(void)			\
+{								\
+	return cpufreq_register_governor(&__governor);	\
+}								\
+core_initcall(__governor##_init)
+
+#define cpufreq_governor_exit(__governor)			\
+static void __exit __governor##_exit(void)			\
+{								\
+	return cpufreq_unregister_governor(&__governor);	\
+}								\
+module_exit(__governor##_exit)
+
 struct cpufreq_governor *cpufreq_default_governor(void);
 struct cpufreq_governor *cpufreq_fallback_governor(void);
 
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 7fbaee24c824..402a09af9f43 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -909,11 +909,7 @@ struct cpufreq_governor *cpufreq_default_governor(void)
 }
 #endif
 
-static int __init sugov_register(void)
-{
-	return cpufreq_register_governor(&schedutil_gov);
-}
-core_initcall(sugov_register);
+cpufreq_governor_init(schedutil_gov);
 
 #ifdef CONFIG_ENERGY_MODEL
 extern bool sched_energy_update;
-- 
2.25.0.rc1.19.g042ed3e048af


^ permalink raw reply related

* [PATCH V4 0/3] cpufreq: Allow default governor on cmdline and fix locking issues
From: Viresh Kumar @ 2020-06-29  8:24 UTC (permalink / raw)
  To: Rafael Wysocki, Arnd Bergmann, Ben Segall, Dietmar Eggemann,
	Ingo Molnar, Jonathan Corbet, Juri Lelli, Mel Gorman,
	Peter Zijlstra, Steven Rostedt, Vincent Guittot, Viresh Kumar
  Cc: linux-doc, linuxppc-dev, adharmap, linux-pm, linux-kernel,
	Quentin Perret, kernel-team, tkjos

Hi,

I have picked Quentin's series over my patch, modified both and tested.

V3->V4:
- Do __module_get() for cpufreq_default_governor() case as well and get
  rid of an extra variable.
- Use a single character array, default_governor, instead of two of them.

V2->V3:
- default_governor is a string now and we don't set it on governor
  registration or unregistration anymore.
- Fixed locking issues in cpufreq_init_policy().

--
Viresh

Original cover letter fro Quentin:

This series enables users of prebuilt kernels (e.g. distro kernels) to
specify their CPUfreq governor of choice using the kernel command line,
instead of having to wait for the system to fully boot to userspace to
switch using the sysfs interface. This is helpful for 2 reasons:
  1. users get to choose the governor that runs during the actual boot;
  2. it simplifies the userspace boot procedure a bit (one less thing to
     worry about).

To enable this, the first patch moves all governor init calls to
core_initcall, to make sure they are registered by the time the drivers
probe. This should be relatively low impact as registering a governor
is a simple procedure (it gets added to a llist), and all governors
already load at core_initcall anyway when they're set as the default
in Kconfig. This also allows to clean-up the governors' init/exit code,
and reduces boilerplate.

The second patch introduces the new command line parameter, inspired by
its cpuidle counterpart. More details can be found in the respective
patch headers.

Changes in v2:
 - added Viresh's ack to patch 01
 - moved the assignment of 'default_governor' in patch 02 to the governor
   registration path instead of the driver registration (Viresh)

Quentin Perret (2):
  cpufreq: Register governors at core_initcall
  cpufreq: Specify default governor on command line

Viresh Kumar (1):
  cpufreq: Fix locking issues with governors

 .../admin-guide/kernel-parameters.txt         |  5 ++
 Documentation/admin-guide/pm/cpufreq.rst      |  6 +-
 .../platforms/cell/cpufreq_spudemand.c        | 26 +-----
 drivers/cpufreq/cpufreq.c                     | 87 ++++++++++++-------
 drivers/cpufreq/cpufreq_conservative.c        | 22 ++---
 drivers/cpufreq/cpufreq_ondemand.c            | 24 ++---
 drivers/cpufreq/cpufreq_performance.c         | 14 +--
 drivers/cpufreq/cpufreq_powersave.c           | 18 +---
 drivers/cpufreq/cpufreq_userspace.c           | 18 +---
 include/linux/cpufreq.h                       | 14 +++
 kernel/sched/cpufreq_schedutil.c              |  6 +-
 11 files changed, 100 insertions(+), 140 deletions(-)

-- 
2.25.0.rc1.19.g042ed3e048af


^ permalink raw reply

* Re: [PATCH V3 2/4] mm/debug_vm_pgtable: Add tests validating advanced arch page table helpers
From: Anshuman Khandual @ 2020-06-29  8:15 UTC (permalink / raw)
  To: Christophe Leroy, linux-mm
  Cc: Heiko Carstens, Paul Mackerras, H. Peter Anvin, linux-riscv,
	Will Deacon, linux-arch, linux-s390, x86, Mike Rapoport,
	Christian Borntraeger, Ingo Molnar, gerald.schaefer, ziy,
	Catalin Marinas, linux-snps-arc, Vasily Gorbik, Borislav Petkov,
	Paul Walmsley, Kirill A . Shutemov, Thomas Gleixner,
	linux-arm-kernel, Vineet Gupta, linux-kernel, Palmer Dabbelt,
	Andrew Morton, linuxppc-dev
In-Reply-To: <4da41eee-5ce0-2a5e-40eb-4424655b3489@csgroup.eu>



On 06/27/2020 12:56 PM, Christophe Leroy wrote:
> 
> 
> Le 15/06/2020 à 05:37, Anshuman Khandual a écrit :
>> This adds new tests validating for these following arch advanced page table
>> helpers. These tests create and test specific mapping types at various page
>> table levels.
>>
>> 1. pxxp_set_wrprotect()
>> 2. pxxp_get_and_clear()
>> 3. pxxp_set_access_flags()
>> 4. pxxp_get_and_clear_full()
>> 5. pxxp_test_and_clear_young()
>> 6. pxx_leaf()
>> 7. pxx_set_huge()
>> 8. pxx_(clear|mk)_savedwrite()
>> 9. huge_pxxp_xxx()
>>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Mike Rapoport <rppt@linux.ibm.com>
>> Cc: Vineet Gupta <vgupta@synopsys.com>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>> Cc: Paul Mackerras <paulus@samba.org>
>> Cc: Michael Ellerman <mpe@ellerman.id.au>
>> Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
>> Cc: Vasily Gorbik <gor@linux.ibm.com>
>> Cc: Christian Borntraeger <borntraeger@de.ibm.com>
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Cc: Borislav Petkov <bp@alien8.de>
>> Cc: "H. Peter Anvin" <hpa@zytor.com>
>> Cc: Kirill A. Shutemov <kirill@shutemov.name>
>> Cc: Paul Walmsley <paul.walmsley@sifive.com>
>> Cc: Palmer Dabbelt <palmer@dabbelt.com>
>> Cc: linux-snps-arc@lists.infradead.org
>> Cc: linux-arm-kernel@lists.infradead.org
>> Cc: linuxppc-dev@lists.ozlabs.org
>> Cc: linux-s390@vger.kernel.org
>> Cc: linux-riscv@lists.infradead.org
>> Cc: x86@kernel.org
>> Cc: linux-mm@kvack.org
>> Cc: linux-arch@vger.kernel.org
>> Cc: linux-kernel@vger.kernel.org
>> Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>> ---
>>   mm/debug_vm_pgtable.c | 306 ++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 306 insertions(+)
>>
>> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
>> index ffa163d4c63c..e3f9f8317a98 100644
>> --- a/mm/debug_vm_pgtable.c
>> +++ b/mm/debug_vm_pgtable.c
>> @@ -21,6 +21,7 @@
>>   #include <linux/module.h>
>>   #include <linux/pfn_t.h>
>>   #include <linux/printk.h>
>> +#include <linux/pgtable.h>
>>   #include <linux/random.h>
>>   #include <linux/spinlock.h>
>>   #include <linux/swap.h>
>> @@ -28,6 +29,7 @@
>>   #include <linux/start_kernel.h>
>>   #include <linux/sched/mm.h>
>>   #include <asm/pgalloc.h>
>> +#include <asm/tlbflush.h>
>>     #define VMFLAGS    (VM_READ|VM_WRITE|VM_EXEC)
>>   @@ -55,6 +57,54 @@ static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
>>       WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte))));
>>   }
>>   +static void __init pte_advanced_tests(struct mm_struct *mm,
>> +            struct vm_area_struct *vma, pte_t *ptep,
>> +            unsigned long pfn, unsigned long vaddr, pgprot_t prot)
> 
> Align args properly.
> 
>> +{
>> +    pte_t pte = pfn_pte(pfn, prot);
>> +
>> +    pte = pfn_pte(pfn, prot);
>> +    set_pte_at(mm, vaddr, ptep, pte);
>> +    ptep_set_wrprotect(mm, vaddr, ptep);
>> +    pte = READ_ONCE(*ptep);
>> +    WARN_ON(pte_write(pte));
>> +
>> +    pte = pfn_pte(pfn, prot);
>> +    set_pte_at(mm, vaddr, ptep, pte);
>> +    ptep_get_and_clear(mm, vaddr, ptep);
>> +    pte = READ_ONCE(*ptep);
>> +    WARN_ON(!pte_none(pte));
>> +
>> +    pte = pfn_pte(pfn, prot);
>> +    pte = pte_wrprotect(pte);
>> +    pte = pte_mkclean(pte);
>> +    set_pte_at(mm, vaddr, ptep, pte);
>> +    pte = pte_mkwrite(pte);
>> +    pte = pte_mkdirty(pte);
>> +    ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
>> +    pte = READ_ONCE(*ptep);
>> +    WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
>> +
>> +    pte = pfn_pte(pfn, prot);
>> +    set_pte_at(mm, vaddr, ptep, pte);
>> +    ptep_get_and_clear_full(mm, vaddr, ptep, 1);
>> +    pte = READ_ONCE(*ptep);
>> +    WARN_ON(!pte_none(pte));
>> +
>> +    pte = pte_mkyoung(pte);
>> +    set_pte_at(mm, vaddr, ptep, pte);
>> +    ptep_test_and_clear_young(vma, vaddr, ptep);
>> +    pte = READ_ONCE(*ptep);
>> +    WARN_ON(pte_young(pte));
>> +}
>> +
>> +static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
>> +{
>> +    pte_t pte = pfn_pte(pfn, prot);
>> +
>> +    WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte))));
>> +    WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte))));
>> +}
>>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>   static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
>>   {
>> @@ -77,6 +127,89 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
>>       WARN_ON(!pmd_bad(pmd_mkhuge(pmd)));
>>   }
>>   +static void __init pmd_advanced_tests(struct mm_struct *mm,
>> +        struct vm_area_struct *vma, pmd_t *pmdp,
>> +        unsigned long pfn, unsigned long vaddr, pgprot_t prot)
> 
> Align args properly
> 
>> +{
>> +    pmd_t pmd = pfn_pmd(pfn, prot);
>> +
>> +    if (!has_transparent_hugepage())
>> +        return;
>> +
>> +    /* Align the address wrt HPAGE_PMD_SIZE */
>> +    vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
>> +
>> +    pmd = pfn_pmd(pfn, prot);
>> +    set_pmd_at(mm, vaddr, pmdp, pmd);
>> +    pmdp_set_wrprotect(mm, vaddr, pmdp);
>> +    pmd = READ_ONCE(*pmdp);
>> +    WARN_ON(pmd_write(pmd));
>> +
>> +    pmd = pfn_pmd(pfn, prot);
>> +    set_pmd_at(mm, vaddr, pmdp, pmd);
>> +    pmdp_huge_get_and_clear(mm, vaddr, pmdp);
>> +    pmd = READ_ONCE(*pmdp);
>> +    WARN_ON(!pmd_none(pmd));
>> +
>> +    pmd = pfn_pmd(pfn, prot);
>> +    pmd = pmd_wrprotect(pmd);
>> +    pmd = pmd_mkclean(pmd);
>> +    set_pmd_at(mm, vaddr, pmdp, pmd);
>> +    pmd = pmd_mkwrite(pmd);
>> +    pmd = pmd_mkdirty(pmd);
>> +    pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
>> +    pmd = READ_ONCE(*pmdp);
>> +    WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
>> +
>> +    pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
>> +    set_pmd_at(mm, vaddr, pmdp, pmd);
>> +    pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
>> +    pmd = READ_ONCE(*pmdp);
>> +    WARN_ON(!pmd_none(pmd));
>> +
>> +    pmd = pmd_mkyoung(pmd);
>> +    set_pmd_at(mm, vaddr, pmdp, pmd);
>> +    pmdp_test_and_clear_young(vma, vaddr, pmdp);
>> +    pmd = READ_ONCE(*pmdp);
>> +    WARN_ON(pmd_young(pmd));
>> +}
>> +
>> +static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
>> +{
>> +    pmd_t pmd = pfn_pmd(pfn, prot);
>> +
>> +    /*
>> +     * PMD based THP is a leaf entry.
>> +     */
>> +    pmd = pmd_mkhuge(pmd);
>> +    WARN_ON(!pmd_leaf(pmd));
>> +}
>> +
>> +static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
>> +{
>> +    pmd_t pmd;
>> +
>> +    if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
>> +        return;
>> +    /*
>> +     * X86 defined pmd_set_huge() verifies that the given
>> +     * PMD is not a populated non-leaf entry.
>> +     */
>> +    WRITE_ONCE(*pmdp, __pmd(0));
>> +    WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
>> +    WARN_ON(!pmd_clear_huge(pmdp));
>> +    pmd = READ_ONCE(*pmdp);
>> +    WARN_ON(!pmd_none(pmd));
>> +}
>> +
>> +static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
>> +{
>> +    pmd_t pmd = pfn_pmd(pfn, prot);
>> +
>> +    WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
>> +    WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
>> +}
>> +
>>   #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>>   static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
>>   {
>> @@ -100,12 +233,115 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
>>        */
>>       WARN_ON(!pud_bad(pud_mkhuge(pud)));
>>   }
>> +
>> +static void pud_advanced_tests(struct mm_struct *mm,
>> +        struct vm_area_struct *vma, pud_t *pudp,
>> +        unsigned long pfn, unsigned long vaddr, pgprot_t prot)
> 
> Align args properly
> 
>> +{
>> +    pud_t pud = pfn_pud(pfn, prot);
>> +
>> +    if (!has_transparent_hugepage())
>> +        return;
>> +
>> +    /* Align the address wrt HPAGE_PUD_SIZE */
>> +    vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
>> +
>> +    set_pud_at(mm, vaddr, pudp, pud);
>> +    pudp_set_wrprotect(mm, vaddr, pudp);
>> +    pud = READ_ONCE(*pudp);
>> +    WARN_ON(pud_write(pud));
>> +
>> +#ifndef __PAGETABLE_PMD_FOLDED
>> +    pud = pfn_pud(pfn, prot);
>> +    set_pud_at(mm, vaddr, pudp, pud);
>> +    pudp_huge_get_and_clear(mm, vaddr, pudp);
>> +    pud = READ_ONCE(*pudp);
>> +    WARN_ON(!pud_none(pud));
>> +
>> +    pud = pfn_pud(pfn, prot);
>> +    set_pud_at(mm, vaddr, pudp, pud);
>> +    pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
>> +    pud = READ_ONCE(*pudp);
>> +    WARN_ON(!pud_none(pud));
>> +#endif /* __PAGETABLE_PMD_FOLDED */
>> +    pud = pfn_pud(pfn, prot);
>> +    pud = pud_wrprotect(pud);
>> +    pud = pud_mkclean(pud);
>> +    set_pud_at(mm, vaddr, pudp, pud);
>> +    pud = pud_mkwrite(pud);
>> +    pud = pud_mkdirty(pud);
>> +    pudp_set_access_flags(vma, vaddr, pudp, pud, 1);
>> +    pud = READ_ONCE(*pudp);
>> +    WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
>> +
>> +    pud = pud_mkyoung(pud);
>> +    set_pud_at(mm, vaddr, pudp, pud);
>> +    pudp_test_and_clear_young(vma, vaddr, pudp);
>> +    pud = READ_ONCE(*pudp);
>> +    WARN_ON(pud_young(pud));
>> +}
>> +
>> +static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
>> +{
>> +    pud_t pud = pfn_pud(pfn, prot);
>> +
>> +    /*
>> +     * PUD based THP is a leaf entry.
>> +     */
>> +    pud = pud_mkhuge(pud);
>> +    WARN_ON(!pud_leaf(pud));
>> +}
>> +
>> +static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
>> +{
>> +    pud_t pud;
>> +
>> +    if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
>> +        return;
>> +    /*
>> +     * X86 defined pud_set_huge() verifies that the given
>> +     * PUD is not a populated non-leaf entry.
>> +     */
>> +    WRITE_ONCE(*pudp, __pud(0));
>> +    WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
>> +    WARN_ON(!pud_clear_huge(pudp));
>> +    pud = READ_ONCE(*pudp);
>> +    WARN_ON(!pud_none(pud));
>> +}
>>   #else  /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
>>   static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
>> +static void pud_advanced_tests(struct mm_struct *mm,
>> +        struct vm_area_struct *vma, pud_t *pudp,
>> +        unsigned long pfn, unsigned long vaddr, pgprot_t prot)
> 
> Align args properly
> 
>> +{
>> +}
>> +static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
>> +static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
>> +{
>> +}
>>   #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
>>   #else  /* !CONFIG_TRANSPARENT_HUGEPAGE */
>>   static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { }
>>   static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
>> +static void __init pmd_advanced_tests(struct mm_struct *mm,
>> +        struct vm_area_struct *vma, pmd_t *pmdp,
>> +        unsigned long pfn, unsigned long vaddr, pgprot_t prot)
> 
> Align args properly
> 
>> +{
>> +}
>> +static void __init pud_advanced_tests(struct mm_struct *mm,
>> +        struct vm_area_struct *vma, pud_t *pudp,
>> +        unsigned long pfn, unsigned long vaddr, pgprot_t prot)
> 
> Align args properly
> 

Sure, will fix the arguments alignment in the above mentioned places.

^ permalink raw reply

* Re: [PATCH RFC 1/1] powerpc/eeh: Synchronization for safety
From: Oliver O'Halloran @ 2020-06-29  8:11 UTC (permalink / raw)
  To: Sam Bobroff, linuxppc-dev
In-Reply-To: <049418257caa232ca3e27ee8cbab9b4ff38b11aa.1585550388.git.sbobroff@linux.ibm.com>

On Mon, 2020-03-30 at 17:39 +1100, Sam Bobroff wrote:
> There is currently little synchronization between EEH error detection
> (eeh_dev_check_failure()), EEH error recovery
> (eeh_handle_{normal,special}_event()) and the PCI subsystem (device
> addition and removal), and so there are race conditions that lead to
> crashes (often access to free'd memory or LIST_POISON).
> 
> However, a solution must consider:
> - EEH error detection can occur in interrupt context, which prevents
> the use of a mutex.
> - EEH recovery may need to sleep, which prevents the use of a spinlock.
> - EEH recovery uses PCI operations that may require the PCI
> rescan/remove lock and/or device lock to be held
> - PCI operations may hold the rescan/remove and/or device lock when
> calling into EEH functions.
> - Device driver callbacks may perform arbitrary PCI operations
> during recovery, including device removal.
> 
> In this patch the existing mutex and spinlock are combined with the
> EEH_PE_RECOVERING flag to provide some assurances that are then used
> to reduce the race conditions.

> The fields to be protected are the ones that provide the structure
> of the trees of struct eeh_pe that are held for each PHB: the parent
> pointer and child lists and the list of struct eeh_dev, as well as
> the pe and pdev pointers within struct eeh_dev.

This needs to be explicitly documented in the source tree. Preferably
next to the fields they protect in eeh_pe and eeh_dev.


> - I'm not aware of any outstanding problems with the set, but I've kept it RFC
>   for now becuase I'm looking for comments on the general strategy and
>   direction -- is this a good way to achieve some safety?

The two-locks idea seems like the sort of thing that's easy to screw up
and it's not really obvious what it's doing or why. Half the problem
with EEH is that it doesn't really make sense unless you spend far too
much time gazing into that abyss and this isn't really improving
matters IMO.

The main driver seems to be the desire to dereference edev->pe from
eeh_dev_check_failure(), but... why? The way it works is technically
broken as-is since it relies on eeh_ops->get_state() correctly
reporting the PE state. That's the same function that we have that 300s
wait hack for in the recovery thread (makes you wonder what platforms
actually need that...)

We've spoken in the past about having eeh_dev_check_failure() punt the
work of checking PE states into the event handler thread. I think we'd
get more milage out of that idea rather than doing the two-locks thing.
Is there some other situation besides eeh_dev_check_failure() being
called from interrupt context that makes the spinlock necessary?


> - Good places to review carefully would be eeh_pe_report_pdev() and
>   eeh_reset_device().

> - The mutex and spinlock need better names. Suggestions welcome.

eeh_dev_mutex should just be eeh_pe_tree_mutex, or similar. If you
really think the two locks idea is the way to go then
eeh_pe_tree_spinlock would make sense too. If we're going to have two
locks covering the same things then their names should be similar.

> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index 12c248a16527..30acae0d10e5 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -108,11 +108,26 @@ bool eeh_debugfs_no_recover;
>  /* Platform dependent EEH operations */
>  struct eeh_ops *eeh_ops = NULL;
>  
> -/* Lock to avoid races due to multiple reports of an error */
> +/*
> + * confirm_error_lock and eeh_dev_mutex are used together to provide
> + * safety during EEH operations.
> + *
> + * Generally, the spinlock is used in error detection where it's not possible
> + * to use a mutex or where there is potential to deadlock with the mutex, and
> + * the mutex is used during recovery and other PCI related operations. One must
> + * be held when reading and both must be held when making changes to the
> + * protected fields: eeh_pe.parent, child_list, child, edevs and eeh_dev.pe,
> + * .pdev.
> + *
> + * Lock ordering:
> + * - the PCI rescan/remove mutex (see pci_lock_rescan_remove())
> + * - the struct device lock (see device_lock())
> + * - the eeh_dev_mutex mutex (see eeh_recovery_lock())
> + * - the confirm_error_lock spinlock (see eeh_serialize_lock())
> + * - the eeh_eventlist_lock spinlock
> + */

I can't make sense of this list. I probably don't need to take the pci
rescan lock before taking the eeh event list lock, or vis-a-vis, but
that's what it's suggesting.

I'm guessing there's two seperate orderings here?


> @@ -1323,7 +1368,11 @@ void eeh_remove_device(struct pci_dev *dev)
>  	 * for the VF EEH device.
>  	 */
>  	edev->in_error = false;
> +	/* Both locks are required to make changes */
> +	eeh_recovery_must_be_locked();
> +	eeh_serialize_lock(&flags);
>  	dev->dev.archdata.edev = NULL;
> +	eeh_serialize_unlock(flags);

So pci_dev->dev.archdata.edev is also covered by eeh_serialize_lock()?


> +struct pci_dev **pdev_cache_list_create(struct eeh_pe *root)
>  {
>  	struct eeh_pe *pe;
>  	struct eeh_dev *edev, *tmp;
> +	struct pci_dev **pdevs;
> +	int i, n;
> +
> +	n = 0;
> +	eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp) {
> +		if (edev->pdev)
> +			n++;
> +	}
> +	pdevs = kmalloc(sizeof(*pdevs) * (n + 1), GFP_KERNEL);
> +	if (WARN_ON_ONCE(!pdevs))
> +		return NULL;
> +	i = 0;
> +	eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp) {
> +		if (i < n) {
> +			get_device(&edev->pdev->dev);
> +			pdevs[i++] = edev->pdev;
> +		}
> +	}
> +	if (WARN_ON_ONCE(i < n))
> +		n = i;
> +	pdevs[n] = NULL; /* terminator */
> +	return pdevs;
> +}
> +
> +static void pdev_cache_list_destroy(struct pci_dev **pdevs)
> +{
> +	struct pci_dev **pdevp;
> +
> +	for (pdevp = pdevs; pdevp && *pdevp; pdevp++)
> +		put_device(&(*pdevp)->dev);
> +	kfree(pdevs);
> +}
> +
> +static void eeh_pe_report(const char *name, struct eeh_pe *root,
> +			  eeh_report_fn fn, enum pci_ers_result *result)
> +{
> +	struct pci_dev **pdevs, **pdevp;
>  
>  	pr_info("EEH: Beginning: '%s'\n", name);
> -	eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp)
> -		eeh_pe_report_edev(edev, fn, result);
> +	/*
> +	 * It would be convenient to continue to hold the recovery lock here
> +	 * but driver callbacks can take a very long time or never return at
> +	 * all.
> +	 */
> +	pdevs = pdev_cache_list_create(root);
> +	for (pdevp = pdevs; pdevp && *pdevp; pdevp++) {
> +		/*
> +		 * NOTE! eeh_recovery_lock() is released briefly
> +		 * in eeh_pe_report_pdev()
> +		 */
> +		eeh_pe_report_pdev(*pdevp, fn, result);
> +	}
> +	pdev_cache_list_destroy(pdevs);

So the cache stuff is taking a ref to all the pci_devs under that PE. I
assume that's to keep the eeh_dev->pdev pointer valid for the whole
time we don't have the recovery_lock held? I don't really see why we
need to care though since we're not holding a reference to the pdev
itself and the removal should happen while we're in the driver
callback. Is this a hold over from when the eeh_dev would be removed
with the pci_dev was released?

> @@ -477,17 +554,44 @@ static void *eeh_add_virt_device(struct eeh_dev *edev)
>  	}
>  
>  #ifdef CONFIG_PCI_IOV
> -	pci_iov_add_virtfn(edev->physfn, eeh_dev_to_pdn(edev)->vf_index);
> +	{
> +		struct pci_dev *physfn = edev->physfn;
> +		int vf_index = eeh_dev_to_pdn(edev)->vf_index;
> +
> +		get_device(&physfn->dev);
> +		eeh_recovery_unlock();
> +		/*
> +		 * This PCI operation will call back into EEH code where the
> +		 * recovery lock will be acquired, so it must be released here,
> +		 * first:
> +		 */
> +		pci_iov_add_virtfn(physfn, vf_index);
> +		put_device(&physfn->dev);
> +		eeh_recovery_lock();
> +	}
>  #endif
>  	return NULL;
>  }
>  
> -static void eeh_rmv_device(struct eeh_dev *edev, void *userdata)
> +/*
> + * It's important that this function take a pdev as a parameter, because they
> + * are protected by a reference count. An edev could be lost when the recovery
> + * lock is dropped (which it must be in order to take the PCI rescan/remove
> + * lock without risking a deadlock).

Same comments as above. Even if an edev goes away does it matter? The
remove will be handled by code that properly handles the locking. Also
with the change that moved eeh_remove_device() from the
pcibios_release_device() into the bus removal notifier this won't stop
the eeh_dev from being removed.


> @@ -1118,6 +1273,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
>   * Called when an EEH event is detected but can't be narrowed down to a
>   * specific PE.  Iterates through possible failures and handles them as
>   * necessary.
> + * XXX TODO Needs to be checked sync work

stale comment or is it fine?


> @@ -445,6 +474,9 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
>  	}
>  	pe->parent = parent;
>  
> +	if (parent->state & EEH_PE_RECOVERING)
> +		pe->state |= EEH_PE_RECOVERING;
> +

What's actually required to hit this case? All I can think of is
discovering a new bridge during a post-reset rescan. A new brige would
mean a new PE would be allocated for the downstream bus.

> @@ -141,13 +144,21 @@ static ssize_t pnv_eeh_ei_write(struct file *filp,
>  	if (ret != 5)
>  		return -EINVAL;
>  
> +	/*
> +	 * Use the spinlock rather than the mutex so that errors can be
> +	 * injected during slow recovery operations (for testing).
> +	 */
> +	eeh_serialize_lock(&flags);
>  	/* Retrieve PE */
>  	pe = eeh_pe_get(hose, pe_no, 0);
> -	if (!pe)
> +	if (!pe) {
> +		eeh_serialize_unlock(flags);
>  		return -ENODEV;
> +	}
>  
>  	/* Do error injection */
>  	ret = eeh_ops->err_inject(pe, type, func, addr, mask);
> +	eeh_serialize_unlock(flags);
>  	return ret < 0 ? ret : count;
>  }

We could probably make this just look at the powernv phb state directly
and avoid touching the eeh_pe at all. The only bit of the eeh_pe that
we care about is the pe number which the user already provided.

I suppose we care about validating that the PE is in use, but it might
be useful to allow unused PEs to be injected into. That would help with
testing errors during device probe, etc.


^ permalink raw reply

* Re: [PATCH V3 2/4] mm/debug_vm_pgtable: Add tests validating advanced arch page table helpers
From: Anshuman Khandual @ 2020-06-29  8:09 UTC (permalink / raw)
  To: Christophe Leroy, linux-mm
  Cc: Heiko Carstens, Paul Mackerras, H. Peter Anvin, linux-riscv,
	Will Deacon, linux-arch, linux-s390, x86, Mike Rapoport,
	Christian Borntraeger, Ingo Molnar, linux-arm-kernel, ziy,
	Catalin Marinas, linux-snps-arc, Vasily Gorbik, Borislav Petkov,
	Paul Walmsley, Kirill A . Shutemov, Thomas Gleixner,
	gerald.schaefer, christophe.leroy, Vineet Gupta, linux-kernel,
	Palmer Dabbelt, Andrew Morton, linuxppc-dev
In-Reply-To: <6da177e6-9219-9ccf-a402-f4293c7564f7@csgroup.eu>


On 06/27/2020 12:48 PM, Christophe Leroy wrote:
> Le 15/06/2020 à 05:37, Anshuman Khandual a écrit :
>> This adds new tests validating for these following arch advanced page table
>> helpers. These tests create and test specific mapping types at various page
>> table levels.
>>
>> 1. pxxp_set_wrprotect()
>> 2. pxxp_get_and_clear()
>> 3. pxxp_set_access_flags()
>> 4. pxxp_get_and_clear_full()
>> 5. pxxp_test_and_clear_young()
>> 6. pxx_leaf()
>> 7. pxx_set_huge()
>> 8. pxx_(clear|mk)_savedwrite()
>> 9. huge_pxxp_xxx()
>>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Mike Rapoport <rppt@linux.ibm.com>
>> Cc: Vineet Gupta <vgupta@synopsys.com>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>> Cc: Paul Mackerras <paulus@samba.org>
>> Cc: Michael Ellerman <mpe@ellerman.id.au>
>> Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
>> Cc: Vasily Gorbik <gor@linux.ibm.com>
>> Cc: Christian Borntraeger <borntraeger@de.ibm.com>
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Cc: Borislav Petkov <bp@alien8.de>
>> Cc: "H. Peter Anvin" <hpa@zytor.com>
>> Cc: Kirill A. Shutemov <kirill@shutemov.name>
>> Cc: Paul Walmsley <paul.walmsley@sifive.com>
>> Cc: Palmer Dabbelt <palmer@dabbelt.com>
>> Cc: linux-snps-arc@lists.infradead.org
>> Cc: linux-arm-kernel@lists.infradead.org
>> Cc: linuxppc-dev@lists.ozlabs.org
>> Cc: linux-s390@vger.kernel.org
>> Cc: linux-riscv@lists.infradead.org
>> Cc: x86@kernel.org
>> Cc: linux-mm@kvack.org
>> Cc: linux-arch@vger.kernel.org
>> Cc: linux-kernel@vger.kernel.org
>> Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>> ---
>>   mm/debug_vm_pgtable.c | 306 ++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 306 insertions(+)
>>
>> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
>> index ffa163d4c63c..e3f9f8317a98 100644
>> --- a/mm/debug_vm_pgtable.c
>> +++ b/mm/debug_vm_pgtable.c
>> @@ -21,6 +21,7 @@
>>   #include <linux/module.h>
>>   #include <linux/pfn_t.h>
>>   #include <linux/printk.h>
>> +#include <linux/pgtable.h>
>>   #include <linux/random.h>
>>   #include <linux/spinlock.h>
>>   #include <linux/swap.h>
>> @@ -28,6 +29,7 @@
>>   #include <linux/start_kernel.h>
>>   #include <linux/sched/mm.h>
>>   #include <asm/pgalloc.h>
>> +#include <asm/tlbflush.h>
>>     #define VMFLAGS    (VM_READ|VM_WRITE|VM_EXEC)
>>   @@ -55,6 +57,54 @@ static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
>>       WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte))));
>>   }
>>   +static void __init pte_advanced_tests(struct mm_struct *mm,
>> +            struct vm_area_struct *vma, pte_t *ptep,
>> +            unsigned long pfn, unsigned long vaddr, pgprot_t prot)
>> +{
>> +    pte_t pte = pfn_pte(pfn, prot);
>> +
>> +    pte = pfn_pte(pfn, prot);
>> +    set_pte_at(mm, vaddr, ptep, pte);
>> +    ptep_set_wrprotect(mm, vaddr, ptep);
>> +    pte = READ_ONCE(*ptep);
> 
> same
> 
>> +    WARN_ON(pte_write(pte));
>> +
>> +    pte = pfn_pte(pfn, prot);
>> +    set_pte_at(mm, vaddr, ptep, pte);
>> +    ptep_get_and_clear(mm, vaddr, ptep);
>> +    pte = READ_ONCE(*ptep);
> 
> same
> 
>> +    WARN_ON(!pte_none(pte));
>> +
>> +    pte = pfn_pte(pfn, prot);
>> +    pte = pte_wrprotect(pte);
>> +    pte = pte_mkclean(pte);
>> +    set_pte_at(mm, vaddr, ptep, pte);
>> +    pte = pte_mkwrite(pte);
>> +    pte = pte_mkdirty(pte);
>> +    ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
>> +    pte = READ_ONCE(*ptep);
> 
> same
> 
>> +    WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
>> +
>> +    pte = pfn_pte(pfn, prot);
>> +    set_pte_at(mm, vaddr, ptep, pte);
>> +    ptep_get_and_clear_full(mm, vaddr, ptep, 1);
>> +    pte = READ_ONCE(*ptep);
> 
> same
> 
>> +    WARN_ON(!pte_none(pte));
>> +
>> +    pte = pte_mkyoung(pte);
>> +    set_pte_at(mm, vaddr, ptep, pte);
>> +    ptep_test_and_clear_young(vma, vaddr, ptep);
>> +    pte = READ_ONCE(*ptep);
> 
> same
> 
>> +    WARN_ON(pte_young(pte));
>> +}
>> +
>> +static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
>> +{
>> +    pte_t pte = pfn_pte(pfn, prot);
>> +
>> +    WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte))));
>> +    WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte))));
>> +}
>>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>   static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
>>   {
>> @@ -77,6 +127,89 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
>>       WARN_ON(!pmd_bad(pmd_mkhuge(pmd)));
>>   }
>>   +static void __init pmd_advanced_tests(struct mm_struct *mm,
>> +        struct vm_area_struct *vma, pmd_t *pmdp,
>> +        unsigned long pfn, unsigned long vaddr, pgprot_t prot)
>> +{
>> +    pmd_t pmd = pfn_pmd(pfn, prot);
>> +
>> +    if (!has_transparent_hugepage())
>> +        return;
>> +
>> +    /* Align the address wrt HPAGE_PMD_SIZE */
>> +    vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
>> +
>> +    pmd = pfn_pmd(pfn, prot);
>> +    set_pmd_at(mm, vaddr, pmdp, pmd);
>> +    pmdp_set_wrprotect(mm, vaddr, pmdp);
>> +    pmd = READ_ONCE(*pmdp);
>> +    WARN_ON(pmd_write(pmd));
>> +
>> +    pmd = pfn_pmd(pfn, prot);
>> +    set_pmd_at(mm, vaddr, pmdp, pmd);
>> +    pmdp_huge_get_and_clear(mm, vaddr, pmdp);
>> +    pmd = READ_ONCE(*pmdp);
>> +    WARN_ON(!pmd_none(pmd));
>> +
>> +    pmd = pfn_pmd(pfn, prot);
>> +    pmd = pmd_wrprotect(pmd);
>> +    pmd = pmd_mkclean(pmd);
>> +    set_pmd_at(mm, vaddr, pmdp, pmd);
>> +    pmd = pmd_mkwrite(pmd);
>> +    pmd = pmd_mkdirty(pmd);
>> +    pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
>> +    pmd = READ_ONCE(*pmdp);
>> +    WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
>> +
>> +    pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
>> +    set_pmd_at(mm, vaddr, pmdp, pmd);
>> +    pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
>> +    pmd = READ_ONCE(*pmdp);
>> +    WARN_ON(!pmd_none(pmd));
>> +
>> +    pmd = pmd_mkyoung(pmd);
>> +    set_pmd_at(mm, vaddr, pmdp, pmd);
>> +    pmdp_test_and_clear_young(vma, vaddr, pmdp);
>> +    pmd = READ_ONCE(*pmdp);
>> +    WARN_ON(pmd_young(pmd));
>> +}
>> +
>> +static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
>> +{
>> +    pmd_t pmd = pfn_pmd(pfn, prot);
>> +
>> +    /*
>> +     * PMD based THP is a leaf entry.
>> +     */
>> +    pmd = pmd_mkhuge(pmd);
>> +    WARN_ON(!pmd_leaf(pmd));
>> +}
>> +
>> +static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
>> +{
>> +    pmd_t pmd;
>> +
>> +    if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
>> +        return;
>> +    /*
>> +     * X86 defined pmd_set_huge() verifies that the given
>> +     * PMD is not a populated non-leaf entry.
>> +     */
>> +    WRITE_ONCE(*pmdp, __pmd(0));
>> +    WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
>> +    WARN_ON(!pmd_clear_huge(pmdp));
>> +    pmd = READ_ONCE(*pmdp);
>> +    WARN_ON(!pmd_none(pmd));
>> +}
>> +
>> +static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
>> +{
>> +    pmd_t pmd = pfn_pmd(pfn, prot);
>> +
>> +    WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
>> +    WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
>> +}
>> +
>>   #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>>   static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
>>   {
>> @@ -100,12 +233,115 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
>>        */
>>       WARN_ON(!pud_bad(pud_mkhuge(pud)));
>>   }
>> +
>> +static void pud_advanced_tests(struct mm_struct *mm,
>> +        struct vm_area_struct *vma, pud_t *pudp,
>> +        unsigned long pfn, unsigned long vaddr, pgprot_t prot)
>> +{
>> +    pud_t pud = pfn_pud(pfn, prot);
>> +
>> +    if (!has_transparent_hugepage())
>> +        return;
>> +
>> +    /* Align the address wrt HPAGE_PUD_SIZE */
>> +    vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
>> +
>> +    set_pud_at(mm, vaddr, pudp, pud);
>> +    pudp_set_wrprotect(mm, vaddr, pudp);
>> +    pud = READ_ONCE(*pudp);
>> +    WARN_ON(pud_write(pud));
>> +
>> +#ifndef __PAGETABLE_PMD_FOLDED
>> +    pud = pfn_pud(pfn, prot);
>> +    set_pud_at(mm, vaddr, pudp, pud);
>> +    pudp_huge_get_and_clear(mm, vaddr, pudp);
>> +    pud = READ_ONCE(*pudp);
>> +    WARN_ON(!pud_none(pud));
>> +
>> +    pud = pfn_pud(pfn, prot);
>> +    set_pud_at(mm, vaddr, pudp, pud);
>> +    pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
>> +    pud = READ_ONCE(*pudp);
>> +    WARN_ON(!pud_none(pud));
>> +#endif /* __PAGETABLE_PMD_FOLDED */
>> +    pud = pfn_pud(pfn, prot);
>> +    pud = pud_wrprotect(pud);
>> +    pud = pud_mkclean(pud);
>> +    set_pud_at(mm, vaddr, pudp, pud);
>> +    pud = pud_mkwrite(pud);
>> +    pud = pud_mkdirty(pud);
>> +    pudp_set_access_flags(vma, vaddr, pudp, pud, 1);
>> +    pud = READ_ONCE(*pudp);
>> +    WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
>> +
>> +    pud = pud_mkyoung(pud);
>> +    set_pud_at(mm, vaddr, pudp, pud);
>> +    pudp_test_and_clear_young(vma, vaddr, pudp);
>> +    pud = READ_ONCE(*pudp);
>> +    WARN_ON(pud_young(pud));
>> +}
>> +
>> +static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
>> +{
>> +    pud_t pud = pfn_pud(pfn, prot);
>> +
>> +    /*
>> +     * PUD based THP is a leaf entry.
>> +     */
>> +    pud = pud_mkhuge(pud);
>> +    WARN_ON(!pud_leaf(pud));
>> +}
>> +
>> +static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
>> +{
>> +    pud_t pud;
>> +
>> +    if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
>> +        return;
>> +    /*
>> +     * X86 defined pud_set_huge() verifies that the given
>> +     * PUD is not a populated non-leaf entry.
>> +     */
>> +    WRITE_ONCE(*pudp, __pud(0));
>> +    WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
>> +    WARN_ON(!pud_clear_huge(pudp));
>> +    pud = READ_ONCE(*pudp);
>> +    WARN_ON(!pud_none(pud));
>> +}
>>   #else  /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
>>   static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
>> +static void pud_advanced_tests(struct mm_struct *mm,
>> +        struct vm_area_struct *vma, pud_t *pudp,
>> +        unsigned long pfn, unsigned long vaddr, pgprot_t prot)
>> +{
>> +}
>> +static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
>> +static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
>> +{
>> +}
>>   #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
>>   #else  /* !CONFIG_TRANSPARENT_HUGEPAGE */
>>   static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { }
>>   static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
>> +static void __init pmd_advanced_tests(struct mm_struct *mm,
>> +        struct vm_area_struct *vma, pmd_t *pmdp,
>> +        unsigned long pfn, unsigned long vaddr, pgprot_t prot)
>> +{
>> +}
>> +static void __init pud_advanced_tests(struct mm_struct *mm,
>> +        struct vm_area_struct *vma, pud_t *pudp,
>> +        unsigned long pfn, unsigned long vaddr, pgprot_t prot)
>> +{
>> +}
>> +static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { }
>> +static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
>> +static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
>> +{
>> +}
>> +static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
>> +{
>> +}
>> +static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
>>   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>>     static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
>> @@ -495,8 +731,56 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
>>       WARN_ON(!pte_huge(pte_mkhuge(pte)));
>>   #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
>>   }
>> +
>> +static void __init hugetlb_advanced_tests(struct mm_struct *mm,
>> +                      struct vm_area_struct *vma,
>> +                      pte_t *ptep, unsigned long pfn,
>> +                      unsigned long vaddr, pgprot_t prot)
>> +{
>> +    struct page *page = pfn_to_page(pfn);
>> +    pte_t pte = READ_ONCE(*ptep);
> 
> Remplace with ptep_get() to avoid build failure on powerpc 8xx.

Sure, will replace all open PTE pointer accesses with ptep_get().

^ permalink raw reply

* Re: [PATCH 01/13] iommu/exynos: Use dev_iommu_priv_get/set()
From: Marek Szyprowski @ 2020-06-29  7:51 UTC (permalink / raw)
  To: Joerg Roedel, iommu
  Cc: linux-ia64, Heiko Stuebner, David Airlie, Joonas Lahtinen,
	Thierry Reding, Paul Mackerras, Will Deacon, x86, Russell King,
	Catalin Marinas, Fenghua Yu, Joerg Roedel, intel-gfx, Jani Nikula,
	Rodrigo Vivi, Matthias Brugger, linux-arm-kernel, Tony Luck,
	linuxppc-dev, linux-kernel, Daniel Vetter, David Woodhouse,
	Lu Baolu
In-Reply-To: <20200625130836.1916-2-joro@8bytes.org>

On 25.06.2020 15:08, Joerg Roedel wrote:
> From: Joerg Roedel <jroedel@suse.de>
>
> Remove the use of dev->archdata.iommu and use the private per-device
> pointer provided by IOMMU core code instead.
>
> Signed-off-by: Joerg Roedel <jroedel@suse.de>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
> ---
>   drivers/iommu/exynos-iommu.c                  | 20 +++++++++----------
>   .../media/platform/s5p-mfc/s5p_mfc_iommu.h    |  4 +++-
>   2 files changed, 13 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
> index 60c8a56e4a3f..6a9b67302369 100644
> --- a/drivers/iommu/exynos-iommu.c
> +++ b/drivers/iommu/exynos-iommu.c
> @@ -173,7 +173,7 @@ static u32 lv2ent_offset(sysmmu_iova_t iova)
>   #define REG_V5_FAULT_AR_VA	0x070
>   #define REG_V5_FAULT_AW_VA	0x080
>   
> -#define has_sysmmu(dev)		(dev->archdata.iommu != NULL)
> +#define has_sysmmu(dev)		(dev_iommu_priv_get(dev) != NULL)
>   
>   static struct device *dma_dev;
>   static struct kmem_cache *lv2table_kmem_cache;
> @@ -226,7 +226,7 @@ static const struct sysmmu_fault_info sysmmu_v5_faults[] = {
>   };
>   
>   /*
> - * This structure is attached to dev.archdata.iommu of the master device
> + * This structure is attached to dev->iommu->priv of the master device
>    * on device add, contains a list of SYSMMU controllers defined by device tree,
>    * which are bound to given master device. It is usually referenced by 'owner'
>    * pointer.
> @@ -670,7 +670,7 @@ static int __maybe_unused exynos_sysmmu_suspend(struct device *dev)
>   	struct device *master = data->master;
>   
>   	if (master) {
> -		struct exynos_iommu_owner *owner = master->archdata.iommu;
> +		struct exynos_iommu_owner *owner = dev_iommu_priv_get(master);
>   
>   		mutex_lock(&owner->rpm_lock);
>   		if (data->domain) {
> @@ -688,7 +688,7 @@ static int __maybe_unused exynos_sysmmu_resume(struct device *dev)
>   	struct device *master = data->master;
>   
>   	if (master) {
> -		struct exynos_iommu_owner *owner = master->archdata.iommu;
> +		struct exynos_iommu_owner *owner = dev_iommu_priv_get(master);
>   
>   		mutex_lock(&owner->rpm_lock);
>   		if (data->domain) {
> @@ -837,8 +837,8 @@ static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain)
>   static void exynos_iommu_detach_device(struct iommu_domain *iommu_domain,
>   				    struct device *dev)
>   {
> -	struct exynos_iommu_owner *owner = dev->archdata.iommu;
>   	struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
> +	struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
>   	phys_addr_t pagetable = virt_to_phys(domain->pgtable);
>   	struct sysmmu_drvdata *data, *next;
>   	unsigned long flags;
> @@ -875,8 +875,8 @@ static void exynos_iommu_detach_device(struct iommu_domain *iommu_domain,
>   static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain,
>   				   struct device *dev)
>   {
> -	struct exynos_iommu_owner *owner = dev->archdata.iommu;
>   	struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
> +	struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
>   	struct sysmmu_drvdata *data;
>   	phys_addr_t pagetable = virt_to_phys(domain->pgtable);
>   	unsigned long flags;
> @@ -1237,7 +1237,7 @@ static phys_addr_t exynos_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
>   
>   static struct iommu_device *exynos_iommu_probe_device(struct device *dev)
>   {
> -	struct exynos_iommu_owner *owner = dev->archdata.iommu;
> +	struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
>   	struct sysmmu_drvdata *data;
>   
>   	if (!has_sysmmu(dev))
> @@ -1263,7 +1263,7 @@ static struct iommu_device *exynos_iommu_probe_device(struct device *dev)
>   
>   static void exynos_iommu_release_device(struct device *dev)
>   {
> -	struct exynos_iommu_owner *owner = dev->archdata.iommu;
> +	struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
>   	struct sysmmu_drvdata *data;
>   
>   	if (!has_sysmmu(dev))
> @@ -1287,8 +1287,8 @@ static void exynos_iommu_release_device(struct device *dev)
>   static int exynos_iommu_of_xlate(struct device *dev,
>   				 struct of_phandle_args *spec)
>   {
> -	struct exynos_iommu_owner *owner = dev->archdata.iommu;
>   	struct platform_device *sysmmu = of_find_device_by_node(spec->np);
> +	struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
>   	struct sysmmu_drvdata *data, *entry;
>   
>   	if (!sysmmu)
> @@ -1305,7 +1305,7 @@ static int exynos_iommu_of_xlate(struct device *dev,
>   
>   		INIT_LIST_HEAD(&owner->controllers);
>   		mutex_init(&owner->rpm_lock);
> -		dev->archdata.iommu = owner;
> +		dev_iommu_priv_set(dev, owner);
>   	}
>   
>   	list_for_each_entry(entry, &owner->controllers, owner_node)
> diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc_iommu.h b/drivers/media/platform/s5p-mfc/s5p_mfc_iommu.h
> index 152a713fff78..1a32266b7ddc 100644
> --- a/drivers/media/platform/s5p-mfc/s5p_mfc_iommu.h
> +++ b/drivers/media/platform/s5p-mfc/s5p_mfc_iommu.h
> @@ -9,9 +9,11 @@
>   
>   #if defined(CONFIG_EXYNOS_IOMMU)
>   
> +#include <linux/iommu.h>
> +
>   static inline bool exynos_is_iommu_available(struct device *dev)
>   {
> -	return dev->archdata.iommu != NULL;
> +	return dev_iommu_priv_get(dev) != NULL;
>   }
>   
>   #else

Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland


^ permalink raw reply

* [PATCH] ASoC: fsl_sai: Refine regcache usage with pm runtime
From: Shengjiu Wang @ 2020-06-29  6:42 UTC (permalink / raw)
  To: timur, nicoleotsuka, Xiubo.Lee, festevam, broonie, perex, tiwai,
	alsa-devel
  Cc: linuxppc-dev, linux-kernel

When there is dedicated power domain bound with device, after probing
the power will be disabled, then registers are not accessible in
fsl_sai_dai_probe(), so regcache only need to be enabled in end of
probe() and regcache_mark_dirty should be moved to pm runtime resume
callback function.

Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
---
 sound/soc/fsl/fsl_sai.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c
index 9d436b0c5718..a22562f2df47 100644
--- a/sound/soc/fsl/fsl_sai.c
+++ b/sound/soc/fsl/fsl_sai.c
@@ -1016,6 +1016,7 @@ static int fsl_sai_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, sai);
 
 	pm_runtime_enable(&pdev->dev);
+	regcache_cache_only(sai->regmap, true);
 
 	ret = devm_snd_soc_register_component(&pdev->dev, &fsl_component,
 			&fsl_sai_dai, 1);
@@ -1107,7 +1108,6 @@ static int fsl_sai_runtime_suspend(struct device *dev)
 	clk_disable_unprepare(sai->bus_clk);
 
 	regcache_cache_only(sai->regmap, true);
-	regcache_mark_dirty(sai->regmap);
 
 	return 0;
 }
@@ -1137,6 +1137,7 @@ static int fsl_sai_runtime_resume(struct device *dev)
 	}
 
 	regcache_cache_only(sai->regmap, false);
+	regcache_mark_dirty(sai->regmap);
 	regmap_write(sai->regmap, FSL_SAI_TCSR(ofs), FSL_SAI_CSR_SR);
 	regmap_write(sai->regmap, FSL_SAI_RCSR(ofs), FSL_SAI_CSR_SR);
 	usleep_range(1000, 2000);
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH v2] powerpc/uaccess: Use flexible addressing with __put_user()/__get_user()
From: Christophe Leroy @ 2020-06-29  6:52 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin,
	segher
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <c2addbd9d76212242d3d8554a2f7ff849fb08b85.1587040754.git.christophe.leroy@c-s.fr>

Hi Michael,

I see this patch is marked as "defered" in patchwork, but I can't see 
any related discussion. Is it normal ?

Christophe

Le 16/04/2020 à 14:39, Christophe Leroy a écrit :
> At the time being, __put_user()/__get_user() and friends only use
> D-form addressing, with 0 offset. Ex:
> 
> 	lwz	reg1, 0(reg2)
> 
> Give the compiler the opportunity to use other adressing modes
> whenever possible, to get more optimised code.
> 
> Hereunder is a small exemple:
> 
> struct test {
> 	u32 item1;
> 	u16 item2;
> 	u8 item3;
> 	u64 item4;
> };
> 
> int set_test_user(struct test __user *from, struct test __user *to)
> {
> 	int err;
> 	u32 item1;
> 	u16 item2;
> 	u8 item3;
> 	u64 item4;
> 
> 	err = __get_user(item1, &from->item1);
> 	err |= __get_user(item2, &from->item2);
> 	err |= __get_user(item3, &from->item3);
> 	err |= __get_user(item4, &from->item4);
> 
> 	err |= __put_user(item1, &to->item1);
> 	err |= __put_user(item2, &to->item2);
> 	err |= __put_user(item3, &to->item3);
> 	err |= __put_user(item4, &to->item4);
> 
> 	return err;
> }
> 
> Before the patch:
> 
> 00000df0 <set_test_user>:
>   df0:	94 21 ff f0 	stwu    r1,-16(r1)
>   df4:	39 40 00 00 	li      r10,0
>   df8:	93 c1 00 08 	stw     r30,8(r1)
>   dfc:	93 e1 00 0c 	stw     r31,12(r1)
>   e00:	7d 49 53 78 	mr      r9,r10
>   e04:	80 a3 00 00 	lwz     r5,0(r3)
>   e08:	38 e3 00 04 	addi    r7,r3,4
>   e0c:	7d 46 53 78 	mr      r6,r10
>   e10:	a0 e7 00 00 	lhz     r7,0(r7)
>   e14:	7d 29 33 78 	or      r9,r9,r6
>   e18:	39 03 00 06 	addi    r8,r3,6
>   e1c:	7d 46 53 78 	mr      r6,r10
>   e20:	89 08 00 00 	lbz     r8,0(r8)
>   e24:	7d 29 33 78 	or      r9,r9,r6
>   e28:	38 63 00 08 	addi    r3,r3,8
>   e2c:	7d 46 53 78 	mr      r6,r10
>   e30:	83 c3 00 00 	lwz     r30,0(r3)
>   e34:	83 e3 00 04 	lwz     r31,4(r3)
>   e38:	7d 29 33 78 	or      r9,r9,r6
>   e3c:	7d 43 53 78 	mr      r3,r10
>   e40:	90 a4 00 00 	stw     r5,0(r4)
>   e44:	7d 29 1b 78 	or      r9,r9,r3
>   e48:	38 c4 00 04 	addi    r6,r4,4
>   e4c:	7d 43 53 78 	mr      r3,r10
>   e50:	b0 e6 00 00 	sth     r7,0(r6)
>   e54:	7d 29 1b 78 	or      r9,r9,r3
>   e58:	38 e4 00 06 	addi    r7,r4,6
>   e5c:	7d 43 53 78 	mr      r3,r10
>   e60:	99 07 00 00 	stb     r8,0(r7)
>   e64:	7d 23 1b 78 	or      r3,r9,r3
>   e68:	38 84 00 08 	addi    r4,r4,8
>   e6c:	93 c4 00 00 	stw     r30,0(r4)
>   e70:	93 e4 00 04 	stw     r31,4(r4)
>   e74:	7c 63 53 78 	or      r3,r3,r10
>   e78:	83 c1 00 08 	lwz     r30,8(r1)
>   e7c:	83 e1 00 0c 	lwz     r31,12(r1)
>   e80:	38 21 00 10 	addi    r1,r1,16
>   e84:	4e 80 00 20 	blr
> 
> After the patch:
> 
> 00000dbc <set_test_user>:
>   dbc:	39 40 00 00 	li      r10,0
>   dc0:	7d 49 53 78 	mr      r9,r10
>   dc4:	80 03 00 00 	lwz     r0,0(r3)
>   dc8:	7d 48 53 78 	mr      r8,r10
>   dcc:	a1 63 00 04 	lhz     r11,4(r3)
>   dd0:	7d 29 43 78 	or      r9,r9,r8
>   dd4:	7d 48 53 78 	mr      r8,r10
>   dd8:	88 a3 00 06 	lbz     r5,6(r3)
>   ddc:	7d 29 43 78 	or      r9,r9,r8
>   de0:	7d 48 53 78 	mr      r8,r10
>   de4:	80 c3 00 08 	lwz     r6,8(r3)
>   de8:	80 e3 00 0c 	lwz     r7,12(r3)
>   dec:	7d 29 43 78 	or      r9,r9,r8
>   df0:	7d 43 53 78 	mr      r3,r10
>   df4:	90 04 00 00 	stw     r0,0(r4)
>   df8:	7d 29 1b 78 	or      r9,r9,r3
>   dfc:	7d 43 53 78 	mr      r3,r10
>   e00:	b1 64 00 04 	sth     r11,4(r4)
>   e04:	7d 29 1b 78 	or      r9,r9,r3
>   e08:	7d 43 53 78 	mr      r3,r10
>   e0c:	98 a4 00 06 	stb     r5,6(r4)
>   e10:	7d 23 1b 78 	or      r3,r9,r3
>   e14:	90 c4 00 08 	stw     r6,8(r4)
>   e18:	90 e4 00 0c 	stw     r7,12(r4)
>   e1c:	7c 63 53 78 	or      r3,r3,r10
>   e20:	4e 80 00 20 	blr
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> Reviewed-by: Segher Boessenkool <segher@kernel.crashing.org>
> ---
> v2:
> - Added <> modifier in __put_user_asm() and __get_user_asm()
> - Removed %U2 in __put_user_asm2() and __get_user_asm2()
> - Reworded the commit log
> ---
>   arch/powerpc/include/asm/uaccess.h | 28 ++++++++++++++--------------
>   1 file changed, 14 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
> index 7c811442b607..9365b59495a2 100644
> --- a/arch/powerpc/include/asm/uaccess.h
> +++ b/arch/powerpc/include/asm/uaccess.h
> @@ -114,7 +114,7 @@ extern long __put_user_bad(void);
>    */
>   #define __put_user_asm(x, addr, err, op)			\
>   	__asm__ __volatile__(					\
> -		"1:	" op " %1,0(%2)	# put_user\n"		\
> +		"1:	" op "%U2%X2 %1,%2	# put_user\n"	\
>   		"2:\n"						\
>   		".section .fixup,\"ax\"\n"			\
>   		"3:	li %0,%3\n"				\
> @@ -122,7 +122,7 @@ extern long __put_user_bad(void);
>   		".previous\n"					\
>   		EX_TABLE(1b, 3b)				\
>   		: "=r" (err)					\
> -		: "r" (x), "b" (addr), "i" (-EFAULT), "0" (err))
> +		: "r" (x), "m<>" (*addr), "i" (-EFAULT), "0" (err))
>   
>   #ifdef __powerpc64__
>   #define __put_user_asm2(x, ptr, retval)				\
> @@ -130,8 +130,8 @@ extern long __put_user_bad(void);
>   #else /* __powerpc64__ */
>   #define __put_user_asm2(x, addr, err)				\
>   	__asm__ __volatile__(					\
> -		"1:	stw %1,0(%2)\n"				\
> -		"2:	stw %1+1,4(%2)\n"			\
> +		"1:	stw%X2 %1,%2\n"			\
> +		"2:	stw%X2 %L1,%L2\n"			\
>   		"3:\n"						\
>   		".section .fixup,\"ax\"\n"			\
>   		"4:	li %0,%3\n"				\
> @@ -140,7 +140,7 @@ extern long __put_user_bad(void);
>   		EX_TABLE(1b, 4b)				\
>   		EX_TABLE(2b, 4b)				\
>   		: "=r" (err)					\
> -		: "r" (x), "b" (addr), "i" (-EFAULT), "0" (err))
> +		: "r" (x), "m" (*addr), "i" (-EFAULT), "0" (err))
>   #endif /* __powerpc64__ */
>   
>   #define __put_user_size_allowed(x, ptr, size, retval)		\
> @@ -260,7 +260,7 @@ extern long __get_user_bad(void);
>   
>   #define __get_user_asm(x, addr, err, op)		\
>   	__asm__ __volatile__(				\
> -		"1:	"op" %1,0(%2)	# get_user\n"	\
> +		"1:	"op"%U2%X2 %1, %2	# get_user\n"	\
>   		"2:\n"					\
>   		".section .fixup,\"ax\"\n"		\
>   		"3:	li %0,%3\n"			\
> @@ -269,7 +269,7 @@ extern long __get_user_bad(void);
>   		".previous\n"				\
>   		EX_TABLE(1b, 3b)			\
>   		: "=r" (err), "=r" (x)			\
> -		: "b" (addr), "i" (-EFAULT), "0" (err))
> +		: "m<>" (*addr), "i" (-EFAULT), "0" (err))
>   
>   #ifdef __powerpc64__
>   #define __get_user_asm2(x, addr, err)			\
> @@ -277,8 +277,8 @@ extern long __get_user_bad(void);
>   #else /* __powerpc64__ */
>   #define __get_user_asm2(x, addr, err)			\
>   	__asm__ __volatile__(				\
> -		"1:	lwz %1,0(%2)\n"			\
> -		"2:	lwz %1+1,4(%2)\n"		\
> +		"1:	lwz%X2 %1, %2\n"			\
> +		"2:	lwz%X2 %L1, %L2\n"		\
>   		"3:\n"					\
>   		".section .fixup,\"ax\"\n"		\
>   		"4:	li %0,%3\n"			\
> @@ -289,7 +289,7 @@ extern long __get_user_bad(void);
>   		EX_TABLE(1b, 4b)			\
>   		EX_TABLE(2b, 4b)			\
>   		: "=r" (err), "=&r" (x)			\
> -		: "b" (addr), "i" (-EFAULT), "0" (err))
> +		: "m" (*addr), "i" (-EFAULT), "0" (err))
>   #endif /* __powerpc64__ */
>   
>   #define __get_user_size_allowed(x, ptr, size, retval)		\
> @@ -299,10 +299,10 @@ do {								\
>   	if (size > sizeof(x))					\
>   		(x) = __get_user_bad();				\
>   	switch (size) {						\
> -	case 1: __get_user_asm(x, ptr, retval, "lbz"); break;	\
> -	case 2: __get_user_asm(x, ptr, retval, "lhz"); break;	\
> -	case 4: __get_user_asm(x, ptr, retval, "lwz"); break;	\
> -	case 8: __get_user_asm2(x, ptr, retval);  break;	\
> +	case 1: __get_user_asm(x, (u8 __user *)ptr, retval, "lbz"); break;	\
> +	case 2: __get_user_asm(x, (u16 __user *)ptr, retval, "lhz"); break;	\
> +	case 4: __get_user_asm(x, (u32 __user *)ptr, retval, "lwz"); break;	\
> +	case 8: __get_user_asm2(x, (u64 __user *)ptr, retval);  break;	\
>   	default: (x) = __get_user_bad();			\
>   	}							\
>   } while (0)
> 

^ permalink raw reply

* Re: [PATCH 02/11] powerpc/kexec_file: mark PPC64 specific code
From: Hari Bathini @ 2020-06-29  6:23 UTC (permalink / raw)
  To: Christophe Leroy, Michael Ellerman, Andrew Morton
  Cc: Pingfan Liu, Kexec-ml, Petr Tesarik, Mahesh J Salgaonkar,
	Sourabh Jain, lkml, linuxppc-dev, Mimi Zohar, Vivek Goyal,
	Dave Young, Thiago Jung Bauermann, Eric Biederman
In-Reply-To: <68d59c00-da02-b362-7bd9-a9631eca0fdd@csgroup.eu>

Hi Christophe

Thanks for the review...

On 27/06/20 12:12 pm, Christophe Leroy wrote:
> 
> 
> Le 26/06/2020 à 21:04, Hari Bathini a écrit :
>> Some of the kexec_file_load code isn't PPC64 specific. Move PPC64
>> specific code from kexec/file_load.c to kexec/file_load_64.c. Also,
>> rename purgatory/trampoline.S to purgatory/trampoline_64.S in the
>> same spirit.
> 
> At the time being, CONFIG_KEXEC_FILE depends on PPC64.

Right.

> Are you planning to make it work on PPC32 as well ?

No.

> Otherwise I don't understand the purpose of this patch.

But I want to make sure the changes I am adding in this series do not
get in the way of adding PPC32 changes whenever they are submitted as there
is common code currently and some more of it in the changes I am adding
in this series...

> Also, what is being done in this patch seems to go far beyond what you describe above.> It is propably worth splitting in several patches with proper explanation.

Hmmm.. I don't see any other reason beyond what I mentioned above.
Will try to split the patch but the changelog would still be the same, afaics.

> Christophe
> 
>>
>> Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
>> ---
>>   arch/powerpc/include/asm/kexec.h       |   11 +++
>>   arch/powerpc/kexec/Makefile            |    2 -
>>   arch/powerpc/kexec/elf_64.c            |    7 +-
>>   arch/powerpc/kexec/file_load.c         |   37 ++--------
>>   arch/powerpc/kexec/file_load_64.c      |  108 ++++++++++++++++++++++++++++++
>>   arch/powerpc/purgatory/Makefile        |    4 +
>>   arch/powerpc/purgatory/trampoline.S    |  117 --------------------------------
>>   arch/powerpc/purgatory/trampoline_64.S |  117 ++++++++++++++++++++++++++++++++
>>   8 files changed, 248 insertions(+), 155 deletions(-)
>>   create mode 100644 arch/powerpc/kexec/file_load_64.c
>>   delete mode 100644 arch/powerpc/purgatory/trampoline.S
>>   create mode 100644 arch/powerpc/purgatory/trampoline_64.S

Thanks
Hari

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox