LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v10 09/12] mm: Move vmap_range from mm/ioremap.c to mm/vmalloc.c
From: Nicholas Piggin @ 2021-01-24  8:22 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, Ding Tianhong, linux-kernel, Nicholas Piggin,
	Christoph Hellwig, Zefan Li, Jonathan Cameron, Rick Edgecombe,
	linuxppc-dev
In-Reply-To: <20210124082230.2118861-1-npiggin@gmail.com>

This is a generic kernel virtual memory mapper, not specific to ioremap.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 include/linux/vmalloc.h |   3 +
 mm/ioremap.c            | 197 ----------------------------------------
 mm/vmalloc.c            | 196 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 199 insertions(+), 197 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 00bd62bd701e..40649c4bb5a2 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -178,6 +178,9 @@ extern struct vm_struct *remove_vm_area(const void *addr);
 extern struct vm_struct *find_vm_area(const void *addr);
 
 #ifdef CONFIG_MMU
+int vmap_range(unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift);
 extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
 				    pgprot_t prot, struct page **pages);
 int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
diff --git a/mm/ioremap.c b/mm/ioremap.c
index c67f91164401..d1dcc7e744ac 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -28,203 +28,6 @@ early_param("nohugeiomap", set_nohugeiomap);
 static const bool iomap_max_page_shift = PAGE_SHIFT;
 #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
 
-static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			pgtbl_mod_mask *mask)
-{
-	pte_t *pte;
-	u64 pfn;
-
-	pfn = phys_addr >> PAGE_SHIFT;
-	pte = pte_alloc_kernel_track(pmd, addr, mask);
-	if (!pte)
-		return -ENOMEM;
-	do {
-		BUG_ON(!pte_none(*pte));
-		set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
-		pfn++;
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-	*mask |= PGTBL_PTE_MODIFIED;
-	return 0;
-}
-
-static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift)
-{
-	if (max_page_shift < PMD_SHIFT)
-		return 0;
-
-	if (!arch_vmap_pmd_supported(prot))
-		return 0;
-
-	if ((end - addr) != PMD_SIZE)
-		return 0;
-
-	if (!IS_ALIGNED(addr, PMD_SIZE))
-		return 0;
-
-	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
-		return 0;
-
-	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
-		return 0;
-
-	return pmd_set_huge(pmd, phys_addr, prot);
-}
-
-static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift, pgtbl_mod_mask *mask)
-{
-	pmd_t *pmd;
-	unsigned long next;
-
-	pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
-	if (!pmd)
-		return -ENOMEM;
-	do {
-		next = pmd_addr_end(addr, end);
-
-		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, max_page_shift)) {
-			*mask |= PGTBL_PMD_MODIFIED;
-			continue;
-		}
-
-		if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
-			return -ENOMEM;
-	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
-	return 0;
-}
-
-static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift)
-{
-	if (max_page_shift < PUD_SHIFT)
-		return 0;
-
-	if (!arch_vmap_pud_supported(prot))
-		return 0;
-
-	if ((end - addr) != PUD_SIZE)
-		return 0;
-
-	if (!IS_ALIGNED(addr, PUD_SIZE))
-		return 0;
-
-	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
-		return 0;
-
-	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
-		return 0;
-
-	return pud_set_huge(pud, phys_addr, prot);
-}
-
-static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift, pgtbl_mod_mask *mask)
-{
-	pud_t *pud;
-	unsigned long next;
-
-	pud = pud_alloc_track(&init_mm, p4d, addr, mask);
-	if (!pud)
-		return -ENOMEM;
-	do {
-		next = pud_addr_end(addr, end);
-
-		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, max_page_shift)) {
-			*mask |= PGTBL_PUD_MODIFIED;
-			continue;
-		}
-
-		if (vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask))
-			return -ENOMEM;
-	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
-	return 0;
-}
-
-static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift)
-{
-	if (max_page_shift < P4D_SHIFT)
-		return 0;
-
-	if (!arch_vmap_p4d_supported(prot))
-		return 0;
-
-	if ((end - addr) != P4D_SIZE)
-		return 0;
-
-	if (!IS_ALIGNED(addr, P4D_SIZE))
-		return 0;
-
-	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
-		return 0;
-
-	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
-		return 0;
-
-	return p4d_set_huge(p4d, phys_addr, prot);
-}
-
-static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift, pgtbl_mod_mask *mask)
-{
-	p4d_t *p4d;
-	unsigned long next;
-
-	p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
-	if (!p4d)
-		return -ENOMEM;
-	do {
-		next = p4d_addr_end(addr, end);
-
-		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, max_page_shift)) {
-			*mask |= PGTBL_P4D_MODIFIED;
-			continue;
-		}
-
-		if (vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask))
-			return -ENOMEM;
-	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
-	return 0;
-}
-
-static int vmap_range(unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift)
-{
-	pgd_t *pgd;
-	unsigned long start;
-	unsigned long next;
-	int err;
-	pgtbl_mod_mask mask = 0;
-
-	might_sleep();
-	BUG_ON(addr >= end);
-
-	start = addr;
-	pgd = pgd_offset_k(addr);
-	do {
-		next = pgd_addr_end(addr, end);
-		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, max_page_shift, &mask);
-		if (err)
-			break;
-	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
-
-	flush_cache_vmap(start, end);
-
-	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
-		arch_sync_kernel_mappings(start, end);
-
-	return err;
-}
-
 int ioremap_page_range(unsigned long addr,
 		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7f2f36116980..5d79148b7fa7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -68,6 +68,202 @@ static void free_work(struct work_struct *w)
 }
 
 /*** Page table manipulation functions ***/
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			pgtbl_mod_mask *mask)
+{
+	pte_t *pte;
+	u64 pfn;
+
+	pfn = phys_addr >> PAGE_SHIFT;
+	pte = pte_alloc_kernel_track(pmd, addr, mask);
+	if (!pte)
+		return -ENOMEM;
+	do {
+		BUG_ON(!pte_none(*pte));
+		set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
+		pfn++;
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	*mask |= PGTBL_PTE_MODIFIED;
+	return 0;
+}
+
+static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	if (max_page_shift < PMD_SHIFT)
+		return 0;
+
+	if (!arch_vmap_pmd_supported(prot))
+		return 0;
+
+	if ((end - addr) != PMD_SIZE)
+		return 0;
+
+	if (!IS_ALIGNED(addr, PMD_SIZE))
+		return 0;
+
+	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+		return 0;
+
+	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+		return 0;
+
+	return pmd_set_huge(pmd, phys_addr, prot);
+}
+
+static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
+	if (!pmd)
+		return -ENOMEM;
+	do {
+		next = pmd_addr_end(addr, end);
+
+		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, max_page_shift)) {
+			*mask |= PGTBL_PMD_MODIFIED;
+			continue;
+		}
+
+		if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
+			return -ENOMEM;
+	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
+	return 0;
+}
+
+static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	if (max_page_shift < PUD_SHIFT)
+		return 0;
+
+	if (!arch_vmap_pud_supported(prot))
+		return 0;
+
+	if ((end - addr) != PUD_SIZE)
+		return 0;
+
+	if (!IS_ALIGNED(addr, PUD_SIZE))
+		return 0;
+
+	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
+		return 0;
+
+	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
+		return 0;
+
+	return pud_set_huge(pud, phys_addr, prot);
+}
+
+static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_alloc_track(&init_mm, p4d, addr, mask);
+	if (!pud)
+		return -ENOMEM;
+	do {
+		next = pud_addr_end(addr, end);
+
+		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, max_page_shift)) {
+			*mask |= PGTBL_PUD_MODIFIED;
+			continue;
+		}
+
+		if (vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask))
+			return -ENOMEM;
+	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
+	return 0;
+}
+
+static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	if (max_page_shift < P4D_SHIFT)
+		return 0;
+
+	if (!arch_vmap_p4d_supported(prot))
+		return 0;
+
+	if ((end - addr) != P4D_SIZE)
+		return 0;
+
+	if (!IS_ALIGNED(addr, P4D_SIZE))
+		return 0;
+
+	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
+		return 0;
+
+	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
+		return 0;
+
+	return p4d_set_huge(p4d, phys_addr, prot);
+}
+
+static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+	p4d_t *p4d;
+	unsigned long next;
+
+	p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
+	if (!p4d)
+		return -ENOMEM;
+	do {
+		next = p4d_addr_end(addr, end);
+
+		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, max_page_shift)) {
+			*mask |= PGTBL_P4D_MODIFIED;
+			continue;
+		}
+
+		if (vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask))
+			return -ENOMEM;
+	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
+	return 0;
+}
+
+int vmap_range(unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	pgd_t *pgd;
+	unsigned long start;
+	unsigned long next;
+	int err;
+	pgtbl_mod_mask mask = 0;
+
+	might_sleep();
+	BUG_ON(addr >= end);
+
+	start = addr;
+	pgd = pgd_offset_k(addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, max_page_shift, &mask);
+		if (err)
+			break;
+	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
+
+	flush_cache_vmap(start, end);
+
+	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+		arch_sync_kernel_mappings(start, end);
+
+	return err;
+}
 
 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			     pgtbl_mod_mask *mask)
-- 
2.23.0


^ permalink raw reply related

* [PATCH v10 08/12] x86: inline huge vmap supported functions
From: Nicholas Piggin @ 2021-01-24  8:22 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, x86, H. Peter Anvin, Ding Tianhong, linux-kernel,
	Nicholas Piggin, Christoph Hellwig, Zefan Li, Borislav Petkov,
	Jonathan Cameron, Thomas Gleixner, Rick Edgecombe, linuxppc-dev,
	Ingo Molnar
In-Reply-To: <20210124082230.2118861-1-npiggin@gmail.com>

This allows unsupported levels to be constant folded away, and so
p4d_free_pud_page can be removed because it's no longer linked to.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/x86/include/asm/vmalloc.h | 22 +++++++++++++++++++---
 arch/x86/mm/ioremap.c          | 21 ---------------------
 arch/x86/mm/pgtable.c          | 13 -------------
 3 files changed, 19 insertions(+), 37 deletions(-)

diff --git a/arch/x86/include/asm/vmalloc.h b/arch/x86/include/asm/vmalloc.h
index 094ea2b565f3..e714b00fc0ca 100644
--- a/arch/x86/include/asm/vmalloc.h
+++ b/arch/x86/include/asm/vmalloc.h
@@ -1,13 +1,29 @@
 #ifndef _ASM_X86_VMALLOC_H
 #define _ASM_X86_VMALLOC_H
 
+#include <asm/cpufeature.h>
 #include <asm/page.h>
 #include <asm/pgtable_areas.h>
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-bool arch_vmap_p4d_supported(pgprot_t prot);
-bool arch_vmap_pud_supported(pgprot_t prot);
-bool arch_vmap_pmd_supported(pgprot_t prot);
+static inline bool arch_vmap_p4d_supported(pgprot_t prot)
+{
+	return false;
+}
+
+static inline bool arch_vmap_pud_supported(pgprot_t prot)
+{
+#ifdef CONFIG_X86_64
+	return boot_cpu_has(X86_FEATURE_GBPAGES);
+#else
+	return false;
+#endif
+}
+
+static inline bool arch_vmap_pmd_supported(pgprot_t prot)
+{
+	return boot_cpu_has(X86_FEATURE_PSE);
+}
 #endif
 
 #endif /* _ASM_X86_VMALLOC_H */
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index fbaf0c447986..12c686c65ea9 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -481,27 +481,6 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
 
-#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-bool arch_vmap_p4d_supported(pgprot_t prot)
-{
-	return false;
-}
-
-bool arch_vmap_pud_supported(pgprot_t prot)
-{
-#ifdef CONFIG_X86_64
-	return boot_cpu_has(X86_FEATURE_GBPAGES);
-#else
-	return false;
-#endif
-}
-
-bool arch_vmap_pmd_supported(pgprot_t prot)
-{
-	return boot_cpu_has(X86_FEATURE_PSE);
-}
-#endif
-
 /*
  * Convert a physical pointer to a virtual kernel pointer for /dev/mem
  * access
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index f6a9e2e36642..d27cf69e811d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -780,14 +780,6 @@ int pmd_clear_huge(pmd_t *pmd)
 	return 0;
 }
 
-/*
- * Until we support 512GB pages, skip them in the vmap area.
- */
-int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
-{
-	return 0;
-}
-
 #ifdef CONFIG_X86_64
 /**
  * pud_free_pmd_page - Clear pud entry and free pmd page.
@@ -861,11 +853,6 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 
 #else /* !CONFIG_X86_64 */
 
-int pud_free_pmd_page(pud_t *pud, unsigned long addr)
-{
-	return pud_none(*pud);
-}
-
 /*
  * Disable free page handling on x86-PAE. This assures that ioremap()
  * does not update sync'd pmd entries. See vmalloc_sync_one().
-- 
2.23.0


^ permalink raw reply related

* [PATCH v10 07/12] arm64: inline huge vmap supported functions
From: Nicholas Piggin @ 2021-01-24  8:22 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, Will Deacon, Catalin Marinas, Ding Tianhong,
	linux-kernel, Nicholas Piggin, Christoph Hellwig, Zefan Li,
	Jonathan Cameron, Rick Edgecombe, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20210124082230.2118861-1-npiggin@gmail.com>

This allows unsupported levels to be constant folded away, and so
p4d_free_pud_page can be removed because it's no longer linked to.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/arm64/include/asm/vmalloc.h | 23 ++++++++++++++++++++---
 arch/arm64/mm/mmu.c              | 26 --------------------------
 2 files changed, 20 insertions(+), 29 deletions(-)

diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
index 597b40405319..fc9a12d6cc1a 100644
--- a/arch/arm64/include/asm/vmalloc.h
+++ b/arch/arm64/include/asm/vmalloc.h
@@ -4,9 +4,26 @@
 #include <asm/page.h>
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-bool arch_vmap_p4d_supported(pgprot_t prot);
-bool arch_vmap_pud_supported(pgprot_t prot);
-bool arch_vmap_pmd_supported(pgprot_t prot);
+static inline bool arch_vmap_p4d_supported(pgprot_t prot)
+{
+	return false;
+}
+
+static inline bool arch_vmap_pud_supported(pgprot_t prot)
+{
+	/*
+	 * Only 4k granule supports level 1 block mappings.
+	 * SW table walks can't handle removal of intermediate entries.
+	 */
+	return IS_ENABLED(CONFIG_ARM64_4K_PAGES) &&
+	       !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
+}
+
+static inline bool arch_vmap_pmd_supported(pgprot_t prot)
+{
+	/* See arch_vmap_pud_supported() */
+	return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
+}
 #endif
 
 #endif /* _ASM_ARM64_VMALLOC_H */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index f6614c378792..ab9ba7c36dae 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1313,27 +1313,6 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
 	return dt_virt;
 }
 
-bool arch_vmap_p4d_supported(pgprot_t prot)
-{
-	return false;
-}
-
-bool arch_vmap_pud_supported(pgprot_t prot);
-{
-	/*
-	 * Only 4k granule supports level 1 block mappings.
-	 * SW table walks can't handle removal of intermediate entries.
-	 */
-	return IS_ENABLED(CONFIG_ARM64_4K_PAGES) &&
-	       !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
-}
-
-bool arch_vmap_pmd_supported(pgprot_t prot)
-{
-	/* See arch_vmap_pud_supported() */
-	return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
-}
-
 int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
 {
 	pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));
@@ -1425,11 +1404,6 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
 	return 1;
 }
 
-int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
-{
-	return 0;	/* Don't attempt a block mapping */
-}
-
 #ifdef CONFIG_MEMORY_HOTPLUG
 static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
 {
-- 
2.23.0


^ permalink raw reply related

* [PATCH v10 06/12] powerpc: inline huge vmap supported functions
From: Nicholas Piggin @ 2021-01-24  8:22 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, Ding Tianhong, linux-kernel, Nicholas Piggin,
	Christoph Hellwig, Zefan Li, Jonathan Cameron, Rick Edgecombe,
	linuxppc-dev
In-Reply-To: <20210124082230.2118861-1-npiggin@gmail.com>

This allows unsupported levels to be constant folded away, and so
p4d_free_pud_page can be removed because it's no longer linked to.

Cc: linuxppc-dev@lists.ozlabs.org
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/vmalloc.h       | 19 ++++++++++++++++---
 arch/powerpc/mm/book3s64/radix_pgtable.c | 21 ---------------------
 2 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/vmalloc.h b/arch/powerpc/include/asm/vmalloc.h
index 105abb73f075..3f0c153befb0 100644
--- a/arch/powerpc/include/asm/vmalloc.h
+++ b/arch/powerpc/include/asm/vmalloc.h
@@ -1,12 +1,25 @@
 #ifndef _ASM_POWERPC_VMALLOC_H
 #define _ASM_POWERPC_VMALLOC_H
 
+#include <asm/mmu.h>
 #include <asm/page.h>
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-bool arch_vmap_p4d_supported(pgprot_t prot);
-bool arch_vmap_pud_supported(pgprot_t prot);
-bool arch_vmap_pmd_supported(pgprot_t prot);
+static inline bool arch_vmap_p4d_supported(pgprot_t prot)
+{
+	return false;
+}
+
+static inline bool arch_vmap_pud_supported(pgprot_t prot)
+{
+	/* HPT does not cope with large pages in the vmalloc area */
+	return radix_enabled();
+}
+
+static inline bool arch_vmap_pmd_supported(pgprot_t prot)
+{
+	return radix_enabled();
+}
 #endif
 
 #endif /* _ASM_POWERPC_VMALLOC_H */
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 743807fc210f..8da62afccee5 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1082,22 +1082,6 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
 	set_pte_at(mm, addr, ptep, pte);
 }
 
-bool arch_vmap_pud_supported(pgprot_t prot)
-{
-	/* HPT does not cope with large pages in the vmalloc area */
-	return radix_enabled();
-}
-
-bool arch_vmap_pmd_supported(pgprot_t prot)
-{
-	return radix_enabled();
-}
-
-int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
-{
-	return 0;
-}
-
 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 {
 	pte_t *ptep = (pte_t *)pud;
@@ -1181,8 +1165,3 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 
 	return 1;
 }
-
-bool arch_vmap_p4d_supported(pgprot_t prot)
-{
-	return false;
-}
-- 
2.23.0


^ permalink raw reply related

* [PATCH v10 05/12] mm: HUGE_VMAP arch support cleanup
From: Nicholas Piggin @ 2021-01-24  8:22 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, x86, H. Peter Anvin, Will Deacon, Catalin Marinas,
	Ding Tianhong, linux-kernel, Nicholas Piggin, Christoph Hellwig,
	Zefan Li, Borislav Petkov, Jonathan Cameron, Thomas Gleixner,
	Rick Edgecombe, linuxppc-dev, Ingo Molnar, linux-arm-kernel
In-Reply-To: <20210124082230.2118861-1-npiggin@gmail.com>

This changes the awkward approach where architectures provide init
functions to determine which levels they can provide large mappings for,
to one where the arch is queried for each call.

This removes code and indirection, and allows constant-folding of dead
code for unsupported levels.

This also adds a prot argument to the arch query. This is unused
currently but could help with some architectures (e.g., some powerpc
processors can't map uncacheable memory with large pages).

Cc: linuxppc-dev@lists.ozlabs.org
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com> [arm64]
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/arm64/include/asm/vmalloc.h         |  8 +++
 arch/arm64/mm/mmu.c                      | 10 +--
 arch/powerpc/include/asm/vmalloc.h       |  8 +++
 arch/powerpc/mm/book3s64/radix_pgtable.c |  8 +--
 arch/x86/include/asm/vmalloc.h           |  7 ++
 arch/x86/mm/ioremap.c                    | 12 ++--
 include/linux/io.h                       |  9 ---
 include/linux/vmalloc.h                  |  6 ++
 init/main.c                              |  1 -
 mm/ioremap.c                             | 88 +++++++++---------------
 10 files changed, 79 insertions(+), 78 deletions(-)

diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
index 2ca708ab9b20..597b40405319 100644
--- a/arch/arm64/include/asm/vmalloc.h
+++ b/arch/arm64/include/asm/vmalloc.h
@@ -1,4 +1,12 @@
 #ifndef _ASM_ARM64_VMALLOC_H
 #define _ASM_ARM64_VMALLOC_H
 
+#include <asm/page.h>
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+bool arch_vmap_p4d_supported(pgprot_t prot);
+bool arch_vmap_pud_supported(pgprot_t prot);
+bool arch_vmap_pmd_supported(pgprot_t prot);
+#endif
+
 #endif /* _ASM_ARM64_VMALLOC_H */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ae0c3d023824..f6614c378792 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1313,12 +1313,12 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
 	return dt_virt;
 }
 
-int __init arch_ioremap_p4d_supported(void)
+bool arch_vmap_p4d_supported(pgprot_t prot)
 {
-	return 0;
+	return false;
 }
 
-int __init arch_ioremap_pud_supported(void)
+bool arch_vmap_pud_supported(pgprot_t prot);
 {
 	/*
 	 * Only 4k granule supports level 1 block mappings.
@@ -1328,9 +1328,9 @@ int __init arch_ioremap_pud_supported(void)
 	       !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
 }
 
-int __init arch_ioremap_pmd_supported(void)
+bool arch_vmap_pmd_supported(pgprot_t prot)
 {
-	/* See arch_ioremap_pud_supported() */
+	/* See arch_vmap_pud_supported() */
 	return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
 }
 
diff --git a/arch/powerpc/include/asm/vmalloc.h b/arch/powerpc/include/asm/vmalloc.h
index b992dfaaa161..105abb73f075 100644
--- a/arch/powerpc/include/asm/vmalloc.h
+++ b/arch/powerpc/include/asm/vmalloc.h
@@ -1,4 +1,12 @@
 #ifndef _ASM_POWERPC_VMALLOC_H
 #define _ASM_POWERPC_VMALLOC_H
 
+#include <asm/page.h>
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+bool arch_vmap_p4d_supported(pgprot_t prot);
+bool arch_vmap_pud_supported(pgprot_t prot);
+bool arch_vmap_pmd_supported(pgprot_t prot);
+#endif
+
 #endif /* _ASM_POWERPC_VMALLOC_H */
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 98f0b243c1ab..743807fc210f 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1082,13 +1082,13 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
 	set_pte_at(mm, addr, ptep, pte);
 }
 
-int __init arch_ioremap_pud_supported(void)
+bool arch_vmap_pud_supported(pgprot_t prot)
 {
 	/* HPT does not cope with large pages in the vmalloc area */
 	return radix_enabled();
 }
 
-int __init arch_ioremap_pmd_supported(void)
+bool arch_vmap_pmd_supported(pgprot_t prot)
 {
 	return radix_enabled();
 }
@@ -1182,7 +1182,7 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 	return 1;
 }
 
-int __init arch_ioremap_p4d_supported(void)
+bool arch_vmap_p4d_supported(pgprot_t prot)
 {
-	return 0;
+	return false;
 }
diff --git a/arch/x86/include/asm/vmalloc.h b/arch/x86/include/asm/vmalloc.h
index 29837740b520..094ea2b565f3 100644
--- a/arch/x86/include/asm/vmalloc.h
+++ b/arch/x86/include/asm/vmalloc.h
@@ -1,6 +1,13 @@
 #ifndef _ASM_X86_VMALLOC_H
 #define _ASM_X86_VMALLOC_H
 
+#include <asm/page.h>
 #include <asm/pgtable_areas.h>
 
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+bool arch_vmap_p4d_supported(pgprot_t prot);
+bool arch_vmap_pud_supported(pgprot_t prot);
+bool arch_vmap_pmd_supported(pgprot_t prot);
+#endif
+
 #endif /* _ASM_X86_VMALLOC_H */
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 9e5ccc56f8e0..fbaf0c447986 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -481,24 +481,26 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
 
-int __init arch_ioremap_p4d_supported(void)
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+bool arch_vmap_p4d_supported(pgprot_t prot)
 {
-	return 0;
+	return false;
 }
 
-int __init arch_ioremap_pud_supported(void)
+bool arch_vmap_pud_supported(pgprot_t prot)
 {
 #ifdef CONFIG_X86_64
 	return boot_cpu_has(X86_FEATURE_GBPAGES);
 #else
-	return 0;
+	return false;
 #endif
 }
 
-int __init arch_ioremap_pmd_supported(void)
+bool arch_vmap_pmd_supported(pgprot_t prot)
 {
 	return boot_cpu_has(X86_FEATURE_PSE);
 }
+#endif
 
 /*
  * Convert a physical pointer to a virtual kernel pointer for /dev/mem
diff --git a/include/linux/io.h b/include/linux/io.h
index 8394c56babc2..f1effd4d7a3c 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -31,15 +31,6 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end,
 }
 #endif
 
-#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-void __init ioremap_huge_init(void);
-int arch_ioremap_p4d_supported(void);
-int arch_ioremap_pud_supported(void);
-int arch_ioremap_pmd_supported(void);
-#else
-static inline void ioremap_huge_init(void) { }
-#endif
-
 /*
  * Managed iomap interface
  */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 80c0181c411d..00bd62bd701e 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -83,6 +83,12 @@ struct vmap_area {
 	};
 };
 
+#ifndef CONFIG_HAVE_ARCH_HUGE_VMAP
+static inline bool arch_vmap_p4d_supported(pgprot_t prot) { return false; }
+static inline bool arch_vmap_pud_supported(pgprot_t prot) { return false; }
+static inline bool arch_vmap_pmd_supported(pgprot_t prot) { return false; }
+#endif
+
 /*
  *	Highlevel APIs for driver use
  */
diff --git a/init/main.c b/init/main.c
index c68d784376ca..bf9389e5b2e4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -834,7 +834,6 @@ static void __init mm_init(void)
 	pgtable_init();
 	debug_objects_mem_init();
 	vmalloc_init();
-	ioremap_huge_init();
 	/* Should be run before the first non-init thread is created */
 	init_espfix_bsp();
 	/* Should be run after espfix64 is set up. */
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 3f4d36f9745a..c67f91164401 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -16,49 +16,16 @@
 #include "pgalloc-track.h"
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static int __read_mostly ioremap_p4d_capable;
-static int __read_mostly ioremap_pud_capable;
-static int __read_mostly ioremap_pmd_capable;
-static int __read_mostly ioremap_huge_disabled;
+static bool __ro_after_init iomap_max_page_shift = PAGE_SHIFT;
 
 static int __init set_nohugeiomap(char *str)
 {
-	ioremap_huge_disabled = 1;
+	iomap_max_page_shift = P4D_SHIFT;
 	return 0;
 }
 early_param("nohugeiomap", set_nohugeiomap);
-
-void __init ioremap_huge_init(void)
-{
-	if (!ioremap_huge_disabled) {
-		if (arch_ioremap_p4d_supported())
-			ioremap_p4d_capable = 1;
-		if (arch_ioremap_pud_supported())
-			ioremap_pud_capable = 1;
-		if (arch_ioremap_pmd_supported())
-			ioremap_pmd_capable = 1;
-	}
-}
-
-static inline int ioremap_p4d_enabled(void)
-{
-	return ioremap_p4d_capable;
-}
-
-static inline int ioremap_pud_enabled(void)
-{
-	return ioremap_pud_capable;
-}
-
-static inline int ioremap_pmd_enabled(void)
-{
-	return ioremap_pmd_capable;
-}
-
-#else	/* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-static inline int ioremap_p4d_enabled(void) { return 0; }
-static inline int ioremap_pud_enabled(void) { return 0; }
-static inline int ioremap_pmd_enabled(void) { return 0; }
+#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+static const bool iomap_max_page_shift = PAGE_SHIFT;
 #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
 
 static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
@@ -82,9 +49,13 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 }
 
 static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot)
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
 {
-	if (!ioremap_pmd_enabled())
+	if (max_page_shift < PMD_SHIFT)
+		return 0;
+
+	if (!arch_vmap_pmd_supported(prot))
 		return 0;
 
 	if ((end - addr) != PMD_SIZE)
@@ -104,7 +75,7 @@ static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 			phys_addr_t phys_addr, pgprot_t prot,
-			pgtbl_mod_mask *mask)
+			unsigned int max_page_shift, pgtbl_mod_mask *mask)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -115,7 +86,7 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 	do {
 		next = pmd_addr_end(addr, end);
 
-		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) {
+		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, max_page_shift)) {
 			*mask |= PGTBL_PMD_MODIFIED;
 			continue;
 		}
@@ -127,9 +98,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 }
 
 static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot)
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
 {
-	if (!ioremap_pud_enabled())
+	if (max_page_shift < PUD_SHIFT)
+		return 0;
+
+	if (!arch_vmap_pud_supported(prot))
 		return 0;
 
 	if ((end - addr) != PUD_SIZE)
@@ -149,7 +124,7 @@ static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
 
 static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 			phys_addr_t phys_addr, pgprot_t prot,
-			pgtbl_mod_mask *mask)
+			unsigned int max_page_shift, pgtbl_mod_mask *mask)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -160,21 +135,25 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 	do {
 		next = pud_addr_end(addr, end);
 
-		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot)) {
+		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, max_page_shift)) {
 			*mask |= PGTBL_PUD_MODIFIED;
 			continue;
 		}
 
-		if (vmap_pmd_range(pud, addr, next, phys_addr, prot, mask))
+		if (vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask))
 			return -ENOMEM;
 	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
 	return 0;
 }
 
 static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot)
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
 {
-	if (!ioremap_p4d_enabled())
+	if (max_page_shift < P4D_SHIFT)
+		return 0;
+
+	if (!arch_vmap_p4d_supported(prot))
 		return 0;
 
 	if ((end - addr) != P4D_SIZE)
@@ -194,7 +173,7 @@ static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
 
 static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 			phys_addr_t phys_addr, pgprot_t prot,
-			pgtbl_mod_mask *mask)
+			unsigned int max_page_shift, pgtbl_mod_mask *mask)
 {
 	p4d_t *p4d;
 	unsigned long next;
@@ -205,19 +184,20 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 	do {
 		next = p4d_addr_end(addr, end);
 
-		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) {
+		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, max_page_shift)) {
 			*mask |= PGTBL_P4D_MODIFIED;
 			continue;
 		}
 
-		if (vmap_pud_range(p4d, addr, next, phys_addr, prot, mask))
+		if (vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask))
 			return -ENOMEM;
 	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
 	return 0;
 }
 
 static int vmap_range(unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot)
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
 {
 	pgd_t *pgd;
 	unsigned long start;
@@ -232,7 +212,7 @@ static int vmap_range(unsigned long addr, unsigned long end,
 	pgd = pgd_offset_k(addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, &mask);
+		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, max_page_shift, &mask);
 		if (err)
 			break;
 	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
@@ -248,7 +228,7 @@ static int vmap_range(unsigned long addr, unsigned long end,
 int ioremap_page_range(unsigned long addr,
 		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
-	return vmap_range(addr, end, phys_addr, prot);
+	return vmap_range(addr, end, phys_addr, prot, iomap_max_page_shift);
 }
 
 #ifdef CONFIG_GENERIC_IOREMAP
-- 
2.23.0


^ permalink raw reply related

* [PATCH v10 04/12] mm/ioremap: rename ioremap_*_range to vmap_*_range
From: Nicholas Piggin @ 2021-01-24  8:22 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, Ding Tianhong, linux-kernel, Nicholas Piggin,
	Christoph Hellwig, Zefan Li, Jonathan Cameron, Rick Edgecombe,
	linuxppc-dev
In-Reply-To: <20210124082230.2118861-1-npiggin@gmail.com>

This will be used as a generic kernel virtual mapping function, so
re-name it in preparation.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 mm/ioremap.c | 64 +++++++++++++++++++++++++++-------------------------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/mm/ioremap.c b/mm/ioremap.c
index 5fa1ab41d152..3f4d36f9745a 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -61,9 +61,9 @@ static inline int ioremap_pud_enabled(void) { return 0; }
 static inline int ioremap_pmd_enabled(void) { return 0; }
 #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
 
-static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
-		unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
-		pgtbl_mod_mask *mask)
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			pgtbl_mod_mask *mask)
 {
 	pte_t *pte;
 	u64 pfn;
@@ -81,9 +81,8 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
 	return 0;
 }
 
-static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
-				unsigned long end, phys_addr_t phys_addr,
-				pgprot_t prot)
+static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot)
 {
 	if (!ioremap_pmd_enabled())
 		return 0;
@@ -103,9 +102,9 @@ static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
 	return pmd_set_huge(pmd, phys_addr, prot);
 }
 
-static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
-		unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
-		pgtbl_mod_mask *mask)
+static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			pgtbl_mod_mask *mask)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -116,20 +115,19 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
 	do {
 		next = pmd_addr_end(addr, end);
 
-		if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) {
+		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) {
 			*mask |= PGTBL_PMD_MODIFIED;
 			continue;
 		}
 
-		if (ioremap_pte_range(pmd, addr, next, phys_addr, prot, mask))
+		if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
 			return -ENOMEM;
 	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
 	return 0;
 }
 
-static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
-				unsigned long end, phys_addr_t phys_addr,
-				pgprot_t prot)
+static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot)
 {
 	if (!ioremap_pud_enabled())
 		return 0;
@@ -149,9 +147,9 @@ static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
 	return pud_set_huge(pud, phys_addr, prot);
 }
 
-static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
-		unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
-		pgtbl_mod_mask *mask)
+static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			pgtbl_mod_mask *mask)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -162,20 +160,19 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
 	do {
 		next = pud_addr_end(addr, end);
 
-		if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) {
+		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot)) {
 			*mask |= PGTBL_PUD_MODIFIED;
 			continue;
 		}
 
-		if (ioremap_pmd_range(pud, addr, next, phys_addr, prot, mask))
+		if (vmap_pmd_range(pud, addr, next, phys_addr, prot, mask))
 			return -ENOMEM;
 	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
 	return 0;
 }
 
-static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
-				unsigned long end, phys_addr_t phys_addr,
-				pgprot_t prot)
+static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot)
 {
 	if (!ioremap_p4d_enabled())
 		return 0;
@@ -195,9 +192,9 @@ static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
 	return p4d_set_huge(p4d, phys_addr, prot);
 }
 
-static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
-		unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
-		pgtbl_mod_mask *mask)
+static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			pgtbl_mod_mask *mask)
 {
 	p4d_t *p4d;
 	unsigned long next;
@@ -208,19 +205,19 @@ static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
 	do {
 		next = p4d_addr_end(addr, end);
 
-		if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) {
+		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) {
 			*mask |= PGTBL_P4D_MODIFIED;
 			continue;
 		}
 
-		if (ioremap_pud_range(p4d, addr, next, phys_addr, prot, mask))
+		if (vmap_pud_range(p4d, addr, next, phys_addr, prot, mask))
 			return -ENOMEM;
 	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
 	return 0;
 }
 
-int ioremap_page_range(unsigned long addr,
-		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+static int vmap_range(unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot)
 {
 	pgd_t *pgd;
 	unsigned long start;
@@ -235,8 +232,7 @@ int ioremap_page_range(unsigned long addr,
 	pgd = pgd_offset_k(addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot,
-					&mask);
+		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, &mask);
 		if (err)
 			break;
 	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
@@ -249,6 +245,12 @@ int ioremap_page_range(unsigned long addr,
 	return err;
 }
 
+int ioremap_page_range(unsigned long addr,
+		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+	return vmap_range(addr, end, phys_addr, prot);
+}
+
 #ifdef CONFIG_GENERIC_IOREMAP
 void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot)
 {
-- 
2.23.0


^ permalink raw reply related

* [PATCH v10 03/12] mm/vmalloc: rename vmap_*_range vmap_pages_*_range
From: Nicholas Piggin @ 2021-01-24  8:22 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, Ding Tianhong, linux-kernel, Nicholas Piggin,
	Christoph Hellwig, Zefan Li, Jonathan Cameron, Rick Edgecombe,
	linuxppc-dev
In-Reply-To: <20210124082230.2118861-1-npiggin@gmail.com>

The vmalloc mapper operates on a struct page * array rather than a
linear physical address, re-name it to make this distinction clear.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 mm/vmalloc.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 62372f9e0167..7f2f36116980 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -189,7 +189,7 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
 		arch_sync_kernel_mappings(start, end);
 }
 
-static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
+static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
 		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
 		pgtbl_mod_mask *mask)
 {
@@ -217,7 +217,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
 	return 0;
 }
 
-static int vmap_pmd_range(pud_t *pud, unsigned long addr,
+static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
 		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
 		pgtbl_mod_mask *mask)
 {
@@ -229,13 +229,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,
 		return -ENOMEM;
 	do {
 		next = pmd_addr_end(addr, end);
-		if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
+		if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
 			return -ENOMEM;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 
-static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
+static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
 		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
 		pgtbl_mod_mask *mask)
 {
@@ -247,13 +247,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
 		return -ENOMEM;
 	do {
 		next = pud_addr_end(addr, end);
-		if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
+		if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
 			return -ENOMEM;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 
-static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
+static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
 		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
 		pgtbl_mod_mask *mask)
 {
@@ -265,7 +265,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
 		return -ENOMEM;
 	do {
 		next = p4d_addr_end(addr, end);
-		if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
+		if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
 			return -ENOMEM;
 	} while (p4d++, addr = next, addr != end);
 	return 0;
@@ -306,7 +306,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
 		next = pgd_addr_end(addr, end);
 		if (pgd_bad(*pgd))
 			mask |= PGTBL_PGD_MODIFIED;
-		err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
+		err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
 		if (err)
 			return err;
 	} while (pgd++, addr = next, addr != end);
-- 
2.23.0


^ permalink raw reply related

* [PATCH v10 02/12] mm: apply_to_pte_range warn and fail if a large pte is encountered
From: Nicholas Piggin @ 2021-01-24  8:22 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, Ding Tianhong, linux-kernel, Nicholas Piggin,
	Christoph Hellwig, Zefan Li, Jonathan Cameron, Rick Edgecombe,
	linuxppc-dev
In-Reply-To: <20210124082230.2118861-1-npiggin@gmail.com>

apply_to_pte_range might mistake a large pte for bad, or treat it as a
page table, resulting in a crash or corruption. Add a test to warn and
return error if large entries are found.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 mm/memory.c | 66 +++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 17 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index feff48e1465a..672e39a72788 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2440,13 +2440,21 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
 	}
 	do {
 		next = pmd_addr_end(addr, end);
-		if (create || !pmd_none_or_clear_bad(pmd)) {
-			err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
-						 create, mask);
-			if (err)
-				break;
+		if (pmd_none(*pmd) && !create)
+			continue;
+		if (WARN_ON_ONCE(pmd_leaf(*pmd)))
+			return -EINVAL;
+		if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
+			if (!create)
+				continue;
+			pmd_clear_bad(pmd);
 		}
+		err = apply_to_pte_range(mm, pmd, addr, next,
+					 fn, data, create, mask);
+		if (err)
+			break;
 	} while (pmd++, addr = next, addr != end);
+
 	return err;
 }
 
@@ -2468,13 +2476,21 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
 	}
 	do {
 		next = pud_addr_end(addr, end);
-		if (create || !pud_none_or_clear_bad(pud)) {
-			err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
-						 create, mask);
-			if (err)
-				break;
+		if (pud_none(*pud) && !create)
+			continue;
+		if (WARN_ON_ONCE(pud_leaf(*pud)))
+			return -EINVAL;
+		if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
+			if (!create)
+				continue;
+			pud_clear_bad(pud);
 		}
+		err = apply_to_pmd_range(mm, pud, addr, next,
+					 fn, data, create, mask);
+		if (err)
+			break;
 	} while (pud++, addr = next, addr != end);
+
 	return err;
 }
 
@@ -2496,13 +2512,21 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
 	}
 	do {
 		next = p4d_addr_end(addr, end);
-		if (create || !p4d_none_or_clear_bad(p4d)) {
-			err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
-						 create, mask);
-			if (err)
-				break;
+		if (p4d_none(*p4d) && !create)
+			continue;
+		if (WARN_ON_ONCE(p4d_leaf(*p4d)))
+			return -EINVAL;
+		if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
+			if (!create)
+				continue;
+			p4d_clear_bad(p4d);
 		}
+		err = apply_to_pud_range(mm, p4d, addr, next,
+					 fn, data, create, mask);
+		if (err)
+			break;
 	} while (p4d++, addr = next, addr != end);
+
 	return err;
 }
 
@@ -2522,9 +2546,17 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		if (!create && pgd_none_or_clear_bad(pgd))
+		if (pgd_none(*pgd) && !create)
 			continue;
-		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
+		if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+			return -EINVAL;
+		if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
+			if (!create)
+				continue;
+			pgd_clear_bad(pgd);
+		}
+		err = apply_to_p4d_range(mm, pgd, addr, next,
+					 fn, data, create, &mask);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-- 
2.23.0


^ permalink raw reply related

* [PATCH v10 01/12] mm/vmalloc: fix vmalloc_to_page for huge vmap mappings
From: Nicholas Piggin @ 2021-01-24  8:22 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, Ding Tianhong, linux-kernel, Nicholas Piggin,
	Christoph Hellwig, Zefan Li, Jonathan Cameron, Rick Edgecombe,
	linuxppc-dev
In-Reply-To: <20210124082230.2118861-1-npiggin@gmail.com>

vmalloc_to_page returns NULL for addresses mapped by larger pages[*].
Whether or not a vmap is huge depends on the architecture details,
alignments, boot options, etc., which the caller can not be expected
to know. Therefore HUGE_VMAP is a regression for vmalloc_to_page.

This change teaches vmalloc_to_page about larger pages, and returns
the struct page that corresponds to the offset within the large page.
This makes the API agnostic to mapping implementation details.

[*] As explained by commit 029c54b095995 ("mm/vmalloc.c: huge-vmap:
    fail gracefully on unexpected huge vmap mappings")

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 mm/vmalloc.c | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e6f352bf0498..62372f9e0167 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -34,7 +34,7 @@
 #include <linux/bitops.h>
 #include <linux/rbtree_augmented.h>
 #include <linux/overflow.h>
-
+#include <linux/pgtable.h>
 #include <linux/uaccess.h>
 #include <asm/tlbflush.h>
 #include <asm/shmparam.h>
@@ -343,7 +343,9 @@ int is_vmalloc_or_module_addr(const void *x)
 }
 
 /*
- * Walk a vmap address to the struct page it maps.
+ * Walk a vmap address to the struct page it maps. Huge vmap mappings will
+ * return the tail page that corresponds to the base page address, which
+ * matches small vmap mappings.
  */
 struct page *vmalloc_to_page(const void *vmalloc_addr)
 {
@@ -363,25 +365,33 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
 
 	if (pgd_none(*pgd))
 		return NULL;
+	if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+		return NULL; /* XXX: no allowance for huge pgd */
+	if (WARN_ON_ONCE(pgd_bad(*pgd)))
+		return NULL;
+
 	p4d = p4d_offset(pgd, addr);
 	if (p4d_none(*p4d))
 		return NULL;
-	pud = pud_offset(p4d, addr);
+	if (p4d_leaf(*p4d))
+		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
+	if (WARN_ON_ONCE(p4d_bad(*p4d)))
+		return NULL;
 
-	/*
-	 * Don't dereference bad PUD or PMD (below) entries. This will also
-	 * identify huge mappings, which we may encounter on architectures
-	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
-	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
-	 * not [unambiguously] associated with a struct page, so there is
-	 * no correct value to return for them.
-	 */
-	WARN_ON_ONCE(pud_bad(*pud));
-	if (pud_none(*pud) || pud_bad(*pud))
+	pud = pud_offset(p4d, addr);
+	if (pud_none(*pud))
+		return NULL;
+	if (pud_leaf(*pud))
+		return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+	if (WARN_ON_ONCE(pud_bad(*pud)))
 		return NULL;
+
 	pmd = pmd_offset(pud, addr);
-	WARN_ON_ONCE(pmd_bad(*pmd));
-	if (pmd_none(*pmd) || pmd_bad(*pmd))
+	if (pmd_none(*pmd))
+		return NULL;
+	if (pmd_leaf(*pmd))
+		return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+	if (WARN_ON_ONCE(pmd_bad(*pmd)))
 		return NULL;
 
 	ptep = pte_offset_map(pmd, addr);
@@ -389,6 +399,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
 	if (pte_present(pte))
 		page = pte_page(pte);
 	pte_unmap(ptep);
+
 	return page;
 }
 EXPORT_SYMBOL(vmalloc_to_page);
-- 
2.23.0


^ permalink raw reply related

* [PATCH v10 00/12] huge vmalloc mappings
From: Nicholas Piggin @ 2021-01-24  8:22 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, Ding Tianhong, linux-kernel, Nicholas Piggin,
	Christoph Hellwig, Zefan Li, Jonathan Cameron, Rick Edgecombe,
	linuxppc-dev

Fixed a couple of bugs that Ding noticed in review and testing.

Thanks,
Nick

Since v9:
- Fixed intermediate build breakage on x86-32 !PAE [thanks Ding]
- Fixed small page fallback case vm_struct double-free [thanks Ding]

Since v8:
- Fixed nommu compile.
- Added Kconfig option help text
- Added VM_NOHUGE which should help archs implement it [suggested by Rick]

Since v7:
- Rebase, added some acks, compile fix
- Removed "order=" from vmallocinfo, it's a bit confusing (nr_pages
  is in small page size for compatibility).
- Added arch_vmap_pmd_supported() test before starting to allocate
  the large page, rather than only testing it when doing the map, to
  avoid unsupported configs trying to allocate huge pages for no
  reason.

Since v6:
- Fixed a false positive warning introduced in patch 2, found by
  kbuild test robot.

Since v5:
- Split arch changes out better and make the constant folding work
- Avoid most of the 80 column wrap, fix a reference to lib/ioremap.c
- Fix compile error on some archs

Since v4:
- Fixed an off-by-page-order bug in v4
- Several minor cleanups.
- Added page order to /proc/vmallocinfo
- Added hugepage to alloc_large_system_hage output.
- Made an architecture config option, powerpc only for now.

Since v3:
- Fixed an off-by-one bug in a loop
- Fix !CONFIG_HAVE_ARCH_HUGE_VMAP build fail

*** BLURB HERE ***

Nicholas Piggin (12):
  mm/vmalloc: fix vmalloc_to_page for huge vmap mappings
  mm: apply_to_pte_range warn and fail if a large pte is encountered
  mm/vmalloc: rename vmap_*_range vmap_pages_*_range
  mm/ioremap: rename ioremap_*_range to vmap_*_range
  mm: HUGE_VMAP arch support cleanup
  powerpc: inline huge vmap supported functions
  arm64: inline huge vmap supported functions
  x86: inline huge vmap supported functions
  mm: Move vmap_range from mm/ioremap.c to mm/vmalloc.c
  mm/vmalloc: add vmap_range_noflush variant
  mm/vmalloc: Hugepage vmalloc mappings
  powerpc/64s/radix: Enable huge vmalloc mappings

 .../admin-guide/kernel-parameters.txt         |   2 +
 arch/Kconfig                                  |  10 +
 arch/arm64/include/asm/vmalloc.h              |  25 +
 arch/arm64/mm/mmu.c                           |  26 -
 arch/powerpc/Kconfig                          |   1 +
 arch/powerpc/include/asm/vmalloc.h            |  21 +
 arch/powerpc/kernel/module.c                  |  13 +-
 arch/powerpc/mm/book3s64/radix_pgtable.c      |  21 -
 arch/x86/include/asm/vmalloc.h                |  23 +
 arch/x86/mm/ioremap.c                         |  19 -
 arch/x86/mm/pgtable.c                         |  13 -
 include/linux/io.h                            |   9 -
 include/linux/vmalloc.h                       |  27 ++
 init/main.c                                   |   1 -
 mm/ioremap.c                                  | 225 +--------
 mm/memory.c                                   |  66 ++-
 mm/page_alloc.c                               |   5 +-
 mm/vmalloc.c                                  | 455 +++++++++++++++---
 18 files changed, 563 insertions(+), 399 deletions(-)

-- 
2.23.0


^ permalink raw reply

* Re: [PATCH v9 05/12] mm: HUGE_VMAP arch support cleanup
From: Nicholas Piggin @ 2021-01-24  7:43 UTC (permalink / raw)
  To: Andrew Morton, Ding Tianhong, linux-mm
  Cc: linux-arch, x86, Will Deacon, Catalin Marinas, H. Peter Anvin,
	linux-kernel, Christoph Hellwig, Zefan Li, Borislav Petkov,
	Jonathan,  Cameron, Thomas Gleixner, Rick,  Edgecombe,
	linuxppc-dev, Ingo Molnar, linux-arm-kernel
In-Reply-To: <c7eb5ba6-1187-d82f-d74c-0ca2c8ae8faf@huawei.com>

Excerpts from Ding Tianhong's message of January 4, 2021 10:33 pm:
> On 2020/12/5 14:57, Nicholas Piggin wrote:
>> This changes the awkward approach where architectures provide init
>> functions to determine which levels they can provide large mappings for,
>> to one where the arch is queried for each call.
>> 
>> This removes code and indirection, and allows constant-folding of dead
>> code for unsupported levels.
>> 
>> This also adds a prot argument to the arch query. This is unused
>> currently but could help with some architectures (e.g., some powerpc
>> processors can't map uncacheable memory with large pages).
>> 
>> Cc: linuxppc-dev@lists.ozlabs.org
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: linux-arm-kernel@lists.infradead.org
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Cc: Borislav Petkov <bp@alien8.de>
>> Cc: x86@kernel.org
>> Cc: "H. Peter Anvin" <hpa@zytor.com>
>> Acked-by: Catalin Marinas <catalin.marinas@arm.com> [arm64]
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>  arch/arm64/include/asm/vmalloc.h         |  8 +++
>>  arch/arm64/mm/mmu.c                      | 10 +--
>>  arch/powerpc/include/asm/vmalloc.h       |  8 +++
>>  arch/powerpc/mm/book3s64/radix_pgtable.c |  8 +--
>>  arch/x86/include/asm/vmalloc.h           |  7 ++
>>  arch/x86/mm/ioremap.c                    | 10 +--
>>  include/linux/io.h                       |  9 ---
>>  include/linux/vmalloc.h                  |  6 ++
>>  init/main.c                              |  1 -
>>  mm/ioremap.c                             | 88 +++++++++---------------
>>  10 files changed, 77 insertions(+), 78 deletions(-)
>> 
>> diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
>> index 2ca708ab9b20..597b40405319 100644
>> --- a/arch/arm64/include/asm/vmalloc.h
>> +++ b/arch/arm64/include/asm/vmalloc.h
>> @@ -1,4 +1,12 @@
>>  #ifndef _ASM_ARM64_VMALLOC_H
>>  #define _ASM_ARM64_VMALLOC_H
>>  
>> +#include <asm/page.h>
>> +
>> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
>> +bool arch_vmap_p4d_supported(pgprot_t prot);
>> +bool arch_vmap_pud_supported(pgprot_t prot);
>> +bool arch_vmap_pmd_supported(pgprot_t prot);
>> +#endif
>> +
>>  #endif /* _ASM_ARM64_VMALLOC_H */
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index ca692a815731..1b60079c1cef 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -1315,12 +1315,12 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
>>  	return dt_virt;
>>  }
>>  
>> -int __init arch_ioremap_p4d_supported(void)
>> +bool arch_vmap_p4d_supported(pgprot_t prot)
>>  {
>> -	return 0;
>> +	return false;
>>  }
>>  
> 
> I think you should put this function in the CONFIG_HAVE_ARCH_HUGE_VMAP, otherwise it may break the compile when disable the CONFIG_HAVE_ARCH_HUGE_VMAP, the same
> as the x86 and ppc.

Ah, good catch. arm64 is okay because it always selects 
HAVE_ARCH_HUGE_VMAP, powerpc is okay because it places
them in a file that's only compiled for configs that select
huge vmap, but x86-32 without PAE build breaks. I'll fix that.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH v2 2/2] memblock: do not start bottom-up allocations with kernel_end
From: Mike Rapoport @ 2021-01-24  7:34 UTC (permalink / raw)
  To: Andrew Morton
  Cc: riel, kernel-team, Ram Pai, linux-kernel, mhocko, linux-mm,
	Satheesh Rajendran, Konrad Rzeszutek Wilk, iamjoonsoo.kim,
	linuxppc-dev, guro, Thiago Jung Bauermann
In-Reply-To: <20210123180911.aafa8404a3a7a30779713456@linux-foundation.org>

On Sat, Jan 23, 2021 at 06:09:11PM -0800, Andrew Morton wrote:
> On Fri, 22 Jan 2021 01:37:14 -0300 Thiago Jung Bauermann <bauerman@linux.ibm.com> wrote:
> 
> > Mike Rapoport <rppt@kernel.org> writes:
> > 
> > > > Signed-off-by: Roman Gushchin <guro@fb.com>
> > > 
> > > Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
> > 
> > I've seen a couple of spurious triggers of the WARN_ONCE() removed by this
> > patch. This happens on some ppc64le bare metal (powernv) server machines with
> > CONFIG_SWIOTLB=y and crashkernel=4G, as described in a candidate patch I posted
> > to solve this issue in a different way:
> > 
> > https://lore.kernel.org/linuxppc-dev/20201218062103.76102-1-bauerman@linux.ibm.com/
> > 
> > Since this patch solves that problem, is it possible to include it in the next
> > feasible v5.11-rcX, with the following tag?
> 
> We could do this, if we're confident that this patch doesn't depend on
> [1/2] "mm: cma: allocate cma areas bottom-up"?  I think it is...

A think it does not depend on cma bottom-up allocation, it's rather the other
way around: without this CMA bottom-up allocation could fail with KASLR
enabled.

Still, this patch may need updates to the way x86 does early reservations:

https://lore.kernel.org/lkml/20210115083255.12744-1-rppt@kernel.org
 
> > Fixes: 8fabc623238e ("powerpc: Ensure that swiotlb buffer is allocated from low memory")
> 
> I added that.
> 
> 

-- 
Sincerely yours,
Mike.

^ permalink raw reply

* Re: [PATCH v2 2/2] memblock: do not start bottom-up allocations with kernel_end
From: Andrew Morton @ 2021-01-24  2:09 UTC (permalink / raw)
  To: Thiago Jung Bauermann
  Cc: riel, kernel-team, Ram Pai, linux-kernel, mhocko, linux-mm,
	Satheesh Rajendran, Konrad Rzeszutek Wilk, iamjoonsoo.kim,
	linuxppc-dev, guro, rppt
In-Reply-To: <20210122043714.266075-1-bauerman@linux.ibm.com>

On Fri, 22 Jan 2021 01:37:14 -0300 Thiago Jung Bauermann <bauerman@linux.ibm.com> wrote:

> Mike Rapoport <rppt@kernel.org> writes:
> 
> > > Signed-off-by: Roman Gushchin <guro@fb.com>
> > 
> > Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
> 
> I've seen a couple of spurious triggers of the WARN_ONCE() removed by this
> patch. This happens on some ppc64le bare metal (powernv) server machines with
> CONFIG_SWIOTLB=y and crashkernel=4G, as described in a candidate patch I posted
> to solve this issue in a different way:
> 
> https://lore.kernel.org/linuxppc-dev/20201218062103.76102-1-bauerman@linux.ibm.com/
> 
> Since this patch solves that problem, is it possible to include it in the next
> feasible v5.11-rcX, with the following tag?

We could do this, if we're confident that this patch doesn't depend on
[1/2] "mm: cma: allocate cma areas bottom-up"?  I think it is...

> Fixes: 8fabc623238e ("powerpc: Ensure that swiotlb buffer is allocated from low memory")

I added that.



^ permalink raw reply

* [PATCH] powerpc/64s: interrupt exit improve bounding of interrupt recursion
From: Nicholas Piggin @ 2021-01-23  6:15 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Athira Rajeev, Nicholas Piggin

When replaying pending soft-masked interrupts when an interrupt returns
to an irqs-enabled context, there is a special case required if this was
an asynchronous interrupt to avoid unbounded interrupt recursion.

This case was not tested for in the case the asynchronous interrupt hit
in user context, because a subsequent nested interrupt would by definition
hit in kernel mode, which then exits via the kernel path which does test
this case.

There is no reason to allow this for such interrupts. While recursion is
bounded at the next level, it's simpler and uses less stack to apply the
replay logic consistently.

This also expands the comment which was really pretty poor and didn't
explain the problem (I can say that because I wrote it).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
This is another little thing I found along the way, I don't think it
actually fixes a bug, but it avoids a recursion case, makes logic more
consistent, and improves the comment. So I'll submit it for next merge
window.

Thanks,
Nick

 arch/powerpc/kernel/syscall_64.c | 55 +++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c
index 7c85ed04a164..e0eb2a502db3 100644
--- a/arch/powerpc/kernel/syscall_64.c
+++ b/arch/powerpc/kernel/syscall_64.c
@@ -138,8 +138,12 @@ notrace long system_call_exception(long r3, long r4, long r5,
 /*
  * local irqs must be disabled. Returns false if the caller must re-enable
  * them, check for new work, and try again.
+ *
+ * This should be called with local irqs disabled, but if they were previously
+ * enabled when the interrupt handler returns (indicating a process-context /
+ * synchronous interrupt) then irqs_enabled should be true.
  */
-static notrace inline bool prep_irq_for_enabled_exit(bool clear_ri)
+static notrace inline bool prep_irq_for_enabled_exit(bool clear_ri, bool irqs_enabled)
 {
 	/* This must be done with RI=1 because tracing may touch vmaps */
 	trace_hardirqs_on();
@@ -156,6 +160,29 @@ static notrace inline bool prep_irq_for_enabled_exit(bool clear_ri)
 		trace_hardirqs_off();
 		local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
 
+		/*
+		 * Must replay pending soft-masked interrupts now. Don't just
+		 * local_irq_enabe(); local_irq_disable(); because if we are
+		 * returning from an asynchronous interrupt here, another one
+		 * might hit after irqs are enabled, and it would exit via this
+		 * same path allowing another to fire, and so on unbounded.
+		 *
+		 * If interrupts were enabled when this interrupt exited,
+		 * indicating a process context (synchronous) interrupt,
+		 * local_irq_enable/disable can be used, which will enable
+		 * interrupts rather than keeping them masked (unclear how
+		 * much benefit this is over just replaying for all cases,
+		 * because we immediately disable again, so all we're really
+		 * doing is allowing hard interrupts to execute directly for
+		 * a very small time, rather than being masked and replayed).
+		 */
+		if (irqs_enabled) {
+			local_irq_enable();
+			local_irq_disable();
+		} else {
+			replay_soft_interrupts();
+		}
+
 		return false;
 	}
 	local_paca->irq_happened = 0;
@@ -212,8 +239,9 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
 		ret |= _TIF_RESTOREALL;
 	}
 
-again:
 	local_irq_disable();
+
+again:
 	ti_flags = READ_ONCE(*ti_flagsp);
 	while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
 		local_irq_enable();
@@ -258,10 +286,8 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
 	}
 
 	/* scv need not set RI=0 because SRRs are not used */
-	if (unlikely(!prep_irq_for_enabled_exit(!scv))) {
-		local_irq_enable();
+	if (unlikely(!prep_irq_for_enabled_exit(!scv, true)))
 		goto again;
-	}
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	local_paca->tm_scratch = regs->msr;
@@ -336,11 +362,8 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned
 		}
 	}
 
-	if (unlikely(!prep_irq_for_enabled_exit(true))) {
-		local_irq_enable();
-		local_irq_disable();
+	if (unlikely(!prep_irq_for_enabled_exit(true, !irqs_disabled_flags(flags))))
 		goto again;
-	}
 
 #ifdef CONFIG_PPC_BOOK3E
 	if (unlikely(ts->debug.dbcr0 & DBCR0_IDM)) {
@@ -403,20 +426,8 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
 			}
 		}
 
-		if (unlikely(!prep_irq_for_enabled_exit(true))) {
-			/*
-			 * Can't local_irq_restore to replay if we were in
-			 * interrupt context. Must replay directly.
-			 */
-			if (irqs_disabled_flags(flags)) {
-				replay_soft_interrupts();
-			} else {
-				local_irq_restore(flags);
-				local_irq_save(flags);
-			}
-			/* Took an interrupt, may have more exit work to do. */
+		if (unlikely(!prep_irq_for_enabled_exit(true, !irqs_disabled_flags(flags))))
 			goto again;
-		}
 	} else {
 		/* Returning to a kernel context with local irqs disabled. */
 		__hard_EE_RI_disable();
-- 
2.23.0


^ permalink raw reply related

* [PATCH] powerpc/64s: prevent recursive replay_soft_interrupts causing superfluous interrupt
From: Nicholas Piggin @ 2021-01-23  6:12 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Athira Rajeev, Nicholas Piggin

When an asynchronous interrupt calls irq_exit, it checks for softirqs
that may have been created, and runs them. Running softirqs enables
local irqs, which can replay pending interrupts causing recursion in
replay_soft_interrupts. This abridged trace shows how this can occur:

! NIP replay_soft_interrupts
  LR  interrupt_exit_kernel_prepare
  Call Trace:
    interrupt_exit_kernel_prepare (unreliable)
    interrupt_return
  --- interrupt: ea0 at __rb_reserve_next
  NIP __rb_reserve_next
  LR __rb_reserve_next
  Call Trace:
    ring_buffer_lock_reserve
    trace_function
    function_trace_call
    ftrace_call
    __do_softirq
    irq_exit
    timer_interrupt
!   replay_soft_interrupts
    interrupt_exit_kernel_prepare
    interrupt_return
  --- interrupt: ea0 at arch_local_irq_restore

This can not be prevented easily, because softirqs must not block hard
irqs, so it has to be dealt with.

The recursion is bounded by design in the softirq code because softirq
replay disables softirqs and loops around again to check for new
softirqs created while it ran, so that's not a problem.

However it does mess up interrupt replay state, causing superfluous
interrupts when the second replay_soft_interrupts clears a pending
interrupt, leaving it still set in the first call in the 'happened'
local variable.

Fix this by not caching a copy of irqs_happened across interrupt
handler calls.

Fixes: 3282a3da25bd ("powerpc/64: Implement soft interrupt replay in C")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
It's actually very tricky / practically impossible to prevent softirqs
from being run like my previous attempt tried to do, when you really
get into details. Certainly not for a fix. I might try to attempt that
some time in future, but it would require kernel/softirq.c changes and
what we do now isn't conceptually different from architectures without
soft-masking, it's just another complexity to the complex soft-masking
code.

Anyway this hopefully fixes the PMU bug, but as I said I couldn't
reproduce it with your test case so it would be good if you could
give it a test.

Thanks,
Nick

 arch/powerpc/kernel/irq.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 6b1eca53e36c..cc7a6271b6b4 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -180,13 +180,18 @@ void notrace restore_interrupts(void)
 
 void replay_soft_interrupts(void)
 {
+	struct pt_regs regs;
+
 	/*
-	 * We use local_paca rather than get_paca() to avoid all
-	 * the debug_smp_processor_id() business in this low level
-	 * function
+	 * Be careful here, calling these interrupt handlers can cause
+	 * softirqs to be raised, which they may run when calling irq_exit,
+	 * which will cause local_irq_enable() to be run, which can then
+	 * recurse into this function. Don't keep any state across
+	 * interrupt handler calls which may change underneath us.
+	 *
+	 * We use local_paca rather than get_paca() to avoid all the
+	 * debug_smp_processor_id() business in this low level function.
 	 */
-	unsigned char happened = local_paca->irq_happened;
-	struct pt_regs regs;
 
 	ppc_save_regs(&regs);
 	regs.softe = IRQS_ENABLED;
@@ -209,7 +214,7 @@ void replay_soft_interrupts(void)
 	 * This is a higher priority interrupt than the others, so
 	 * replay it first.
 	 */
-	if (IS_ENABLED(CONFIG_PPC_BOOK3S) && (happened & PACA_IRQ_HMI)) {
+	if (IS_ENABLED(CONFIG_PPC_BOOK3S) && (local_paca->irq_happened & PACA_IRQ_HMI)) {
 		local_paca->irq_happened &= ~PACA_IRQ_HMI;
 		regs.trap = 0xe60;
 		handle_hmi_exception(&regs);
@@ -217,7 +222,7 @@ void replay_soft_interrupts(void)
 			hard_irq_disable();
 	}
 
-	if (happened & PACA_IRQ_DEC) {
+	if (local_paca->irq_happened & PACA_IRQ_DEC) {
 		local_paca->irq_happened &= ~PACA_IRQ_DEC;
 		regs.trap = 0x900;
 		timer_interrupt(&regs);
@@ -225,7 +230,7 @@ void replay_soft_interrupts(void)
 			hard_irq_disable();
 	}
 
-	if (happened & PACA_IRQ_EE) {
+	if (local_paca->irq_happened & PACA_IRQ_EE) {
 		local_paca->irq_happened &= ~PACA_IRQ_EE;
 		regs.trap = 0x500;
 		do_IRQ(&regs);
@@ -233,7 +238,7 @@ void replay_soft_interrupts(void)
 			hard_irq_disable();
 	}
 
-	if (IS_ENABLED(CONFIG_PPC_DOORBELL) && (happened & PACA_IRQ_DBELL)) {
+	if (IS_ENABLED(CONFIG_PPC_DOORBELL) && (local_paca->irq_happened & PACA_IRQ_DBELL)) {
 		local_paca->irq_happened &= ~PACA_IRQ_DBELL;
 		if (IS_ENABLED(CONFIG_PPC_BOOK3E))
 			regs.trap = 0x280;
@@ -245,7 +250,7 @@ void replay_soft_interrupts(void)
 	}
 
 	/* Book3E does not support soft-masking PMI interrupts */
-	if (IS_ENABLED(CONFIG_PPC_BOOK3S) && (happened & PACA_IRQ_PMI)) {
+	if (IS_ENABLED(CONFIG_PPC_BOOK3S) && (local_paca->irq_happened & PACA_IRQ_PMI)) {
 		local_paca->irq_happened &= ~PACA_IRQ_PMI;
 		regs.trap = 0xf00;
 		performance_monitor_exception(&regs);
@@ -253,8 +258,7 @@ void replay_soft_interrupts(void)
 			hard_irq_disable();
 	}
 
-	happened = local_paca->irq_happened;
-	if (happened & ~PACA_IRQ_HARD_DIS) {
+	if (local_paca->irq_happened & ~PACA_IRQ_HARD_DIS) {
 		/*
 		 * We are responding to the next interrupt, so interrupt-off
 		 * latencies should be reset here.
-- 
2.23.0


^ permalink raw reply related

* Re: [PATCH][next] scsi: ibmvfc: Fix spelling mistake "succeded" -> "succeeded"
From: Martin K. Petersen @ 2021-01-23  4:22 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Tyrel Datwyler, Michael Ellerman,
	Paul Mackerras, James E . J . Bottomley, Colin King, linux-scsi,
	linuxppc-dev
  Cc: kernel-janitors, linux-kernel, Martin K . Petersen
In-Reply-To: <20210118111346.70798-1-colin.king@canonical.com>

On Mon, 18 Jan 2021 11:13:46 +0000, Colin King wrote:

> There is a spelling mistake in a ibmvfc_dbg debug message. Fix it.

Applied to 5.12/scsi-queue, thanks!

[1/1] scsi: ibmvfc: Fix spelling mistake "succeded" -> "succeeded"
      https://git.kernel.org/mkp/scsi/c/ff79acc49af8

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: [PATCH 6/6] powerpc/rtas: constrain user region allocation to RMA
From: Alexey Kardashevskiy @ 2021-01-23  1:54 UTC (permalink / raw)
  To: Nathan Lynch, Michael Ellerman, linuxppc-dev
  Cc: tyreld, brking, ajd, aneesh.kumar
In-Reply-To: <87pn1ywbs9.fsf@linux.ibm.com>



On 22/01/2021 02:27, Nathan Lynch wrote:
> Michael Ellerman <mpe@ellerman.id.au> writes:
>> Nathan Lynch <nathanl@linux.ibm.com> writes:
>>> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>>>> On 16/01/2021 02:38, Nathan Lynch wrote:
>>>>> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>>>>>> On 15/01/2021 09:00, Nathan Lynch wrote:
>>>>>>> Memory locations passed as arguments from the OS to RTAS usually need
>>>>>>> to be addressable in 32-bit mode and must reside in the Real Mode
>>>>>>> Area. On PAPR guests, the RMA starts at logical address 0 and is the
>>>>>>> first logical memory block reported in the LPAR’s device tree.
>>>>>>>
>>>>>>> On powerpc targets with RTAS, Linux makes available to user space a
>>>>>>> region of memory suitable for arguments to be passed to RTAS via
>>>>>>> sys_rtas(). This region (rtas_rmo_buf) is allocated via the memblock
>>>>>>> API during boot in order to ensure that it satisfies the requirements
>>>>>>> described above.
>>>>>>>
>>>>>>> With radix MMU, the upper limit supplied to the memblock allocation
>>>>>>> can exceed the bounds of the first logical memory block, since
>>>>>>> ppc64_rma_size is ULONG_MAX and RTAS_INSTANTIATE_MAX is 1GB. (512MB is
>>>>>>> a common size of the first memory block according to a small sample of
>>>>>>> LPARs I have checked.) This leads to failures when user space invokes
>>>>>>> an RTAS function that uses a work area, such as
>>>>>>> ibm,configure-connector.
>>>>>>>
>>>>>>> Alter the determination of the upper limit for rtas_rmo_buf's
>>>>>>> allocation to consult the device tree directly, ensuring placement
>>>>>>> within the RMA regardless of the MMU in use.
>>>>>>
>>>>>> Can we tie this with RTAS (which also needs to be in RMA) and simply add
>>>>>> extra 64K in prom_instantiate_rtas() and advertise this address
>>>>>> (ALIGH_UP(rtas-base + rtas-size, PAGE_SIZE)) to the user space? We do
>>>>>> not need this RMO area before that point.
>>>>>
>>>>> Can you explain more about what advantage that would bring? I'm not
>>>>> seeing it. It's a more significant change than what I've written
>>>>> here.
>>>>
>>>>
>>>> We already allocate space for RTAS and (like RMO) it needs to be in RMA,
>>>> and RMO is useless without RTAS. We can reuse RTAS allocation code for
>>>> RMO like this:
>>>
>>> When you say RMO I assume you are referring to rtas_rmo_buf? (I don't
>>> think it is well-named.)
>> ...
>>
>> RMO (Real mode offset) is the old term we used to use to refer to what
>> is now called the RMA (Real mode area). There are still many references
>> to RMO in Linux, but they almost certainly all refer to what we now call
>> the RMA.
> 
> Yes... but I think in this discussion Alexey was using RMO to stand in
> for rtas_rmo_buf, which was what I was trying to clarify.


Correct. Thanks for the clarification, appreciated.

> 
>>>> May be store in the FDT as "linux,rmo-base" next to "linux,rtas-base",
>>>> for clarity, as sharing symbols between prom and main kernel is a bit
>>>> tricky.
>>>>
>>>> The benefit is that we do not do the same thing   (== find 64K in RMA)
>>>> in 2 different ways and if the RMO allocated my way is broken - we'll
>>>> know it much sooner as RTAS itself will break too.
>>>
>>> Implementation details aside... I'll grant that combining the
>>> allocations into one in prom_init reduces some duplication in the sense
>>> that both are subject to the same constraints (mostly - the RTAS data
>>> area must not cross a 256MB boundary, while the user region may). But
>>> they really are distinct concerns. The RTAS private data area is
>>> specified in the platform architecture, the OS is obligated to allocate
>>> it and pass it to instantiate-rtas, etc etc. However the user region
>>> (rtas_rmo_buf) is purely a Linux construct which is there to support
>>> sys_rtas.
>>>
>>> Now, there are multiple sites in the kernel proper that must allocate
>>> memory suitable for passing to RTAS. Obviously there is value in
>>> consolidating the logic for that purpose in one place, so I'll work on
>>> adding that in v2. OK?
>>
>> I don't think we want to move any allocations into prom_init.c unless we
>> have to.
>>
>> It's best thought of as a trampoline, that runs before the kernel
>> proper, to transition from live OF to a flat DT environment. One thing
>> that must be done as part of that is instantiating RTAS, because it's
>> basically a runtime copy of the live OF. But any other allocs are for
>> Linux to handle later, IMHO.
> 
> Agreed.

Then the only comment I have left is may be use of_address_to_resource() 
+ resource_size() instead of of_n_addr_cells()/of_n_size_cells() (like 
pseries_memory_block_size()). And now I shut up :) Thanks,


-- 
Alexey

^ permalink raw reply

* Re: [PATCH] lib/sstep: Fix incorrect return from analyze_instr()
From: Michael Ellerman @ 2021-01-23  0:33 UTC (permalink / raw)
  To: Ananth N Mavinakayanahalli, linuxppc-dev
  Cc: naveen.n.rao, ravi.bangoria, paulus, sandipan
In-Reply-To: <161124771457.333703.14641179082577500423.stgit@thinktux.local>

Ananth N Mavinakayanahalli <ananth@linux.ibm.com> writes:
> We currently just percolate the return value from analyze_instr()
> to the caller of emulate_step(), especially if it is a -1.
>
> For one particular case (opcode = 4) for instructions that
> aren't currently emulated, we are returning 'should not be
> single-stepped' while we should have returned 0 which says
> 'did not emulate, may have to single-step'.
>
> Signed-off-by: Ananth N Mavinakayanahalli <ananth@linux.ibm.com>
> Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
> ---
>  arch/powerpc/lib/sstep.c |   49 +++++++++++++++++++++++++---------------------
>  1 file changed, 27 insertions(+), 22 deletions(-)
>
> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
> index 5a425a4a1d88..a3a0373843cd 100644
> --- a/arch/powerpc/lib/sstep.c
> +++ b/arch/powerpc/lib/sstep.c
> @@ -1445,34 +1445,39 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
>  
>  #ifdef __powerpc64__
>  	case 4:
> -		if (!cpu_has_feature(CPU_FTR_ARCH_300))
> -			return -1;
> -
> -		switch (word & 0x3f) {
> -		case 48:	/* maddhd */
> -			asm volatile(PPC_MADDHD(%0, %1, %2, %3) :
> -				     "=r" (op->val) : "r" (regs->gpr[ra]),
> -				     "r" (regs->gpr[rb]), "r" (regs->gpr[rc]));
> -			goto compute_done;
> +		/*
> +		 * There are very many instructions with this primary opcode
> +		 * introduced in the ISA as early as v2.03. However, the ones
> +		 * we currently emulate were all introduced with ISA 3.0
> +		 */
> +		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
> +			switch (word & 0x3f) {
> +			case 48:	/* maddhd */
> +				asm volatile(PPC_MADDHD(%0, %1, %2, %3) :
> +					     "=r" (op->val) : "r" (regs->gpr[ra]),
> +					     "r" (regs->gpr[rb]), "r" (regs->gpr[rc]));
> +				goto compute_done;

Indenting everything makes this patch harder to read, and I think makes
the resulting code harder to read too. We already have two levels of
switch here, and we're inside a ~1700 line function, so keeping things
simple is important I think.

Doesn't this achieve the same result?

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index bf7a7d62ae8b..d631baaf1da2 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1443,8 +1443,10 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 
 #ifdef __powerpc64__
 	case 4:
-		if (!cpu_has_feature(CPU_FTR_ARCH_300))
-			return -1;
+		if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+			op->type = UNKNOWN;
+			return 0;
+		}
 
 		switch (word & 0x3f) {
 		case 48:	/* maddhd */
@@ -1470,7 +1472,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 		 * There are other instructions from ISA 3.0 with the same
 		 * primary opcode which do not have emulation support yet.
 		 */
-		return -1;
+		op->type = UNKNOWN;
+		return 0;
 #endif
 
 	case 7:		/* mulli */


cheers


^ permalink raw reply related

* Re: [PATCH 2/2] ima: Free IMA measurement buffer after kexec syscall
From: Thiago Jung Bauermann @ 2021-01-22 22:31 UTC (permalink / raw)
  To: Lakshmi Ramasubramanian
  Cc: sashal, dmitry.kasatkin, linux-kernel, zohar, tyhicks, ebiederm,
	gregkh, linux-integrity, linuxppc-dev
In-Reply-To: <20210121173003.18324-2-nramas@linux.microsoft.com>


Lakshmi Ramasubramanian <nramas@linux.microsoft.com> writes:

> IMA allocates kernel virtual memory to carry forward the measurement
> list, from the current kernel to the next kernel on kexec system call,
> in ima_add_kexec_buffer() function.  This buffer is not freed before
> completing the kexec system call resulting in memory leak.
>
> Add ima_buffer field in "struct kimage" to store the virtual address
> of the buffer allocated for the IMA measurement list.
> Free the memory allocated for the IMA measurement list in
> kimage_file_post_load_cleanup() function.
>
> Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
> Suggested-by: Tyler Hicks <tyhicks@linux.microsoft.com>
> Fixes: 7b8589cc29e7 ("ima: on soft reboot, save the measurement list")

Good catch.

Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [PATCH 1/2] ima: Free IMA measurement buffer on error
From: Thiago Jung Bauermann @ 2021-01-22 22:30 UTC (permalink / raw)
  To: Lakshmi Ramasubramanian
  Cc: sashal, dmitry.kasatkin, linux-kernel, zohar, tyhicks, ebiederm,
	gregkh, linux-integrity, linuxppc-dev
In-Reply-To: <20210121173003.18324-1-nramas@linux.microsoft.com>


Hi Lakshmi,

Lakshmi Ramasubramanian <nramas@linux.microsoft.com> writes:

> IMA allocates kernel virtual memory to carry forward the measurement
> list, from the current kernel to the next kernel on kexec system call,
> in ima_add_kexec_buffer() function.  In error code paths this memory
> is not freed resulting in memory leak.
>
> Free the memory allocated for the IMA measurement list in
> the error code paths in ima_add_kexec_buffer() function.
>
> Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
> Suggested-by: Tyler Hicks <tyhicks@linux.microsoft.com>
> Fixes: 7b8589cc29e7 ("ima: on soft reboot, save the measurement list")
> ---
>  security/integrity/ima/ima_kexec.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c
> index 121de3e04af2..212145008a01 100644
> --- a/security/integrity/ima/ima_kexec.c
> +++ b/security/integrity/ima/ima_kexec.c
> @@ -119,12 +119,14 @@ void ima_add_kexec_buffer(struct kimage *image)
>  	ret = kexec_add_buffer(&kbuf);
>  	if (ret) {
>  		pr_err("Error passing over kexec measurement buffer.\n");
> +		vfree(kexec_buffer);
>  		return;
>  	}

This is a good catch.

>  
>  	ret = arch_ima_add_kexec_buffer(image, kbuf.mem, kexec_segment_size);
>  	if (ret) {
>  		pr_err("Error passing over kexec measurement buffer.\n");
> +		vfree(kexec_buffer);
>  		return;
>  	}

But this would cause problems, because the buffer is still there in the
kimage and would cause kimage_load_segment() to access invalid memory.

There's no function to undo a kexec_add_buffer() to avoid this problem,
so I'd suggest just accepting the leak in this case. Fortunately, the
current implementations of arch_ima_add_kexec_buffer() are very simple
and cannot fail, so this is a theoretical problem.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [musl] Re: [PATCH v2] powerpc/64/signal: balance return predictor stack in signal trampoline
From: Raoni Fassina Firmino @ 2021-01-22 18:50 UTC (permalink / raw)
  To: Rich Felker
  Cc: Florian Weimer, libc-alpha, Alan Modra, musl, Nicholas Piggin,
	linuxppc-dev
In-Reply-To: <20210122183127.GQ23432@brightrain.aerifal.cx>

On Fri, Jan 22, 2021 at 01:31:27PM -0500, Rich Felker wrote:
> On Fri, Jan 22, 2021 at 03:19:22PM -0300, Raoni Fassina Firmino wrote:
> > On Fri, Jan 22, 2021 at 09:44:05AM -0500, Rich Felker wrote:
> > > Maybe I'm missing something but I don't see how this would break musl;
> > > we just inspect the PC in the mcontext, which I don't see any changes
> > > to and which should still point to the next instruction of the
> > > interrupted context. I don't have a test environment though so I'll
> > > have to wait for feedback from ppc users to be sure. Are there any
> > > further details on how it's breaking glibc?
> > 
> > For glibc, backtrace() compares the return-address from each stack frame
> > to the value of `__kernel_sigtramp_rt64` to identify the frame with the
> > mcontext information, but now the return-address is not the start of the
> > routine, but the middle of it, so it fails to catch this special frame.
> 
> Is there a reason it's backtracing rather than just looking at the
> interrupted context (pointed to by the third argument to the signal
> handler)?

The regression is exposed in the backtrace() routine. More precisely,
when the backtrace() is called from inside a signal handling. What I
described is the way backtrace() uses to identify this special
situation. What is failling in glibc is the test for this.


o/
Raoni Fassina

^ permalink raw reply

* Re: [musl] Re: [PATCH v2] powerpc/64/signal: balance return predictor stack in signal trampoline
From: Rich Felker @ 2021-01-22 18:31 UTC (permalink / raw)
  To: Florian Weimer, musl, libc-alpha, linuxppc-dev, Nicholas Piggin,
	Alan Modra
In-Reply-To: <20210122181922.pcxyomeg5xcf2umu@work-tp>

On Fri, Jan 22, 2021 at 03:19:22PM -0300, Raoni Fassina Firmino wrote:
> On Fri, Jan 22, 2021 at 09:44:05AM -0500, Rich Felker wrote:
> > Maybe I'm missing something but I don't see how this would break musl;
> > we just inspect the PC in the mcontext, which I don't see any changes
> > to and which should still point to the next instruction of the
> > interrupted context. I don't have a test environment though so I'll
> > have to wait for feedback from ppc users to be sure. Are there any
> > further details on how it's breaking glibc?
> 
> For glibc, backtrace() compares the return-address from each stack frame
> to the value of `__kernel_sigtramp_rt64` to identify the frame with the
> mcontext information, but now the return-address is not the start of the
> routine, but the middle of it, so it fails to catch this special frame.

Is there a reason it's backtracing rather than just looking at the
interrupted context (pointed to by the third argument to the signal
handler)?

Rich

^ permalink raw reply

* Re: [musl] Re: [PATCH v2] powerpc/64/signal: balance return predictor stack in signal trampoline
From: Raoni Fassina Firmino @ 2021-01-22 18:19 UTC (permalink / raw)
  To: Rich Felker
  Cc: Florian Weimer, libc-alpha, Alan Modra, musl, Nicholas Piggin,
	linuxppc-dev
In-Reply-To: <20210122144402.GP23432@brightrain.aerifal.cx>

On Fri, Jan 22, 2021 at 09:44:05AM -0500, Rich Felker wrote:
> Maybe I'm missing something but I don't see how this would break musl;
> we just inspect the PC in the mcontext, which I don't see any changes
> to and which should still point to the next instruction of the
> interrupted context. I don't have a test environment though so I'll
> have to wait for feedback from ppc users to be sure. Are there any
> further details on how it's breaking glibc?

For glibc, backtrace() compares the return-address from each stack frame
to the value of `__kernel_sigtramp_rt64` to identify the frame with the
mcontext information, but now the return-address is not the start of the
routine, but the middle of it, so it fails to catch this special frame.


o/
Raoni Fassina

^ permalink raw reply

* Re: [PATCH v2] powerpc/64/signal: balance return predictor stack in signal trampoline
From: Raoni Fassina Firmino @ 2021-01-22 18:13 UTC (permalink / raw)
  To: Florian Weimer
  Cc: libc-alpha, musl, linuxppc-dev, Nicholas Piggin, Alan Modra
In-Reply-To: <87im7pp5yl.fsf@oldenburg.str.redhat.com>

On Fri, Jan 22, 2021 at 12:27:14PM +0100, AL glibc-alpha wrote:
> * Nicholas Piggin:
> 
> > diff --git a/arch/powerpc/kernel/vdso64/sigtramp.S b/arch/powerpc/kernel/vdso64/sigtramp.S
> > index a8cc0409d7d2..bbf68cd01088 100644
> > --- a/arch/powerpc/kernel/vdso64/sigtramp.S
> > +++ b/arch/powerpc/kernel/vdso64/sigtramp.S
> > @@ -6,6 +6,7 @@
> >   * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.
> >   * Copyright (C) 2004 Alan Modra (amodra@au.ibm.com)), IBM Corp.
> >   */
> > +#include <asm/cache.h>		/* IFETCH_ALIGN_BYTES */
> >  #include <asm/processor.h>
> >  #include <asm/ppc_asm.h>
> >  #include <asm/unistd.h>
> > @@ -14,21 +15,17 @@
> >  
> >  	.text
> >  
> > -/* The nop here is a hack.  The dwarf2 unwind routines subtract 1 from
> > -   the return address to get an address in the middle of the presumed
> > -   call instruction.  Since we don't have a call here, we artificially
> > -   extend the range covered by the unwind info by padding before the
> > -   real start.  */
> > -	nop
> >  	.balign 8
> > +	.balign IFETCH_ALIGN_BYTES
> >  V_FUNCTION_BEGIN(__kernel_sigtramp_rt64)
> > -.Lsigrt_start = . - 4
> > +.Lsigrt_start:
> > +	bctrl	/* call the handler */
> >  	addi	r1, r1, __SIGNAL_FRAMESIZE
> >  	li	r0,__NR_rt_sigreturn
> >  	sc
> >  .Lsigrt_end:
> >  V_FUNCTION_END(__kernel_sigtramp_rt64)
> > -/* The ".balign 8" above and the following zeros mimic the old stack
> > +/* The .balign 8 above and the following zeros mimic the old stack
> >     trampoline layout.  The last magic value is the ucontext pointer,
> >     chosen in such a way that older libgcc unwind code returns a zero
> >     for a sigcontext pointer.  */
> 
> As far as I understand it, this breaks cancellation handling on musl and
> future glibc because it is necessary to look at the signal delivery
> location to see if a system call sequence has result in an action, and
> that location is no longer in user code after this change.
> 
> We have a glibc test in preparation of our change, and it started
> failing:
> 
>   Linux 5.10 breaks sigcontext_get_pc on powerpc64
>   <https://sourceware.org/bugzilla/show_bug.cgi?id=27223>
> 
> Isn't it possible to avoid the return predictor desynchronization by
> adding the appropriate hint?

I also caught this regression, I believe it was introduced in the kernel
5.9.

I don't know enough to comment on Florian suggestion, but I am working
on some possible fixes:

On the kernel side we can keep `__kernel_sigtramp_rt64` in the original
place after `bctrl` and add a new symbol so the kernel can jump to the
right place before `bctrl`.  This would ensure backward compatibility.

On the other side, this change exposed how fragile `backtrace()` is to
any changes in the trampoline code, which the libc has no control over
in this case. So maybe there is something that can be improved in how
backtrace decides that the return-address is to the trampoline. My fist
option is to test a range, after `__kernel_sigtramp_rt64` so see if the
address is inside the routine. This would be better if we can know the
size of the function, I know that the vdso.so has this info in the elf
but I don't know if it is exposed to the glibc.

As Nicholas mentioned in his patch, GDB seems to keep working just fine,
it is seems that GDB uses some heuristics to match the surround code of
the return address to identify that it is the trampoline code. So maybe
other option is to do something similar.


o/
Raoni Fassina

^ permalink raw reply

* Re: [musl] Re: [PATCH v2] powerpc/64/signal: balance return predictor stack in signal trampoline
From: Rich Felker @ 2021-01-22 14:44 UTC (permalink / raw)
  To: Florian Weimer
  Cc: musl, libc-alpha, linuxppc-dev, Nicholas Piggin, Alan Modra
In-Reply-To: <87im7pp5yl.fsf@oldenburg.str.redhat.com>

On Fri, Jan 22, 2021 at 12:27:14PM +0100, Florian Weimer wrote:
> * Nicholas Piggin:
> 
> > diff --git a/arch/powerpc/kernel/vdso64/sigtramp.S b/arch/powerpc/kernel/vdso64/sigtramp.S
> > index a8cc0409d7d2..bbf68cd01088 100644
> > --- a/arch/powerpc/kernel/vdso64/sigtramp.S
> > +++ b/arch/powerpc/kernel/vdso64/sigtramp.S
> > @@ -6,6 +6,7 @@
> >   * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.
> >   * Copyright (C) 2004 Alan Modra (amodra@au.ibm.com)), IBM Corp.
> >   */
> > +#include <asm/cache.h>		/* IFETCH_ALIGN_BYTES */
> >  #include <asm/processor.h>
> >  #include <asm/ppc_asm.h>
> >  #include <asm/unistd.h>
> > @@ -14,21 +15,17 @@
> >  
> >  	.text
> >  
> > -/* The nop here is a hack.  The dwarf2 unwind routines subtract 1 from
> > -   the return address to get an address in the middle of the presumed
> > -   call instruction.  Since we don't have a call here, we artificially
> > -   extend the range covered by the unwind info by padding before the
> > -   real start.  */
> > -	nop
> >  	.balign 8
> > +	.balign IFETCH_ALIGN_BYTES
> >  V_FUNCTION_BEGIN(__kernel_sigtramp_rt64)
> > -.Lsigrt_start = . - 4
> > +.Lsigrt_start:
> > +	bctrl	/* call the handler */
> >  	addi	r1, r1, __SIGNAL_FRAMESIZE
> >  	li	r0,__NR_rt_sigreturn
> >  	sc
> >  .Lsigrt_end:
> >  V_FUNCTION_END(__kernel_sigtramp_rt64)
> > -/* The ".balign 8" above and the following zeros mimic the old stack
> > +/* The .balign 8 above and the following zeros mimic the old stack
> >     trampoline layout.  The last magic value is the ucontext pointer,
> >     chosen in such a way that older libgcc unwind code returns a zero
> >     for a sigcontext pointer.  */
> 
> As far as I understand it, this breaks cancellation handling on musl and
> future glibc because it is necessary to look at the signal delivery
> location to see if a system call sequence has result in an action, and
> that location is no longer in user code after this change.
> 
> We have a glibc test in preparation of our change, and it started
> failing:
> 
>   Linux 5.10 breaks sigcontext_get_pc on powerpc64
>   <https://sourceware.org/bugzilla/show_bug.cgi?id=27223>
> 
> Isn't it possible to avoid the return predictor desynchronization by
> adding the appropriate hint?

Maybe I'm missing something but I don't see how this would break musl;
we just inspect the PC in the mcontext, which I don't see any changes
to and which should still point to the next instruction of the
interrupted context. I don't have a test environment though so I'll
have to wait for feedback from ppc users to be sure. Are there any
further details on how it's breaking glibc?

Rich

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox