LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v11 10/13] mm: Move vmap_range from mm/ioremap.c to mm/vmalloc.c
From: Nicholas Piggin @ 2021-01-26  4:45 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, Ding Tianhong, linux-kernel, Nicholas Piggin,
	Christoph Hellwig, Jonathan Cameron, Rick Edgecombe, linuxppc-dev,
	Christoph Hellwig
In-Reply-To: <20210126044510.2491820-1-npiggin@gmail.com>

This is a generic kernel virtual memory mapper, not specific to ioremap.

Code is unchanged other than making vmap_range non-static.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 include/linux/vmalloc.h |   3 +
 mm/ioremap.c            | 203 ----------------------------------------
 mm/vmalloc.c            | 202 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 205 insertions(+), 203 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 9f7b8b00101b..99ea72d547dc 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -194,6 +194,9 @@ extern struct vm_struct *remove_vm_area(const void *addr);
 extern struct vm_struct *find_vm_area(const void *addr);
 
 #ifdef CONFIG_MMU
+int vmap_range(unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift);
 extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
 				    pgprot_t prot, struct page **pages);
 int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 3264d0203785..d1dcc7e744ac 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -28,209 +28,6 @@ early_param("nohugeiomap", set_nohugeiomap);
 static const bool iomap_max_page_shift = PAGE_SHIFT;
 #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
 
-static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			pgtbl_mod_mask *mask)
-{
-	pte_t *pte;
-	u64 pfn;
-
-	pfn = phys_addr >> PAGE_SHIFT;
-	pte = pte_alloc_kernel_track(pmd, addr, mask);
-	if (!pte)
-		return -ENOMEM;
-	do {
-		BUG_ON(!pte_none(*pte));
-		set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
-		pfn++;
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-	*mask |= PGTBL_PTE_MODIFIED;
-	return 0;
-}
-
-static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift)
-{
-	if (max_page_shift < PMD_SHIFT)
-		return 0;
-
-	if (!arch_vmap_pmd_supported(prot))
-		return 0;
-
-	if ((end - addr) != PMD_SIZE)
-		return 0;
-
-	if (!IS_ALIGNED(addr, PMD_SIZE))
-		return 0;
-
-	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
-		return 0;
-
-	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
-		return 0;
-
-	return pmd_set_huge(pmd, phys_addr, prot);
-}
-
-static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift, pgtbl_mod_mask *mask)
-{
-	pmd_t *pmd;
-	unsigned long next;
-
-	pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
-	if (!pmd)
-		return -ENOMEM;
-	do {
-		next = pmd_addr_end(addr, end);
-
-		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
-					max_page_shift)) {
-			*mask |= PGTBL_PMD_MODIFIED;
-			continue;
-		}
-
-		if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
-			return -ENOMEM;
-	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
-	return 0;
-}
-
-static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift)
-{
-	if (max_page_shift < PUD_SHIFT)
-		return 0;
-
-	if (!arch_vmap_pud_supported(prot))
-		return 0;
-
-	if ((end - addr) != PUD_SIZE)
-		return 0;
-
-	if (!IS_ALIGNED(addr, PUD_SIZE))
-		return 0;
-
-	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
-		return 0;
-
-	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
-		return 0;
-
-	return pud_set_huge(pud, phys_addr, prot);
-}
-
-static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift, pgtbl_mod_mask *mask)
-{
-	pud_t *pud;
-	unsigned long next;
-
-	pud = pud_alloc_track(&init_mm, p4d, addr, mask);
-	if (!pud)
-		return -ENOMEM;
-	do {
-		next = pud_addr_end(addr, end);
-
-		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
-					max_page_shift)) {
-			*mask |= PGTBL_PUD_MODIFIED;
-			continue;
-		}
-
-		if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
-					max_page_shift, mask))
-			return -ENOMEM;
-	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
-	return 0;
-}
-
-static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift)
-{
-	if (max_page_shift < P4D_SHIFT)
-		return 0;
-
-	if (!arch_vmap_p4d_supported(prot))
-		return 0;
-
-	if ((end - addr) != P4D_SIZE)
-		return 0;
-
-	if (!IS_ALIGNED(addr, P4D_SIZE))
-		return 0;
-
-	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
-		return 0;
-
-	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
-		return 0;
-
-	return p4d_set_huge(p4d, phys_addr, prot);
-}
-
-static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift, pgtbl_mod_mask *mask)
-{
-	p4d_t *p4d;
-	unsigned long next;
-
-	p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
-	if (!p4d)
-		return -ENOMEM;
-	do {
-		next = p4d_addr_end(addr, end);
-
-		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
-					max_page_shift)) {
-			*mask |= PGTBL_P4D_MODIFIED;
-			continue;
-		}
-
-		if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
-					max_page_shift, mask))
-			return -ENOMEM;
-	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
-	return 0;
-}
-
-static int vmap_range(unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift)
-{
-	pgd_t *pgd;
-	unsigned long start;
-	unsigned long next;
-	int err;
-	pgtbl_mod_mask mask = 0;
-
-	might_sleep();
-	BUG_ON(addr >= end);
-
-	start = addr;
-	pgd = pgd_offset_k(addr);
-	do {
-		next = pgd_addr_end(addr, end);
-		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
-					max_page_shift, &mask);
-		if (err)
-			break;
-	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
-
-	flush_cache_vmap(start, end);
-
-	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
-		arch_sync_kernel_mappings(start, end);
-
-	return err;
-}
-
 int ioremap_page_range(unsigned long addr,
 		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7f2f36116980..f043386bb51d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -68,6 +68,208 @@ static void free_work(struct work_struct *w)
 }
 
 /*** Page table manipulation functions ***/
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			pgtbl_mod_mask *mask)
+{
+	pte_t *pte;
+	u64 pfn;
+
+	pfn = phys_addr >> PAGE_SHIFT;
+	pte = pte_alloc_kernel_track(pmd, addr, mask);
+	if (!pte)
+		return -ENOMEM;
+	do {
+		BUG_ON(!pte_none(*pte));
+		set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
+		pfn++;
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	*mask |= PGTBL_PTE_MODIFIED;
+	return 0;
+}
+
+static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	if (max_page_shift < PMD_SHIFT)
+		return 0;
+
+	if (!arch_vmap_pmd_supported(prot))
+		return 0;
+
+	if ((end - addr) != PMD_SIZE)
+		return 0;
+
+	if (!IS_ALIGNED(addr, PMD_SIZE))
+		return 0;
+
+	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+		return 0;
+
+	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+		return 0;
+
+	return pmd_set_huge(pmd, phys_addr, prot);
+}
+
+static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
+	if (!pmd)
+		return -ENOMEM;
+	do {
+		next = pmd_addr_end(addr, end);
+
+		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
+					max_page_shift)) {
+			*mask |= PGTBL_PMD_MODIFIED;
+			continue;
+		}
+
+		if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
+			return -ENOMEM;
+	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
+	return 0;
+}
+
+static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	if (max_page_shift < PUD_SHIFT)
+		return 0;
+
+	if (!arch_vmap_pud_supported(prot))
+		return 0;
+
+	if ((end - addr) != PUD_SIZE)
+		return 0;
+
+	if (!IS_ALIGNED(addr, PUD_SIZE))
+		return 0;
+
+	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
+		return 0;
+
+	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
+		return 0;
+
+	return pud_set_huge(pud, phys_addr, prot);
+}
+
+static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_alloc_track(&init_mm, p4d, addr, mask);
+	if (!pud)
+		return -ENOMEM;
+	do {
+		next = pud_addr_end(addr, end);
+
+		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
+					max_page_shift)) {
+			*mask |= PGTBL_PUD_MODIFIED;
+			continue;
+		}
+
+		if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
+					max_page_shift, mask))
+			return -ENOMEM;
+	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
+	return 0;
+}
+
+static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	if (max_page_shift < P4D_SHIFT)
+		return 0;
+
+	if (!arch_vmap_p4d_supported(prot))
+		return 0;
+
+	if ((end - addr) != P4D_SIZE)
+		return 0;
+
+	if (!IS_ALIGNED(addr, P4D_SIZE))
+		return 0;
+
+	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
+		return 0;
+
+	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
+		return 0;
+
+	return p4d_set_huge(p4d, phys_addr, prot);
+}
+
+static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+	p4d_t *p4d;
+	unsigned long next;
+
+	p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
+	if (!p4d)
+		return -ENOMEM;
+	do {
+		next = p4d_addr_end(addr, end);
+
+		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
+					max_page_shift)) {
+			*mask |= PGTBL_P4D_MODIFIED;
+			continue;
+		}
+
+		if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
+					max_page_shift, mask))
+			return -ENOMEM;
+	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
+	return 0;
+}
+
+int vmap_range(unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	pgd_t *pgd;
+	unsigned long start;
+	unsigned long next;
+	int err;
+	pgtbl_mod_mask mask = 0;
+
+	might_sleep();
+	BUG_ON(addr >= end);
+
+	start = addr;
+	pgd = pgd_offset_k(addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
+					max_page_shift, &mask);
+		if (err)
+			break;
+	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
+
+	flush_cache_vmap(start, end);
+
+	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+		arch_sync_kernel_mappings(start, end);
+
+	return err;
+}
 
 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			     pgtbl_mod_mask *mask)
-- 
2.23.0


^ permalink raw reply related

* [PATCH v11 11/13] mm/vmalloc: add vmap_range_noflush variant
From: Nicholas Piggin @ 2021-01-26  4:45 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, Ding Tianhong, linux-kernel, Nicholas Piggin,
	Christoph Hellwig, Jonathan Cameron, Rick Edgecombe, linuxppc-dev,
	Christoph Hellwig
In-Reply-To: <20210126044510.2491820-1-npiggin@gmail.com>

As a side-effect, the order of flush_cache_vmap() and
arch_sync_kernel_mappings() calls are switched, but that now matches
the other callers in this file.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 mm/vmalloc.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f043386bb51d..47ab4338cfff 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -240,7 +240,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 	return 0;
 }
 
-int vmap_range(unsigned long addr, unsigned long end,
+static int vmap_range_noflush(unsigned long addr, unsigned long end,
 			phys_addr_t phys_addr, pgprot_t prot,
 			unsigned int max_page_shift)
 {
@@ -263,14 +263,24 @@ int vmap_range(unsigned long addr, unsigned long end,
 			break;
 	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
 
-	flush_cache_vmap(start, end);
-
 	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
 		arch_sync_kernel_mappings(start, end);
 
 	return err;
 }
 
+int vmap_range(unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	int err;
+
+	err = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift);
+	flush_cache_vmap(addr, end);
+
+	return err;
+}
+
 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			     pgtbl_mod_mask *mask)
 {
-- 
2.23.0


^ permalink raw reply related

* [PATCH v11 12/13] mm/vmalloc: Hugepage vmalloc mappings
From: Nicholas Piggin @ 2021-01-26  4:45 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, Ding Tianhong, linux-kernel, Nicholas Piggin,
	Christoph Hellwig, Jonathan Cameron, Rick Edgecombe, linuxppc-dev
In-Reply-To: <20210126044510.2491820-1-npiggin@gmail.com>

Support huge page vmalloc mappings. Config option HAVE_ARCH_HUGE_VMALLOC
enables support on architectures that define HAVE_ARCH_HUGE_VMAP and
supports PMD sized vmap mappings.

vmalloc will attempt to allocate PMD-sized pages if allocating PMD size
or larger, and fall back to small pages if that was unsuccessful.

Architectures must ensure that any arch specific vmalloc allocations
that require PAGE_SIZE mappings (e.g., module allocations vs strict
module rwx) use the VM_NOHUGE flag to inhibit larger mappings.

When hugepage vmalloc mappings are enabled in the next patch, this
reduces TLB misses by nearly 30x on a `git diff` workload on a 2-node
POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%.

This can result in more internal fragmentation and memory overhead for a
given allocation, an option nohugevmalloc is added to disable at boot.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig            |  11 ++
 include/linux/vmalloc.h |  21 ++++
 mm/page_alloc.c         |   5 +-
 mm/vmalloc.c            | 215 +++++++++++++++++++++++++++++++---------
 4 files changed, 205 insertions(+), 47 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 24862d15f3a3..eef170e0c9b8 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -724,6 +724,17 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 config HAVE_ARCH_HUGE_VMAP
 	bool
 
+#
+#  Archs that select this would be capable of PMD-sized vmaps (i.e.,
+#  arch_vmap_pmd_supported() returns true), and they must make no assumptions
+#  that vmalloc memory is mapped with PAGE_SIZE ptes. The VM_NO_HUGE_VMAP flag
+#  can be used to prohibit arch-specific allocations from using hugepages to
+#  help with this (e.g., modules may require it).
+#
+config HAVE_ARCH_HUGE_VMALLOC
+	depends on HAVE_ARCH_HUGE_VMAP
+	bool
+
 config ARCH_WANT_HUGE_PMD_SHARE
 	bool
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 99ea72d547dc..93270adf5db5 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -25,6 +25,7 @@ struct notifier_block;		/* in notifier.h */
 #define VM_NO_GUARD		0x00000040      /* don't add guard page */
 #define VM_KASAN		0x00000080      /* has allocated kasan shadow memory */
 #define VM_MAP_PUT_PAGES	0x00000100	/* put pages and free array in vfree */
+#define VM_NO_HUGE_VMAP		0x00000200	/* force PAGE_SIZE pte mapping */
 
 /*
  * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
@@ -59,6 +60,9 @@ struct vm_struct {
 	unsigned long		size;
 	unsigned long		flags;
 	struct page		**pages;
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+	unsigned int		page_order;
+#endif
 	unsigned int		nr_pages;
 	phys_addr_t		phys_addr;
 	const void		*caller;
@@ -193,6 +197,22 @@ void free_vm_area(struct vm_struct *area);
 extern struct vm_struct *remove_vm_area(const void *addr);
 extern struct vm_struct *find_vm_area(const void *addr);
 
+static inline bool is_vm_area_hugepages(const void *addr)
+{
+	/*
+	 * This may not 100% tell if the area is mapped with > PAGE_SIZE
+	 * page table entries, if for some reason the architecture indicates
+	 * larger sizes are available but decides not to use them, nothing
+	 * prevents that. This only indicates the size of the physical page
+	 * allocated in the vmalloc layer.
+	 */
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+	return find_vm_area(addr)->page_order > 0;
+#else
+	return false;
+#endif
+}
+
 #ifdef CONFIG_MMU
 int vmap_range(unsigned long addr, unsigned long end,
 			phys_addr_t phys_addr, pgprot_t prot,
@@ -210,6 +230,7 @@ static inline void set_vm_flush_reset_perms(void *addr)
 	if (vm)
 		vm->flags |= VM_FLUSH_RESET_PERMS;
 }
+
 #else
 static inline int
 map_kernel_range_noflush(unsigned long start, unsigned long size,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 027f6481ba59..b7a9661fa232 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -72,6 +72,7 @@
 #include <linux/padata.h>
 #include <linux/khugepaged.h>
 #include <linux/buffer_head.h>
+#include <linux/vmalloc.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -8238,6 +8239,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 	void *table = NULL;
 	gfp_t gfp_flags;
 	bool virt;
+	bool huge;
 
 	/* allow the kernel cmdline to have a say */
 	if (!numentries) {
@@ -8305,6 +8307,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 		} else if (get_order(size) >= MAX_ORDER || hashdist) {
 			table = __vmalloc(size, gfp_flags);
 			virt = true;
+			huge = is_vm_area_hugepages(table);
 		} else {
 			/*
 			 * If bucketsize is not a power-of-two, we may free
@@ -8321,7 +8324,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 
 	pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
 		tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
-		virt ? "vmalloc" : "linear");
+		virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
 
 	if (_hash_shift)
 		*_hash_shift = log2qty;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 47ab4338cfff..e9a28de04182 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -42,6 +42,19 @@
 #include "internal.h"
 #include "pgalloc-track.h"
 
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+static bool __ro_after_init vmap_allow_huge = true;
+
+static int __init set_nohugevmalloc(char *str)
+{
+	vmap_allow_huge = false;
+	return 0;
+}
+early_param("nohugevmalloc", set_nohugevmalloc);
+#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+static const bool vmap_allow_huge = false;
+#endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+
 bool is_vmalloc_addr(const void *x)
 {
 	unsigned long addr = (unsigned long)x;
@@ -483,31 +496,12 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
 	return 0;
 }
 
-/**
- * map_kernel_range_noflush - map kernel VM area with the specified pages
- * @addr: start of the VM area to map
- * @size: size of the VM area to map
- * @prot: page protection flags to use
- * @pages: pages to map
- *
- * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify should
- * have been allocated using get_vm_area() and its friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing.  The caller is responsible for
- * calling flush_cache_vmap() on to-be-mapped areas before calling this
- * function.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
-			     pgprot_t prot, struct page **pages)
+static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
+		pgprot_t prot, struct page **pages)
 {
 	unsigned long start = addr;
-	unsigned long end = addr + size;
-	unsigned long next;
 	pgd_t *pgd;
+	unsigned long next;
 	int err = 0;
 	int nr = 0;
 	pgtbl_mod_mask mask = 0;
@@ -529,6 +523,66 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
 	return 0;
 }
 
+static int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+		pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+	unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
+
+	WARN_ON(page_shift < PAGE_SHIFT);
+
+	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
+			page_shift == PAGE_SHIFT)
+		return vmap_small_pages_range_noflush(addr, end, prot, pages);
+
+	for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
+		int err;
+
+		err = vmap_range_noflush(addr, addr + (1UL << page_shift),
+					__pa(page_address(pages[i])), prot,
+					page_shift);
+		if (err)
+			return err;
+
+		addr += 1UL << page_shift;
+	}
+
+	return 0;
+}
+
+static int vmap_pages_range(unsigned long addr, unsigned long end,
+		pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+	int err;
+
+	err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+	flush_cache_vmap(addr, end);
+	return err;
+}
+
+/**
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
+ * @addr: start of the VM area to map
+ * @size: size of the VM area to map
+ * @prot: page protection flags to use
+ * @pages: pages to map
+ *
+ * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify should
+ * have been allocated using get_vm_area() and its friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing.  The caller is responsible for
+ * calling flush_cache_vmap() on to-be-mapped areas before calling this
+ * function.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
+			     pgprot_t prot, struct page **pages)
+{
+	return vmap_pages_range_noflush(addr, addr + size, prot, pages, PAGE_SHIFT);
+}
+
 int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
 		struct page **pages)
 {
@@ -2112,6 +2166,24 @@ EXPORT_SYMBOL(vm_map_ram);
 
 static struct vm_struct *vmlist __initdata;
 
+static inline unsigned int vm_area_page_order(struct vm_struct *vm)
+{
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+	return vm->page_order;
+#else
+	return 0;
+#endif
+}
+
+static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
+{
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+	vm->page_order = order;
+#else
+	BUG_ON(order != 0);
+#endif
+}
+
 /**
  * vm_area_add_early - add vmap area early during boot
  * @vm: vm_struct to add
@@ -2422,6 +2494,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
 {
 	int i;
 
+	/* HUGE_VMALLOC passes small pages to set_direct_map */
 	for (i = 0; i < area->nr_pages; i++)
 		if (page_address(area->pages[i]))
 			set_direct_map(area->pages[i]);
@@ -2431,6 +2504,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
 static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
 {
 	unsigned long start = ULONG_MAX, end = 0;
+	unsigned int page_order = vm_area_page_order(area);
 	int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
 	int flush_dmap = 0;
 	int i;
@@ -2455,11 +2529,14 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
 	 * map. Find the start and end range of the direct mappings to make sure
 	 * the vm_unmap_aliases() flush includes the direct map.
 	 */
-	for (i = 0; i < area->nr_pages; i++) {
+	for (i = 0; i < area->nr_pages; i += 1U << page_order) {
 		unsigned long addr = (unsigned long)page_address(area->pages[i]);
 		if (addr) {
+			unsigned long page_size;
+
+			page_size = PAGE_SIZE << page_order;
 			start = min(addr, start);
-			end = max(addr + PAGE_SIZE, end);
+			end = max(addr + page_size, end);
 			flush_dmap = 1;
 		}
 	}
@@ -2500,13 +2577,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
 	vm_remove_mappings(area, deallocate_pages);
 
 	if (deallocate_pages) {
+		unsigned int page_order = vm_area_page_order(area);
 		int i;
 
-		for (i = 0; i < area->nr_pages; i++) {
+		for (i = 0; i < area->nr_pages; i += 1U << page_order) {
 			struct page *page = area->pages[i];
 
 			BUG_ON(!page);
-			__free_pages(page, 0);
+			__free_pages(page, page_order);
 		}
 		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
 
@@ -2697,15 +2775,19 @@ EXPORT_SYMBOL_GPL(vmap_pfn);
 #endif /* CONFIG_VMAP_PFN */
 
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
-				 pgprot_t prot, int node)
+				 pgprot_t prot, unsigned int page_shift,
+				 int node)
 {
 	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
-	unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
+	unsigned long addr = (unsigned long)area->addr;
+	unsigned long size = get_vm_area_size(area);
 	unsigned long array_size;
-	unsigned int i;
+	unsigned int nr_small_pages = size >> PAGE_SHIFT;
+	unsigned int page_order;
 	struct page **pages;
+	unsigned int i;
 
-	array_size = (unsigned long)nr_pages * sizeof(struct page *);
+	array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
 	gfp_mask |= __GFP_NOWARN;
 	if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
 		gfp_mask |= __GFP_HIGHMEM;
@@ -2724,30 +2806,37 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	}
 
 	area->pages = pages;
-	area->nr_pages = nr_pages;
+	area->nr_pages = nr_small_pages;
+	set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
 
-	for (i = 0; i < area->nr_pages; i++) {
-		struct page *page;
+	page_order = vm_area_page_order(area);
 
-		if (node == NUMA_NO_NODE)
-			page = alloc_page(gfp_mask);
-		else
-			page = alloc_pages_node(node, gfp_mask, 0);
+	/*
+	 * Careful, we allocate and map page_order pages, but tracking is done
+	 * per PAGE_SIZE page so as to keep the vm_struct APIs independent of
+	 * the physical/mapped size.
+	 */
+	for (i = 0; i < area->nr_pages; i += 1U << page_order) {
+		struct page *page;
+		int p;
 
+		page = alloc_pages_node(node, gfp_mask, page_order);
 		if (unlikely(!page)) {
 			/* Successfully allocated i pages, free them in __vfree() */
 			area->nr_pages = i;
 			atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
 			goto fail;
 		}
-		area->pages[i] = page;
+
+		for (p = 0; p < (1U << page_order); p++)
+			area->pages[i + p] = page + p;
+
 		if (gfpflags_allow_blocking(gfp_mask))
 			cond_resched();
 	}
 	atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
 
-	if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
-			prot, pages) < 0)
+	if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0)
 		goto fail;
 
 	return area->addr;
@@ -2755,7 +2844,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 fail:
 	warn_alloc(gfp_mask, NULL,
 			  "vmalloc: allocation failure, allocated %ld of %ld bytes",
-			  (area->nr_pages*PAGE_SIZE), area->size);
+			  (area->nr_pages*PAGE_SIZE), size);
 	__vfree(area->addr);
 	return NULL;
 }
@@ -2786,19 +2875,43 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	struct vm_struct *area;
 	void *addr;
 	unsigned long real_size = size;
+	unsigned long real_align = align;
+	unsigned int shift = PAGE_SHIFT;
 
-	size = PAGE_ALIGN(size);
 	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
 		goto fail;
 
-	area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
+	if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) &&
+			arch_vmap_pmd_supported(prot)) {
+		unsigned long size_per_node;
+
+		/*
+		 * Try huge pages. Only try for PAGE_KERNEL allocations,
+		 * others like modules don't yet expect huge pages in
+		 * their allocations due to apply_to_page_range not
+		 * supporting them.
+		 */
+
+		size_per_node = size;
+		if (node == NUMA_NO_NODE)
+			size_per_node /= num_online_nodes();
+		if (size_per_node >= PMD_SIZE) {
+			shift = PMD_SHIFT;
+			align = max(real_align, 1UL << shift);
+			size = ALIGN(real_size, 1UL << shift);
+		}
+	}
+
+again:
+	size = PAGE_ALIGN(size);
+	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
 				vm_flags, start, end, node, gfp_mask, caller);
 	if (!area)
 		goto fail;
 
-	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
+	addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
 	if (!addr)
-		return NULL;
+		goto fail;
 
 	/*
 	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -2812,8 +2925,18 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	return addr;
 
 fail:
-	warn_alloc(gfp_mask, NULL,
+	if (shift > PAGE_SHIFT) {
+		shift = PAGE_SHIFT;
+		align = real_align;
+		size = real_size;
+		goto again;
+	}
+
+	if (!area) {
+		/* Warn for area allocation, page allocations already warn */
+		warn_alloc(gfp_mask, NULL,
 			  "vmalloc: allocation failure: %lu bytes", real_size);
+	}
 	return NULL;
 }
 
-- 
2.23.0


^ permalink raw reply related

* [PATCH v11 13/13] powerpc/64s/radix: Enable huge vmalloc mappings
From: Nicholas Piggin @ 2021-01-26  4:45 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, Ding Tianhong, linux-kernel, Nicholas Piggin,
	Christoph Hellwig, Jonathan Cameron, Rick Edgecombe, linuxppc-dev
In-Reply-To: <20210126044510.2491820-1-npiggin@gmail.com>

Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 .../admin-guide/kernel-parameters.txt         |  2 ++
 arch/powerpc/Kconfig                          |  1 +
 arch/powerpc/kernel/module.c                  | 21 +++++++++++++++----
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a10b545c2070..d62df53e5200 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3225,6 +3225,8 @@
 
 	nohugeiomap	[KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings.
 
+	nohugevmalloc	[PPC] Disable kernel huge vmalloc mappings.
+
 	nosmt		[KNL,S390] Disable symmetric multithreading (SMT).
 			Equivalent to smt=1.
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 107bb4319e0e..781da6829ab7 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -181,6 +181,7 @@ config PPC
 	select GENERIC_GETTIMEOFDAY
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_HUGE_VMAP		if PPC_BOOK3S_64 && PPC_RADIX_MMU
+	select HAVE_ARCH_HUGE_VMALLOC		if HAVE_ARCH_HUGE_VMAP
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_KASAN			if PPC32 && PPC_PAGE_SHIFT <= 14
 	select HAVE_ARCH_KASAN_VMALLOC		if PPC32 && PPC_PAGE_SHIFT <= 14
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index a211b0253cdb..07026335d24d 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -87,13 +87,26 @@ int module_finalize(const Elf_Ehdr *hdr,
 	return 0;
 }
 
-#ifdef MODULES_VADDR
 void *module_alloc(unsigned long size)
 {
+	unsigned long start = VMALLOC_START;
+	unsigned long end = VMALLOC_END;
+
+#ifdef MODULES_VADDR
 	BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
+	start = MODULES_VADDR;
+	end = MODULES_END;
+#endif
+
+	/*
+	 * Don't do huge page allocations for modules yet until more testing
+	 * is done. STRICT_MODULE_RWX may require extra work to support this
+	 * too.
+	 */
 
-	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, GFP_KERNEL,
-				    PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
+	return __vmalloc_node_range(size, 1, start, end, GFP_KERNEL,
+				    PAGE_KERNEL_EXEC,
+				    VM_NO_HUGE_VMAP | VM_FLUSH_RESET_PERMS,
+				    NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
-#endif
-- 
2.23.0


^ permalink raw reply related

* [PATCH 3/5] powerpc/xive: remove unnecessary unmap_kernel_range
From: Nicholas Piggin @ 2021-01-26  4:54 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: Cédric Le Goater, linuxppc-dev, linux-kernel,
	Nicholas Piggin, Christoph Hellwig
In-Reply-To: <20210126045404.2492588-1-npiggin@gmail.com>

iounmap will remove ptes.

Cc: "Cédric Le Goater" <clg@kaod.org>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/sysdev/xive/common.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index 595310e056f4..d6c2069cc828 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -959,16 +959,12 @@ EXPORT_SYMBOL_GPL(is_xive_irq);
 void xive_cleanup_irq_data(struct xive_irq_data *xd)
 {
 	if (xd->eoi_mmio) {
-		unmap_kernel_range((unsigned long)xd->eoi_mmio,
-				   1u << xd->esb_shift);
 		iounmap(xd->eoi_mmio);
 		if (xd->eoi_mmio == xd->trig_mmio)
 			xd->trig_mmio = NULL;
 		xd->eoi_mmio = NULL;
 	}
 	if (xd->trig_mmio) {
-		unmap_kernel_range((unsigned long)xd->trig_mmio,
-				   1u << xd->esb_shift);
 		iounmap(xd->trig_mmio);
 		xd->trig_mmio = NULL;
 	}
-- 
2.23.0


^ permalink raw reply related

* Re: [PATCH v11 05/13] mm: HUGE_VMAP arch support cleanup
From: Ding Tianhong @ 2021-01-26  6:07 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm, Andrew Morton
  Cc: linux-arch, H. Peter Anvin, Will Deacon, Catalin Marinas, x86,
	linux-kernel, Christoph Hellwig, Ingo Molnar, Borislav Petkov,
	Jonathan Cameron, Thomas Gleixner, Rick Edgecombe, linuxppc-dev,
	linux-arm-kernel
In-Reply-To: <20210126044510.2491820-6-npiggin@gmail.com>

Reviewed-by: Ding Tianhong <dingtianhong@huawei.com>

On 2021/1/26 12:45, Nicholas Piggin wrote:
> This changes the awkward approach where architectures provide init
> functions to determine which levels they can provide large mappings for,
> to one where the arch is queried for each call.
> 
> This removes code and indirection, and allows constant-folding of dead
> code for unsupported levels.
> 
> This also adds a prot argument to the arch query. This is unused
> currently but could help with some architectures (e.g., some powerpc
> processors can't map uncacheable memory with large pages).
> 
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: x86@kernel.org
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Acked-by: Catalin Marinas <catalin.marinas@arm.com> [arm64]
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/arm64/include/asm/vmalloc.h         |  8 ++
>  arch/arm64/mm/mmu.c                      | 10 +--
>  arch/powerpc/include/asm/vmalloc.h       |  8 ++
>  arch/powerpc/mm/book3s64/radix_pgtable.c |  8 +-
>  arch/x86/include/asm/vmalloc.h           |  7 ++
>  arch/x86/mm/ioremap.c                    | 12 +--
>  include/linux/io.h                       |  9 ---
>  include/linux/vmalloc.h                  |  6 ++
>  init/main.c                              |  1 -
>  mm/ioremap.c                             | 94 ++++++++++--------------
>  10 files changed, 85 insertions(+), 78 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
> index 2ca708ab9b20..597b40405319 100644
> --- a/arch/arm64/include/asm/vmalloc.h
> +++ b/arch/arm64/include/asm/vmalloc.h
> @@ -1,4 +1,12 @@
>  #ifndef _ASM_ARM64_VMALLOC_H
>  #define _ASM_ARM64_VMALLOC_H
>  
> +#include <asm/page.h>
> +
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +bool arch_vmap_p4d_supported(pgprot_t prot);
> +bool arch_vmap_pud_supported(pgprot_t prot);
> +bool arch_vmap_pmd_supported(pgprot_t prot);
> +#endif
> +
>  #endif /* _ASM_ARM64_VMALLOC_H */
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index ae0c3d023824..1613d290cbd1 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -1313,12 +1313,12 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
>  	return dt_virt;
>  }
>  
> -int __init arch_ioremap_p4d_supported(void)
> +bool arch_vmap_p4d_supported(pgprot_t prot)
>  {
> -	return 0;
> +	return false;
>  }
>  
> -int __init arch_ioremap_pud_supported(void)
> +bool arch_vmap_pud_supported(pgprot_t prot)
>  {
>  	/*
>  	 * Only 4k granule supports level 1 block mappings.
> @@ -1328,9 +1328,9 @@ int __init arch_ioremap_pud_supported(void)
>  	       !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
>  }
>  
> -int __init arch_ioremap_pmd_supported(void)
> +bool arch_vmap_pmd_supported(pgprot_t prot)
>  {
> -	/* See arch_ioremap_pud_supported() */
> +	/* See arch_vmap_pud_supported() */
>  	return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
>  }
>  
> diff --git a/arch/powerpc/include/asm/vmalloc.h b/arch/powerpc/include/asm/vmalloc.h
> index b992dfaaa161..105abb73f075 100644
> --- a/arch/powerpc/include/asm/vmalloc.h
> +++ b/arch/powerpc/include/asm/vmalloc.h
> @@ -1,4 +1,12 @@
>  #ifndef _ASM_POWERPC_VMALLOC_H
>  #define _ASM_POWERPC_VMALLOC_H
>  
> +#include <asm/page.h>
> +
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +bool arch_vmap_p4d_supported(pgprot_t prot);
> +bool arch_vmap_pud_supported(pgprot_t prot);
> +bool arch_vmap_pmd_supported(pgprot_t prot);
> +#endif
> +
>  #endif /* _ASM_POWERPC_VMALLOC_H */
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index 98f0b243c1ab..743807fc210f 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -1082,13 +1082,13 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
>  	set_pte_at(mm, addr, ptep, pte);
>  }
>  
> -int __init arch_ioremap_pud_supported(void)
> +bool arch_vmap_pud_supported(pgprot_t prot)
>  {
>  	/* HPT does not cope with large pages in the vmalloc area */
>  	return radix_enabled();
>  }
>  
> -int __init arch_ioremap_pmd_supported(void)
> +bool arch_vmap_pmd_supported(pgprot_t prot)
>  {
>  	return radix_enabled();
>  }
> @@ -1182,7 +1182,7 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
>  	return 1;
>  }
>  
> -int __init arch_ioremap_p4d_supported(void)
> +bool arch_vmap_p4d_supported(pgprot_t prot)
>  {
> -	return 0;
> +	return false;
>  }
> diff --git a/arch/x86/include/asm/vmalloc.h b/arch/x86/include/asm/vmalloc.h
> index 29837740b520..094ea2b565f3 100644
> --- a/arch/x86/include/asm/vmalloc.h
> +++ b/arch/x86/include/asm/vmalloc.h
> @@ -1,6 +1,13 @@
>  #ifndef _ASM_X86_VMALLOC_H
>  #define _ASM_X86_VMALLOC_H
>  
> +#include <asm/page.h>
>  #include <asm/pgtable_areas.h>
>  
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +bool arch_vmap_p4d_supported(pgprot_t prot);
> +bool arch_vmap_pud_supported(pgprot_t prot);
> +bool arch_vmap_pmd_supported(pgprot_t prot);
> +#endif
> +
>  #endif /* _ASM_X86_VMALLOC_H */
> diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
> index 9e5ccc56f8e0..fbaf0c447986 100644
> --- a/arch/x86/mm/ioremap.c
> +++ b/arch/x86/mm/ioremap.c
> @@ -481,24 +481,26 @@ void iounmap(volatile void __iomem *addr)
>  }
>  EXPORT_SYMBOL(iounmap);
>  
> -int __init arch_ioremap_p4d_supported(void)
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +bool arch_vmap_p4d_supported(pgprot_t prot)
>  {
> -	return 0;
> +	return false;
>  }
>  
> -int __init arch_ioremap_pud_supported(void)
> +bool arch_vmap_pud_supported(pgprot_t prot)
>  {
>  #ifdef CONFIG_X86_64
>  	return boot_cpu_has(X86_FEATURE_GBPAGES);
>  #else
> -	return 0;
> +	return false;
>  #endif
>  }
>  
> -int __init arch_ioremap_pmd_supported(void)
> +bool arch_vmap_pmd_supported(pgprot_t prot)
>  {
>  	return boot_cpu_has(X86_FEATURE_PSE);
>  }
> +#endif
>  
>  /*
>   * Convert a physical pointer to a virtual kernel pointer for /dev/mem
> diff --git a/include/linux/io.h b/include/linux/io.h
> index 8394c56babc2..f1effd4d7a3c 100644
> --- a/include/linux/io.h
> +++ b/include/linux/io.h
> @@ -31,15 +31,6 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end,
>  }
>  #endif
>  
> -#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> -void __init ioremap_huge_init(void);
> -int arch_ioremap_p4d_supported(void);
> -int arch_ioremap_pud_supported(void);
> -int arch_ioremap_pmd_supported(void);
> -#else
> -static inline void ioremap_huge_init(void) { }
> -#endif
> -
>  /*
>   * Managed iomap interface
>   */
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 80c0181c411d..00bd62bd701e 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -83,6 +83,12 @@ struct vmap_area {
>  	};
>  };
>  
> +#ifndef CONFIG_HAVE_ARCH_HUGE_VMAP
> +static inline bool arch_vmap_p4d_supported(pgprot_t prot) { return false; }
> +static inline bool arch_vmap_pud_supported(pgprot_t prot) { return false; }
> +static inline bool arch_vmap_pmd_supported(pgprot_t prot) { return false; }
> +#endif
> +
>  /*
>   *	Highlevel APIs for driver use
>   */
> diff --git a/init/main.c b/init/main.c
> index c68d784376ca..bf9389e5b2e4 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -834,7 +834,6 @@ static void __init mm_init(void)
>  	pgtable_init();
>  	debug_objects_mem_init();
>  	vmalloc_init();
> -	ioremap_huge_init();
>  	/* Should be run before the first non-init thread is created */
>  	init_espfix_bsp();
>  	/* Should be run after espfix64 is set up. */
> diff --git a/mm/ioremap.c b/mm/ioremap.c
> index 3f4d36f9745a..3264d0203785 100644
> --- a/mm/ioremap.c
> +++ b/mm/ioremap.c
> @@ -16,49 +16,16 @@
>  #include "pgalloc-track.h"
>  
>  #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> -static int __read_mostly ioremap_p4d_capable;
> -static int __read_mostly ioremap_pud_capable;
> -static int __read_mostly ioremap_pmd_capable;
> -static int __read_mostly ioremap_huge_disabled;
> +static bool __ro_after_init iomap_max_page_shift = PAGE_SHIFT;
>  
>  static int __init set_nohugeiomap(char *str)
>  {
> -	ioremap_huge_disabled = 1;
> +	iomap_max_page_shift = P4D_SHIFT;
>  	return 0;
>  }
>  early_param("nohugeiomap", set_nohugeiomap);
> -
> -void __init ioremap_huge_init(void)
> -{
> -	if (!ioremap_huge_disabled) {
> -		if (arch_ioremap_p4d_supported())
> -			ioremap_p4d_capable = 1;
> -		if (arch_ioremap_pud_supported())
> -			ioremap_pud_capable = 1;
> -		if (arch_ioremap_pmd_supported())
> -			ioremap_pmd_capable = 1;
> -	}
> -}
> -
> -static inline int ioremap_p4d_enabled(void)
> -{
> -	return ioremap_p4d_capable;
> -}
> -
> -static inline int ioremap_pud_enabled(void)
> -{
> -	return ioremap_pud_capable;
> -}
> -
> -static inline int ioremap_pmd_enabled(void)
> -{
> -	return ioremap_pmd_capable;
> -}
> -
> -#else	/* !CONFIG_HAVE_ARCH_HUGE_VMAP */
> -static inline int ioremap_p4d_enabled(void) { return 0; }
> -static inline int ioremap_pud_enabled(void) { return 0; }
> -static inline int ioremap_pmd_enabled(void) { return 0; }
> +#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
> +static const bool iomap_max_page_shift = PAGE_SHIFT;
>  #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
>  
>  static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
> @@ -82,9 +49,13 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
>  }
>  
>  static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
> -			phys_addr_t phys_addr, pgprot_t prot)
> +			phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
>  {
> -	if (!ioremap_pmd_enabled())
> +	if (max_page_shift < PMD_SHIFT)
> +		return 0;
> +
> +	if (!arch_vmap_pmd_supported(prot))
>  		return 0;
>  
>  	if ((end - addr) != PMD_SIZE)
> @@ -104,7 +75,7 @@ static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
>  
>  static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
>  			phys_addr_t phys_addr, pgprot_t prot,
> -			pgtbl_mod_mask *mask)
> +			unsigned int max_page_shift, pgtbl_mod_mask *mask)
>  {
>  	pmd_t *pmd;
>  	unsigned long next;
> @@ -115,7 +86,8 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
>  	do {
>  		next = pmd_addr_end(addr, end);
>  
> -		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) {
> +		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
> +					max_page_shift)) {
>  			*mask |= PGTBL_PMD_MODIFIED;
>  			continue;
>  		}
> @@ -127,9 +99,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
>  }
>  
>  static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
> -			phys_addr_t phys_addr, pgprot_t prot)
> +			phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
>  {
> -	if (!ioremap_pud_enabled())
> +	if (max_page_shift < PUD_SHIFT)
> +		return 0;
> +
> +	if (!arch_vmap_pud_supported(prot))
>  		return 0;
>  
>  	if ((end - addr) != PUD_SIZE)
> @@ -149,7 +125,7 @@ static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
>  
>  static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
>  			phys_addr_t phys_addr, pgprot_t prot,
> -			pgtbl_mod_mask *mask)
> +			unsigned int max_page_shift, pgtbl_mod_mask *mask)
>  {
>  	pud_t *pud;
>  	unsigned long next;
> @@ -160,21 +136,27 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
>  	do {
>  		next = pud_addr_end(addr, end);
>  
> -		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot)) {
> +		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
> +					max_page_shift)) {
>  			*mask |= PGTBL_PUD_MODIFIED;
>  			continue;
>  		}
>  
> -		if (vmap_pmd_range(pud, addr, next, phys_addr, prot, mask))
> +		if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
> +					max_page_shift, mask))
>  			return -ENOMEM;
>  	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
>  	return 0;
>  }
>  
>  static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
> -			phys_addr_t phys_addr, pgprot_t prot)
> +			phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
>  {
> -	if (!ioremap_p4d_enabled())
> +	if (max_page_shift < P4D_SHIFT)
> +		return 0;
> +
> +	if (!arch_vmap_p4d_supported(prot))
>  		return 0;
>  
>  	if ((end - addr) != P4D_SIZE)
> @@ -194,7 +176,7 @@ static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
>  
>  static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
>  			phys_addr_t phys_addr, pgprot_t prot,
> -			pgtbl_mod_mask *mask)
> +			unsigned int max_page_shift, pgtbl_mod_mask *mask)
>  {
>  	p4d_t *p4d;
>  	unsigned long next;
> @@ -205,19 +187,22 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
>  	do {
>  		next = p4d_addr_end(addr, end);
>  
> -		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) {
> +		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
> +					max_page_shift)) {
>  			*mask |= PGTBL_P4D_MODIFIED;
>  			continue;
>  		}
>  
> -		if (vmap_pud_range(p4d, addr, next, phys_addr, prot, mask))
> +		if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
> +					max_page_shift, mask))
>  			return -ENOMEM;
>  	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
>  	return 0;
>  }
>  
>  static int vmap_range(unsigned long addr, unsigned long end,
> -			phys_addr_t phys_addr, pgprot_t prot)
> +			phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
>  {
>  	pgd_t *pgd;
>  	unsigned long start;
> @@ -232,7 +217,8 @@ static int vmap_range(unsigned long addr, unsigned long end,
>  	pgd = pgd_offset_k(addr);
>  	do {
>  		next = pgd_addr_end(addr, end);
> -		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, &mask);
> +		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
> +					max_page_shift, &mask);
>  		if (err)
>  			break;
>  	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
> @@ -248,7 +234,7 @@ static int vmap_range(unsigned long addr, unsigned long end,
>  int ioremap_page_range(unsigned long addr,
>  		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
>  {
> -	return vmap_range(addr, end, phys_addr, prot);
> +	return vmap_range(addr, end, phys_addr, prot, iomap_max_page_shift);
>  }
>  
>  #ifdef CONFIG_GENERIC_IOREMAP
> 


^ permalink raw reply

* Re: [PATCH 3/5] powerpc/xive: remove unnecessary unmap_kernel_range
From: Christoph Hellwig @ 2021-01-26  6:38 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-kernel, linux-mm, Cédric Le Goater, Andrew Morton,
	linuxppc-dev, Christoph Hellwig
In-Reply-To: <20210126045404.2492588-4-npiggin@gmail.com>

On Tue, Jan 26, 2021 at 02:54:02PM +1000, Nicholas Piggin wrote:
> iounmap will remove ptes.

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply

* Re: [PATCH v11 04/13] mm/ioremap: rename ioremap_*_range to vmap_*_range
From: Christoph Hellwig @ 2021-01-26  6:40 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-arch, Ding Tianhong, linux-kernel, Christoph Hellwig,
	linux-mm, Jonathan Cameron, Andrew Morton, Rick Edgecombe,
	linuxppc-dev
In-Reply-To: <20210126044510.2491820-5-npiggin@gmail.com>

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply

* Re: [PATCH v11 12/13] mm/vmalloc: Hugepage vmalloc mappings
From: Ding Tianhong @ 2021-01-26  6:59 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm, Andrew Morton
  Cc: linux-arch, linux-kernel, Christoph Hellwig, Jonathan Cameron,
	Rick Edgecombe, linuxppc-dev
In-Reply-To: <20210126044510.2491820-13-npiggin@gmail.com>

On 2021/1/26 12:45, Nicholas Piggin wrote:
> Support huge page vmalloc mappings. Config option HAVE_ARCH_HUGE_VMALLOC
> enables support on architectures that define HAVE_ARCH_HUGE_VMAP and
> supports PMD sized vmap mappings.
> 
> vmalloc will attempt to allocate PMD-sized pages if allocating PMD size
> or larger, and fall back to small pages if that was unsuccessful.
> 
> Architectures must ensure that any arch specific vmalloc allocations
> that require PAGE_SIZE mappings (e.g., module allocations vs strict
> module rwx) use the VM_NOHUGE flag to inhibit larger mappings.
> 
> When hugepage vmalloc mappings are enabled in the next patch, this
> reduces TLB misses by nearly 30x on a `git diff` workload on a 2-node
> POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%.
> 
> This can result in more internal fragmentation and memory overhead for a
> given allocation, an option nohugevmalloc is added to disable at boot.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/Kconfig            |  11 ++
>  include/linux/vmalloc.h |  21 ++++
>  mm/page_alloc.c         |   5 +-
>  mm/vmalloc.c            | 215 +++++++++++++++++++++++++++++++---------
>  4 files changed, 205 insertions(+), 47 deletions(-)
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 24862d15f3a3..eef170e0c9b8 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -724,6 +724,17 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>  config HAVE_ARCH_HUGE_VMAP
>  	bool
>  
> +#
> +#  Archs that select this would be capable of PMD-sized vmaps (i.e.,
> +#  arch_vmap_pmd_supported() returns true), and they must make no assumptions
> +#  that vmalloc memory is mapped with PAGE_SIZE ptes. The VM_NO_HUGE_VMAP flag
> +#  can be used to prohibit arch-specific allocations from using hugepages to
> +#  help with this (e.g., modules may require it).
> +#
> +config HAVE_ARCH_HUGE_VMALLOC
> +	depends on HAVE_ARCH_HUGE_VMAP
> +	bool
> +
>  config ARCH_WANT_HUGE_PMD_SHARE
>  	bool
>  
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 99ea72d547dc..93270adf5db5 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -25,6 +25,7 @@ struct notifier_block;		/* in notifier.h */
>  #define VM_NO_GUARD		0x00000040      /* don't add guard page */
>  #define VM_KASAN		0x00000080      /* has allocated kasan shadow memory */
>  #define VM_MAP_PUT_PAGES	0x00000100	/* put pages and free array in vfree */
> +#define VM_NO_HUGE_VMAP		0x00000200	/* force PAGE_SIZE pte mapping */
> 
>  /*
>   * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
> @@ -59,6 +60,9 @@ struct vm_struct {
>  	unsigned long		size;
>  	unsigned long		flags;
>  	struct page		**pages;
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
> +	unsigned int		page_order;
> +#endif
>  	unsigned int		nr_pages;
>  	phys_addr_t		phys_addr;
>  	const void		*caller;
Hi Nicholas:

Give a suggestion :)

The page order was only used to indicate the huge page flag for vm area, and only valid when
size bigger than PMD_SIZE, so can we use the vm flgas to instead of that, just like define the
new flag named VM_HUGEPAGE, it would not break the vm struct, and it is easier for me to backport the serious
patches to our own branches. (Base on the lts version).

Tianhong

> @@ -193,6 +197,22 @@ void free_vm_area(struct vm_struct *area);
>  extern struct vm_struct *remove_vm_area(const void *addr);
>  extern struct vm_struct *find_vm_area(const void *addr);
>  
> +static inline bool is_vm_area_hugepages(const void *addr)
> +{
> +	/*
> +	 * This may not 100% tell if the area is mapped with > PAGE_SIZE
> +	 * page table entries, if for some reason the architecture indicates
> +	 * larger sizes are available but decides not to use them, nothing
> +	 * prevents that. This only indicates the size of the physical page
> +	 * allocated in the vmalloc layer.
> +	 */
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
> +	return find_vm_area(addr)->page_order > 0;
> +#else
> +	return false;
> +#endif
> +}
> +
>  #ifdef CONFIG_MMU
>  int vmap_range(unsigned long addr, unsigned long end,
>  			phys_addr_t phys_addr, pgprot_t prot,
> @@ -210,6 +230,7 @@ static inline void set_vm_flush_reset_perms(void *addr)
>  	if (vm)
>  		vm->flags |= VM_FLUSH_RESET_PERMS;
>  }
> +
>  #else
>  static inline int
>  map_kernel_range_noflush(unsigned long start, unsigned long size,
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 027f6481ba59..b7a9661fa232 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -72,6 +72,7 @@
>  #include <linux/padata.h>
>  #include <linux/khugepaged.h>
>  #include <linux/buffer_head.h>
> +#include <linux/vmalloc.h>
>  
>  #include <asm/sections.h>
>  #include <asm/tlbflush.h>
> @@ -8238,6 +8239,7 @@ void *__init alloc_large_system_hash(const char *tablename,
>  	void *table = NULL;
>  	gfp_t gfp_flags;
>  	bool virt;
> +	bool huge;
>  
>  	/* allow the kernel cmdline to have a say */
>  	if (!numentries) {
> @@ -8305,6 +8307,7 @@ void *__init alloc_large_system_hash(const char *tablename,
>  		} else if (get_order(size) >= MAX_ORDER || hashdist) {
>  			table = __vmalloc(size, gfp_flags);
>  			virt = true;
> +			huge = is_vm_area_hugepages(table);
>  		} else {
>  			/*
>  			 * If bucketsize is not a power-of-two, we may free
> @@ -8321,7 +8324,7 @@ void *__init alloc_large_system_hash(const char *tablename,
>  
>  	pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
>  		tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
> -		virt ? "vmalloc" : "linear");
> +		virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
>  
>  	if (_hash_shift)
>  		*_hash_shift = log2qty;
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 47ab4338cfff..e9a28de04182 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -42,6 +42,19 @@
>  #include "internal.h"
>  #include "pgalloc-track.h"
>  
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
> +static bool __ro_after_init vmap_allow_huge = true;
> +
> +static int __init set_nohugevmalloc(char *str)
> +{
> +	vmap_allow_huge = false;
> +	return 0;
> +}
> +early_param("nohugevmalloc", set_nohugevmalloc);
> +#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
> +static const bool vmap_allow_huge = false;
> +#endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
> +
>  bool is_vmalloc_addr(const void *x)
>  {
>  	unsigned long addr = (unsigned long)x;
> @@ -483,31 +496,12 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
>  	return 0;
>  }
>  
> -/**
> - * map_kernel_range_noflush - map kernel VM area with the specified pages
> - * @addr: start of the VM area to map
> - * @size: size of the VM area to map
> - * @prot: page protection flags to use
> - * @pages: pages to map
> - *
> - * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify should
> - * have been allocated using get_vm_area() and its friends.
> - *
> - * NOTE:
> - * This function does NOT do any cache flushing.  The caller is responsible for
> - * calling flush_cache_vmap() on to-be-mapped areas before calling this
> - * function.
> - *
> - * RETURNS:
> - * 0 on success, -errno on failure.
> - */
> -int map_kernel_range_noflush(unsigned long addr, unsigned long size,
> -			     pgprot_t prot, struct page **pages)
> +static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
> +		pgprot_t prot, struct page **pages)
>  {
>  	unsigned long start = addr;
> -	unsigned long end = addr + size;
> -	unsigned long next;
>  	pgd_t *pgd;
> +	unsigned long next;
>  	int err = 0;
>  	int nr = 0;
>  	pgtbl_mod_mask mask = 0;
> @@ -529,6 +523,66 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
>  	return 0;
>  }
>  
> +static int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
> +		pgprot_t prot, struct page **pages, unsigned int page_shift)
> +{
> +	unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
> +
> +	WARN_ON(page_shift < PAGE_SHIFT);
> +
> +	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
> +			page_shift == PAGE_SHIFT)
> +		return vmap_small_pages_range_noflush(addr, end, prot, pages);
> +
> +	for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
> +		int err;
> +
> +		err = vmap_range_noflush(addr, addr + (1UL << page_shift),
> +					__pa(page_address(pages[i])), prot,
> +					page_shift);
> +		if (err)
> +			return err;
> +
> +		addr += 1UL << page_shift;
> +	}
> +
> +	return 0;
> +}
> +
> +static int vmap_pages_range(unsigned long addr, unsigned long end,
> +		pgprot_t prot, struct page **pages, unsigned int page_shift)
> +{
> +	int err;
> +
> +	err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
> +	flush_cache_vmap(addr, end);
> +	return err;
> +}
> +
> +/**
> + * map_kernel_range_noflush - map kernel VM area with the specified pages
> + * @addr: start of the VM area to map
> + * @size: size of the VM area to map
> + * @prot: page protection flags to use
> + * @pages: pages to map
> + *
> + * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify should
> + * have been allocated using get_vm_area() and its friends.
> + *
> + * NOTE:
> + * This function does NOT do any cache flushing.  The caller is responsible for
> + * calling flush_cache_vmap() on to-be-mapped areas before calling this
> + * function.
> + *
> + * RETURNS:
> + * 0 on success, -errno on failure.
> + */
> +int map_kernel_range_noflush(unsigned long addr, unsigned long size,
> +			     pgprot_t prot, struct page **pages)
> +{
> +	return vmap_pages_range_noflush(addr, addr + size, prot, pages, PAGE_SHIFT);
> +}
> +
>  int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
>  		struct page **pages)
>  {
> @@ -2112,6 +2166,24 @@ EXPORT_SYMBOL(vm_map_ram);
>  
>  static struct vm_struct *vmlist __initdata;
>  
> +static inline unsigned int vm_area_page_order(struct vm_struct *vm)
> +{
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
> +	return vm->page_order;
> +#else
> +	return 0;
> +#endif
> +}
> +
> +static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
> +{
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
> +	vm->page_order = order;
> +#else
> +	BUG_ON(order != 0);
> +#endif
> +}
> +
>  /**
>   * vm_area_add_early - add vmap area early during boot
>   * @vm: vm_struct to add
> @@ -2422,6 +2494,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
>  {
>  	int i;
>  
> +	/* HUGE_VMALLOC passes small pages to set_direct_map */
>  	for (i = 0; i < area->nr_pages; i++)
>  		if (page_address(area->pages[i]))
>  			set_direct_map(area->pages[i]);
> @@ -2431,6 +2504,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
>  static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
>  {
>  	unsigned long start = ULONG_MAX, end = 0;
> +	unsigned int page_order = vm_area_page_order(area);
>  	int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
>  	int flush_dmap = 0;
>  	int i;
> @@ -2455,11 +2529,14 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
>  	 * map. Find the start and end range of the direct mappings to make sure
>  	 * the vm_unmap_aliases() flush includes the direct map.
>  	 */
> -	for (i = 0; i < area->nr_pages; i++) {
> +	for (i = 0; i < area->nr_pages; i += 1U << page_order) {
>  		unsigned long addr = (unsigned long)page_address(area->pages[i]);
>  		if (addr) {
> +			unsigned long page_size;
> +
> +			page_size = PAGE_SIZE << page_order;
>  			start = min(addr, start);
> -			end = max(addr + PAGE_SIZE, end);
> +			end = max(addr + page_size, end);
>  			flush_dmap = 1;
>  		}
>  	}
> @@ -2500,13 +2577,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
>  	vm_remove_mappings(area, deallocate_pages);
>  
>  	if (deallocate_pages) {
> +		unsigned int page_order = vm_area_page_order(area);
>  		int i;
>  
> -		for (i = 0; i < area->nr_pages; i++) {
> +		for (i = 0; i < area->nr_pages; i += 1U << page_order) {
>  			struct page *page = area->pages[i];
>  
>  			BUG_ON(!page);
> -			__free_pages(page, 0);
> +			__free_pages(page, page_order);
>  		}
>  		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
>  
> @@ -2697,15 +2775,19 @@ EXPORT_SYMBOL_GPL(vmap_pfn);
>  #endif /* CONFIG_VMAP_PFN */
>  
>  static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
> -				 pgprot_t prot, int node)
> +				 pgprot_t prot, unsigned int page_shift,
> +				 int node)
>  {
>  	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
> -	unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
> +	unsigned long addr = (unsigned long)area->addr;
> +	unsigned long size = get_vm_area_size(area);
>  	unsigned long array_size;
> -	unsigned int i;
> +	unsigned int nr_small_pages = size >> PAGE_SHIFT;
> +	unsigned int page_order;
>  	struct page **pages;
> +	unsigned int i;
>  
> -	array_size = (unsigned long)nr_pages * sizeof(struct page *);
> +	array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
>  	gfp_mask |= __GFP_NOWARN;
>  	if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
>  		gfp_mask |= __GFP_HIGHMEM;
> @@ -2724,30 +2806,37 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  	}
>  
>  	area->pages = pages;
> -	area->nr_pages = nr_pages;
> +	area->nr_pages = nr_small_pages;
> +	set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
>  
> -	for (i = 0; i < area->nr_pages; i++) {
> -		struct page *page;
> +	page_order = vm_area_page_order(area);
>  
> -		if (node == NUMA_NO_NODE)
> -			page = alloc_page(gfp_mask);
> -		else
> -			page = alloc_pages_node(node, gfp_mask, 0);
> +	/*
> +	 * Careful, we allocate and map page_order pages, but tracking is done
> +	 * per PAGE_SIZE page so as to keep the vm_struct APIs independent of
> +	 * the physical/mapped size.
> +	 */
> +	for (i = 0; i < area->nr_pages; i += 1U << page_order) {
> +		struct page *page;
> +		int p;
>  
> +		page = alloc_pages_node(node, gfp_mask, page_order);
>  		if (unlikely(!page)) {
>  			/* Successfully allocated i pages, free them in __vfree() */
>  			area->nr_pages = i;
>  			atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
>  			goto fail;
>  		}
> -		area->pages[i] = page;
> +
> +		for (p = 0; p < (1U << page_order); p++)
> +			area->pages[i + p] = page + p;
> +
>  		if (gfpflags_allow_blocking(gfp_mask))
>  			cond_resched();
>  	}
>  	atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
>  
> -	if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
> -			prot, pages) < 0)
> +	if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0)
>  		goto fail;
>  
>  	return area->addr;
> @@ -2755,7 +2844,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  fail:
>  	warn_alloc(gfp_mask, NULL,
>  			  "vmalloc: allocation failure, allocated %ld of %ld bytes",
> -			  (area->nr_pages*PAGE_SIZE), area->size);
> +			  (area->nr_pages*PAGE_SIZE), size);
>  	__vfree(area->addr);
>  	return NULL;
>  }
> @@ -2786,19 +2875,43 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>  	struct vm_struct *area;
>  	void *addr;
>  	unsigned long real_size = size;
> +	unsigned long real_align = align;
> +	unsigned int shift = PAGE_SHIFT;
>  
> -	size = PAGE_ALIGN(size);
>  	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
>  		goto fail;
>  
> -	area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
> +	if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) &&
> +			arch_vmap_pmd_supported(prot)) {
> +		unsigned long size_per_node;
> +
> +		/*
> +		 * Try huge pages. Only try for PAGE_KERNEL allocations,
> +		 * others like modules don't yet expect huge pages in
> +		 * their allocations due to apply_to_page_range not
> +		 * supporting them.
> +		 */
> +
> +		size_per_node = size;
> +		if (node == NUMA_NO_NODE)
> +			size_per_node /= num_online_nodes();
> +		if (size_per_node >= PMD_SIZE) {
> +			shift = PMD_SHIFT;
> +			align = max(real_align, 1UL << shift);
> +			size = ALIGN(real_size, 1UL << shift);
> +		}
> +	}
> +
> +again:
> +	size = PAGE_ALIGN(size);
> +	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
>  				vm_flags, start, end, node, gfp_mask, caller);
>  	if (!area)
>  		goto fail;
>  
> -	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
> +	addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
>  	if (!addr)
> -		return NULL;
> +		goto fail;
>  
>  	/*
>  	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
> @@ -2812,8 +2925,18 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>  	return addr;
>  
>  fail:
> -	warn_alloc(gfp_mask, NULL,
> +	if (shift > PAGE_SHIFT) {
> +		shift = PAGE_SHIFT;
> +		align = real_align;
> +		size = real_size;
> +		goto again;
> +	}
> +
> +	if (!area) {
> +		/* Warn for area allocation, page allocations already warn */
> +		warn_alloc(gfp_mask, NULL,
>  			  "vmalloc: allocation failure: %lu bytes", real_size);
> +	}
>  	return NULL;
>  }
>  
> 


^ permalink raw reply

* Re: [PATCH] PCI: dwc: layerscape: convert to builtin_platform_driver()
From: Geert Uytterhoeven @ 2021-01-26  8:50 UTC (permalink / raw)
  To: Saravana Kannan
  Cc: Roy Zang, Lorenzo Pieralisi, PCI, LKML, Minghuan Lian,
	Michael Walle, linux-arm-kernel, Greg Kroah-Hartman,
	Bjorn Helgaas, linuxppc-dev, Mingkai Hu
In-Reply-To: <CAGETcx8FO+YSM0jwCnDdnvE3NCdjZ=1FSmAZpyaOEOvCgd4SXw@mail.gmail.com>

Hi Saravana,

On Mon, Jan 25, 2021 at 11:42 PM Saravana Kannan <saravanak@google.com> wrote:
> On Mon, Jan 25, 2021 at 11:49 AM Michael Walle <michael@walle.cc> wrote:
> > Am 2021-01-21 12:01, schrieb Geert Uytterhoeven:
> > > On Thu, Jan 21, 2021 at 1:05 AM Saravana Kannan <saravanak@google.com>
> > > wrote:
> > >> On Wed, Jan 20, 2021 at 3:53 PM Michael Walle <michael@walle.cc>
> > >> wrote:
> > >> > Am 2021-01-20 20:47, schrieb Saravana Kannan:
> > >> > > On Wed, Jan 20, 2021 at 11:28 AM Michael Walle <michael@walle.cc>
> > >> > > wrote:
> > >> > >>
> > >> > >> [RESEND, fat-fingered the buttons of my mail client and converted
> > >> > >> all CCs to BCCs :(]
> > >> > >>
> > >> > >> Am 2021-01-20 20:02, schrieb Saravana Kannan:
> > >> > >> > On Wed, Jan 20, 2021 at 6:24 AM Rob Herring <robh@kernel.org> wrote:
> > >> > >> >>
> > >> > >> >> On Wed, Jan 20, 2021 at 4:53 AM Michael Walle <michael@walle.cc>
> > >> > >> >> wrote:
> > >> > >> >> >
> > >> > >> >> > fw_devlink will defer the probe until all suppliers are ready. We can't
> > >> > >> >> > use builtin_platform_driver_probe() because it doesn't retry after probe
> > >> > >> >> > deferral. Convert it to builtin_platform_driver().
> > >> > >> >>
> > >> > >> >> If builtin_platform_driver_probe() doesn't work with fw_devlink, then
> > >> > >> >> shouldn't it be fixed or removed?
> > >> > >> >
> > >> > >> > I was actually thinking about this too. The problem with fixing
> > >> > >> > builtin_platform_driver_probe() to behave like
> > >> > >> > builtin_platform_driver() is that these probe functions could be
> > >> > >> > marked with __init. But there are also only 20 instances of
> > >> > >> > builtin_platform_driver_probe() in the kernel:
> > >> > >> > $ git grep ^builtin_platform_driver_probe | wc -l
> > >> > >> > 20
> > >> > >> >
> > >> > >> > So it might be easier to just fix them to not use
> > >> > >> > builtin_platform_driver_probe().
> > >> > >> >
> > >> > >> > Michael,
> > >> > >> >
> > >> > >> > Any chance you'd be willing to help me by converting all these to
> > >> > >> > builtin_platform_driver() and delete builtin_platform_driver_probe()?
> > >> > >>
> > >> > >> If it just moving the probe function to the _driver struct and
> > >> > >> remove the __init annotations. I could look into that.
> > >> > >
> > >> > > Yup. That's pretty much it AFAICT.
> > >> > >
> > >> > > builtin_platform_driver_probe() also makes sure the driver doesn't ask
> > >> > > for async probe, etc. But I doubt anyone is actually setting async
> > >> > > flags and still using builtin_platform_driver_probe().
> > >> >
> > >> > Hasn't module_platform_driver_probe() the same problem? And there
> > >> > are ~80 drivers which uses that.
> > >>
> > >> Yeah. The biggest problem with all of these is the __init markers.
> > >> Maybe some familiar with coccinelle can help?
> > >
> > > And dropping them will increase memory usage.
> >
> > Although I do have the changes for the builtin_platform_driver_probe()
> > ready, I don't think it makes much sense to send these unless we agree
> > on the increased memory footprint. While there are just a few
> > builtin_platform_driver_probe() and memory increase _might_ be
> > negligible, there are many more module_platform_driver_probe().
>
> While it's good to drop code that'll not be used past kernel init, the
> module_platform_driver_probe() is going even more extreme. It doesn't
> even allow deferred probe (well before kernel init is done). I don't
> think that behavior is right and that's why we should delete it. Also,

This construct is typically used for builtin hardware for which the
dependencies are registered very early, and thus known to probe at
first try (if present).

> I doubt if any of these probe functions even take up 4KB of memory.

How many 4 KiB pages do you have in a system with 10 MiB of SRAM?
How many can you afford to waste?

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH v11 12/13] mm/vmalloc: Hugepage vmalloc mappings
From: Nicholas Piggin @ 2021-01-26  9:47 UTC (permalink / raw)
  To: Andrew Morton, Ding Tianhong, linux-mm
  Cc: linux-arch, linux-kernel, Christoph Hellwig, Jonathan Cameron,
	Rick Edgecombe, linuxppc-dev
In-Reply-To: <0f360e6e-6d34-19ce-6c76-a17a5f4f7fc3@huawei.com>

Excerpts from Ding Tianhong's message of January 26, 2021 4:59 pm:
> On 2021/1/26 12:45, Nicholas Piggin wrote:
>> Support huge page vmalloc mappings. Config option HAVE_ARCH_HUGE_VMALLOC
>> enables support on architectures that define HAVE_ARCH_HUGE_VMAP and
>> supports PMD sized vmap mappings.
>> 
>> vmalloc will attempt to allocate PMD-sized pages if allocating PMD size
>> or larger, and fall back to small pages if that was unsuccessful.
>> 
>> Architectures must ensure that any arch specific vmalloc allocations
>> that require PAGE_SIZE mappings (e.g., module allocations vs strict
>> module rwx) use the VM_NOHUGE flag to inhibit larger mappings.
>> 
>> When hugepage vmalloc mappings are enabled in the next patch, this
>> reduces TLB misses by nearly 30x on a `git diff` workload on a 2-node
>> POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%.
>> 
>> This can result in more internal fragmentation and memory overhead for a
>> given allocation, an option nohugevmalloc is added to disable at boot.
>> 
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>  arch/Kconfig            |  11 ++
>>  include/linux/vmalloc.h |  21 ++++
>>  mm/page_alloc.c         |   5 +-
>>  mm/vmalloc.c            | 215 +++++++++++++++++++++++++++++++---------
>>  4 files changed, 205 insertions(+), 47 deletions(-)
>> 
>> diff --git a/arch/Kconfig b/arch/Kconfig
>> index 24862d15f3a3..eef170e0c9b8 100644
>> --- a/arch/Kconfig
>> +++ b/arch/Kconfig
>> @@ -724,6 +724,17 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>>  config HAVE_ARCH_HUGE_VMAP
>>  	bool
>>  
>> +#
>> +#  Archs that select this would be capable of PMD-sized vmaps (i.e.,
>> +#  arch_vmap_pmd_supported() returns true), and they must make no assumptions
>> +#  that vmalloc memory is mapped with PAGE_SIZE ptes. The VM_NO_HUGE_VMAP flag
>> +#  can be used to prohibit arch-specific allocations from using hugepages to
>> +#  help with this (e.g., modules may require it).
>> +#
>> +config HAVE_ARCH_HUGE_VMALLOC
>> +	depends on HAVE_ARCH_HUGE_VMAP
>> +	bool
>> +
>>  config ARCH_WANT_HUGE_PMD_SHARE
>>  	bool
>>  
>> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
>> index 99ea72d547dc..93270adf5db5 100644
>> --- a/include/linux/vmalloc.h
>> +++ b/include/linux/vmalloc.h
>> @@ -25,6 +25,7 @@ struct notifier_block;		/* in notifier.h */
>>  #define VM_NO_GUARD		0x00000040      /* don't add guard page */
>>  #define VM_KASAN		0x00000080      /* has allocated kasan shadow memory */
>>  #define VM_MAP_PUT_PAGES	0x00000100	/* put pages and free array in vfree */
>> +#define VM_NO_HUGE_VMAP		0x00000200	/* force PAGE_SIZE pte mapping */
>> 
>>  /*
>>   * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
>> @@ -59,6 +60,9 @@ struct vm_struct {
>>  	unsigned long		size;
>>  	unsigned long		flags;
>>  	struct page		**pages;
>> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
>> +	unsigned int		page_order;
>> +#endif
>>  	unsigned int		nr_pages;
>>  	phys_addr_t		phys_addr;
>>  	const void		*caller;
> Hi Nicholas:
> 
> Give a suggestion :)
> 
> The page order was only used to indicate the huge page flag for vm area, and only valid when
> size bigger than PMD_SIZE, so can we use the vm flgas to instead of that, just like define the
> new flag named VM_HUGEPAGE, it would not break the vm struct, and it is easier for me to backport the serious
> patches to our own branches. (Base on the lts version).

Hmm, it might be possible. I'm not sure if 1GB vmallocs will be used any 
time soon (or maybe they will for edge case configurations? It would be 
trivial to add support for).

The other concern I have is that Christophe IIRC was asking about 
implementing a mapping for PPC which used TLB mappings that were 
different than kernel page table tree size. Although I guess we could 
deal with that when it comes.

I like the flexibility of page_order though. How hard would it be for 
you to do the backport with VM_HUGEPAGE yourself?

I should also say, thanks for all the review and testing from the Huawei 
team. Do you have an x86 patch?

Thanks,
Nick

^ permalink raw reply

* RE: [PATCH v10 11/12] mm/vmalloc: Hugepage vmalloc mappings
From: Nicholas Piggin @ 2021-01-26  9:50 UTC (permalink / raw)
  To: Andrew Morton, 'Christophe Leroy', David Laight,
	linux-mm@kvack.org
  Cc: linux-arch@vger.kernel.org, Ding Tianhong,
	linux-kernel@vger.kernel.org, Christoph Hellwig, Zefan,  Li,
	Jonathan Cameron, Rick Edgecombe, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <7749b310046c4b9baa07037af1d97d87@AcuMS.aculab.com>

Excerpts from David Laight's message of January 25, 2021 10:24 pm:
> From: Christophe Leroy
>> Sent: 25 January 2021 09:15
>> 
>> Le 24/01/2021 à 09:22, Nicholas Piggin a écrit :
>> > Support huge page vmalloc mappings. Config option HAVE_ARCH_HUGE_VMALLOC
>> > enables support on architectures that define HAVE_ARCH_HUGE_VMAP and
>> > supports PMD sized vmap mappings.
>> >
>> > vmalloc will attempt to allocate PMD-sized pages if allocating PMD size
>> > or larger, and fall back to small pages if that was unsuccessful.
>> >
>> > Architectures must ensure that any arch specific vmalloc allocations
>> > that require PAGE_SIZE mappings (e.g., module allocations vs strict
>> > module rwx) use the VM_NOHUGE flag to inhibit larger mappings.
>> >
>> > When hugepage vmalloc mappings are enabled in the next patch, this
>> > reduces TLB misses by nearly 30x on a `git diff` workload on a 2-node
>> > POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%.
>> >
>> > This can result in more internal fragmentation and memory overhead for a
>> > given allocation, an option nohugevmalloc is added to disable at boot.
>> >
>> > Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> > ---
>> >   arch/Kconfig            |  10 +++
>> >   include/linux/vmalloc.h |  18 ++++
>> >   mm/page_alloc.c         |   5 +-
>> >   mm/vmalloc.c            | 192 ++++++++++++++++++++++++++++++----------
>> >   4 files changed, 177 insertions(+), 48 deletions(-)
>> >
>> 
>> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>> > index 0377e1d059e5..eef61e0f5170 100644
>> > --- a/mm/vmalloc.c
>> > +++ b/mm/vmalloc.c
>> 
>> > @@ -2691,15 +2746,18 @@ EXPORT_SYMBOL_GPL(vmap_pfn);
>> >   #endif /* CONFIG_VMAP_PFN */
>> >
>> >   static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>> > -				 pgprot_t prot, int node)
>> > +				 pgprot_t prot, unsigned int page_shift,
>> > +				 int node)
>> >   {
>> >   	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
>> > -	unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
>> > -	unsigned long array_size;
>> > -	unsigned int i;
>> > +	unsigned int page_order = page_shift - PAGE_SHIFT;
>> > +	unsigned long addr = (unsigned long)area->addr;
>> > +	unsigned long size = get_vm_area_size(area);
>> > +	unsigned int nr_small_pages = size >> PAGE_SHIFT;
>> >   	struct page **pages;
>> > +	unsigned int i;
>> >
>> > -	array_size = (unsigned long)nr_pages * sizeof(struct page *);
>> > +	array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
>> 
>> array_size() is a function in include/linux/overflow.h
>> 
>> For some reason, it breaks the build with your series.
> 
> I can't see the replacement definition for array_size.
> The old local variable is deleted.

Yeah I saw that after taking another look. Must have sent in a bad diff. 
The v11 fixed that and a couple of other compile issues.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH v11 01/13] mm/vmalloc: fix HUGE_VMAP regression by enabling huge pages in vmalloc_to_page
From: Miaohe Lin @ 2021-01-26  6:40 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-arch, Ding Tianhong, linux-kernel, Christoph Hellwig,
	Linux-MM, Jonathan Cameron, Andrew Morton, Rick Edgecombe,
	linuxppc-dev, Christoph Hellwig
In-Reply-To: <20210126044510.2491820-2-npiggin@gmail.com>

Hi:
On 2021/1/26 12:44, Nicholas Piggin wrote:
> vmalloc_to_page returns NULL for addresses mapped by larger pages[*].
> Whether or not a vmap is huge depends on the architecture details,
> alignments, boot options, etc., which the caller can not be expected
> to know. Therefore HUGE_VMAP is a regression for vmalloc_to_page.
> 
> This change teaches vmalloc_to_page about larger pages, and returns
> the struct page that corresponds to the offset within the large page.
> This makes the API agnostic to mapping implementation details.
> 
> [*] As explained by commit 029c54b095995 ("mm/vmalloc.c: huge-vmap:
>     fail gracefully on unexpected huge vmap mappings")
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  mm/vmalloc.c | 41 ++++++++++++++++++++++++++---------------
>  1 file changed, 26 insertions(+), 15 deletions(-)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index e6f352bf0498..62372f9e0167 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -34,7 +34,7 @@
>  #include <linux/bitops.h>
>  #include <linux/rbtree_augmented.h>
>  #include <linux/overflow.h>
> -
> +#include <linux/pgtable.h>
>  #include <linux/uaccess.h>
>  #include <asm/tlbflush.h>
>  #include <asm/shmparam.h>
> @@ -343,7 +343,9 @@ int is_vmalloc_or_module_addr(const void *x)
>  }
>  
>  /*
> - * Walk a vmap address to the struct page it maps.
> + * Walk a vmap address to the struct page it maps. Huge vmap mappings will
> + * return the tail page that corresponds to the base page address, which
> + * matches small vmap mappings.
>   */
>  struct page *vmalloc_to_page(const void *vmalloc_addr)
>  {
> @@ -363,25 +365,33 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>  
>  	if (pgd_none(*pgd))
>  		return NULL;
> +	if (WARN_ON_ONCE(pgd_leaf(*pgd)))
> +		return NULL; /* XXX: no allowance for huge pgd */
> +	if (WARN_ON_ONCE(pgd_bad(*pgd)))
> +		return NULL;
> +
>  	p4d = p4d_offset(pgd, addr);
>  	if (p4d_none(*p4d))
>  		return NULL;
> -	pud = pud_offset(p4d, addr);
> +	if (p4d_leaf(*p4d))
> +		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
> +	if (WARN_ON_ONCE(p4d_bad(*p4d)))
> +		return NULL;
>  
> -	/*
> -	 * Don't dereference bad PUD or PMD (below) entries. This will also
> -	 * identify huge mappings, which we may encounter on architectures
> -	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
> -	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
> -	 * not [unambiguously] associated with a struct page, so there is
> -	 * no correct value to return for them.
> -	 */
> -	WARN_ON_ONCE(pud_bad(*pud));
> -	if (pud_none(*pud) || pud_bad(*pud))
> +	pud = pud_offset(p4d, addr);
> +	if (pud_none(*pud))
> +		return NULL;
> +	if (pud_leaf(*pud))
> +		return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> +	if (WARN_ON_ONCE(pud_bad(*pud)))
>  		return NULL;
> +
>  	pmd = pmd_offset(pud, addr);
> -	WARN_ON_ONCE(pmd_bad(*pmd));
> -	if (pmd_none(*pmd) || pmd_bad(*pmd))
> +	if (pmd_none(*pmd))
> +		return NULL;
> +	if (pmd_leaf(*pmd))
> +		return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
> +	if (WARN_ON_ONCE(pmd_bad(*pmd)))
>  		return NULL;
>  
>  	ptep = pte_offset_map(pmd, addr);
> @@ -389,6 +399,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>  	if (pte_present(pte))
>  		page = pte_page(pte);
>  	pte_unmap(ptep);
> +
>  	return page;
>  }
>  EXPORT_SYMBOL(vmalloc_to_page);
> 

LGTM. Thanks.

Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>

^ permalink raw reply

* Re: [PATCH v11 02/13] mm: apply_to_pte_range warn and fail if a large pte is encountered
From: Miaohe Lin @ 2021-01-26  6:49 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-arch, Ding Tianhong, linux-kernel, Christoph Hellwig,
	Linux-MM, Jonathan Cameron, Andrew Morton, Rick Edgecombe,
	linuxppc-dev, Christoph Hellwig
In-Reply-To: <20210126044510.2491820-3-npiggin@gmail.com>

Hi:
On 2021/1/26 12:44, Nicholas Piggin wrote:
> apply_to_pte_range might mistake a large pte for bad, or treat it as a
> page table, resulting in a crash or corruption. Add a test to warn and
> return error if large entries are found.
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  mm/memory.c | 66 +++++++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 49 insertions(+), 17 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index feff48e1465a..672e39a72788 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2440,13 +2440,21 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
>  	}
>  	do {
>  		next = pmd_addr_end(addr, end);
> -		if (create || !pmd_none_or_clear_bad(pmd)) {
> -			err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
> -						 create, mask);
> -			if (err)
> -				break;
> +		if (pmd_none(*pmd) && !create)
> +			continue;
> +		if (WARN_ON_ONCE(pmd_leaf(*pmd)))
> +			return -EINVAL;
> +		if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
> +			if (!create)
> +				continue;
> +			pmd_clear_bad(pmd);
>  		}
> +		err = apply_to_pte_range(mm, pmd, addr, next,
> +					 fn, data, create, mask);
> +		if (err)
> +			break;
>  	} while (pmd++, addr = next, addr != end);
> +
>  	return err;
>  }
>  
> @@ -2468,13 +2476,21 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
>  	}
>  	do {
>  		next = pud_addr_end(addr, end);
> -		if (create || !pud_none_or_clear_bad(pud)) {
> -			err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
> -						 create, mask);
> -			if (err)
> -				break;
> +		if (pud_none(*pud) && !create)
> +			continue;
> +		if (WARN_ON_ONCE(pud_leaf(*pud)))
> +			return -EINVAL;
> +		if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
> +			if (!create)
> +				continue;
> +			pud_clear_bad(pud);
>  		}
> +		err = apply_to_pmd_range(mm, pud, addr, next,
> +					 fn, data, create, mask);
> +		if (err)
> +			break;
>  	} while (pud++, addr = next, addr != end);
> +
>  	return err;
>  }
>  
> @@ -2496,13 +2512,21 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
>  	}
>  	do {
>  		next = p4d_addr_end(addr, end);
> -		if (create || !p4d_none_or_clear_bad(p4d)) {
> -			err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
> -						 create, mask);
> -			if (err)
> -				break;
> +		if (p4d_none(*p4d) && !create)
> +			continue;
> +		if (WARN_ON_ONCE(p4d_leaf(*p4d)))
> +			return -EINVAL;
> +		if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
> +			if (!create)
> +				continue;
> +			p4d_clear_bad(p4d);
>  		}
> +		err = apply_to_pud_range(mm, p4d, addr, next,
> +					 fn, data, create, mask);
> +		if (err)
> +			break;
>  	} while (p4d++, addr = next, addr != end);
> +
>  	return err;
>  }
>  
> @@ -2522,9 +2546,17 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
>  	pgd = pgd_offset(mm, addr);
>  	do {
>  		next = pgd_addr_end(addr, end);
> -		if (!create && pgd_none_or_clear_bad(pgd))
> +		if (pgd_none(*pgd) && !create)
>  			continue;
> -		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
> +		if (WARN_ON_ONCE(pgd_leaf(*pgd)))
> +			return -EINVAL;
> +		if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
> +			if (!create)
> +				continue;
> +			pgd_clear_bad(pgd);
> +		}
> +		err = apply_to_p4d_range(mm, pgd, addr, next,
> +					 fn, data, create, &mask);
>  		if (err)
>  			break;
>  	} while (pgd++, addr = next, addr != end);
> 

Looks good to me, thanks.

Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>

^ permalink raw reply

* Re: [PATCH] PCI: dwc: layerscape: convert to builtin_platform_driver()
From: Lorenzo Pieralisi @ 2021-01-26 10:02 UTC (permalink / raw)
  To: Michael Walle
  Cc: Roy Zang, Saravana Kannan, Rob Herring, linux-pci, linux-kernel,
	Minghuan Lian, linux-arm-kernel, Greg Kroah-Hartman,
	Bjorn Helgaas, linuxppc-dev, Mingkai Hu
In-Reply-To: <20210120105246.23218-1-michael@walle.cc>

On Wed, Jan 20, 2021 at 11:52:46AM +0100, Michael Walle wrote:
> fw_devlink will defer the probe until all suppliers are ready. We can't
> use builtin_platform_driver_probe() because it doesn't retry after probe
> deferral. Convert it to builtin_platform_driver().
> 
> Fixes: e590474768f1 ("driver core: Set fw_devlink=on by default")

I will have to drop this Fixes: tag if you don't mind, it is not
in the mainline.

Lorenzo

> Signed-off-by: Michael Walle <michael@walle.cc>
> ---
>  drivers/pci/controller/dwc/pci-layerscape.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/pci/controller/dwc/pci-layerscape.c b/drivers/pci/controller/dwc/pci-layerscape.c
> index 44ad34cdc3bc..5b9c625df7b8 100644
> --- a/drivers/pci/controller/dwc/pci-layerscape.c
> +++ b/drivers/pci/controller/dwc/pci-layerscape.c
> @@ -232,7 +232,7 @@ static const struct of_device_id ls_pcie_of_match[] = {
>  	{ },
>  };
>  
> -static int __init ls_pcie_probe(struct platform_device *pdev)
> +static int ls_pcie_probe(struct platform_device *pdev)
>  {
>  	struct device *dev = &pdev->dev;
>  	struct dw_pcie *pci;
> @@ -271,10 +271,11 @@ static int __init ls_pcie_probe(struct platform_device *pdev)
>  }
>  
>  static struct platform_driver ls_pcie_driver = {
> +	.probe = ls_pcie_probe,
>  	.driver = {
>  		.name = "layerscape-pcie",
>  		.of_match_table = ls_pcie_of_match,
>  		.suppress_bind_attrs = true,
>  	},
>  };
> -builtin_platform_driver_probe(ls_pcie_driver, ls_pcie_probe);
> +builtin_platform_driver(ls_pcie_driver);
> -- 
> 2.20.1
> 

^ permalink raw reply

* Re: [PATCH v4 19/23] powerpc/syscall: Avoid stack frame in likely part of system_call_exception()
From: Nicholas Piggin @ 2021-01-26 10:14 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Christophe Leroy, Michael Ellerman,
	msuchanek, Paul Mackerras
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cdaf4ac33405e9a00ab277eccc5fd240d95e65b1.1611585031.git.christophe.leroy@csgroup.eu>

Excerpts from Christophe Leroy's message of January 26, 2021 12:48 am:
> When r3 is not modified, reload it from regs->orig_r3 to free
> volatile registers. This avoids a stack frame for the likely part
> of system_call_exception()
> 
> Before the patch:
> 
> c000b4d4 <system_call_exception>:
> c000b4d4:	7c 08 02 a6 	mflr    r0
> c000b4d8:	94 21 ff e0 	stwu    r1,-32(r1)
> c000b4dc:	93 e1 00 1c 	stw     r31,28(r1)
> c000b4e0:	90 01 00 24 	stw     r0,36(r1)
> c000b4e4:	90 6a 00 88 	stw     r3,136(r10)
> c000b4e8:	81 6a 00 84 	lwz     r11,132(r10)
> c000b4ec:	69 6b 00 02 	xori    r11,r11,2
> c000b4f0:	55 6b ff fe 	rlwinm  r11,r11,31,31,31
> c000b4f4:	0f 0b 00 00 	twnei   r11,0
> c000b4f8:	81 6a 00 a0 	lwz     r11,160(r10)
> c000b4fc:	55 6b 07 fe 	clrlwi  r11,r11,31
> c000b500:	0f 0b 00 00 	twnei   r11,0
> c000b504:	7c 0c 42 e6 	mftb    r0
> c000b508:	83 e2 00 08 	lwz     r31,8(r2)
> c000b50c:	81 82 00 28 	lwz     r12,40(r2)
> c000b510:	90 02 00 24 	stw     r0,36(r2)
> c000b514:	7d 8c f8 50 	subf    r12,r12,r31
> c000b518:	7c 0c 02 14 	add     r0,r12,r0
> c000b51c:	90 02 00 08 	stw     r0,8(r2)
> c000b520:	7c 10 13 a6 	mtspr   80,r0
> c000b524:	81 62 00 70 	lwz     r11,112(r2)
> c000b528:	71 60 86 91 	andi.   r0,r11,34449
> c000b52c:	40 82 00 34 	bne     c000b560 <system_call_exception+0x8c>
> c000b530:	2b 89 01 b6 	cmplwi  cr7,r9,438
> c000b534:	41 9d 00 64 	bgt     cr7,c000b598 <system_call_exception+0xc4>
> c000b538:	3d 40 c0 5c 	lis     r10,-16292
> c000b53c:	55 29 10 3a 	rlwinm  r9,r9,2,0,29
> c000b540:	39 4a 41 e8 	addi    r10,r10,16872
> c000b544:	80 01 00 24 	lwz     r0,36(r1)
> c000b548:	7d 2a 48 2e 	lwzx    r9,r10,r9
> c000b54c:	7c 08 03 a6 	mtlr    r0
> c000b550:	7d 29 03 a6 	mtctr   r9
> c000b554:	83 e1 00 1c 	lwz     r31,28(r1)
> c000b558:	38 21 00 20 	addi    r1,r1,32
> c000b55c:	4e 80 04 20 	bctr
> 
> After the patch:
> 
> c000b4d4 <system_call_exception>:
> c000b4d4:	81 6a 00 84 	lwz     r11,132(r10)
> c000b4d8:	90 6a 00 88 	stw     r3,136(r10)
> c000b4dc:	69 6b 00 02 	xori    r11,r11,2
> c000b4e0:	55 6b ff fe 	rlwinm  r11,r11,31,31,31
> c000b4e4:	0f 0b 00 00 	twnei   r11,0
> c000b4e8:	80 6a 00 a0 	lwz     r3,160(r10)
> c000b4ec:	54 63 07 fe 	clrlwi  r3,r3,31
> c000b4f0:	0f 03 00 00 	twnei   r3,0
> c000b4f4:	7d 6c 42 e6 	mftb    r11
> c000b4f8:	81 82 00 08 	lwz     r12,8(r2)
> c000b4fc:	80 02 00 28 	lwz     r0,40(r2)
> c000b500:	91 62 00 24 	stw     r11,36(r2)
> c000b504:	7c 00 60 50 	subf    r0,r0,r12
> c000b508:	7d 60 5a 14 	add     r11,r0,r11
> c000b50c:	91 62 00 08 	stw     r11,8(r2)
> c000b510:	7c 10 13 a6 	mtspr   80,r0
> c000b514:	80 62 00 70 	lwz     r3,112(r2)
> c000b518:	70 6b 86 91 	andi.   r11,r3,34449
> c000b51c:	40 82 00 28 	bne     c000b544 <system_call_exception+0x70>
> c000b520:	2b 89 01 b6 	cmplwi  cr7,r9,438
> c000b524:	41 9d 00 84 	bgt     cr7,c000b5a8 <system_call_exception+0xd4>
> c000b528:	80 6a 00 88 	lwz     r3,136(r10)
> c000b52c:	3d 40 c0 5c 	lis     r10,-16292
> c000b530:	55 29 10 3a 	rlwinm  r9,r9,2,0,29
> c000b534:	39 4a 41 e4 	addi    r10,r10,16868
> c000b538:	7d 2a 48 2e 	lwzx    r9,r10,r9
> c000b53c:	7d 29 03 a6 	mtctr   r9
> c000b540:	4e 80 04 20 	bctr
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> ---
>  arch/powerpc/kernel/syscall.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
> index a3510fa4e641..476909b11051 100644
> --- a/arch/powerpc/kernel/syscall.c
> +++ b/arch/powerpc/kernel/syscall.c
> @@ -115,6 +115,9 @@ notrace long system_call_exception(long r3, long r4, long r5,
>  			return regs->gpr[3];
>  		}
>  		return -ENOSYS;
> +	} else {
> +		/* Restore r3 from orig_gpr3 to free up a volatile reg */
> +		r3 = regs->orig_gpr3;
>  	}
>  
>  	/* May be faster to do array_index_nospec? */
> -- 

Nice optimisation, great analysis and catch. I'll have to test it on 
ppc64.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH v4 20/23] powerpc/syscall: Do not check unsupported scv vector on PPC32
From: Nicholas Piggin @ 2021-01-26 10:16 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Christophe Leroy, Michael Ellerman,
	msuchanek, Paul Mackerras
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <f02af988a86f7e83b6492df7c4fa1b53bcd1919b.1611585031.git.christophe.leroy@csgroup.eu>

Excerpts from Christophe Leroy's message of January 26, 2021 12:48 am:
> Only PPC64 has scv. No need to check the 0x7ff0 trap on PPC32.
> 
> And ignore the scv parameter in syscall_exit_prepare (Save 14 cycles
> 346 => 332 cycles)
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> ---
>  arch/powerpc/kernel/entry_32.S | 1 -
>  arch/powerpc/kernel/syscall.c  | 7 +++++--
>  2 files changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
> index 9922a04650f7..6ae9c7bcb06c 100644
> --- a/arch/powerpc/kernel/entry_32.S
> +++ b/arch/powerpc/kernel/entry_32.S
> @@ -343,7 +343,6 @@ transfer_to_syscall:
>  
>  ret_from_syscall:
>  	addi    r4,r1,STACK_FRAME_OVERHEAD
> -	li	r5,0
>  	bl	syscall_exit_prepare
>  #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
>  	/* If the process has its own DBCR0 value, load it up.  The internal
> diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
> index 476909b11051..30f8a397a522 100644
> --- a/arch/powerpc/kernel/syscall.c
> +++ b/arch/powerpc/kernel/syscall.c
> @@ -86,7 +86,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
>  	local_irq_enable();
>  
>  	if (unlikely(current_thread_info()->flags & _TIF_SYSCALL_DOTRACE)) {
> -		if (unlikely(regs->trap == 0x7ff0)) {
> +		if (IS_ENABLED(CONFIG_PPC64) && unlikely(regs->trap == 0x7ff0)) {
>  			/* Unsupported scv vector */
>  			_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
>  			return regs->gpr[3];
> @@ -109,7 +109,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
>  		r8 = regs->gpr[8];
>  
>  	} else if (unlikely(r0 >= NR_syscalls)) {
> -		if (unlikely(regs->trap == 0x7ff0)) {
> +		if (IS_ENABLED(CONFIG_PPC64) && unlikely(regs->trap == 0x7ff0)) {

Perhaps this could be hidden behind a function like trap_is_scv()?

trap_is_unsupported_scv() ?

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH v4 14/23] powerpc/syscall: Save r3 in regs->orig_r3
From: Nicholas Piggin @ 2021-01-26 10:18 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Christophe Leroy, Michael Ellerman,
	msuchanek, Paul Mackerras
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <5d375bea8f519924e110842f6b0d05e83cd04470.1611585031.git.christophe.leroy@csgroup.eu>

Excerpts from Christophe Leroy's message of January 26, 2021 12:48 am:
> Save r3 in regs->orig_r3 in system_call_exception()
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> ---
>  arch/powerpc/kernel/entry_64.S | 1 -
>  arch/powerpc/kernel/syscall.c  | 2 ++
>  2 files changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
> index aa1af139d947..a562a4240aa6 100644
> --- a/arch/powerpc/kernel/entry_64.S
> +++ b/arch/powerpc/kernel/entry_64.S
> @@ -278,7 +278,6 @@ END_BTB_FLUSH_SECTION
>  	std	r10,_LINK(r1)
>  	std	r11,_TRAP(r1)
>  	std	r12,_CCR(r1)
> -	std	r3,ORIG_GPR3(r1)
>  	addi	r10,r1,STACK_FRAME_OVERHEAD
>  	ld	r11,exception_marker@toc(r2)
>  	std	r11,-16(r10)		/* "regshere" marker */

This misses system_call_vectored.

Thanks,
Nick

> diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
> index cb415170b8f2..b66cfcbcb755 100644
> --- a/arch/powerpc/kernel/syscall.c
> +++ b/arch/powerpc/kernel/syscall.c
> @@ -29,6 +29,8 @@ notrace long system_call_exception(long r3, long r4, long r5,
>  {
>  	syscall_fn f;
>  
> +	regs->orig_gpr3 = r3;
> +
>  	if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
>  		BUG_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED);
>  
> -- 
> 2.25.0
> 
> 

^ permalink raw reply

* Re: [PATCH v4 11/23] powerpc/syscall: Rename syscall_64.c into syscall.c
From: Nicholas Piggin @ 2021-01-26 10:21 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Christophe Leroy, Michael Ellerman,
	msuchanek, Paul Mackerras
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <ff9dd4accdc897013594768833d54444e4823bf9.1611585031.git.christophe.leroy@csgroup.eu>

Excerpts from Christophe Leroy's message of January 26, 2021 12:48 am:
> syscall_64.c will be reused almost as is for PPC32.
> 
> Rename it syscall.c

Could you rename it to interrupt.c instead? A system call is an 
interrupt, and the file now also has code to return from other
interrupts as well, and it matches the new asm/interrupt.h from
the interrupts series.

Thanks,
Nick

^ permalink raw reply

* RE: [PATCH v4 11/23] powerpc/syscall: Rename syscall_64.c into syscall.c
From: David Laight @ 2021-01-26 10:28 UTC (permalink / raw)
  To: 'Nicholas Piggin', Benjamin Herrenschmidt,
	Christophe Leroy, Michael Ellerman, msuchanek@suse.de,
	Paul Mackerras
  Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org
In-Reply-To: <1611656343.yaxha7r2q4.astroid@bobo.none>

From: Nicholas Piggin
> Sent: 26 January 2021 10:21
> 
> Excerpts from Christophe Leroy's message of January 26, 2021 12:48 am:
> > syscall_64.c will be reused almost as is for PPC32.
> >
> > Rename it syscall.c
> 
> Could you rename it to interrupt.c instead? A system call is an
> interrupt, and the file now also has code to return from other
> interrupts as well, and it matches the new asm/interrupt.h from
> the interrupts series.

Hmmm....

That might make it harder for someone looking for the system call
entry code to find it.

In some sense interrupts are the simpler case.

Especially when comparing with other architectures which have
special instructions for syscall entry.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply

* Re: [PATCH] PCI: dwc: layerscape: convert to builtin_platform_driver()
From: Michael Walle @ 2021-01-26 10:39 UTC (permalink / raw)
  To: Lorenzo Pieralisi
  Cc: Roy Zang, Saravana Kannan, Rob Herring, linux-pci, linux-kernel,
	Minghuan Lian, linux-arm-kernel, Greg Kroah-Hartman,
	Bjorn Helgaas, linuxppc-dev, Mingkai Hu
In-Reply-To: <20210126100256.GA20547@e121166-lin.cambridge.arm.com>

Am 2021-01-26 11:02, schrieb Lorenzo Pieralisi:
> On Wed, Jan 20, 2021 at 11:52:46AM +0100, Michael Walle wrote:
>> fw_devlink will defer the probe until all suppliers are ready. We 
>> can't
>> use builtin_platform_driver_probe() because it doesn't retry after 
>> probe
>> deferral. Convert it to builtin_platform_driver().
>> 
>> Fixes: e590474768f1 ("driver core: Set fw_devlink=on by default")
> 
> I will have to drop this Fixes: tag if you don't mind, it is not
> in the mainline.

That commit is in Greg's for-next tree:
https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git/commit/?h=driver-core-next&id=e590474768f1cc04852190b61dec692411b22e2a

I was under the impression there are other commits with this
particular fixes tag, too. Either it was removed from
for-next queues or I was confused.

But I'm fine with removing the tag, assuming this will end
up together with the "driver core: Set fw_devlink=on by default"
commit in 5.11.

-michael

^ permalink raw reply

* Re: [PATCH] PCI: dwc: layerscape: convert to builtin_platform_driver()
From: Lorenzo Pieralisi @ 2021-01-26 10:55 UTC (permalink / raw)
  To: linux-kernel, Michael Walle, linux-pci, linux-arm-kernel,
	linuxppc-dev
  Cc: Rob Herring, Lorenzo Pieralisi, Saravana Kannan, Roy Zang,
	Greg Kroah-Hartman, Minghuan Lian, Bjorn Helgaas, Mingkai Hu
In-Reply-To: <20210120105246.23218-1-michael@walle.cc>

On Wed, 20 Jan 2021 11:52:46 +0100, Michael Walle wrote:
> fw_devlink will defer the probe until all suppliers are ready. We can't
> use builtin_platform_driver_probe() because it doesn't retry after probe
> deferral. Convert it to builtin_platform_driver().

Applied to pci/dwc, thanks!

[1/1] PCI: dwc: layerscape: Convert to builtin_platform_driver()
      https://git.kernel.org/lpieralisi/pci/c/538157be1e

Thanks,
Lorenzo

^ permalink raw reply

* Re: [PATCH] PCI: dwc: layerscape: convert to builtin_platform_driver()
From: Geert Uytterhoeven @ 2021-01-26 10:56 UTC (permalink / raw)
  To: Michael Walle
  Cc: Roy Zang, Lorenzo Pieralisi, Saravana Kannan, linux-pci,
	Linux Kernel Mailing List, Minghuan Lian, Mingkai Hu,
	Greg Kroah-Hartman, Bjorn Helgaas, linuxppc-dev, Linux ARM
In-Reply-To: <1a36ef741c5ab2a6e90b38c58944aa25@walle.cc>

Hi Michael,

On Tue, Jan 26, 2021 at 11:46 AM Michael Walle <michael@walle.cc> wrote:
> Am 2021-01-26 11:02, schrieb Lorenzo Pieralisi:
> > On Wed, Jan 20, 2021 at 11:52:46AM +0100, Michael Walle wrote:
> >> fw_devlink will defer the probe until all suppliers are ready. We
> >> can't
> >> use builtin_platform_driver_probe() because it doesn't retry after
> >> probe
> >> deferral. Convert it to builtin_platform_driver().
> >>
> >> Fixes: e590474768f1 ("driver core: Set fw_devlink=on by default")
> >
> > I will have to drop this Fixes: tag if you don't mind, it is not
> > in the mainline.
>
> That commit is in Greg's for-next tree:
> https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git/commit/?h=driver-core-next&id=e590474768f1cc04852190b61dec692411b22e2a
>
> I was under the impression there are other commits with this
> particular fixes tag, too. Either it was removed from
> for-next queues or I was confused.
>
> But I'm fine with removing the tag, assuming this will end
> up together with the "driver core: Set fw_devlink=on by default"
> commit in 5.11.

Definitely not v5.11.

And I sincerely doubt it will be applied for v5.12.
It's already way too late to implement all changes to existing drivers
needed, and get them accepted for v5.12.

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH 3/5] powerpc/xive: remove unnecessary unmap_kernel_range
From: Cédric Le Goater @ 2021-01-26  7:25 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm, Andrew Morton
  Cc: linuxppc-dev, linux-kernel, Christoph Hellwig
In-Reply-To: <20210126045404.2492588-4-npiggin@gmail.com>

On 1/26/21 5:54 AM, Nicholas Piggin wrote:
> iounmap will remove ptes.
> 
> Cc: "Cédric Le Goater" <clg@kaod.org>
> Cc: linuxppc-dev@lists.ozlabs.org
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Looks good. 

Acked-by: Cédric Le Goater <clg@kaod.org>

Thanks,

C. 

> ---
>  arch/powerpc/sysdev/xive/common.c | 4 ----
>  1 file changed, 4 deletions(-)
> 
> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
> index 595310e056f4..d6c2069cc828 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -959,16 +959,12 @@ EXPORT_SYMBOL_GPL(is_xive_irq);
>  void xive_cleanup_irq_data(struct xive_irq_data *xd)
>  {
>  	if (xd->eoi_mmio) {
> -		unmap_kernel_range((unsigned long)xd->eoi_mmio,
> -				   1u << xd->esb_shift);
>  		iounmap(xd->eoi_mmio);
>  		if (xd->eoi_mmio == xd->trig_mmio)
>  			xd->trig_mmio = NULL;
>  		xd->eoi_mmio = NULL;
>  	}
>  	if (xd->trig_mmio) {
> -		unmap_kernel_range((unsigned long)xd->trig_mmio,
> -				   1u << xd->esb_shift);
>  		iounmap(xd->trig_mmio);
>  		xd->trig_mmio = NULL;
>  	}
> 


^ permalink raw reply

* Re: [PATCH v11 12/13] mm/vmalloc: Hugepage vmalloc mappings
From: Ding Tianhong @ 2021-01-26 11:48 UTC (permalink / raw)
  To: Nicholas Piggin, Andrew Morton, linux-mm
  Cc: linux-arch, linux-kernel, Christoph Hellwig, Jonathan Cameron,
	Rick Edgecombe, linuxppc-dev
In-Reply-To: <1611653945.t3oot63nwn.astroid@bobo.none>

On 2021/1/26 17:47, Nicholas Piggin wrote:
> Excerpts from Ding Tianhong's message of January 26, 2021 4:59 pm:
>> On 2021/1/26 12:45, Nicholas Piggin wrote:
>>> Support huge page vmalloc mappings. Config option HAVE_ARCH_HUGE_VMALLOC
>>> enables support on architectures that define HAVE_ARCH_HUGE_VMAP and
>>> supports PMD sized vmap mappings.
>>>
>>> vmalloc will attempt to allocate PMD-sized pages if allocating PMD size
>>> or larger, and fall back to small pages if that was unsuccessful.
>>>
>>> Architectures must ensure that any arch specific vmalloc allocations
>>> that require PAGE_SIZE mappings (e.g., module allocations vs strict
>>> module rwx) use the VM_NOHUGE flag to inhibit larger mappings.
>>>
>>> When hugepage vmalloc mappings are enabled in the next patch, this
>>> reduces TLB misses by nearly 30x on a `git diff` workload on a 2-node
>>> POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%.
>>>
>>> This can result in more internal fragmentation and memory overhead for a
>>> given allocation, an option nohugevmalloc is added to disable at boot.
>>>
>>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>>> ---
>>>  arch/Kconfig            |  11 ++
>>>  include/linux/vmalloc.h |  21 ++++
>>>  mm/page_alloc.c         |   5 +-
>>>  mm/vmalloc.c            | 215 +++++++++++++++++++++++++++++++---------
>>>  4 files changed, 205 insertions(+), 47 deletions(-)
>>>
>>> diff --git a/arch/Kconfig b/arch/Kconfig
>>> index 24862d15f3a3..eef170e0c9b8 100644
>>> --- a/arch/Kconfig
>>> +++ b/arch/Kconfig
>>> @@ -724,6 +724,17 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>>>  config HAVE_ARCH_HUGE_VMAP
>>>  	bool
>>>  
>>> +#
>>> +#  Archs that select this would be capable of PMD-sized vmaps (i.e.,
>>> +#  arch_vmap_pmd_supported() returns true), and they must make no assumptions
>>> +#  that vmalloc memory is mapped with PAGE_SIZE ptes. The VM_NO_HUGE_VMAP flag
>>> +#  can be used to prohibit arch-specific allocations from using hugepages to
>>> +#  help with this (e.g., modules may require it).
>>> +#
>>> +config HAVE_ARCH_HUGE_VMALLOC
>>> +	depends on HAVE_ARCH_HUGE_VMAP
>>> +	bool
>>> +
>>>  config ARCH_WANT_HUGE_PMD_SHARE
>>>  	bool
>>>  
>>> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
>>> index 99ea72d547dc..93270adf5db5 100644
>>> --- a/include/linux/vmalloc.h
>>> +++ b/include/linux/vmalloc.h
>>> @@ -25,6 +25,7 @@ struct notifier_block;		/* in notifier.h */
>>>  #define VM_NO_GUARD		0x00000040      /* don't add guard page */
>>>  #define VM_KASAN		0x00000080      /* has allocated kasan shadow memory */
>>>  #define VM_MAP_PUT_PAGES	0x00000100	/* put pages and free array in vfree */
>>> +#define VM_NO_HUGE_VMAP		0x00000200	/* force PAGE_SIZE pte mapping */
>>>
>>>  /*
>>>   * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
>>> @@ -59,6 +60,9 @@ struct vm_struct {
>>>  	unsigned long		size;
>>>  	unsigned long		flags;
>>>  	struct page		**pages;
>>> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
>>> +	unsigned int		page_order;
>>> +#endif
>>>  	unsigned int		nr_pages;
>>>  	phys_addr_t		phys_addr;
>>>  	const void		*caller;
>> Hi Nicholas:
>>
>> Give a suggestion :)
>>
>> The page order was only used to indicate the huge page flag for vm area, and only valid when
>> size bigger than PMD_SIZE, so can we use the vm flgas to instead of that, just like define the
>> new flag named VM_HUGEPAGE, it would not break the vm struct, and it is easier for me to backport the serious
>> patches to our own branches. (Base on the lts version).
> 
> Hmm, it might be possible. I'm not sure if 1GB vmallocs will be used any 
> time soon (or maybe they will for edge case configurations? It would be 
> trivial to add support for).
> 

1GB vmallocs is really crazy, but maybe used for future. :)

> The other concern I have is that Christophe IIRC was asking about 
> implementing a mapping for PPC which used TLB mappings that were 
> different than kernel page table tree size. Although I guess we could 
> deal with that when it comes.
> 

I didn't check the PPC platform, but a agree with you.

> I like the flexibility of page_order though. How hard would it be for 
> you to do the backport with VM_HUGEPAGE yourself?
> 

Yes, i can fix it with VM_HUGEPAGE for my own branch.

> I should also say, thanks for all the review and testing from the Huawei 
> team. Do you have an x86 patch?
I only enable and use it for x86 and aarch64 platform, this serious patches is
really help us a lot. Thanks.

Ding

> Thanks,
> Nick
> .
> 


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox