LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [BISECTED REGRESSION] b43legacy broken on G4 PowerBook
From: Larry Finger @ 2019-06-10 18:44 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Aaro Koskinen, Christoph Hellwig,
	Christian Zigotzky, Michael Ellerman
  Cc: linux-wireless, linuxppc-dev, linux-kernel
In-Reply-To: <7697a9d10777b28ae79fdffdde6d0985555f6310.camel@kernel.crashing.org>

On 6/7/19 11:21 PM, Benjamin Herrenschmidt wrote:
> 
>> Please try the attached patch. I'm not really pleased with it and I will
>> continue to determine why the fallback to a 30-bit mask fails, but at least this
>> one works for me.
> 
> Your patch only makes sense if the device is indeed capable of
> addressing 31-bits.
> 
> So either the driver is buggy and asks for a too small mask in which
> case your patch is ok, or it's not and you're just going to cause all
> sort of interesting random problems including possible memory
> corruption.

Of course the driver may be buggy, but it asks for the correct mask.

This particular device is not capable of handling 32-bit DMA. The driver detects 
the 32-bit failure and falls back to 30 bits. It works on x86, and did on PPC32 
until 5.1. As Christoph said, it should always be possible to use fewer bits 
than the maximum.

Similar devices that are new enough to use b43 rather than b43legacy work with 
new kernels; however, they have and use 32-bit DMA.

Larry

^ permalink raw reply

* Re: [BISECTED REGRESSION] b43legacy broken on G4 PowerBook
From: Larry Finger @ 2019-06-10 16:09 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Aaro Koskinen, linux-wireless, linux-kernel, Christian Zigotzky,
	linuxppc-dev
In-Reply-To: <20190610081825.GA16534@lst.de>

[-- Attachment #1: Type: text/plain, Size: 2404 bytes --]

On 6/10/19 3:18 AM, Christoph Hellwig wrote:
> On Sat, Jun 08, 2019 at 04:52:24PM -0500, Larry Finger wrote:
>> On 6/7/19 12:29 PM, Christoph Hellwig wrote:
>>> I don't think we should work around this in the driver, we need to fix
>>> it in the core.  I'm curious why my previous patch didn't work.  Can
>>> you throw in a few printks what failed?  I.e. did dma_direct_supported
>>> return false?  Did the actual allocation fail?
>>
>> Routine dma_direct_supported() returns true.
>>
>> The failure is in routine dma_set_mask() in the following if test:
>>
>>          if (!dev->dma_mask || !dma_supported(dev, mask))
>>                  return -EIO;
>>
>> For b43legacy, dev->dma_mask is 0xc265684800000000.
>>      dma_supported(dev, mask) is 0xc08b000000000000, mask is 0x3fffffff, and
>> the routine returns -EIO.
>>
>> For b43,       dev->dma_mask is 0xc265684800000001,
>>      dma_supported(dev, mask) is 0xc08b000000000000, mask is 0x77777777, and
>> the routine returns 0.
> 
> I don't fully understand what values the above map to.  Can you send
> me your actual debugging patch as well?

I do not understand why the if statement returns true as neither of the values 
is zero. After seeing the x86 output shown below, I also do not understand all 
the trailing zeros.

My entire patch is attached. That output came from this section:

diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index f7afdad..ba2489d 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -317,9 +317,12 @@ int dma_supported(struct device *dev, u64 mask)

  int dma_set_mask(struct device *dev, u64 mask)
  {
+       pr_info("mask 0x%llx, dma_mask 0x%llx, dma_supported 0x%llx\n", mask, 
dev->dma_mask,
+               dma_supported(dev, mask));
         if (!dev->dma_mask || !dma_supported(dev, mask))
                 return -EIO;

+       pr_info("Continuing in dma_set_mask()\n");
         arch_dma_set_mask(dev, mask);
         dma_check_mask(dev, mask);
         *dev->dma_mask = mask;

On a 32-bit x86 computer with 1GB of RAM, that same output was

For b43legacy, dev->dma_mask is 0x01f4029044.
     dma_supported(dev, mask) is 0x1ef37f7000, mask is 0x3fffffff, and
the routine returns 0. 30-bit DMA works.

For b43,       dev->dma_mask is 0x01f4029044,
     dma_supported(dev, mask) is 0x1ef37f7000, mask is 0xffffffff, and
  the routine also returns 0. This card supports 32-bit DMA.

Larry

[-- Attachment #2: b43legacy_tests --]
[-- Type: text/plain, Size: 4133 bytes --]

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index b8286a2..7a367ce 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -319,6 +319,10 @@ extern void copy_user_page(void *to, void *from, unsigned long vaddr,
 #endif /* __ASSEMBLY__ */
 #include <asm/slice.h>
 
+#if 1 /* XXX: pmac?  dynamic discovery? */
+#define ARCH_ZONE_DMA_BITS 30
+#else
 #define ARCH_ZONE_DMA_BITS 31
+#endif
 
 #endif /* _ASM_POWERPC_PAGE_H */
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 09231ef..761d951 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -20,6 +20,8 @@
  */
 static inline bool dma_iommu_alloc_bypass(struct device *dev)
 {
+	pr_info("dev->archdata.iommu_bypass %d, !iommu_fixed_is_weak %d\n",
+		dev->archdata.iommu_bypass, !iommu_fixed_is_weak)		
 	return dev->archdata.iommu_bypass && !iommu_fixed_is_weak &&
 		dma_direct_supported(dev, dev->coherent_dma_mask);
 }
@@ -27,6 +29,8 @@ static inline bool dma_iommu_alloc_bypass(struct device *dev)
 static inline bool dma_iommu_map_bypass(struct device *dev,
 		unsigned long attrs)
 {
+	pr_info("(attrs & DMA_ATTR_WEAK_ORDERING) %d\n",
+		(attrs & DMA_ATTR_WEAK_ORDERING));
 	return dev->archdata.iommu_bypass &&
 		(!iommu_fixed_is_weak || (attrs & DMA_ATTR_WEAK_ORDERING));
 }
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index cba2913..2540d3b 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -248,7 +248,8 @@ void __init paging_init(void)
 	       (long int)((top_of_ram - total_ram) >> 20));
 
 #ifdef CONFIG_ZONE_DMA
-	max_zone_pfns[ZONE_DMA]	= min(max_low_pfn, 0x7fffffffUL >> PAGE_SHIFT);
+	max_zone_pfns[ZONE_DMA]	= min(max_low_pfn,
+			((1UL << ARCH_ZONE_DMA_BITS) - 1) >> PAGE_SHIFT);
 #endif
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 #ifdef CONFIG_HIGHMEM
diff --git a/drivers/net/wireless/broadcom/b43/dma.c b/drivers/net/wireless/broadcom/b43/dma.c
index 806406a..e0270da 100644
--- a/drivers/net/wireless/broadcom/b43/dma.c
+++ b/drivers/net/wireless/broadcom/b43/dma.c
@@ -1053,6 +1053,7 @@ static int b43_dma_set_mask(struct b43_wldev *dev, u64 mask)
 	 * lower mask, as we can always also support a lower one. */
 	while (1) {
 		err = dma_set_mask_and_coherent(dev->dev->dma_dev, mask);
+		pr_info("dma_set_mask_and_coherent %d, mask 0x%llx\n", err, mask);
 		if (!err)
 			break;
 		if (mask == DMA_BIT_MASK(64)) {
diff --git a/drivers/net/wireless/broadcom/b43legacy/dma.c b/drivers/net/wireless/broadcom/b43legacy/dma.c
index 1cc25f4..c625ffc 100644
--- a/drivers/net/wireless/broadcom/b43legacy/dma.c
+++ b/drivers/net/wireless/broadcom/b43legacy/dma.c
@@ -794,6 +794,7 @@ static int b43legacy_dma_set_mask(struct b43legacy_wldev *dev, u64 mask)
 	 * lower mask, as we can always also support a lower one. */
 	while (1) {
 		err = dma_set_mask_and_coherent(dev->dev->dma_dev, mask);
+		pr_info("dma_set_mask_and_coherent %d, mask 0x%llx\n", err, mask);
 		if (!err)
 			break;
 		if (mask == DMA_BIT_MASK(64)) {
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 2c2772e..b716e62 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -391,6 +391,8 @@ int dma_direct_supported(struct device *dev, u64 mask)
 	 * use __phys_to_dma() here so that the SME encryption mask isn't
 	 * part of the check.
 	 */
+	pr_info("min_mask 0x%x. max_pfn 0x%x, __phys_to_dma 0x%x, mask 0x%x\n", min_mask,
+		max_pfn, __phys_to_dma(dev, min_mask), mask);
 	return mask >= __phys_to_dma(dev, min_mask);
 }
 
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index f7afdad..ba2489d 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -317,9 +317,12 @@ int dma_supported(struct device *dev, u64 mask)
 
 int dma_set_mask(struct device *dev, u64 mask)
 {
+	pr_info("mask 0x%llx, dma_mask 0x%llx, dma_supported 0x%llx\n", mask, dev->dma_mask,
+		dma_supported(dev, mask));
 	if (!dev->dma_mask || !dma_supported(dev, mask))
 		return -EIO;
 
+	pr_info("Continuing in dma_set_mask()\n");
 	arch_dma_set_mask(dev, mask);
 	dma_check_mask(dev, mask);
 	*dev->dma_mask = mask;

^ permalink raw reply related

* Re: [RFC V3] mm: Generalize and rename notify_page_fault() as kprobe_page_fault()
From: Leonardo Bras @ 2019-06-10 15:27 UTC (permalink / raw)
  To: Anshuman Khandual, Christophe Leroy, linux-kernel, linux-mm
  Cc: Mark Rutland, Michal Hocko, linux-ia64, linux-sh, Peter Zijlstra,
	Catalin Marinas, Dave Hansen, Heiko Carstens, Paul Mackerras,
	sparclinux, linux-s390, Yoshinori Sato, x86, Russell King,
	Matthew Wilcox, Ingo Molnar, Andrey Konovalov, Fenghua Yu,
	Stephen Rothwell, Will Deacon, Andy Lutomirski, Thomas Gleixner,
	linux-arm-kernel, Tony Luck, Martin Schwidefsky, Andrew Morton,
	linuxppc-dev, David S. Miller
In-Reply-To: <97e9c9b3-89c8-d378-4730-841a900e6800@arm.com>

[-- Attachment #1: Type: text/plain, Size: 765 bytes --]

On Mon, 2019-06-10 at 08:09 +0530, Anshuman Khandual wrote:
> > > +    /*
> > > +     * To be potentially processing a kprobe fault and to be allowed
> > > +     * to call kprobe_running(), we have to be non-preemptible.
> > > +     */
> > > +    if (kprobes_built_in() && !preemptible() && !user_mode(regs)) {
> > > +        if (kprobe_running() && kprobe_fault_handler(regs, trap))
> > 
> > don't need an 'if A if B', can do 'if A && B'
> 
> Which will make it a very lengthy condition check.

Well, is there any problem line-breaking the if condition?

if (A && B && C &&
    D && E )

Also, if it's used only to decide the return value, maybe would be fine
to do somethink like that:

return (A && B && C &&
        D && E ); 

Regards,

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
From: Nicholas Piggin @ 2019-06-10 14:44 UTC (permalink / raw)
  To: Mark Rutland; +Cc: linux-mm, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20190610141036.GA16989@lakrids.cambridge.arm.com>

Mark Rutland's on June 11, 2019 12:10 am:
> Hi,
> 
> On Mon, Jun 10, 2019 at 02:38:38PM +1000, Nicholas Piggin wrote:
>> For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
>> allocate huge pages and map them
>> 
>> This brings dTLB misses for linux kernel tree `git diff` from 45,000 to
>> 8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off
>> (performance is in the noise, under 1% difference, page tables are likely
>> to be well cached for this workload). Similar numbers are seen on POWER9.
> 
> Do you happen to know which vmalloc mappings these get used for in the
> above case? Where do we see vmalloc mappings that large?

Large module vmalloc could be subject to huge mappings.

> I'm worried as to how this would interact with the set_memory_*()
> functions, as on arm64 those can only operate on page-granular mappings.
> Those may need fixing up to handle huge mappings; certainly if the above
> is all for modules.

Good point, that looks like it would break on arm64 at least. I'll
work on it. We may have to make this opt in beyond HUGE_VMAP.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
From: Mark Rutland @ 2019-06-10 14:10 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: linux-mm, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20190610043838.27916-4-npiggin@gmail.com>

Hi,

On Mon, Jun 10, 2019 at 02:38:38PM +1000, Nicholas Piggin wrote:
> For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
> allocate huge pages and map them
> 
> This brings dTLB misses for linux kernel tree `git diff` from 45,000 to
> 8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off
> (performance is in the noise, under 1% difference, page tables are likely
> to be well cached for this workload). Similar numbers are seen on POWER9.

Do you happen to know which vmalloc mappings these get used for in the
above case? Where do we see vmalloc mappings that large?

I'm worried as to how this would interact with the set_memory_*()
functions, as on arm64 those can only operate on page-granular mappings.
Those may need fixing up to handle huge mappings; certainly if the above
is all for modules.

Thanks,
Mark.

> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  include/asm-generic/4level-fixup.h |   1 +
>  include/asm-generic/5level-fixup.h |   1 +
>  include/linux/vmalloc.h            |   1 +
>  mm/vmalloc.c                       | 132 +++++++++++++++++++++++------
>  4 files changed, 107 insertions(+), 28 deletions(-)
> 
> diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
> index e3667c9a33a5..3cc65a4dd093 100644
> --- a/include/asm-generic/4level-fixup.h
> +++ b/include/asm-generic/4level-fixup.h
> @@ -20,6 +20,7 @@
>  #define pud_none(pud)			0
>  #define pud_bad(pud)			0
>  #define pud_present(pud)		1
> +#define pud_large(pud)			0
>  #define pud_ERROR(pud)			do { } while (0)
>  #define pud_clear(pud)			pgd_clear(pud)
>  #define pud_val(pud)			pgd_val(pud)
> diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
> index bb6cb347018c..c4377db09a4f 100644
> --- a/include/asm-generic/5level-fixup.h
> +++ b/include/asm-generic/5level-fixup.h
> @@ -22,6 +22,7 @@
>  #define p4d_none(p4d)			0
>  #define p4d_bad(p4d)			0
>  #define p4d_present(p4d)		1
> +#define p4d_large(p4d)			0
>  #define p4d_ERROR(p4d)			do { } while (0)
>  #define p4d_clear(p4d)			pgd_clear(p4d)
>  #define p4d_val(p4d)			pgd_val(p4d)
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 812bea5866d6..4c92dc608928 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -42,6 +42,7 @@ struct vm_struct {
>  	unsigned long		size;
>  	unsigned long		flags;
>  	struct page		**pages;
> +	unsigned int		page_shift;
>  	unsigned int		nr_pages;
>  	phys_addr_t		phys_addr;
>  	const void		*caller;
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index dd27cfb29b10..0cf8e861caeb 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -36,6 +36,7 @@
>  #include <linux/rbtree_augmented.h>
>  
>  #include <linux/uaccess.h>
> +#include <asm/pgtable.h>
>  #include <asm/tlbflush.h>
>  #include <asm/shmparam.h>
>  
> @@ -440,6 +441,41 @@ static int vmap_pages_range(unsigned long start, unsigned long end,
>  	return ret;
>  }
>  
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +static int vmap_hpages_range(unsigned long start, unsigned long end,
> +				   pgprot_t prot, struct page **pages,
> +				   unsigned int page_shift)
> +{
> +	unsigned long addr = start;
> +	unsigned int i, nr = (end - start) >> (PAGE_SHIFT + page_shift);
> +
> +	for (i = 0; i < nr; i++) {
> +		int err;
> +
> +		err = vmap_range_noflush(addr,
> +					addr + (PAGE_SIZE << page_shift),
> +					__pa(page_address(pages[i])), prot,
> +					PAGE_SHIFT + page_shift);
> +		if (err)
> +			return err;
> +
> +		addr += PAGE_SIZE << page_shift;
> +	}
> +	flush_cache_vmap(start, end);
> +
> +	return nr;
> +}
> +#else
> +static int vmap_hpages_range(unsigned long start, unsigned long end,
> +			   pgprot_t prot, struct page **pages,
> +			   unsigned int page_shift)
> +{
> +	BUG_ON(page_shift != PAGE_SIZE);
> +	return vmap_pages_range(start, end, prot, pages);
> +}
> +#endif
> +
> +
>  int is_vmalloc_or_module_addr(const void *x)
>  {
>  	/*
> @@ -462,7 +498,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>  {
>  	unsigned long addr = (unsigned long) vmalloc_addr;
>  	struct page *page = NULL;
> -	pgd_t *pgd = pgd_offset_k(addr);
> +	pgd_t *pgd;
>  	p4d_t *p4d;
>  	pud_t *pud;
>  	pmd_t *pmd;
> @@ -474,27 +510,38 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>  	 */
>  	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
>  
> +	pgd = pgd_offset_k(addr);
>  	if (pgd_none(*pgd))
>  		return NULL;
> +
>  	p4d = p4d_offset(pgd, addr);
>  	if (p4d_none(*p4d))
>  		return NULL;
> -	pud = pud_offset(p4d, addr);
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +	if (p4d_large(*p4d))
> +		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(p4d_bad(*p4d)))
> +		return NULL;
>  
> -	/*
> -	 * Don't dereference bad PUD or PMD (below) entries. This will also
> -	 * identify huge mappings, which we may encounter on architectures
> -	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
> -	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
> -	 * not [unambiguously] associated with a struct page, so there is
> -	 * no correct value to return for them.
> -	 */
> -	WARN_ON_ONCE(pud_bad(*pud));
> -	if (pud_none(*pud) || pud_bad(*pud))
> +	pud = pud_offset(p4d, addr);
> +	if (pud_none(*pud))
> +		return NULL;
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +	if (pud_large(*pud))
> +		return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(pud_bad(*pud)))
>  		return NULL;
> +
>  	pmd = pmd_offset(pud, addr);
> -	WARN_ON_ONCE(pmd_bad(*pmd));
> -	if (pmd_none(*pmd) || pmd_bad(*pmd))
> +	if (pmd_none(*pmd))
> +		return NULL;
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +	if (pmd_large(*pmd))
> +		return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(pmd_bad(*pmd)))
>  		return NULL;
>  
>  	ptep = pte_offset_map(pmd, addr);
> @@ -502,6 +549,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>  	if (pte_present(pte))
>  		page = pte_page(pte);
>  	pte_unmap(ptep);
> +
>  	return page;
>  }
>  EXPORT_SYMBOL(vmalloc_to_page);
> @@ -2185,8 +2233,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
>  		return NULL;
>  
>  	if (flags & VM_IOREMAP)
> -		align = 1ul << clamp_t(int, get_count_order_long(size),
> -				       PAGE_SHIFT, IOREMAP_MAX_ORDER);
> +		align = max(align,
> +				1ul << clamp_t(int, get_count_order_long(size),
> +				       PAGE_SHIFT, IOREMAP_MAX_ORDER));
>  
>  	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
>  	if (unlikely(!area))
> @@ -2398,7 +2447,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
>  			struct page *page = area->pages[i];
>  
>  			BUG_ON(!page);
> -			__free_pages(page, 0);
> +			__free_pages(page, area->page_shift);
>  		}
>  
>  		kvfree(area->pages);
> @@ -2541,14 +2590,17 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  				 pgprot_t prot, int node)
>  {
>  	struct page **pages;
> +	unsigned long addr = (unsigned long)area->addr;
> +	unsigned long size = get_vm_area_size(area);
> +	unsigned int page_shift = area->page_shift;
> +	unsigned int shift = page_shift + PAGE_SHIFT;
>  	unsigned int nr_pages, array_size, i;
>  	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
>  	const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
>  	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
> -					0 :
> -					__GFP_HIGHMEM;
> +					0 : __GFP_HIGHMEM;
>  
> -	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
> +	nr_pages = size >> shift;
>  	array_size = (nr_pages * sizeof(struct page *));
>  
>  	area->nr_pages = nr_pages;
> @@ -2569,10 +2621,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  	for (i = 0; i < area->nr_pages; i++) {
>  		struct page *page;
>  
> -		if (node == NUMA_NO_NODE)
> -			page = alloc_page(alloc_mask|highmem_mask);
> -		else
> -			page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
> +		page = alloc_pages_node(node,
> +				alloc_mask|highmem_mask, page_shift);
>  
>  		if (unlikely(!page)) {
>  			/* Successfully allocated i pages, free them in __vunmap() */
> @@ -2584,8 +2634,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  			cond_resched();
>  	}
>  
> -	if (map_vm_area(area, prot, pages))
> +	if (vmap_hpages_range(addr, addr + size, prot, pages, page_shift) < 0)
>  		goto fail;
> +
>  	return area->addr;
>  
>  fail:
> @@ -2619,22 +2670,39 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>  			pgprot_t prot, unsigned long vm_flags, int node,
>  			const void *caller)
>  {
> -	struct vm_struct *area;
> +	struct vm_struct *area = NULL;
>  	void *addr;
>  	unsigned long real_size = size;
> +	unsigned long real_align = align;
> +	unsigned int shift = PAGE_SHIFT;
>  
>  	size = PAGE_ALIGN(size);
>  	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
>  		goto fail;
>  
> +	if (IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) {
> +		unsigned long size_per_node;
> +
> +		size_per_node = size;
> +		if (node == NUMA_NO_NODE)
> +			size_per_node /= num_online_nodes();
> +		if (size_per_node >= PMD_SIZE)
> +			shift = PMD_SHIFT;
> +	}
> +again:
> +	align = max(real_align, 1UL << shift);
> +	size = ALIGN(real_size, align);
> +
>  	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
>  				vm_flags, start, end, node, gfp_mask, caller);
>  	if (!area)
>  		goto fail;
>  
> +	area->page_shift = shift - PAGE_SHIFT;
> +
>  	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
>  	if (!addr)
> -		return NULL;
> +		goto fail;
>  
>  	/*
>  	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
> @@ -2648,8 +2716,16 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>  	return addr;
>  
>  fail:
> -	warn_alloc(gfp_mask, NULL,
> +	if (shift == PMD_SHIFT) {
> +		shift = PAGE_SHIFT;
> +		goto again;
> +	}
> +
> +	if (!area) {
> +		/* Warn for area allocation, page allocations already warn */
> +		warn_alloc(gfp_mask, NULL,
>  			  "vmalloc: allocation failure: %lu bytes", real_size);
> +	}
>  	return NULL;
>  }
>  
> -- 
> 2.20.1
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply

* Re: [PATCH v3 1/9] KVM: PPC: Ultravisor: Add PPC_UV config option
From: Claudio Carvalho @ 2019-06-10 12:18 UTC (permalink / raw)
  To: Leonardo Bras, linuxppc-dev
  Cc: Madhavan Srinivasan, Michael Anderson, Ram Pai, kvm-ppc,
	Bharata B Rao, Sukadev Bhattiprolu, Anshuman Khandual
In-Reply-To: <0f6d278e78b2784a77ce2cd07a84377da6f5262e.camel@linux.ibm.com>


On 6/7/19 5:11 PM, Leonardo Bras wrote:
>
> On Thu, 2019-06-06 at 14:36 -0300, Claudio Carvalho wrote:
>> From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
>>
>> CONFIG_PPC_UV adds support for ultravisor.
>>
>> Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
>> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
>> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
>> [Update config help and commit message]
>> Signed-off-by: Claudio Carvalho <cclaudio@linux.ibm.com>
>> ---
>>  arch/powerpc/Kconfig | 20 ++++++++++++++++++++
>>  1 file changed, 20 insertions(+)
>>
>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>> index 8c1c636308c8..276c1857c335 100644
>> --- a/arch/powerpc/Kconfig
>> +++ b/arch/powerpc/Kconfig
>> @@ -439,6 +439,26 @@ config PPC_TRANSACTIONAL_MEM
>>         ---help---
>>           Support user-mode Transactional Memory on POWERPC.
>>
>> +config PPC_UV
>> +	bool "Ultravisor support"
>> +	depends on KVM_BOOK3S_HV_POSSIBLE
>> +	select HMM_MIRROR
>> +	select HMM
>> +	select ZONE_DEVICE
>> +	select MIGRATE_VMA_HELPER
>> +	select DEV_PAGEMAP_OPS
>> +	select DEVICE_PRIVATE
>> +	select MEMORY_HOTPLUG
>> +	select MEMORY_HOTREMOVE
>> +	default n
>> +	help
>> +	  This option paravirtualizes the kernel to run in POWER platforms that
>> +	  supports the Protected Execution Facility (PEF). In such platforms,
>> +	  the ultravisor firmware runs at a privilege level above the
>> +	  hypervisor.
>> +
>> +	  If unsure, say "N".
>> +
>>  config LD_HEAD_STUB_CATCH
>>  	bool "Reserve 256 bytes to cope with linker stubs in HEAD text" if EXPERT
>>  	depends on PPC64
> Maybe this patch should be the last of the series, as it may cause some
> bisect trouble to have this option enabled while missing some of the
> patches.

Thanks Leonardo. I changed that for the next version.

Claudio



^ permalink raw reply

* Re: Latest Git kernel: Section mismatch in reference from the variable start_here_multiplatform to the function .init.text:.early_setup()
From: Christian Zigotzky @ 2019-06-10  9:39 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: linuxppc-dev, Christian Zigotzky
In-Reply-To: <dbfbd2e0-eca8-8ecc-793b-a6f1471ce2ee@c-s.fr>

Hello Christophe,

Could you please add this patch to the GIT kernel because the issue still exists.

Thanks,
Christian

On 15. May 2019, at 12:15, Christophe Leroy <christophe.leroy@c-s.fr> wrote:

Hi,

Le 15/05/2019 à 12:09, Christian Zigotzky a écrit :
Hi All,
I got the following error messages with the latest Git kernel today:
GEN     .version
  CHK     include/generated/compile.h
  LD      vmlinux.o
  MODPOST vmlinux.o
WARNING: vmlinux.o(.text+0x302a): Section mismatch in reference from the variable start_here_multiplatform to the function .init.text:.early_setup()
The function start_here_multiplatform() references
the function __init .early_setup().
This is often because start_here_multiplatform lacks a __init
annotation or the annotation of .early_setup is wrong.
  MODINFO modules.builtin.modinfo
  KSYM    .tmp_kallsyms1.o
  KSYM    .tmp_kallsyms2.o
  LD      vmlinux
  SORTEX  vmlinux
  SYSMAP  System.map
  CHKHEAD vmlinux
What does it mean?

————————-

I proposed a patch for it at https://patchwork.ozlabs.org/patch/1097845/

Christophe

^ permalink raw reply

* [RESEND PATCH] Documentation/stackprotector: powerpc supports stack protector
From: Bhupesh Sharma @ 2019-06-10 10:03 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: arnd, Jonathan Corbet, bhsharma, linux-doc, linux-kernel, paulus,
	bhupesh.linux

powerpc architecture (both 64-bit and 32-bit) supports stack protector
mechanism since some time now [see commit 06ec27aea9fc ("powerpc/64:
add stack protector support")].

Update stackprotector arch support documentation to reflect the same.

Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Bhupesh Sharma <bhsharma@redhat.com>
---
Resend, this time Cc'ing Jonathan and doc-list.

 Documentation/features/debug/stackprotector/arch-support.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/features/debug/stackprotector/arch-support.txt b/Documentation/features/debug/stackprotector/arch-support.txt
index 9999ea521f3e..32bbdfc64c32 100644
--- a/Documentation/features/debug/stackprotector/arch-support.txt
+++ b/Documentation/features/debug/stackprotector/arch-support.txt
@@ -22,7 +22,7 @@
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
-    |     powerpc: | TODO |
+    |     powerpc: |  ok  |
     |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: |  ok  |
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
From: Anshuman Khandual @ 2019-06-10  8:53 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm; +Cc: linuxppc-dev, linux-arm-kernel
In-Reply-To: <20190610043838.27916-4-npiggin@gmail.com>

On 06/10/2019 10:08 AM, Nicholas Piggin wrote:
> For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
> allocate huge pages and map them.

IIUC that extends HAVE_ARCH_HUGE_VMAP from iormap to vmalloc. 

> 
> This brings dTLB misses for linux kernel tree `git diff` from 45,000 to
> 8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off
> (performance is in the noise, under 1% difference, page tables are likely
> to be well cached for this workload). Similar numbers are seen on POWER9.

Sure will try this on arm64.

> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  include/asm-generic/4level-fixup.h |   1 +
>  include/asm-generic/5level-fixup.h |   1 +
>  include/linux/vmalloc.h            |   1 +
>  mm/vmalloc.c                       | 132 +++++++++++++++++++++++------
>  4 files changed, 107 insertions(+), 28 deletions(-)
> 
> diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
> index e3667c9a33a5..3cc65a4dd093 100644
> --- a/include/asm-generic/4level-fixup.h
> +++ b/include/asm-generic/4level-fixup.h
> @@ -20,6 +20,7 @@
>  #define pud_none(pud)			0
>  #define pud_bad(pud)			0
>  #define pud_present(pud)		1
> +#define pud_large(pud)			0
>  #define pud_ERROR(pud)			do { } while (0)
>  #define pud_clear(pud)			pgd_clear(pud)
>  #define pud_val(pud)			pgd_val(pud)
> diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
> index bb6cb347018c..c4377db09a4f 100644
> --- a/include/asm-generic/5level-fixup.h
> +++ b/include/asm-generic/5level-fixup.h
> @@ -22,6 +22,7 @@
>  #define p4d_none(p4d)			0
>  #define p4d_bad(p4d)			0
>  #define p4d_present(p4d)		1
> +#define p4d_large(p4d)			0
>  #define p4d_ERROR(p4d)			do { } while (0)
>  #define p4d_clear(p4d)			pgd_clear(p4d)
>  #define p4d_val(p4d)			pgd_val(p4d)

Both of these are required from vmalloc_to_page() which as per a later comment
should be part of a prerequisite patch before this series.

> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 812bea5866d6..4c92dc608928 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -42,6 +42,7 @@ struct vm_struct {
>  	unsigned long		size;
>  	unsigned long		flags;
>  	struct page		**pages;
> +	unsigned int		page_shift;

So the entire vm_struct will be mapped with a single page_shift. It cannot have
mix and match mappings with PAGE_SIZE, PMD_SIZE, PUD_SIZE etc in case the
allocation fails for larger ones, falling back etc what over other reasons.

>  	unsigned int		nr_pages;
>  	phys_addr_t		phys_addr;
>  	const void		*caller;
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index dd27cfb29b10..0cf8e861caeb 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -36,6 +36,7 @@
>  #include <linux/rbtree_augmented.h>
>  
>  #include <linux/uaccess.h>
> +#include <asm/pgtable.h>
>  #include <asm/tlbflush.h>
>  #include <asm/shmparam.h>
>  
> @@ -440,6 +441,41 @@ static int vmap_pages_range(unsigned long start, unsigned long end,
>  	return ret;
>  }
>  
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +static int vmap_hpages_range(unsigned long start, unsigned long end,

A small nit (if you agree) s/hpages/huge_pages/

> +				   pgprot_t prot, struct page **pages,

Re-order (prot <---> pages) just to follow the standard like before.

> +				   unsigned int page_shift)
> +{
> +	unsigned long addr = start;
> +	unsigned int i, nr = (end - start) >> (PAGE_SHIFT + page_shift);

s/nr/nr_huge_pages ?

Also should not we check for the alignment of the range [start...end] with
respect to (1UL << [PAGE_SHIFT + page_shift]).


> +
> +	for (i = 0; i < nr; i++) {
> +		int err;
> +
> +		err = vmap_range_noflush(addr,
> +					addr + (PAGE_SIZE << page_shift),
> +					__pa(page_address(pages[i])), prot,
> +					PAGE_SHIFT + page_shift);
> +		if (err)
> +			return err;
> +
> +		addr += PAGE_SIZE << page_shift;
> +	}
> +	flush_cache_vmap(start, end);
> +
> +	return nr;
> +}
> +#else
> +static int vmap_hpages_range(unsigned long start, unsigned long end,
> +			   pgprot_t prot, struct page **pages,
> +			   unsigned int page_shift)
> +{
> +	BUG_ON(page_shift != PAGE_SIZE);
> +	return vmap_pages_range(start, end, prot, pages);
> +}
> +#endif
> +
> +
>  int is_vmalloc_or_module_addr(const void *x)
>  {
>  	/*
> @@ -462,7 +498,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>  {
>  	unsigned long addr = (unsigned long) vmalloc_addr;
>  	struct page *page = NULL;
> -	pgd_t *pgd = pgd_offset_k(addr);
> +	pgd_t *pgd;
>  	p4d_t *p4d;
>  	pud_t *pud;
>  	pmd_t *pmd;
> @@ -474,27 +510,38 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>  	 */
>  	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
>  
> +	pgd = pgd_offset_k(addr);
>  	if (pgd_none(*pgd))
>  		return NULL;
> +

Small nit. Stray line here.

'pgd' related changes here seem to be just cleanups and should not part of this patch.

>  	p4d = p4d_offset(pgd, addr);
>  	if (p4d_none(*p4d))
>  		return NULL;
> -	pud = pud_offset(p4d, addr);
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +	if (p4d_large(*p4d))
> +		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(p4d_bad(*p4d)))
> +		return NULL;
>  
> -	/*
> -	 * Don't dereference bad PUD or PMD (below) entries. This will also
> -	 * identify huge mappings, which we may encounter on architectures
> -	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
> -	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
> -	 * not [unambiguously] associated with a struct page, so there is
> -	 * no correct value to return for them.
> -	 */

What changed the situation so that we could return struct page for a huge
mapping now ? AFAICT even after this patch, PUD/P4D level huge pages can only
be created with ioremap_page_range() not with vmalloc() which creates PMD
sized mappings only. Hence if it's okay to dereference struct page of a huge
mapping (not withstanding the comment here) it should be part of an earlier
patch fixing it first for existing ioremap_page_range() huge mappings.

> -	WARN_ON_ONCE(pud_bad(*pud));
> -	if (pud_none(*pud) || pud_bad(*pud))
> +	pud = pud_offset(p4d, addr);
> +	if (pud_none(*pud))
> +		return NULL;
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +	if (pud_large(*pud))
> +		return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(pud_bad(*pud)))
>  		return NULL;
> +
>  	pmd = pmd_offset(pud, addr);
> -	WARN_ON_ONCE(pmd_bad(*pmd));
> -	if (pmd_none(*pmd) || pmd_bad(*pmd))
> +	if (pmd_none(*pmd))
> +		return NULL;
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +	if (pmd_large(*pmd))
> +		return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(pmd_bad(*pmd)))
>  		return NULL;

At each page table level, we are checking in this order

pXX_none() --> pXX_large() --> pXX_bad()

Are not these alternative orders bit better

pXX_bad() --> pXX_none() --> pXX_large()

Or

pXX_none() --> pXX_bad() --> pXX_large()

Checking for pXX_bad() at the end does not make much sense.

>  
>  	ptep = pte_offset_map(pmd, addr);
> @@ -502,6 +549,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>  	if (pte_present(pte))
>  		page = pte_page(pte);
>  	pte_unmap(ptep);
> +

Small nit. Stray line here.

>  	return page;
>  }
>  EXPORT_SYMBOL(vmalloc_to_page);
> @@ -2185,8 +2233,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
>  		return NULL;
>  
>  	if (flags & VM_IOREMAP)
> -		align = 1ul << clamp_t(int, get_count_order_long(size),
> -				       PAGE_SHIFT, IOREMAP_MAX_ORDER);
> +		align = max(align,
> +				1ul << clamp_t(int, get_count_order_long(size),
> +				       PAGE_SHIFT, IOREMAP_MAX_ORDER));
>  
>  	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
>  	if (unlikely(!area))
> @@ -2398,7 +2447,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
>  			struct page *page = area->pages[i];
>  
>  			BUG_ON(!page);
> -			__free_pages(page, 0);
> +			__free_pages(page, area->page_shift);

area->page_shift' turns out to be effective page order. I think the name here is bit
misleading. s/page_shift/page_order or nr_pages should be better IMHO. page_shift is
not actual shift (1UL << area->shift to get size) nor does it sound like page 'order'.

>  		}
>  
>  		kvfree(area->pages);
> @@ -2541,14 +2590,17 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  				 pgprot_t prot, int node)
>  {
>  	struct page **pages;
> +	unsigned long addr = (unsigned long)area->addr;
> +	unsigned long size = get_vm_area_size(area);
> +	unsigned int page_shift = area->page_shift;
> +	unsigned int shift = page_shift + PAGE_SHIFT;
>  	unsigned int nr_pages, array_size, i;
>  	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
>  	const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
>  	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
> -					0 :
> -					__GFP_HIGHMEM;
> +					0 : __GFP_HIGHMEM;
>  
> -	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
> +	nr_pages = size >> shift;
>  	array_size = (nr_pages * sizeof(struct page *));
>  
>  	area->nr_pages = nr_pages;
> @@ -2569,10 +2621,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  	for (i = 0; i < area->nr_pages; i++) {
>  		struct page *page;
>  
> -		if (node == NUMA_NO_NODE)
> -			page = alloc_page(alloc_mask|highmem_mask);
> -		else
> -			page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
> +		page = alloc_pages_node(node,
> +				alloc_mask|highmem_mask, page_shift);

alloc_mask remains the exact same like before even for these high order pages.

>  
>  		if (unlikely(!page)) {
>  			/* Successfully allocated i pages, free them in __vunmap() */
> @@ -2584,8 +2634,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  			cond_resched();
>  	}
>  
> -	if (map_vm_area(area, prot, pages))
> +	if (vmap_hpages_range(addr, addr + size, prot, pages, page_shift) < 0)
>  		goto fail;
> +
>  	return area->addr;
>  
>  fail:
> @@ -2619,22 +2670,39 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>  			pgprot_t prot, unsigned long vm_flags, int node,
>  			const void *caller)
>  {
> -	struct vm_struct *area;
> +	struct vm_struct *area = NULL;
>  	void *addr;
>  	unsigned long real_size = size;
> +	unsigned long real_align = align;
> +	unsigned int shift = PAGE_SHIFT;
>  
>  	size = PAGE_ALIGN(size);
>  	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
>  		goto fail;
>  
> +	if (IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) {
> +		unsigned long size_per_node;
> +
> +		size_per_node = size;
> +		if (node == NUMA_NO_NODE)
> +			size_per_node /= num_online_nodes();
> +		if (size_per_node >= PMD_SIZE)
> +			shift = PMD_SHIFT;

There are two problems here.

1. Should not size_per_node be aligned with PMD_SIZE to avoid wasting memory later
   because of alignment upwards (making it worse for NUMA_NO_NODE)
2. What about PUD_SIZE which is not considered here at all
3. We should have similar knobs like ioremap controlling different size huge mappings

static int __read_mostly ioremap_p4d_capable;
static int __read_mostly ioremap_pud_capable;
static int __read_mostly ioremap_pmd_capable;
static int __read_mostly ioremap_huge_disabled;

while also giving arch a chance to weigh in through similar overrides like these.

arch_ioremap_[pud|pmd]_supported() ---> probably unifying it for vmalloc() 
 
> +	}
> +again:
> +	align = max(real_align, 1UL << shift);
> +	size = ALIGN(real_size, align);
> +
>  	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
>  				vm_flags, start, end, node, gfp_mask, caller);
>  	if (!area)
>  		goto fail;
>  
> +	area->page_shift = shift - PAGE_SHIFT;
> +
>  	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
>  	if (!addr)
> -		return NULL;
> +		goto fail;
>  
>  	/*
>  	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
> @@ -2648,8 +2716,16 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>  	return addr;
>  
>  fail:
> -	warn_alloc(gfp_mask, NULL,
> +	if (shift == PMD_SHIFT) {
> +		shift = PAGE_SHIFT;
> +		goto again;
> +	}

PUD_SHIFT should be accommodated here as well while falling back to lower
mapping sizes in case previous allocation attempt fails.

^ permalink raw reply

* Re: [BISECTED REGRESSION] b43legacy broken on G4 PowerBook
From: Christoph Hellwig @ 2019-06-10  8:18 UTC (permalink / raw)
  To: Larry Finger
  Cc: Aaro Koskinen, linux-wireless, linux-kernel, Christian Zigotzky,
	linuxppc-dev, Christoph Hellwig
In-Reply-To: <30000803-3772-3edf-f4a9-55122d504f3f@lwfinger.net>

On Sat, Jun 08, 2019 at 04:52:24PM -0500, Larry Finger wrote:
> On 6/7/19 12:29 PM, Christoph Hellwig wrote:
>> I don't think we should work around this in the driver, we need to fix
>> it in the core.  I'm curious why my previous patch didn't work.  Can
>> you throw in a few printks what failed?  I.e. did dma_direct_supported
>> return false?  Did the actual allocation fail?
>
> Routine dma_direct_supported() returns true.
>
> The failure is in routine dma_set_mask() in the following if test:
>
>         if (!dev->dma_mask || !dma_supported(dev, mask))
>                 return -EIO;
>
> For b43legacy, dev->dma_mask is 0xc265684800000000.
>     dma_supported(dev, mask) is 0xc08b000000000000, mask is 0x3fffffff, and 
> the routine returns -EIO.
>
> For b43,       dev->dma_mask is 0xc265684800000001,
>     dma_supported(dev, mask) is 0xc08b000000000000, mask is 0x77777777, and 
> the routine returns 0.

I don't fully understand what values the above map to.  Can you send
me your actual debugging patch as well?

^ permalink raw reply

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
From: Satheesh Rajendran @ 2019-06-10  8:08 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: linux-mm, linuxppc-dev, linux-arm-kernel
In-Reply-To: <1560145722.obq2bpepl8.astroid@bobo.none>

On Mon, Jun 10, 2019 at 03:49:48PM +1000, Nicholas Piggin wrote:
> Nicholas Piggin's on June 10, 2019 2:38 pm:
> > +static int vmap_hpages_range(unsigned long start, unsigned long end,
> > +			   pgprot_t prot, struct page **pages,
> > +			   unsigned int page_shift)
> > +{
> > +	BUG_ON(page_shift != PAGE_SIZE);
> > +	return vmap_pages_range(start, end, prot, pages);
> > +}
> 
> That's a false positive BUG_ON for !HUGE_VMAP configs. I'll fix that
> and repost after a round of feedback.

Sure, Crash log for that false positive BUG_ON on Power8 Host.

[    0.001718] pid_max: default: 163840 minimum: 1280
[    0.010437] ------------[ cut here ]------------
[    0.010461] kernel BUG at mm/vmalloc.c:473!
[    0.010471] Oops: Exception in kernel mode, sig: 5 [#1]
[    0.010481] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
[    0.010491] Modules linked in:
[    0.010503] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.2.0-rc3-ga7ee9421d #1
[    0.010515] NIP:  c00000000034dbd8 LR: c00000000034dc80 CTR: 0000000000000000
[    0.010527] REGS: c0000000015bf9a0 TRAP: 0700   Not tainted  (5.2.0-rc3-ga7ee9421d)
[    0.010537] MSR:  9000000002029033 <SF,HV,VEC,EE,ME,IR,DR,RI,LE>  CR: 22022422  XER: 20000000
[    0.010559] CFAR: c00000000034dc88 IRQMASK: 0
[    0.010559] GPR00: c00000000034dc80 c0000000015bfc30 c0000000015c2f00 c00c000001fd0e00
[    0.010559] GPR04: 0000000000000000 0000000000002322 0000000000000000 0000000000000040
[    0.010559] GPR08: c000000ff9080000 0000000000000400 0000000000000400 0000000000000100
[    0.010559] GPR12: 0000000042022422 c0000000017a0000 00000001035ae7d8 0000000000000400
[    0.010559] GPR16: 0000000004000000 800000000000018e c000000000ee08c8 0000000000000000
[    0.010559] GPR20: 0000000000010000 0000000000002b22 0000000000000b20 0000000000000022
[    0.010559] GPR24: c0000007f92c7880 0000000000000b22 0000000000010000 c00a000000000000
[    0.010559] GPR28: c008000000000000 0000000004000000 ffffffffffffffff 0000000000000b20
[    0.010664] NIP [c00000000034dbd8] __vmalloc_node_range+0x1f8/0x410
[    0.010677] LR [c00000000034dc80] __vmalloc_node_range+0x2a0/0x410
[    0.010686] Call Trace:
[    0.010695] [c0000000015bfc30] [c00000000034dc80] __vmalloc_node_range+0x2a0/0x410 (unreliable)
[    0.010711] [c0000000015bfd30] [c00000000034de40] __vmalloc+0x50/0x60
[    0.010724] [c0000000015bfda0] [c00000000101e54c] alloc_large_system_hash+0x200/0x304
[    0.010738] [c0000000015bfe60] [c0000000010235bc] vfs_caches_init+0xd8/0x138
[    0.010752] [c0000000015bfee0] [c000000000fe428c] start_kernel+0x5c4/0x668
[    0.010767] [c0000000015bff90] [c00000000000b774] start_here_common+0x1c/0x528
[    0.010777] Instruction dump:
[    0.010785] 60000000 7c691b79 418200dc e9180020 79ea1f24 7d28512a 40920170 8138002c
[    0.010803] 394f0001 794f0020 7c095040 4181ffbc <0fe00000> 60000000 3f400001 4bfffedc
[    0.010826] ---[ end trace dd0217488686d653 ]---
[    0.010834]
[    1.010946] Kernel panic - not syncing: Attempted to kill the idle task!
[    1.011061] Rebooting in 10 seconds..

Regards,
-Satheesh.
> 
> Thanks,
> Nick
> 
> 


^ permalink raw reply

* [PATCH RESEND 2/2] tools/perf: Set 'trace_cycles' as defaultevent for perf kvm record in powerpc
From: Anju T Sudhakar @ 2019-06-10  6:45 UTC (permalink / raw)
  To: mpe, acme, jolsa
  Cc: ravi.bangoria, maddy, peterz, linux-kernel, alexander.shishkin,
	anju, namhyung, linuxppc-dev
In-Reply-To: <20190610064518.949-1-anju@linux.vnet.ibm.com>

Use 'trace_imc/trace_cycles' as the default event for 'perf kvm record'
in powerpc.

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
---
 tools/perf/arch/powerpc/util/kvm-stat.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c
index 66f8fe500945..b552884263df 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -177,8 +177,9 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
 /*
  * Incase of powerpc architecture, pmu registers are programmable
  * by guest kernel. So monitoring guest via host may not provide
- * valid samples. It is better to fail the "perf kvm record"
- * with default "cycles" event to monitor guest in powerpc.
+ * valid samples with default 'cycles' event. It is better to use
+ * 'trace_imc/trace_cycles' event for guest profiling, since it
+ * can track the guest instruction pointer in the trace-record.
  *
  * Function to parse the arguments and return appropriate values.
  */
@@ -202,8 +203,14 @@ int kvm_add_default_arch_event(int *argc, const char **argv)
 
 	parse_options(j, tmp, event_options, NULL, 0);
 	if (!event) {
-		free(tmp);
-		return -EINVAL;
+		if (pmu_have_event("trace_imc", "trace_cycles")) {
+			argv[j++] = strdup("-e");
+			argv[j++] = strdup("trace_imc/trace_cycles/");
+			*argc += 2;
+		} else {
+			free(tmp);
+			return -EINVAL;
+		}
 	}
 
 	free(tmp);
-- 
2.17.2


^ permalink raw reply related

* [PATCH RESEND 1/2] tools/perf: Add arch neutral function to choose event for perf kvm record
From: Anju T Sudhakar @ 2019-06-10  6:45 UTC (permalink / raw)
  To: mpe, acme, jolsa
  Cc: ravi.bangoria, maddy, peterz, linux-kernel, alexander.shishkin,
	anju, namhyung, linuxppc-dev

'perf kvm record' uses 'cycles'(if the user did not specify any event) as
the default event to profile the guest.
This will not provide any proper samples from the guest incase of
powerpc architecture, since in powerpc the PMUs are controlled by
the guest rather than the host.

Patch adds a function to pick an arch specific event for 'perf kvm record',
instead of selecting 'cycles' as a default event for all architectures.

For powerpc this function checks for any user specified event, and if there
isn't any it returns invalid instead of proceeding with 'cycles' event.

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
---
 tools/perf/arch/powerpc/util/kvm-stat.c | 37 +++++++++++++++++++++++++
 tools/perf/builtin-kvm.c                | 12 +++++++-
 tools/perf/util/kvm-stat.h              |  2 +-
 3 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c
index f9db341c47b6..66f8fe500945 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -8,6 +8,7 @@
 
 #include "book3s_hv_exits.h"
 #include "book3s_hcalls.h"
+#include <subcmd/parse-options.h>
 
 #define NR_TPS 4
 
@@ -172,3 +173,39 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
 
 	return ret;
 }
+
+/*
+ * Incase of powerpc architecture, pmu registers are programmable
+ * by guest kernel. So monitoring guest via host may not provide
+ * valid samples. It is better to fail the "perf kvm record"
+ * with default "cycles" event to monitor guest in powerpc.
+ *
+ * Function to parse the arguments and return appropriate values.
+ */
+int kvm_add_default_arch_event(int *argc, const char **argv)
+{
+	const char **tmp;
+	bool event = false;
+	int i, j = *argc;
+
+	const struct option event_options[] = {
+		OPT_BOOLEAN('e', "event", &event, NULL),
+		OPT_END()
+	};
+
+	tmp = calloc(j + 1, sizeof(char *));
+	if (!tmp)
+		return -EINVAL;
+
+	for (i = 0; i < j; i++)
+		tmp[i] = argv[i];
+
+	parse_options(j, tmp, event_options, NULL, 0);
+	if (!event) {
+		free(tmp);
+		return -EINVAL;
+	}
+
+	free(tmp);
+	return 0;
+}
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index dbb6f737a3e2..fe33b3ec55c9 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -1510,11 +1510,21 @@ static int kvm_cmd_stat(const char *file_name, int argc, const char **argv)
 }
 #endif /* HAVE_KVM_STAT_SUPPORT */
 
+int __weak kvm_add_default_arch_event(int *argc __maybe_unused,
+					const char **argv __maybe_unused)
+{
+	return 0;
+}
+
 static int __cmd_record(const char *file_name, int argc, const char **argv)
 {
-	int rec_argc, i = 0, j;
+	int rec_argc, i = 0, j, ret;
 	const char **rec_argv;
 
+	ret = kvm_add_default_arch_event(&argc, argv);
+	if (ret)
+		return -EINVAL;
+
 	rec_argc = argc + 2;
 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
 	rec_argv[i++] = strdup("record");
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index 1403dec189b4..da38b56c46cb 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -144,5 +144,5 @@ extern const int decode_str_len;
 extern const char *kvm_exit_reason;
 extern const char *kvm_entry_trace;
 extern const char *kvm_exit_trace;
-
+extern int kvm_add_default_arch_event(int *argc, const char **argv);
 #endif /* __PERF_KVM_STAT_H */
-- 
2.17.2


^ permalink raw reply related

* [PATCH v2] powerpc/perf: Use cpumask_last() to determine the designated cpu for nest/core units.
From: Anju T Sudhakar @ 2019-06-10  6:32 UTC (permalink / raw)
  To: mpe; +Cc: ego, anju, linuxppc-dev, maddy

Nest and core imc(In-memory Collection counters) assigns a particular
cpu as the designated target for counter data collection.
During system boot, the first online cpu in a chip gets assigned as
the designated cpu for that chip(for nest-imc) and the first online cpu
in a core gets assigned as the designated cpu for that core(for core-imc).

If the designated cpu goes offline, the next online cpu from the same
chip(for nest-imc)/core(for core-imc) is assigned as the next target,
and the event context is migrated to the target cpu.
Currently, cpumask_any_but() function is used to find the target cpu.
Though this function is expected to return a `random` cpu, this always
returns the next online cpu.

If all cpus in a chip/core is offlined in a sequential manner, starting
from the first cpu, the event migration has to happen for all the cpus
which goes offline. Since the migration process involves a grace period,
the total time taken to offline all the cpus will be significantly high.

Example:
In a system which has 2 sockets, with
NUMA node0 CPU(s):     0-87
NUMA node8 CPU(s):     88-175

Time taken to offline cpu 88-175:
real    2m56.099s
user    0m0.191s
sys     0m0.000s

Use cpumask_last() to choose the target cpu, when the designated cpu
goes online, so the migration will happen only when the last_cpu in the
mask goes offline. This way the time taken to offline all cpus in a
chip/core can be reduced.

With the patch, 

Time taken  to offline cpu 88-175:
real    0m12.207s
user    0m0.171s
sys     0m0.000s


Offlining all cpus in reverse order is also taken care because,
cpumask_any_but() is used to find the designated cpu if the last cpu in
the mask goes offline. Since cpumask_any_but() always return the first
cpu in the mask, that becomes the designated cpu and migration will happen
only when the first_cpu in the mask goes offline.

Example:
With the patch,

Time taken to offline cpu from 175-88:
real    0m9.330s
user    0m0.110s
sys     0m0.000s

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
---

Changes from v1:
	Modified the commit log with more info.
---

 arch/powerpc/perf/imc-pmu.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 31fa753..fbfd6e7 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -366,7 +366,14 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu)
 	 */
 	nid = cpu_to_node(cpu);
 	l_cpumask = cpumask_of_node(nid);
-	target = cpumask_any_but(l_cpumask, cpu);
+	target = cpumask_last(l_cpumask);
+
+	/*
+	 * If this(target) is the last cpu in the cpumask for this chip,
+	 * check for any possible online cpu in the chip.
+	 */
+	if (unlikely(target == cpu))
+		target = cpumask_any_but(l_cpumask, cpu);
 
 	/*
 	 * Update the cpumask with the target cpu and
@@ -671,7 +678,10 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu)
 		return 0;
 
 	/* Find any online cpu in that core except the current "cpu" */
-	ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
+	ncpu = cpumask_last(cpu_sibling_mask(cpu));
+
+	if (unlikely(ncpu == cpu))
+		ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
 
 	if (ncpu >= 0 && ncpu < nr_cpu_ids) {
 		cpumask_set_cpu(ncpu, &core_imc_cpumask);
-- 
1.8.3.1


^ permalink raw reply related

* [Bug 203837] Booting kernel under KVM immediately freezes host
From: bugzilla-daemon @ 2019-06-10  6:30 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-203837-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=203837

--- Comment #3 from npiggin@gmail.com ---
bugzilla-daemon@bugzilla.kernel.org's on June 7, 2019 4:29 pm:
> https://bugzilla.kernel.org/show_bug.cgi?id=203837
> 
> --- Comment #2 from Paul Mackerras (paulus@ozlabs.org) ---
> Just tried 5.1.7 in the host and got the guest locking up during boot. In
> xmon
> I see one cpu in pmdp_invalidate and another in handle_mm_fault. It seems
> very
> possible this is the bug that Nick Piggin's recent patch series fixes
> ("powerpc/64s: Fix THP PMD collapse serialisation"):
> 
> http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=112348

It's worth a try, although the bug was introduced around 4.20 and
I wasn't able to trigger it on radix, but other timing changes
could cause it to trigger I suppose.

pdbg (https://github.com/open-power/pdbg) is a useful tool for your
BMC that can often get the CPU registers out even for bad crashes,
this might help to narrow down the problem without bisecting.

Thanks,
Nick

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* Re: [Bug 203837] Booting kernel under KVM immediately freezes host
From: Nicholas Piggin @ 2019-06-10  6:27 UTC (permalink / raw)
  To: bugzilla-daemon, linuxppc-dev
In-Reply-To: <bug-203837-206035-1lnk8ZObWZ@https.bugzilla.kernel.org/>

bugzilla-daemon@bugzilla.kernel.org's on June 7, 2019 4:29 pm:
> https://bugzilla.kernel.org/show_bug.cgi?id=203837
> 
> --- Comment #2 from Paul Mackerras (paulus@ozlabs.org) ---
> Just tried 5.1.7 in the host and got the guest locking up during boot. In xmon
> I see one cpu in pmdp_invalidate and another in handle_mm_fault. It seems very
> possible this is the bug that Nick Piggin's recent patch series fixes
> ("powerpc/64s: Fix THP PMD collapse serialisation"):
> 
> http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=112348

It's worth a try, although the bug was introduced around 4.20 and
I wasn't able to trigger it on radix, but other timing changes
could cause it to trigger I suppose.

pdbg (https://github.com/open-power/pdbg) is a useful tool for your
BMC that can often get the CPU registers out even for bad crashes,
this might help to narrow down the problem without bisecting.

Thanks,
Nick


^ permalink raw reply

* Re: [PATCH 1/4] mm: Move ioremap page table mapping function to mm/
From: Nicholas Piggin @ 2019-06-10  6:21 UTC (permalink / raw)
  To: Anshuman Khandual, linux-mm; +Cc: linuxppc-dev, linux-arm-kernel
In-Reply-To: <03de53e9-f1f9-1632-567e-b88aabc56764@arm.com>

Anshuman Khandual's on June 10, 2019 3:42 pm:
> 
> 
> On 06/10/2019 10:08 AM, Nicholas Piggin wrote:
>> ioremap_page_range is a generic function to create a kernel virtual
>> mapping, move it to mm/vmalloc.c and rename it vmap_range.
> 
> Absolutely. It belongs in mm/vmalloc.c as its a kernel virtual range.
> But what is the rationale of changing the name to vmap_range ?

Well it doesn't just map IO. It's for arbitrary kernel virtual mapping
(including ioremap). Last patch uses it to map regular cacheable
memory.

>> For clarity with this move, also:
>> - Rename vunmap_page_range (vmap_range's inverse) to vunmap_range.
> 
> Will be inverse for both vmap_range() and vmap_page[s]_range() ?

Yes.

> 
>> - Rename vmap_page_range (which takes a page array) to vmap_pages.
> 
> s/vmap_pages/vmap_pages_range instead here ................^^^^^^

Yes.

> This deviates from the subject of this patch that it is related to
> ioremap only. I believe what this patch intends is to create
> 
> - vunmap_range() takes [VA range]
> 
> 	This will be the common kernel virtual range tear down
> 	function for ranges created either with vmap_range() or
> 	vmap_pages_range(). Is that correct ?
> - vmap_range() takes [VA range, PA range, prot]
> - vmap_pages_range() takes [VA range, struct pages, prot] 

That's right although we already have all those functions, so I don't
create anything, only move and re-name. I'm happy to change the
subject if you have a preference.

> Can we re-order the arguments (pages <--> prot) for vmap_pages_range()
> just to make it sync with vmap_range() ?
> 
> static int vmap_pages_range(unsigned long start, unsigned long end,
>  			   pgprot_t prot, struct page **pages)
> 

Sure, makes sense.

Thanks,
Nick


^ permalink raw reply

* Re: [PATCH 2/4] arm64: support huge vmap vmalloc
From: Nicholas Piggin @ 2019-06-10  6:14 UTC (permalink / raw)
  To: Anshuman Khandual, linux-mm; +Cc: linuxppc-dev, linux-arm-kernel
In-Reply-To: <c49a8fa7-c700-b45b-31b8-1d49afc42136@arm.com>

Anshuman Khandual's on June 10, 2019 3:47 pm:
> 
> 
> On 06/10/2019 10:08 AM, Nicholas Piggin wrote:
>> Applying huge vmap to vmalloc requires vmalloc_to_page to walk huge
>> pages. Define pud_large and pmd_large to support this.
>> 
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>  arch/arm64/include/asm/pgtable.h | 2 ++
>>  1 file changed, 2 insertions(+)
>> 
>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>> index 2c41b04708fe..30fe7b344bf7 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -428,6 +428,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>>  				 PMD_TYPE_TABLE)
>>  #define pmd_sect(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
>>  				 PMD_TYPE_SECT)
>> +#define pmd_large(pmd)		pmd_sect(pmd)
>>  
>>  #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
>>  #define pud_sect(pud)		(0)
>> @@ -438,6 +439,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>>  #define pud_table(pud)		((pud_val(pud) & PUD_TYPE_MASK) == \
>>  				 PUD_TYPE_TABLE)
>>  #endif
>> +#define pud_large(pud)		pud_sect(pud)
>>  
>>  extern pgd_t init_pg_dir[PTRS_PER_PGD];
>>  extern pgd_t init_pg_end[];
> 
> Another series (I guess not merged yet) is trying to add these wrappers
> on arm64 (https://patchwork.kernel.org/patch/10883887/).
> 

Okay good, I'll just cherry pick it for the series.

Thanks,
Nick


^ permalink raw reply

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
From: Nicholas Piggin @ 2019-06-10  5:49 UTC (permalink / raw)
  To: linux-mm; +Cc: linuxppc-dev, linux-arm-kernel
In-Reply-To: <20190610043838.27916-4-npiggin@gmail.com>

Nicholas Piggin's on June 10, 2019 2:38 pm:
> +static int vmap_hpages_range(unsigned long start, unsigned long end,
> +			   pgprot_t prot, struct page **pages,
> +			   unsigned int page_shift)
> +{
> +	BUG_ON(page_shift != PAGE_SIZE);
> +	return vmap_pages_range(start, end, prot, pages);
> +}

That's a false positive BUG_ON for !HUGE_VMAP configs. I'll fix that
and repost after a round of feedback.

Thanks,
Nick


^ permalink raw reply

* Re: [PATCH 2/4] arm64: support huge vmap vmalloc
From: Anshuman Khandual @ 2019-06-10  5:47 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm; +Cc: linuxppc-dev, linux-arm-kernel
In-Reply-To: <20190610043838.27916-2-npiggin@gmail.com>



On 06/10/2019 10:08 AM, Nicholas Piggin wrote:
> Applying huge vmap to vmalloc requires vmalloc_to_page to walk huge
> pages. Define pud_large and pmd_large to support this.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/arm64/include/asm/pgtable.h | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 2c41b04708fe..30fe7b344bf7 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -428,6 +428,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>  				 PMD_TYPE_TABLE)
>  #define pmd_sect(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
>  				 PMD_TYPE_SECT)
> +#define pmd_large(pmd)		pmd_sect(pmd)
>  
>  #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
>  #define pud_sect(pud)		(0)
> @@ -438,6 +439,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>  #define pud_table(pud)		((pud_val(pud) & PUD_TYPE_MASK) == \
>  				 PUD_TYPE_TABLE)
>  #endif
> +#define pud_large(pud)		pud_sect(pud)
>  
>  extern pgd_t init_pg_dir[PTRS_PER_PGD];
>  extern pgd_t init_pg_end[];

Another series (I guess not merged yet) is trying to add these wrappers
on arm64 (https://patchwork.kernel.org/patch/10883887/).

^ permalink raw reply

* Re: [PATCH 1/4] mm: Move ioremap page table mapping function to mm/
From: Anshuman Khandual @ 2019-06-10  5:42 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm; +Cc: linuxppc-dev, linux-arm-kernel
In-Reply-To: <20190610043838.27916-1-npiggin@gmail.com>



On 06/10/2019 10:08 AM, Nicholas Piggin wrote:
> ioremap_page_range is a generic function to create a kernel virtual
> mapping, move it to mm/vmalloc.c and rename it vmap_range.

Absolutely. It belongs in mm/vmalloc.c as its a kernel virtual range.
But what is the rationale of changing the name to vmap_range ?
 
> 
> For clarity with this move, also:
> - Rename vunmap_page_range (vmap_range's inverse) to vunmap_range.

Will be inverse for both vmap_range() and vmap_page[s]_range() ?

> - Rename vmap_page_range (which takes a page array) to vmap_pages.

s/vmap_pages/vmap_pages_range instead here ................^^^^^^

This deviates from the subject of this patch that it is related to
ioremap only. I believe what this patch intends is to create

- vunmap_range() takes [VA range]

	This will be the common kernel virtual range tear down
	function for ranges created either with vmap_range() or
	vmap_pages_range(). Is that correct ?

- vmap_range() takes [VA range, PA range, prot]
- vmap_pages_range() takes [VA range, struct pages, prot] 

Can we re-order the arguments (pages <--> prot) for vmap_pages_range()
just to make it sync with vmap_range() ?

static int vmap_pages_range(unsigned long start, unsigned long end,
 			   pgprot_t prot, struct page **pages)

^ permalink raw reply

* Re: [PATCH kernel v3 0/3] powerpc/ioda2: Yet another attempt to allow DMA masks between 32 and 59
From: Alexey Kardashevskiy @ 2019-06-10  5:19 UTC (permalink / raw)
  To: Alistair Popple, Oliver
  Cc: Sam Bobroff, Shawn Anastasio, linuxppc-dev, David Gibson
In-Reply-To: <2377993.PuYYC6lyoP@townsend>



On 07/06/2019 11:41, Alistair Popple wrote:
> On Thursday, 6 June 2019 10:07:54 PM AEST Oliver wrote:
>> On Thu, Jun 6, 2019 at 5:17 PM Alistair Popple <alistair@popple.id.au> 
> wrote:
>>> I have been hitting EEH address errors testing this with some network
>>> cards which map/unmap DMA addresses more frequently. For example:
>>>
>>> PHB4 PHB#5 Diag-data (Version: 1)
>>> brdgCtl:    00000002
>>> RootSts:    00060020 00402000 a0220008 00100107 00000800
>>> PhbSts:     0000001c00000000 0000001c00000000
>>> Lem:        0000000100000080 0000000000000000 0000000000000080
>>> PhbErr:     0000028000000000 0000020000000000 2148000098000240
>>> a008400000000000 RxeTceErr:  2000000000000000 2000000000000000
>>> c000000000000000 0000000000000000 PblErr:     0000000000020000
>>> 0000000000020000 0000000000000000 0000000000000000 RegbErr:   
>>> 0000004000000000 0000004000000000 61000c4800000000 0000000000000000
>>> PE[000] A/B: 8300b03800000000 8000000000000000
>>>
>>> Interestingly the PE[000] A/B data is the same across different cards
>>> and drivers.
>>
>> TCE page fault due to permissions so odds are the DMA address was unmapped.
>>
>> What cards did you get this with? I tried with one of the common
>> BCM5719 NICs and generated network traffic by using rsync to copy a
>> linux git tree to the system and it worked fine.
> 
> Personally I've seen it with the BCM5719 with the driver modified to set a DMA 
> mask of 48 bits instead of 64 and using scp to copy a random 1GB file to the 
> system repeatedly until it crashes. 
> 
> I have also had reports of someone hitting the same error using a Mellanox 
> CX-5 adaptor with a similar driver modification.
> 
> - Alistair


Seems to be a race (I could see broken TCEs), this helps on my setup:


diff --git a/arch/powerpc/include/asm/iommu.h
b/arch/powerpc/include/asm/iommu.h
index f920697c03ef..215351e16ae8 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -113,6 +113,7 @@ struct iommu_table {
        int it_nid;
        unsigned long it_reserved_start; /* Start of not-DMA-able (MMIO)
area */
        unsigned long it_reserved_end;
+       spinlock_t lock;
 };

 #define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index c75ec37bf0cd..7045b6518243 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -29,6 +29,7 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
        tbl->it_size = tce_size >> 3;
        tbl->it_busno = 0;
        tbl->it_type = TCE_PCI;
+       spin_lock_init(&tbl->lock);
 }

 static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
@@ -65,14 +66,17 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool
user, long idx, bool alloc)

                        if (!alloc)
                                return NULL;
-
-                       tmp2 = pnv_alloc_tce_level(tbl->it_nid,
-                                       ilog2(tbl->it_level_size) + 3);
-                       if (!tmp2)
-                               return NULL;
-
-                       tmp[n] = cpu_to_be64(__pa(tmp2) |
-                                       TCE_PCI_READ | TCE_PCI_WRITE);
+                       }
+                       spin_lock(&tbl->lock);
+                       if (tmp[n] == 0) {
+                               tmp2 = pnv_alloc_tce_level(tbl->it_nid,
+
ilog2(tbl->it_level_size) + 3);
+                               if (!tmp2)
+                                       return NULL;
+                               tmp[n] = cpu_to_be64(__pa(tmp2) |
+                                               TCE_PCI_READ |
TCE_PCI_WRITE);
+                       }
+                       spin_unlock(&tbl->lock);
                }
                tce = be64_to_cpu(tmp[n]);




-- 
Alexey

^ permalink raw reply related

* Re: [RFC V3] mm: Generalize and rename notify_page_fault() as kprobe_page_fault()
From: Anshuman Khandual @ 2019-06-10  5:06 UTC (permalink / raw)
  To: Dave Hansen, Matthew Wilcox
  Cc: Mark Rutland, Michal Hocko, linux-ia64, linux-sh, Peter Zijlstra,
	Catalin Marinas, Dave Hansen, Will Deacon, linux-mips, linux-mm,
	Paul Mackerras, sparclinux, linux-s390, Yoshinori Sato, x86,
	Russell King, Ingo Molnar, James Hogan, linux-snps-arc,
	Fenghua Yu, Stephen Rothwell, Andrey Konovalov, Andy Lutomirski,
	Thomas Gleixner, linux-arm-kernel, Tony Luck, Heiko Carstens,
	Vineet Gupta, linux-kernel, Ralf Baechle, Paul Burton,
	Martin Schwidefsky, Andrew Morton, linuxppc-dev, David S. Miller
In-Reply-To: <33c6a1cd-5c07-e623-28e5-f31f6fe30394@intel.com>



On 06/10/2019 10:27 AM, Dave Hansen wrote:
> On 6/9/19 9:34 PM, Anshuman Khandual wrote:
>>> Do you really think this is easier to read?
>>>
>>> Why not just move the x86 version to include/linux/kprobes.h, and replace
>>> the int with bool?
>> Will just return bool directly without an additional variable here as suggested
>> before. But for the conditional statement, I guess the proposed one here is more
>> compact than the x86 one.
> 
> FWIW, I don't think "compact" is generally a good goal for code.  Being
> readable is 100x more important than being compact and being un-compact
> is only a problem when it hurts readability.
> 
> For a function like the one in question, having the individual return
> conditions clearly commented is way more important than saving 10 lines
> of code.

Fair enough. Will keep the existing code flow from x86.

^ permalink raw reply

* Re: [RFC V3] mm: Generalize and rename notify_page_fault() as kprobe_page_fault()
From: Dave Hansen @ 2019-06-10  4:57 UTC (permalink / raw)
  To: Anshuman Khandual, Matthew Wilcox
  Cc: Mark Rutland, Michal Hocko, linux-ia64, linux-sh, Peter Zijlstra,
	Catalin Marinas, Dave Hansen, Will Deacon, linux-mips, linux-mm,
	Paul Mackerras, sparclinux, linux-s390, Yoshinori Sato, x86,
	Russell King, Ingo Molnar, James Hogan, linux-snps-arc,
	Fenghua Yu, Stephen Rothwell, Andrey Konovalov, Andy Lutomirski,
	Thomas Gleixner, linux-arm-kernel, Tony Luck, Heiko Carstens,
	Vineet Gupta, linux-kernel, Ralf Baechle, Paul Burton,
	Martin Schwidefsky, Andrew Morton, linuxppc-dev, David S. Miller
In-Reply-To: <f1b109a3-ef4c-359c-a124-e219e84a6266@arm.com>

On 6/9/19 9:34 PM, Anshuman Khandual wrote:
>> Do you really think this is easier to read?
>>
>> Why not just move the x86 version to include/linux/kprobes.h, and replace
>> the int with bool?
> Will just return bool directly without an additional variable here as suggested
> before. But for the conditional statement, I guess the proposed one here is more
> compact than the x86 one.

FWIW, I don't think "compact" is generally a good goal for code.  Being
readable is 100x more important than being compact and being un-compact
is only a problem when it hurts readability.

For a function like the one in question, having the individual return
conditions clearly commented is way more important than saving 10 lines
of code.

^ permalink raw reply

* [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
From: Nicholas Piggin @ 2019-06-10  4:38 UTC (permalink / raw)
  To: linux-mm; +Cc: linuxppc-dev, linux-arm-kernel, Nicholas Piggin
In-Reply-To: <20190610043838.27916-1-npiggin@gmail.com>

For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
allocate huge pages and map them

This brings dTLB misses for linux kernel tree `git diff` from 45,000 to
8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off
(performance is in the noise, under 1% difference, page tables are likely
to be well cached for this workload). Similar numbers are seen on POWER9.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 include/asm-generic/4level-fixup.h |   1 +
 include/asm-generic/5level-fixup.h |   1 +
 include/linux/vmalloc.h            |   1 +
 mm/vmalloc.c                       | 132 +++++++++++++++++++++++------
 4 files changed, 107 insertions(+), 28 deletions(-)

diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
index e3667c9a33a5..3cc65a4dd093 100644
--- a/include/asm-generic/4level-fixup.h
+++ b/include/asm-generic/4level-fixup.h
@@ -20,6 +20,7 @@
 #define pud_none(pud)			0
 #define pud_bad(pud)			0
 #define pud_present(pud)		1
+#define pud_large(pud)			0
 #define pud_ERROR(pud)			do { } while (0)
 #define pud_clear(pud)			pgd_clear(pud)
 #define pud_val(pud)			pgd_val(pud)
diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
index bb6cb347018c..c4377db09a4f 100644
--- a/include/asm-generic/5level-fixup.h
+++ b/include/asm-generic/5level-fixup.h
@@ -22,6 +22,7 @@
 #define p4d_none(p4d)			0
 #define p4d_bad(p4d)			0
 #define p4d_present(p4d)		1
+#define p4d_large(p4d)			0
 #define p4d_ERROR(p4d)			do { } while (0)
 #define p4d_clear(p4d)			pgd_clear(p4d)
 #define p4d_val(p4d)			pgd_val(p4d)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 812bea5866d6..4c92dc608928 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -42,6 +42,7 @@ struct vm_struct {
 	unsigned long		size;
 	unsigned long		flags;
 	struct page		**pages;
+	unsigned int		page_shift;
 	unsigned int		nr_pages;
 	phys_addr_t		phys_addr;
 	const void		*caller;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index dd27cfb29b10..0cf8e861caeb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -36,6 +36,7 @@
 #include <linux/rbtree_augmented.h>
 
 #include <linux/uaccess.h>
+#include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/shmparam.h>
 
@@ -440,6 +441,41 @@ static int vmap_pages_range(unsigned long start, unsigned long end,
 	return ret;
 }
 
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+static int vmap_hpages_range(unsigned long start, unsigned long end,
+				   pgprot_t prot, struct page **pages,
+				   unsigned int page_shift)
+{
+	unsigned long addr = start;
+	unsigned int i, nr = (end - start) >> (PAGE_SHIFT + page_shift);
+
+	for (i = 0; i < nr; i++) {
+		int err;
+
+		err = vmap_range_noflush(addr,
+					addr + (PAGE_SIZE << page_shift),
+					__pa(page_address(pages[i])), prot,
+					PAGE_SHIFT + page_shift);
+		if (err)
+			return err;
+
+		addr += PAGE_SIZE << page_shift;
+	}
+	flush_cache_vmap(start, end);
+
+	return nr;
+}
+#else
+static int vmap_hpages_range(unsigned long start, unsigned long end,
+			   pgprot_t prot, struct page **pages,
+			   unsigned int page_shift)
+{
+	BUG_ON(page_shift != PAGE_SIZE);
+	return vmap_pages_range(start, end, prot, pages);
+}
+#endif
+
+
 int is_vmalloc_or_module_addr(const void *x)
 {
 	/*
@@ -462,7 +498,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
 {
 	unsigned long addr = (unsigned long) vmalloc_addr;
 	struct page *page = NULL;
-	pgd_t *pgd = pgd_offset_k(addr);
+	pgd_t *pgd;
 	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
@@ -474,27 +510,38 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
 	 */
 	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
 
+	pgd = pgd_offset_k(addr);
 	if (pgd_none(*pgd))
 		return NULL;
+
 	p4d = p4d_offset(pgd, addr);
 	if (p4d_none(*p4d))
 		return NULL;
-	pud = pud_offset(p4d, addr);
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+	if (p4d_large(*p4d))
+		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
+#endif
+	if (WARN_ON_ONCE(p4d_bad(*p4d)))
+		return NULL;
 
-	/*
-	 * Don't dereference bad PUD or PMD (below) entries. This will also
-	 * identify huge mappings, which we may encounter on architectures
-	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
-	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
-	 * not [unambiguously] associated with a struct page, so there is
-	 * no correct value to return for them.
-	 */
-	WARN_ON_ONCE(pud_bad(*pud));
-	if (pud_none(*pud) || pud_bad(*pud))
+	pud = pud_offset(p4d, addr);
+	if (pud_none(*pud))
+		return NULL;
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+	if (pud_large(*pud))
+		return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+#endif
+	if (WARN_ON_ONCE(pud_bad(*pud)))
 		return NULL;
+
 	pmd = pmd_offset(pud, addr);
-	WARN_ON_ONCE(pmd_bad(*pmd));
-	if (pmd_none(*pmd) || pmd_bad(*pmd))
+	if (pmd_none(*pmd))
+		return NULL;
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+	if (pmd_large(*pmd))
+		return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+#endif
+	if (WARN_ON_ONCE(pmd_bad(*pmd)))
 		return NULL;
 
 	ptep = pte_offset_map(pmd, addr);
@@ -502,6 +549,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
 	if (pte_present(pte))
 		page = pte_page(pte);
 	pte_unmap(ptep);
+
 	return page;
 }
 EXPORT_SYMBOL(vmalloc_to_page);
@@ -2185,8 +2233,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 		return NULL;
 
 	if (flags & VM_IOREMAP)
-		align = 1ul << clamp_t(int, get_count_order_long(size),
-				       PAGE_SHIFT, IOREMAP_MAX_ORDER);
+		align = max(align,
+				1ul << clamp_t(int, get_count_order_long(size),
+				       PAGE_SHIFT, IOREMAP_MAX_ORDER));
 
 	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
 	if (unlikely(!area))
@@ -2398,7 +2447,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
 			struct page *page = area->pages[i];
 
 			BUG_ON(!page);
-			__free_pages(page, 0);
+			__free_pages(page, area->page_shift);
 		}
 
 		kvfree(area->pages);
@@ -2541,14 +2590,17 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 				 pgprot_t prot, int node)
 {
 	struct page **pages;
+	unsigned long addr = (unsigned long)area->addr;
+	unsigned long size = get_vm_area_size(area);
+	unsigned int page_shift = area->page_shift;
+	unsigned int shift = page_shift + PAGE_SHIFT;
 	unsigned int nr_pages, array_size, i;
 	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
 	const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
 	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
-					0 :
-					__GFP_HIGHMEM;
+					0 : __GFP_HIGHMEM;
 
-	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
+	nr_pages = size >> shift;
 	array_size = (nr_pages * sizeof(struct page *));
 
 	area->nr_pages = nr_pages;
@@ -2569,10 +2621,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	for (i = 0; i < area->nr_pages; i++) {
 		struct page *page;
 
-		if (node == NUMA_NO_NODE)
-			page = alloc_page(alloc_mask|highmem_mask);
-		else
-			page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
+		page = alloc_pages_node(node,
+				alloc_mask|highmem_mask, page_shift);
 
 		if (unlikely(!page)) {
 			/* Successfully allocated i pages, free them in __vunmap() */
@@ -2584,8 +2634,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 			cond_resched();
 	}
 
-	if (map_vm_area(area, prot, pages))
+	if (vmap_hpages_range(addr, addr + size, prot, pages, page_shift) < 0)
 		goto fail;
+
 	return area->addr;
 
 fail:
@@ -2619,22 +2670,39 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			pgprot_t prot, unsigned long vm_flags, int node,
 			const void *caller)
 {
-	struct vm_struct *area;
+	struct vm_struct *area = NULL;
 	void *addr;
 	unsigned long real_size = size;
+	unsigned long real_align = align;
+	unsigned int shift = PAGE_SHIFT;
 
 	size = PAGE_ALIGN(size);
 	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
 		goto fail;
 
+	if (IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) {
+		unsigned long size_per_node;
+
+		size_per_node = size;
+		if (node == NUMA_NO_NODE)
+			size_per_node /= num_online_nodes();
+		if (size_per_node >= PMD_SIZE)
+			shift = PMD_SHIFT;
+	}
+again:
+	align = max(real_align, 1UL << shift);
+	size = ALIGN(real_size, align);
+
 	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
 				vm_flags, start, end, node, gfp_mask, caller);
 	if (!area)
 		goto fail;
 
+	area->page_shift = shift - PAGE_SHIFT;
+
 	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
 	if (!addr)
-		return NULL;
+		goto fail;
 
 	/*
 	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -2648,8 +2716,16 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	return addr;
 
 fail:
-	warn_alloc(gfp_mask, NULL,
+	if (shift == PMD_SHIFT) {
+		shift = PAGE_SHIFT;
+		goto again;
+	}
+
+	if (!area) {
+		/* Warn for area allocation, page allocations already warn */
+		warn_alloc(gfp_mask, NULL,
 			  "vmalloc: allocation failure: %lu bytes", real_size);
+	}
 	return NULL;
 }
 
-- 
2.20.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox