LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH v4 3/7] KVM: PPC: Remove redundant kvm_run from vcpu_arch
From: Tianjia Zhang @ 2020-05-27  5:23 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: wanpengli, kvm, david, heiko.carstens, peterx, linux-mips, hpa,
	kvmarm, linux-s390, frankja, chenhuacai, maz, joro, x86,
	borntraeger, mingo, julien.thierry.kdev, thuth, gor,
	suzuki.poulose, kvm-ppc, bp, tglx, linux-arm-kernel, jmattson,
	tsbogend, cohuck, christoffer.dall, sean.j.christopherson,
	linux-kernel, james.morse, pbonzini, vkuznets, linuxppc-dev
In-Reply-To: <20200527042055.GG293451@thinks.paulus.ozlabs.org>



On 2020/5/27 12:20, Paul Mackerras wrote:
> On Mon, Apr 27, 2020 at 12:35:10PM +0800, Tianjia Zhang wrote:
>> The 'kvm_run' field already exists in the 'vcpu' structure, which
>> is the same structure as the 'kvm_run' in the 'vcpu_arch' and
>> should be deleted.
>>
>> Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
> 
> Thanks, patches 3 and 4 of this series applied to my kvm-ppc-next branch.
> 
> Paul.
> 

Thanks for your suggestion, for 5/7, I will submit a new version patch.

Thanks,
Tianjia

^ permalink raw reply

* Re: [PATCH v3 1/3] riscv: Move kernel mapping to vmalloc zone
From: Zong Li @ 2020-05-27  6:05 UTC (permalink / raw)
  To: Alex Ghiti
  Cc: Albert Ou, Anup Patel, linux-kernel@vger.kernel.org List,
	Atish Patra, Paul Mackerras, Paul Walmsley, Palmer Dabbelt,
	linux-riscv, linuxppc-dev
In-Reply-To: <6d6b09bc-32e4-4969-7020-12f9f02e557e@ghiti.fr>

On Wed, May 27, 2020 at 1:06 AM Alex Ghiti <alex@ghiti.fr> wrote:
>
> Hi Zong,
>
> Le 5/26/20 à 5:43 AM, Zong Li a écrit :
> > On Sun, May 24, 2020 at 4:54 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
> >> This is a preparatory patch for relocatable kernel.
> >>
> >> The kernel used to be linked at PAGE_OFFSET address and used to be loaded
> >> physically at the beginning of the main memory. Therefore, we could use
> >> the linear mapping for the kernel mapping.
> >>
> >> But the relocated kernel base address will be different from PAGE_OFFSET
> >> and since in the linear mapping, two different virtual addresses cannot
> >> point to the same physical address, the kernel mapping needs to lie outside
> >> the linear mapping.
> >>
> >> In addition, because modules and BPF must be close to the kernel (inside
> >> +-2GB window), the kernel is placed at the end of the vmalloc zone minus
> >> 2GB, which leaves room for modules and BPF. The kernel could not be
> >> placed at the beginning of the vmalloc zone since other vmalloc
> >> allocations from the kernel could get all the +-2GB window around the
> >> kernel which would prevent new modules and BPF programs to be loaded.
> >>
> >> Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
> >> ---
> >>   arch/riscv/boot/loader.lds.S     |  3 +-
> >>   arch/riscv/include/asm/page.h    | 10 +++++-
> >>   arch/riscv/include/asm/pgtable.h | 37 +++++++++++++-------
> >>   arch/riscv/kernel/head.S         |  3 +-
> >>   arch/riscv/kernel/module.c       |  4 +--
> >>   arch/riscv/kernel/vmlinux.lds.S  |  3 +-
> >>   arch/riscv/mm/init.c             | 58 +++++++++++++++++++++++++-------
> >>   arch/riscv/mm/physaddr.c         |  2 +-
> >>   8 files changed, 87 insertions(+), 33 deletions(-)
> >>
> >> diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S
> >> index 47a5003c2e28..62d94696a19c 100644
> >> --- a/arch/riscv/boot/loader.lds.S
> >> +++ b/arch/riscv/boot/loader.lds.S
> >> @@ -1,13 +1,14 @@
> >>   /* SPDX-License-Identifier: GPL-2.0 */
> >>
> >>   #include <asm/page.h>
> >> +#include <asm/pgtable.h>
> >>
> >>   OUTPUT_ARCH(riscv)
> >>   ENTRY(_start)
> >>
> >>   SECTIONS
> >>   {
> >> -       . = PAGE_OFFSET;
> >> +       . = KERNEL_LINK_ADDR;
> >>
> >>          .payload : {
> >>                  *(.payload)
> >> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> >> index 2d50f76efe48..48bb09b6a9b7 100644
> >> --- a/arch/riscv/include/asm/page.h
> >> +++ b/arch/riscv/include/asm/page.h
> >> @@ -90,18 +90,26 @@ typedef struct page *pgtable_t;
> >>
> >>   #ifdef CONFIG_MMU
> >>   extern unsigned long va_pa_offset;
> >> +extern unsigned long va_kernel_pa_offset;
> >>   extern unsigned long pfn_base;
> >>   #define ARCH_PFN_OFFSET                (pfn_base)
> >>   #else
> >>   #define va_pa_offset           0
> >> +#define va_kernel_pa_offset    0
> >>   #define ARCH_PFN_OFFSET                (PAGE_OFFSET >> PAGE_SHIFT)
> >>   #endif /* CONFIG_MMU */
> >>
> >>   extern unsigned long max_low_pfn;
> >>   extern unsigned long min_low_pfn;
> >> +extern unsigned long kernel_virt_addr;
> >>
> >>   #define __pa_to_va_nodebug(x)  ((void *)((unsigned long) (x) + va_pa_offset))
> >> -#define __va_to_pa_nodebug(x)  ((unsigned long)(x) - va_pa_offset)
> >> +#define linear_mapping_va_to_pa(x)     ((unsigned long)(x) - va_pa_offset)
> >> +#define kernel_mapping_va_to_pa(x)     \
> >> +       ((unsigned long)(x) - va_kernel_pa_offset)
> >> +#define __va_to_pa_nodebug(x)          \
> >> +       (((x) >= PAGE_OFFSET) ?         \
> >> +               linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x))
> >>
> >>   #ifdef CONFIG_DEBUG_VIRTUAL
> >>   extern phys_addr_t __virt_to_phys(unsigned long x);
> >> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> >> index 35b60035b6b0..25213cfaf680 100644
> >> --- a/arch/riscv/include/asm/pgtable.h
> >> +++ b/arch/riscv/include/asm/pgtable.h
> >> @@ -11,23 +11,29 @@
> >>
> >>   #include <asm/pgtable-bits.h>
> >>
> >> -#ifndef __ASSEMBLY__
> >> -
> >> -/* Page Upper Directory not used in RISC-V */
> >> -#include <asm-generic/pgtable-nopud.h>
> >> -#include <asm/page.h>
> >> -#include <asm/tlbflush.h>
> >> -#include <linux/mm_types.h>
> >> -
> >> -#ifdef CONFIG_MMU
> >> +#ifndef CONFIG_MMU
> >> +#define KERNEL_VIRT_ADDR       PAGE_OFFSET
> >> +#define KERNEL_LINK_ADDR       PAGE_OFFSET
> >> +#else
> >> +/*
> >> + * Leave 2GB for modules and BPF that must lie within a 2GB range around
> >> + * the kernel.
> >> + */
> >> +#define KERNEL_VIRT_ADDR       (VMALLOC_END - SZ_2G + 1)
> >> +#define KERNEL_LINK_ADDR       KERNEL_VIRT_ADDR
> >>
> >>   #define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
> >>   #define VMALLOC_END      (PAGE_OFFSET - 1)
> >>   #define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)
> >>
> >>   #define BPF_JIT_REGION_SIZE    (SZ_128M)
> >> -#define BPF_JIT_REGION_START   (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> >> -#define BPF_JIT_REGION_END     (VMALLOC_END)
> >> +#define BPF_JIT_REGION_START   (kernel_virt_addr)
> >> +#define BPF_JIT_REGION_END     (kernel_virt_addr + BPF_JIT_REGION_SIZE)
> > It seems to have a potential risk here, the region of bpf is
> > overlapping with kernel mapping, so if kernel size is bigger than
> > 128MB, bpf region would be occupied and run out by kernel mapping.

Is there the risk as I mentioned?

> >
> >> +
> >> +#ifdef CONFIG_64BIT
> >> +#define VMALLOC_MODULE_START   BPF_JIT_REGION_END
> >> +#define VMALLOC_MODULE_END     VMALLOC_END
> >> +#endif
> >>
> > Although kernel_virt_addr is a fixed address now, I think it could be
> > changed for the purpose of relocatable or KASLR, so if
> > kernel_virt_addr is moved to far from VMALLOC_END than 2G, the region
> > of module would be too big.
>
>
> Yes you're right, that's wrong to allow modules to lie outside
> the 2G window, thanks for noticing.
>
>
> > In addition, the region of module could be
> > +-2G around the kernel, so we don't be limited in one direction as
> > before. It seems to me that the region of the module could be decided
> > at runtime, for example, VMALLOC_MODULE_START is "&_end - 2G" and
> > VMLLOC_MODULE_END is "&_start + 2G".
>
>
> I had tried that, but as we need to make sure BPF region is different
> from the module's
> that makes the macro definitions really cumbersome. I'll give a try
> again anyway. And
> I tried to use _end and _start here but it failed, I have to debug this.
>
>
> >   I'm not sure whether the size of
> > region of bpf has to be 128MB for some particular reason, if not,
> > maybe the region of bpf could be the same with module to avoid being
> > run out by module.
>
>
> On the contrary, BPF region must not be the same as module's since in
> that case,
> modules could take all the space and make BPF fail.

ok, I got it. Thanks for the explaining.


>
>
> Thanks for your review Zong,
>
>
> Alex
>
>
> >
> >>   /*
> >>    * Roughly size the vmemmap space to be large enough to fit enough
> >> @@ -57,9 +63,16 @@
> >>   #define FIXADDR_SIZE     PGDIR_SIZE
> >>   #endif
> >>   #define FIXADDR_START    (FIXADDR_TOP - FIXADDR_SIZE)
> >> -
> >>   #endif
> >>
> >> +#ifndef __ASSEMBLY__
> >> +
> >> +/* Page Upper Directory not used in RISC-V */
> >> +#include <asm-generic/pgtable-nopud.h>
> >> +#include <asm/page.h>
> >> +#include <asm/tlbflush.h>
> >> +#include <linux/mm_types.h>
> >> +
> >>   #ifdef CONFIG_64BIT
> >>   #include <asm/pgtable-64.h>
> >>   #else
> >> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> >> index 98a406474e7d..8f5bb7731327 100644
> >> --- a/arch/riscv/kernel/head.S
> >> +++ b/arch/riscv/kernel/head.S
> >> @@ -49,7 +49,8 @@ ENTRY(_start)
> >>   #ifdef CONFIG_MMU
> >>   relocate:
> >>          /* Relocate return address */
> >> -       li a1, PAGE_OFFSET
> >> +       la a1, kernel_virt_addr
> >> +       REG_L a1, 0(a1)
> >>          la a2, _start
> >>          sub a1, a1, a2
> >>          add ra, ra, a1
> >> diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
> >> index 8bbe5dbe1341..1a8fbe05accf 100644
> >> --- a/arch/riscv/kernel/module.c
> >> +++ b/arch/riscv/kernel/module.c
> >> @@ -392,12 +392,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
> >>   }
> >>
> >>   #if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
> >> -#define VMALLOC_MODULE_START \
> >> -        max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START)
> >>   void *module_alloc(unsigned long size)
> >>   {
> >>          return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START,
> >> -                                   VMALLOC_END, GFP_KERNEL,
> >> +                                   VMALLOC_MODULE_END, GFP_KERNEL,
> >>                                      PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> >>                                      __builtin_return_address(0));
> >>   }
> >> diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S
> >> index 0339b6bbe11a..a9abde62909f 100644
> >> --- a/arch/riscv/kernel/vmlinux.lds.S
> >> +++ b/arch/riscv/kernel/vmlinux.lds.S
> >> @@ -4,7 +4,8 @@
> >>    * Copyright (C) 2017 SiFive
> >>    */
> >>
> >> -#define LOAD_OFFSET PAGE_OFFSET
> >> +#include <asm/pgtable.h>
> >> +#define LOAD_OFFSET KERNEL_LINK_ADDR
> >>   #include <asm/vmlinux.lds.h>
> >>   #include <asm/page.h>
> >>   #include <asm/cache.h>
> >> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> >> index 27a334106708..17f108baec4f 100644
> >> --- a/arch/riscv/mm/init.c
> >> +++ b/arch/riscv/mm/init.c
> >> @@ -22,6 +22,9 @@
> >>
> >>   #include "../kernel/head.h"
> >>
> >> +unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR;
> >> +EXPORT_SYMBOL(kernel_virt_addr);
> >> +
> >>   unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
> >>                                                          __page_aligned_bss;
> >>   EXPORT_SYMBOL(empty_zero_page);
> >> @@ -178,8 +181,12 @@ void __init setup_bootmem(void)
> >>   }
> >>
> >>   #ifdef CONFIG_MMU
> >> +/* Offset between linear mapping virtual address and kernel load address */
> >>   unsigned long va_pa_offset;
> >>   EXPORT_SYMBOL(va_pa_offset);
> >> +/* Offset between kernel mapping virtual address and kernel load address */
> >> +unsigned long va_kernel_pa_offset;
> >> +EXPORT_SYMBOL(va_kernel_pa_offset);
> >>   unsigned long pfn_base;
> >>   EXPORT_SYMBOL(pfn_base);
> >>
> >> @@ -271,7 +278,7 @@ static phys_addr_t __init alloc_pmd(uintptr_t va)
> >>          if (mmu_enabled)
> >>                  return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
> >>
> >> -       pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT;
> >> +       pmd_num = (va - kernel_virt_addr) >> PGDIR_SHIFT;
> >>          BUG_ON(pmd_num >= NUM_EARLY_PMDS);
> >>          return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD];
> >>   }
> >> @@ -372,14 +379,30 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
> >>   #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing."
> >>   #endif
> >>
> >> +static uintptr_t load_pa, load_sz;
> >> +
> >> +void create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size)
> >> +{
> >> +       uintptr_t va, end_va;
> >> +
> >> +       end_va = kernel_virt_addr + load_sz;
> >> +       for (va = kernel_virt_addr; va < end_va; va += map_size)
> >> +               create_pgd_mapping(pgdir, va,
> >> +                                  load_pa + (va - kernel_virt_addr),
> >> +                                  map_size, PAGE_KERNEL_EXEC);
> >> +}
> >> +
> >>   asmlinkage void __init setup_vm(uintptr_t dtb_pa)
> >>   {
> >>          uintptr_t va, end_va;
> >> -       uintptr_t load_pa = (uintptr_t)(&_start);
> >> -       uintptr_t load_sz = (uintptr_t)(&_end) - load_pa;
> >>          uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE);
> >>
> >> +       load_pa = (uintptr_t)(&_start);
> >> +       load_sz = (uintptr_t)(&_end) - load_pa;
> >> +
> >>          va_pa_offset = PAGE_OFFSET - load_pa;
> >> +       va_kernel_pa_offset = kernel_virt_addr - load_pa;
> >> +
> >>          pfn_base = PFN_DOWN(load_pa);
> >>
> >>          /*
> >> @@ -402,26 +425,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
> >>          create_pmd_mapping(fixmap_pmd, FIXADDR_START,
> >>                             (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
> >>          /* Setup trampoline PGD and PMD */
> >> -       create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
> >> +       create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
> >>                             (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
> >> -       create_pmd_mapping(trampoline_pmd, PAGE_OFFSET,
> >> +       create_pmd_mapping(trampoline_pmd, kernel_virt_addr,
> >>                             load_pa, PMD_SIZE, PAGE_KERNEL_EXEC);
> >>   #else
> >>          /* Setup trampoline PGD */
> >> -       create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
> >> +       create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
> >>                             load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC);
> >>   #endif
> >>
> >>          /*
> >> -        * Setup early PGD covering entire kernel which will allows
> >> +        * Setup early PGD covering entire kernel which will allow
> >>           * us to reach paging_init(). We map all memory banks later
> >>           * in setup_vm_final() below.
> >>           */
> >> -       end_va = PAGE_OFFSET + load_sz;
> >> -       for (va = PAGE_OFFSET; va < end_va; va += map_size)
> >> -               create_pgd_mapping(early_pg_dir, va,
> >> -                                  load_pa + (va - PAGE_OFFSET),
> >> -                                  map_size, PAGE_KERNEL_EXEC);
> >> +       create_kernel_page_table(early_pg_dir, map_size);
> >>
> >>          /* Create fixed mapping for early FDT parsing */
> >>          end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE;
> >> @@ -441,6 +460,7 @@ static void __init setup_vm_final(void)
> >>          uintptr_t va, map_size;
> >>          phys_addr_t pa, start, end;
> >>          struct memblock_region *reg;
> >> +       static struct vm_struct vm_kernel = { 0 };
> >>
> >>          /* Set mmu_enabled flag */
> >>          mmu_enabled = true;
> >> @@ -467,10 +487,22 @@ static void __init setup_vm_final(void)
> >>                  for (pa = start; pa < end; pa += map_size) {
> >>                          va = (uintptr_t)__va(pa);
> >>                          create_pgd_mapping(swapper_pg_dir, va, pa,
> >> -                                          map_size, PAGE_KERNEL_EXEC);
> >> +                                          map_size, PAGE_KERNEL);
> >>                  }
> >>          }
> >>
> >> +       /* Map the kernel */
> >> +       create_kernel_page_table(swapper_pg_dir, PMD_SIZE);
> >> +
> >> +       /* Reserve the vmalloc area occupied by the kernel */
> >> +       vm_kernel.addr = (void *)kernel_virt_addr;
> >> +       vm_kernel.phys_addr = load_pa;
> >> +       vm_kernel.size = (load_sz + PMD_SIZE) & ~(PMD_SIZE - 1);
> >> +       vm_kernel.flags = VM_MAP | VM_NO_GUARD;
> >> +       vm_kernel.caller = __builtin_return_address(0);
> >> +
> >> +       vm_area_add_early(&vm_kernel);
> >> +
> >>          /* Clear fixmap PTE and PMD mappings */
> >>          clear_fixmap(FIX_PTE);
> >>          clear_fixmap(FIX_PMD);
> >> diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c
> >> index e8e4dcd39fed..35703d5ef5fd 100644
> >> --- a/arch/riscv/mm/physaddr.c
> >> +++ b/arch/riscv/mm/physaddr.c
> >> @@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys);
> >>
> >>   phys_addr_t __phys_addr_symbol(unsigned long x)
> >>   {
> >> -       unsigned long kernel_start = (unsigned long)PAGE_OFFSET;
> >> +       unsigned long kernel_start = (unsigned long)kernel_virt_addr;
> >>          unsigned long kernel_end = (unsigned long)_end;
> >>
> >>          /*
> >> --
> >> 2.20.1
> >>

^ permalink raw reply

* Re: [PATCH v4 6/7] KVM: MIPS: clean up redundant 'kvm_run' parameters
From: Tianjia Zhang @ 2020-05-27  6:24 UTC (permalink / raw)
  To: Huacai Chen
  Cc: wanpengli, kvm, david, heiko.carstens, Peter Xu, open list:MIPS,
	hpa, kvmarm, linux-s390, frankja, Marc Zyngier, joro, x86,
	borntraeger, mingo, julien.thierry.kdev, thuth, gor,
	suzuki.poulose, kvm-ppc, Borislav Petkov, Thomas Gleixner,
	linux-arm-kernel, jmattson, Thomas Bogendoerfer, cohuck,
	christoffer.dall, sean.j.christopherson, LKML, james.morse,
	Paolo Bonzini, vkuznets, linuxppc-dev
In-Reply-To: <CAAhV-H7kpKUfQoWid6GSNL5+4hTTroGyL83EaW6yZwS2+Ti9kA@mail.gmail.com>



On 2020/4/27 13:40, Huacai Chen wrote:
> Reviewed-by: Huacai Chen <chenhc@lemote.com>
> 
> On Mon, Apr 27, 2020 at 12:35 PM Tianjia Zhang
> <tianjia.zhang@linux.alibaba.com> wrote:
>>
>> In the current kvm version, 'kvm_run' has been included in the 'kvm_vcpu'
>> structure. For historical reasons, many kvm-related function parameters
>> retain the 'kvm_run' and 'kvm_vcpu' parameters at the same time. This
>> patch does a unified cleanup of these remaining redundant parameters.
>>
>> Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
>> ---
>>   arch/mips/include/asm/kvm_host.h |  28 +-------
>>   arch/mips/kvm/emulate.c          |  59 ++++++----------
>>   arch/mips/kvm/mips.c             |  11 ++-
>>   arch/mips/kvm/trap_emul.c        | 114 ++++++++++++++-----------------
>>   arch/mips/kvm/vz.c               |  26 +++----
>>   5 files changed, 87 insertions(+), 151 deletions(-)
>>

Hi Huacai,

These two patches(6/7 and 7/7) should be merged into the tree of the 
mips architecture separately. At present, there seems to be no good way 
to merge the whole architecture patchs.

For this series of patches, some architectures have been merged, some 
need to update the patch.

Thanks and best,
Tianjia

^ permalink raw reply

* Re: [PATCH] media: omap3isp: Shuffle cacheflush.h and include mm.h
From: Geert Uytterhoeven @ 2020-05-27  7:02 UTC (permalink / raw)
  To: Nathan Chancellor
  Cc: linux-ia64@vger.kernel.org, Linux-sh list, Roman Zippel,
	Linux Kernel Mailing List, Linux MM, Laurent Pinchart, sparclinux,
	linux-riscv, Christoph Hellwig, Linux-Arch, linux-c6x-dev,
	open list:QUALCOMM HEXAGON..., the arch/x86 maintainers,
	open list:TENSILICA XTENSA PORT (xtensa), Arnd Bergmann, alpha,
	linux-um, linux-m68k, Openrisc, Linux ARM, Michal Simek,
	open list:BROADCOM NVRAM DRIVER, Jessica Yu, Linux FS Devel,
	Andrew Morton, linuxppc-dev
In-Reply-To: <20200527043426.3242439-1-natechancellor@gmail.com>

Hi Nathan,

CC Laurent

On Wed, May 27, 2020 at 6:37 AM Nathan Chancellor
<natechancellor@gmail.com> wrote:
> After mm.h was removed from the asm-generic version of cacheflush.h,
> s390 allyesconfig shows several warnings of the following nature:
>
> In file included from ./arch/s390/include/generated/asm/cacheflush.h:1,
>                  from drivers/media/platform/omap3isp/isp.c:42:
> ./include/asm-generic/cacheflush.h:16:42: warning: 'struct mm_struct'
> declared inside parameter list will not be visible outside of this
> definition or declaration
>
> cacheflush.h does not include mm.h nor does it include any forward
> declaration of these structures hence the warning. To avoid this,
> include mm.h explicitly in this file and shuffle cacheflush.h below it.
>
> Fixes: 19c0054597a0 ("asm-generic: don't include <linux/mm.h> in cacheflush.h")
> Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>

Thanks for your patch!

> I am aware the fixes tag is kind of irrelevant because that SHA will
> change in the next linux-next revision and this will probably get folded
> into the original patch anyways but still.
>
> The other solution would be to add forward declarations of these structs
> to the top of cacheflush.h, I just chose to do what Christoph did in the
> original patch. I am happy to do that instead if you all feel that is
> better.

That actually looks like a better solution to me, as it would address the
problem for all users.

>  drivers/media/platform/omap3isp/isp.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/media/platform/omap3isp/isp.c b/drivers/media/platform/omap3isp/isp.c
> index a4ee6b86663e..54106a768e54 100644
> --- a/drivers/media/platform/omap3isp/isp.c
> +++ b/drivers/media/platform/omap3isp/isp.c
> @@ -39,8 +39,6 @@
>   *     Troy Laramy <t-laramy@ti.com>
>   */
>
> -#include <asm/cacheflush.h>
> -
>  #include <linux/clk.h>
>  #include <linux/clkdev.h>
>  #include <linux/delay.h>
> @@ -49,6 +47,7 @@
>  #include <linux/i2c.h>
>  #include <linux/interrupt.h>
>  #include <linux/mfd/syscon.h>
> +#include <linux/mm.h>
>  #include <linux/module.h>
>  #include <linux/omap-iommu.h>
>  #include <linux/platform_device.h>
> @@ -58,6 +57,8 @@
>  #include <linux/sched.h>
>  #include <linux/vmalloc.h>
>
> +#include <asm/cacheflush.h>
> +
>  #ifdef CONFIG_ARM_DMA_USE_IOMMU
>  #include <asm/dma-iommu.h>
>  #endif

Why does this file need <asm/cacheflush.h> at all?
It doesn't call any of the flush_*() functions, and seems to compile fine
without (on arm32).

Perhaps it was included at the top intentionally, to override the definitions
of copy_{to,from}_user_page()? Fortunately that doesn't seem to be the
case, from a quick look at the assembler output.

So let's just remove the #include instead?

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH v3 1/3] riscv: Move kernel mapping to vmalloc zone
From: Alex Ghiti @ 2020-05-27  7:29 UTC (permalink / raw)
  To: Zong Li
  Cc: Albert Ou, Anup Patel, linux-kernel@vger.kernel.org List,
	Atish Patra, Paul Mackerras, Paul Walmsley, Palmer Dabbelt,
	linux-riscv, linuxppc-dev
In-Reply-To: <CANXhq0qBOjFNvRB4QL-AoWzr5dU6Pz=tbf2qJMQxJjR99NHU3A@mail.gmail.com>

Le 5/27/20 à 2:05 AM, Zong Li a écrit :
> On Wed, May 27, 2020 at 1:06 AM Alex Ghiti <alex@ghiti.fr> wrote:
>> Hi Zong,
>>
>> Le 5/26/20 à 5:43 AM, Zong Li a écrit :
>>> On Sun, May 24, 2020 at 4:54 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
>>>> This is a preparatory patch for relocatable kernel.
>>>>
>>>> The kernel used to be linked at PAGE_OFFSET address and used to be loaded
>>>> physically at the beginning of the main memory. Therefore, we could use
>>>> the linear mapping for the kernel mapping.
>>>>
>>>> But the relocated kernel base address will be different from PAGE_OFFSET
>>>> and since in the linear mapping, two different virtual addresses cannot
>>>> point to the same physical address, the kernel mapping needs to lie outside
>>>> the linear mapping.
>>>>
>>>> In addition, because modules and BPF must be close to the kernel (inside
>>>> +-2GB window), the kernel is placed at the end of the vmalloc zone minus
>>>> 2GB, which leaves room for modules and BPF. The kernel could not be
>>>> placed at the beginning of the vmalloc zone since other vmalloc
>>>> allocations from the kernel could get all the +-2GB window around the
>>>> kernel which would prevent new modules and BPF programs to be loaded.
>>>>
>>>> Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
>>>> ---
>>>>    arch/riscv/boot/loader.lds.S     |  3 +-
>>>>    arch/riscv/include/asm/page.h    | 10 +++++-
>>>>    arch/riscv/include/asm/pgtable.h | 37 +++++++++++++-------
>>>>    arch/riscv/kernel/head.S         |  3 +-
>>>>    arch/riscv/kernel/module.c       |  4 +--
>>>>    arch/riscv/kernel/vmlinux.lds.S  |  3 +-
>>>>    arch/riscv/mm/init.c             | 58 +++++++++++++++++++++++++-------
>>>>    arch/riscv/mm/physaddr.c         |  2 +-
>>>>    8 files changed, 87 insertions(+), 33 deletions(-)
>>>>
>>>> diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S
>>>> index 47a5003c2e28..62d94696a19c 100644
>>>> --- a/arch/riscv/boot/loader.lds.S
>>>> +++ b/arch/riscv/boot/loader.lds.S
>>>> @@ -1,13 +1,14 @@
>>>>    /* SPDX-License-Identifier: GPL-2.0 */
>>>>
>>>>    #include <asm/page.h>
>>>> +#include <asm/pgtable.h>
>>>>
>>>>    OUTPUT_ARCH(riscv)
>>>>    ENTRY(_start)
>>>>
>>>>    SECTIONS
>>>>    {
>>>> -       . = PAGE_OFFSET;
>>>> +       . = KERNEL_LINK_ADDR;
>>>>
>>>>           .payload : {
>>>>                   *(.payload)
>>>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
>>>> index 2d50f76efe48..48bb09b6a9b7 100644
>>>> --- a/arch/riscv/include/asm/page.h
>>>> +++ b/arch/riscv/include/asm/page.h
>>>> @@ -90,18 +90,26 @@ typedef struct page *pgtable_t;
>>>>
>>>>    #ifdef CONFIG_MMU
>>>>    extern unsigned long va_pa_offset;
>>>> +extern unsigned long va_kernel_pa_offset;
>>>>    extern unsigned long pfn_base;
>>>>    #define ARCH_PFN_OFFSET                (pfn_base)
>>>>    #else
>>>>    #define va_pa_offset           0
>>>> +#define va_kernel_pa_offset    0
>>>>    #define ARCH_PFN_OFFSET                (PAGE_OFFSET >> PAGE_SHIFT)
>>>>    #endif /* CONFIG_MMU */
>>>>
>>>>    extern unsigned long max_low_pfn;
>>>>    extern unsigned long min_low_pfn;
>>>> +extern unsigned long kernel_virt_addr;
>>>>
>>>>    #define __pa_to_va_nodebug(x)  ((void *)((unsigned long) (x) + va_pa_offset))
>>>> -#define __va_to_pa_nodebug(x)  ((unsigned long)(x) - va_pa_offset)
>>>> +#define linear_mapping_va_to_pa(x)     ((unsigned long)(x) - va_pa_offset)
>>>> +#define kernel_mapping_va_to_pa(x)     \
>>>> +       ((unsigned long)(x) - va_kernel_pa_offset)
>>>> +#define __va_to_pa_nodebug(x)          \
>>>> +       (((x) >= PAGE_OFFSET) ?         \
>>>> +               linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x))
>>>>
>>>>    #ifdef CONFIG_DEBUG_VIRTUAL
>>>>    extern phys_addr_t __virt_to_phys(unsigned long x);
>>>> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
>>>> index 35b60035b6b0..25213cfaf680 100644
>>>> --- a/arch/riscv/include/asm/pgtable.h
>>>> +++ b/arch/riscv/include/asm/pgtable.h
>>>> @@ -11,23 +11,29 @@
>>>>
>>>>    #include <asm/pgtable-bits.h>
>>>>
>>>> -#ifndef __ASSEMBLY__
>>>> -
>>>> -/* Page Upper Directory not used in RISC-V */
>>>> -#include <asm-generic/pgtable-nopud.h>
>>>> -#include <asm/page.h>
>>>> -#include <asm/tlbflush.h>
>>>> -#include <linux/mm_types.h>
>>>> -
>>>> -#ifdef CONFIG_MMU
>>>> +#ifndef CONFIG_MMU
>>>> +#define KERNEL_VIRT_ADDR       PAGE_OFFSET
>>>> +#define KERNEL_LINK_ADDR       PAGE_OFFSET
>>>> +#else
>>>> +/*
>>>> + * Leave 2GB for modules and BPF that must lie within a 2GB range around
>>>> + * the kernel.
>>>> + */
>>>> +#define KERNEL_VIRT_ADDR       (VMALLOC_END - SZ_2G + 1)
>>>> +#define KERNEL_LINK_ADDR       KERNEL_VIRT_ADDR
>>>>
>>>>    #define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
>>>>    #define VMALLOC_END      (PAGE_OFFSET - 1)
>>>>    #define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)
>>>>
>>>>    #define BPF_JIT_REGION_SIZE    (SZ_128M)
>>>> -#define BPF_JIT_REGION_START   (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
>>>> -#define BPF_JIT_REGION_END     (VMALLOC_END)
>>>> +#define BPF_JIT_REGION_START   (kernel_virt_addr)
>>>> +#define BPF_JIT_REGION_END     (kernel_virt_addr + BPF_JIT_REGION_SIZE)
>>> It seems to have a potential risk here, the region of bpf is
>>> overlapping with kernel mapping, so if kernel size is bigger than
>>> 128MB, bpf region would be occupied and run out by kernel mapping.
> Is there the risk as I mentioned?


Sorry I forgot to answer this one: I was confident that 128MB was large 
enough for kernel
and BPF. But I see no reason to leave this risk so I'll change 
kernel_virt_addr for _end so
BPF will have its 128MB reserved.

Thanks !

Alex


>
>>>> +
>>>> +#ifdef CONFIG_64BIT
>>>> +#define VMALLOC_MODULE_START   BPF_JIT_REGION_END
>>>> +#define VMALLOC_MODULE_END     VMALLOC_END
>>>> +#endif
>>>>
>>> Although kernel_virt_addr is a fixed address now, I think it could be
>>> changed for the purpose of relocatable or KASLR, so if
>>> kernel_virt_addr is moved to far from VMALLOC_END than 2G, the region
>>> of module would be too big.
>>
>> Yes you're right, that's wrong to allow modules to lie outside
>> the 2G window, thanks for noticing.
>>
>>
>>> In addition, the region of module could be
>>> +-2G around the kernel, so we don't be limited in one direction as
>>> before. It seems to me that the region of the module could be decided
>>> at runtime, for example, VMALLOC_MODULE_START is "&_end - 2G" and
>>> VMLLOC_MODULE_END is "&_start + 2G".
>>
>> I had tried that, but as we need to make sure BPF region is different
>> from the module's
>> that makes the macro definitions really cumbersome. I'll give a try
>> again anyway. And
>> I tried to use _end and _start here but it failed, I have to debug this.
>>
>>
>>>    I'm not sure whether the size of
>>> region of bpf has to be 128MB for some particular reason, if not,
>>> maybe the region of bpf could be the same with module to avoid being
>>> run out by module.
>>
>> On the contrary, BPF region must not be the same as module's since in
>> that case,
>> modules could take all the space and make BPF fail.
> ok, I got it. Thanks for the explaining.
>
>
>>
>> Thanks for your review Zong,
>>
>>
>> Alex
>>
>>
>>>>    /*
>>>>     * Roughly size the vmemmap space to be large enough to fit enough
>>>> @@ -57,9 +63,16 @@
>>>>    #define FIXADDR_SIZE     PGDIR_SIZE
>>>>    #endif
>>>>    #define FIXADDR_START    (FIXADDR_TOP - FIXADDR_SIZE)
>>>> -
>>>>    #endif
>>>>
>>>> +#ifndef __ASSEMBLY__
>>>> +
>>>> +/* Page Upper Directory not used in RISC-V */
>>>> +#include <asm-generic/pgtable-nopud.h>
>>>> +#include <asm/page.h>
>>>> +#include <asm/tlbflush.h>
>>>> +#include <linux/mm_types.h>
>>>> +
>>>>    #ifdef CONFIG_64BIT
>>>>    #include <asm/pgtable-64.h>
>>>>    #else
>>>> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
>>>> index 98a406474e7d..8f5bb7731327 100644
>>>> --- a/arch/riscv/kernel/head.S
>>>> +++ b/arch/riscv/kernel/head.S
>>>> @@ -49,7 +49,8 @@ ENTRY(_start)
>>>>    #ifdef CONFIG_MMU
>>>>    relocate:
>>>>           /* Relocate return address */
>>>> -       li a1, PAGE_OFFSET
>>>> +       la a1, kernel_virt_addr
>>>> +       REG_L a1, 0(a1)
>>>>           la a2, _start
>>>>           sub a1, a1, a2
>>>>           add ra, ra, a1
>>>> diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
>>>> index 8bbe5dbe1341..1a8fbe05accf 100644
>>>> --- a/arch/riscv/kernel/module.c
>>>> +++ b/arch/riscv/kernel/module.c
>>>> @@ -392,12 +392,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
>>>>    }
>>>>
>>>>    #if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
>>>> -#define VMALLOC_MODULE_START \
>>>> -        max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START)
>>>>    void *module_alloc(unsigned long size)
>>>>    {
>>>>           return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START,
>>>> -                                   VMALLOC_END, GFP_KERNEL,
>>>> +                                   VMALLOC_MODULE_END, GFP_KERNEL,
>>>>                                       PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
>>>>                                       __builtin_return_address(0));
>>>>    }
>>>> diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S
>>>> index 0339b6bbe11a..a9abde62909f 100644
>>>> --- a/arch/riscv/kernel/vmlinux.lds.S
>>>> +++ b/arch/riscv/kernel/vmlinux.lds.S
>>>> @@ -4,7 +4,8 @@
>>>>     * Copyright (C) 2017 SiFive
>>>>     */
>>>>
>>>> -#define LOAD_OFFSET PAGE_OFFSET
>>>> +#include <asm/pgtable.h>
>>>> +#define LOAD_OFFSET KERNEL_LINK_ADDR
>>>>    #include <asm/vmlinux.lds.h>
>>>>    #include <asm/page.h>
>>>>    #include <asm/cache.h>
>>>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>>>> index 27a334106708..17f108baec4f 100644
>>>> --- a/arch/riscv/mm/init.c
>>>> +++ b/arch/riscv/mm/init.c
>>>> @@ -22,6 +22,9 @@
>>>>
>>>>    #include "../kernel/head.h"
>>>>
>>>> +unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR;
>>>> +EXPORT_SYMBOL(kernel_virt_addr);
>>>> +
>>>>    unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
>>>>                                                           __page_aligned_bss;
>>>>    EXPORT_SYMBOL(empty_zero_page);
>>>> @@ -178,8 +181,12 @@ void __init setup_bootmem(void)
>>>>    }
>>>>
>>>>    #ifdef CONFIG_MMU
>>>> +/* Offset between linear mapping virtual address and kernel load address */
>>>>    unsigned long va_pa_offset;
>>>>    EXPORT_SYMBOL(va_pa_offset);
>>>> +/* Offset between kernel mapping virtual address and kernel load address */
>>>> +unsigned long va_kernel_pa_offset;
>>>> +EXPORT_SYMBOL(va_kernel_pa_offset);
>>>>    unsigned long pfn_base;
>>>>    EXPORT_SYMBOL(pfn_base);
>>>>
>>>> @@ -271,7 +278,7 @@ static phys_addr_t __init alloc_pmd(uintptr_t va)
>>>>           if (mmu_enabled)
>>>>                   return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
>>>>
>>>> -       pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT;
>>>> +       pmd_num = (va - kernel_virt_addr) >> PGDIR_SHIFT;
>>>>           BUG_ON(pmd_num >= NUM_EARLY_PMDS);
>>>>           return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD];
>>>>    }
>>>> @@ -372,14 +379,30 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
>>>>    #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing."
>>>>    #endif
>>>>
>>>> +static uintptr_t load_pa, load_sz;
>>>> +
>>>> +void create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size)
>>>> +{
>>>> +       uintptr_t va, end_va;
>>>> +
>>>> +       end_va = kernel_virt_addr + load_sz;
>>>> +       for (va = kernel_virt_addr; va < end_va; va += map_size)
>>>> +               create_pgd_mapping(pgdir, va,
>>>> +                                  load_pa + (va - kernel_virt_addr),
>>>> +                                  map_size, PAGE_KERNEL_EXEC);
>>>> +}
>>>> +
>>>>    asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>>>>    {
>>>>           uintptr_t va, end_va;
>>>> -       uintptr_t load_pa = (uintptr_t)(&_start);
>>>> -       uintptr_t load_sz = (uintptr_t)(&_end) - load_pa;
>>>>           uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE);
>>>>
>>>> +       load_pa = (uintptr_t)(&_start);
>>>> +       load_sz = (uintptr_t)(&_end) - load_pa;
>>>> +
>>>>           va_pa_offset = PAGE_OFFSET - load_pa;
>>>> +       va_kernel_pa_offset = kernel_virt_addr - load_pa;
>>>> +
>>>>           pfn_base = PFN_DOWN(load_pa);
>>>>
>>>>           /*
>>>> @@ -402,26 +425,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>>>>           create_pmd_mapping(fixmap_pmd, FIXADDR_START,
>>>>                              (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
>>>>           /* Setup trampoline PGD and PMD */
>>>> -       create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
>>>> +       create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
>>>>                              (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
>>>> -       create_pmd_mapping(trampoline_pmd, PAGE_OFFSET,
>>>> +       create_pmd_mapping(trampoline_pmd, kernel_virt_addr,
>>>>                              load_pa, PMD_SIZE, PAGE_KERNEL_EXEC);
>>>>    #else
>>>>           /* Setup trampoline PGD */
>>>> -       create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
>>>> +       create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
>>>>                              load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC);
>>>>    #endif
>>>>
>>>>           /*
>>>> -        * Setup early PGD covering entire kernel which will allows
>>>> +        * Setup early PGD covering entire kernel which will allow
>>>>            * us to reach paging_init(). We map all memory banks later
>>>>            * in setup_vm_final() below.
>>>>            */
>>>> -       end_va = PAGE_OFFSET + load_sz;
>>>> -       for (va = PAGE_OFFSET; va < end_va; va += map_size)
>>>> -               create_pgd_mapping(early_pg_dir, va,
>>>> -                                  load_pa + (va - PAGE_OFFSET),
>>>> -                                  map_size, PAGE_KERNEL_EXEC);
>>>> +       create_kernel_page_table(early_pg_dir, map_size);
>>>>
>>>>           /* Create fixed mapping for early FDT parsing */
>>>>           end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE;
>>>> @@ -441,6 +460,7 @@ static void __init setup_vm_final(void)
>>>>           uintptr_t va, map_size;
>>>>           phys_addr_t pa, start, end;
>>>>           struct memblock_region *reg;
>>>> +       static struct vm_struct vm_kernel = { 0 };
>>>>
>>>>           /* Set mmu_enabled flag */
>>>>           mmu_enabled = true;
>>>> @@ -467,10 +487,22 @@ static void __init setup_vm_final(void)
>>>>                   for (pa = start; pa < end; pa += map_size) {
>>>>                           va = (uintptr_t)__va(pa);
>>>>                           create_pgd_mapping(swapper_pg_dir, va, pa,
>>>> -                                          map_size, PAGE_KERNEL_EXEC);
>>>> +                                          map_size, PAGE_KERNEL);
>>>>                   }
>>>>           }
>>>>
>>>> +       /* Map the kernel */
>>>> +       create_kernel_page_table(swapper_pg_dir, PMD_SIZE);
>>>> +
>>>> +       /* Reserve the vmalloc area occupied by the kernel */
>>>> +       vm_kernel.addr = (void *)kernel_virt_addr;
>>>> +       vm_kernel.phys_addr = load_pa;
>>>> +       vm_kernel.size = (load_sz + PMD_SIZE) & ~(PMD_SIZE - 1);
>>>> +       vm_kernel.flags = VM_MAP | VM_NO_GUARD;
>>>> +       vm_kernel.caller = __builtin_return_address(0);
>>>> +
>>>> +       vm_area_add_early(&vm_kernel);
>>>> +
>>>>           /* Clear fixmap PTE and PMD mappings */
>>>>           clear_fixmap(FIX_PTE);
>>>>           clear_fixmap(FIX_PMD);
>>>> diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c
>>>> index e8e4dcd39fed..35703d5ef5fd 100644
>>>> --- a/arch/riscv/mm/physaddr.c
>>>> +++ b/arch/riscv/mm/physaddr.c
>>>> @@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys);
>>>>
>>>>    phys_addr_t __phys_addr_symbol(unsigned long x)
>>>>    {
>>>> -       unsigned long kernel_start = (unsigned long)PAGE_OFFSET;
>>>> +       unsigned long kernel_start = (unsigned long)kernel_virt_addr;
>>>>           unsigned long kernel_end = (unsigned long)_end;
>>>>
>>>>           /*
>>>> --
>>>> 2.20.1
>>>>

^ permalink raw reply

* Re: [PATCH v3 1/3] riscv: Move kernel mapping to vmalloc zone
From: kbuild test robot @ 2020-05-27  7:33 UTC (permalink / raw)
  To: Alexandre Ghiti, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Paul Walmsley, Palmer Dabbelt, Albert Ou,
	Anup Patel, Atish Patra, Zong Li, linux-kernel, linuxppc-dev,
	linux-riscv
  Cc: kbuild-all, Alexandre Ghiti
In-Reply-To: <20200524085259.24784-2-alex@ghiti.fr>

[-- Attachment #1: Type: text/plain, Size: 1976 bytes --]

Hi Alexandre,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on powerpc/next]
[also build test WARNING on linus/master v5.7-rc7 next-20200526]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Alexandre-Ghiti/vmalloc-kernel-mapping-and-relocatable-kernel/20200524-170109
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: riscv-allyesconfig (attached as .config)
compiler: riscv64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=riscv 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot <lkp@intel.com>

All warnings (new ones prefixed by >>, old ones prefixed by <<):

>> arch/riscv/mm/init.c:383:6: warning: no previous prototype for 'create_kernel_page_table' [-Wmissing-prototypes]
383 | void create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size)
|      ^~~~~~~~~~~~~~~~~~~~~~~~

vim +/create_kernel_page_table +383 arch/riscv/mm/init.c

   382	
 > 383	void create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size)
   384	{
   385		uintptr_t va, end_va;
   386	
   387		end_va = kernel_virt_addr + load_sz;
   388		for (va = kernel_virt_addr; va < end_va; va += map_size)
   389			create_pgd_mapping(pgdir, va,
   390					   load_pa + (va - kernel_virt_addr),
   391					   map_size, PAGE_KERNEL_EXEC);
   392	}
   393	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 63405 bytes --]

^ permalink raw reply

* Re: [PATCH] media: omap3isp: Shuffle cacheflush.h and include mm.h
From: Nathan Chancellor @ 2020-05-27  8:13 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: linux-ia64@vger.kernel.org, Linux-sh list, Roman Zippel,
	Linux Kernel Mailing List, Linux MM, Laurent Pinchart, sparclinux,
	linux-riscv, Christoph Hellwig, Linux-Arch, linux-c6x-dev,
	open list:QUALCOMM HEXAGON..., the arch/x86 maintainers,
	open list:TENSILICA XTENSA PORT (xtensa), Arnd Bergmann, alpha,
	linux-um, linux-m68k, Openrisc, Linux ARM, Michal Simek,
	open list:BROADCOM NVRAM DRIVER, Jessica Yu, Linux FS Devel,
	Andrew Morton, linuxppc-dev
In-Reply-To: <CAMuHMdVSduTOi5bUgF9sLQdGADwyL1+qALWsKgin1TeOLGhAKQ@mail.gmail.com>

Hi Geert,

On Wed, May 27, 2020 at 09:02:51AM +0200, Geert Uytterhoeven wrote:
> Hi Nathan,
> 
> CC Laurent
> 
> On Wed, May 27, 2020 at 6:37 AM Nathan Chancellor
> <natechancellor@gmail.com> wrote:
> > After mm.h was removed from the asm-generic version of cacheflush.h,
> > s390 allyesconfig shows several warnings of the following nature:
> >
> > In file included from ./arch/s390/include/generated/asm/cacheflush.h:1,
> >                  from drivers/media/platform/omap3isp/isp.c:42:
> > ./include/asm-generic/cacheflush.h:16:42: warning: 'struct mm_struct'
> > declared inside parameter list will not be visible outside of this
> > definition or declaration
> >
> > cacheflush.h does not include mm.h nor does it include any forward
> > declaration of these structures hence the warning. To avoid this,
> > include mm.h explicitly in this file and shuffle cacheflush.h below it.
> >
> > Fixes: 19c0054597a0 ("asm-generic: don't include <linux/mm.h> in cacheflush.h")
> > Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
> 
> Thanks for your patch!
> 
> > I am aware the fixes tag is kind of irrelevant because that SHA will
> > change in the next linux-next revision and this will probably get folded
> > into the original patch anyways but still.
> >
> > The other solution would be to add forward declarations of these structs
> > to the top of cacheflush.h, I just chose to do what Christoph did in the
> > original patch. I am happy to do that instead if you all feel that is
> > better.
> 
> That actually looks like a better solution to me, as it would address the
> problem for all users.
> 
> >  drivers/media/platform/omap3isp/isp.c | 5 +++--
> >  1 file changed, 3 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/media/platform/omap3isp/isp.c b/drivers/media/platform/omap3isp/isp.c
> > index a4ee6b86663e..54106a768e54 100644
> > --- a/drivers/media/platform/omap3isp/isp.c
> > +++ b/drivers/media/platform/omap3isp/isp.c
> > @@ -39,8 +39,6 @@
> >   *     Troy Laramy <t-laramy@ti.com>
> >   */
> >
> > -#include <asm/cacheflush.h>
> > -
> >  #include <linux/clk.h>
> >  #include <linux/clkdev.h>
> >  #include <linux/delay.h>
> > @@ -49,6 +47,7 @@
> >  #include <linux/i2c.h>
> >  #include <linux/interrupt.h>
> >  #include <linux/mfd/syscon.h>
> > +#include <linux/mm.h>
> >  #include <linux/module.h>
> >  #include <linux/omap-iommu.h>
> >  #include <linux/platform_device.h>
> > @@ -58,6 +57,8 @@
> >  #include <linux/sched.h>
> >  #include <linux/vmalloc.h>
> >
> > +#include <asm/cacheflush.h>
> > +
> >  #ifdef CONFIG_ARM_DMA_USE_IOMMU
> >  #include <asm/dma-iommu.h>
> >  #endif
> 
> Why does this file need <asm/cacheflush.h> at all?
> It doesn't call any of the flush_*() functions, and seems to compile fine
> without (on arm32).
> 
> Perhaps it was included at the top intentionally, to override the definitions
> of copy_{to,from}_user_page()? Fortunately that doesn't seem to be the
> case, from a quick look at the assembler output.
> 
> So let's just remove the #include instead?

Sounds good to me. I can send a patch if needed or I suppose Andrew can
just make a small fixup patch for it. Let me know what I should do.

Cheers,
Nathan

^ permalink raw reply

* Re: [PATCH v2] KVM: PPC: Book3S HV: relax check on H_SVM_INIT_ABORT
From: Laurent Dufour @ 2020-05-27  9:23 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: linuxram, linux-kernel, kvm-ppc, groug, sukadev, linuxppc-dev
In-Reply-To: <20200527041649.GD293451@thinks.paulus.ozlabs.org>

Le 27/05/2020 à 06:16, Paul Mackerras a écrit :
> On Wed, May 20, 2020 at 07:43:08PM +0200, Laurent Dufour wrote:
>> The commit 8c47b6ff29e3 ("KVM: PPC: Book3S HV: Check caller of H_SVM_*
>> Hcalls") added checks of secure bit of SRR1 to filter out the Hcall
>> reserved to the Ultravisor.
>>
>> However, the Hcall H_SVM_INIT_ABORT is made by the Ultravisor passing the
>> context of the VM calling UV_ESM. This allows the Hypervisor to return to
>> the guest without going through the Ultravisor. Thus the Secure bit of SRR1
>> is not set in that particular case.
>>
>> In the case a regular VM is calling H_SVM_INIT_ABORT, this hcall will be
>> filtered out in kvmppc_h_svm_init_abort() because kvm->arch.secure_guest is
>> not set in that case.
>>
>> Fixes: 8c47b6ff29e3 ("KVM: PPC: Book3S HV: Check caller of H_SVM_* Hcalls")
>> Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
> 
> Thanks, applied to my kvm-ppc-next branch.  I expanded the comment in
> the code a little.

Thanks, the comment is more explicit now.

Laurent.

^ permalink raw reply

* [PATCH] powerpc/fadump: account for memory_limit while reserving memory
From: Hari Bathini @ 2020-05-27  9:44 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev
  Cc: stable, Mahesh J Salgaonkar, Vasant Hegde, Sourabh Jain,
	kbuild test robot

If the memory chunk found for reserving memory overshoots the memory
limit imposed, do not proceed with reserving memory. Default behavior
was this until commit 140777a3d8df ("powerpc/fadump: consider reserved
ranges while reserving memory") changed it unwittingly.

Reported-by: kbuild test robot <lkp@intel.com>
Fixes: 140777a3d8df ("powerpc/fadump: consider reserved ranges while reserving memory")
Cc: stable@vger.kernel.org
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
---

For reference:
- https://lists.ozlabs.org/pipermail/linuxppc-dev/2020-May/211136.html


 arch/powerpc/kernel/fadump.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 63aac8b..78ab9a6 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -603,7 +603,7 @@ int __init fadump_reserve_mem(void)
 		 */
 		base = fadump_locate_reserve_mem(base, size);
 
-		if (!base) {
+		if (!base || (base + size > mem_boundary)) {
 			pr_err("Failed to find memory chunk for reservation!\n");
 			goto error_out;
 		}


^ permalink raw reply related

* Re: [PATCH v8 3/5] powerpc/papr_scm: Fetch nvdimm health information from PHYP
From: Aneesh Kumar K.V @ 2020-05-27  9:54 UTC (permalink / raw)
  To: Vaibhav Jain, linuxppc-dev, linux-nvdimm, linux-kernel
  Cc: Vaibhav Jain, Steven Rostedt
In-Reply-To: <20200527041244.37821-4-vaibhav@linux.ibm.com>

Vaibhav Jain <vaibhav@linux.ibm.com> writes:

> Implement support for fetching nvdimm health information via
> H_SCM_HEALTH hcall as documented in Ref[1]. The hcall returns a pair
> of 64-bit bitmap, bitwise-and of which is then stored in
> 'struct papr_scm_priv' and subsequently partially exposed to
> user-space via newly introduced dimm specific attribute
> 'papr/flags'. Since the hcall is costly, the health information is
> cached and only re-queried, 60s after the previous successful hcall.
>
> The patch also adds a  documentation text describing flags reported by
> the the new sysfs attribute 'papr/flags' is also introduced at
> Documentation/ABI/testing/sysfs-bus-papr-scm.
>
> [1] commit 58b278f568f0 ("powerpc: Provide initial documentation for
> PAPR hcalls")
>

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

> Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.ibm.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>

^ permalink raw reply

* Re: [PATCH v8 4/5] ndctl/papr_scm,uapi: Add support for PAPR nvdimm specific methods
From: Aneesh Kumar K.V @ 2020-05-27  9:55 UTC (permalink / raw)
  To: Vaibhav Jain, linuxppc-dev, linux-nvdimm, linux-kernel
  Cc: Vaibhav Jain, Steven Rostedt
In-Reply-To: <20200527041244.37821-5-vaibhav@linux.ibm.com>

Vaibhav Jain <vaibhav@linux.ibm.com> writes:

> Introduce support for PAPR NVDIMM Specific Methods (PDSM) in papr_scm
> module and add the command family to the white list of NVDIMM command
> sets. Also advertise support for ND_CMD_CALL for the nvdimm
> command mask and implement necessary scaffolding in the module to
> handle ND_CMD_CALL ioctl and PDSM requests that we receive.
>
> The layout of the PDSM request as we expect from libnvdimm/libndctl is
> described in newly introduced uapi header 'papr_scm_pdsm.h' which
> defines a new 'struct nd_pdsm_cmd_pkg' header. This header is used
> to communicate the PDSM request via member
> 'nd_pkg_papr_scm->nd_command' and size of payload that need to be
> sent/received for servicing the PDSM.
>
> A new function is_cmd_valid() is implemented that reads the args to
> papr_scm_ndctl() and performs sanity tests on them. A new function
> papr_scm_service_pdsm() is introduced and is called from
> papr_scm_ndctl() in case of a PDSM request is received via ND_CMD_CALL
> command from libnvdimm.
>

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

> Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.ibm.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
> ---

-aneesh

^ permalink raw reply

* Re: [PATCH v8 5/5] powerpc/papr_scm: Implement support for PAPR_SCM_PDSM_HEALTH
From: Aneesh Kumar K.V @ 2020-05-27  9:55 UTC (permalink / raw)
  To: Vaibhav Jain, linuxppc-dev, linux-nvdimm, linux-kernel
  Cc: Vaibhav Jain, Steven Rostedt
In-Reply-To: <20200527041244.37821-6-vaibhav@linux.ibm.com>

Vaibhav Jain <vaibhav@linux.ibm.com> writes:

> This patch implements support for PDSM request 'PAPR_SCM_PDSM_HEALTH'
> that returns a newly introduced 'struct nd_papr_pdsm_health' instance
> containing dimm health information back to user space in response to
> ND_CMD_CALL. This functionality is implemented in newly introduced
> papr_scm_get_health() that queries the scm-dimm health information and
> then copies this information to the package payload whose layout is
> defined by 'struct nd_papr_pdsm_health'.
>
> The patch also introduces a new member 'struct papr_scm_priv.health'
> thats an instance of 'struct nd_papr_pdsm_health' to cache the health
> information of a nvdimm. As a result functions drc_pmem_query_health()
> and flags_show() are updated to populate and use this new struct
> instead of a u64 integer that was earlier used.
>

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

> Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.ibm.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>

-aneesh

^ permalink raw reply

* Re: [PATCH 2/3] powerpc/pci: unmap legacy INTx interrupts of passthrough IO adapters
From: Cédric Le Goater @ 2020-05-27  7:31 UTC (permalink / raw)
  To: Oliver O'Halloran; +Cc: linuxppc-dev
In-Reply-To: <CAOSf1CHZ4+vEHotKzPDu2czVDBBM_oerxcCRS5QOFxsMbSknKQ@mail.gmail.com>

On 5/27/20 2:57 AM, Oliver O'Halloran wrote:
> On Wed, Apr 29, 2020 at 5:51 PM Cédric Le Goater <clg@kaod.org> wrote:
>>
>> When a passthrough IO adapter is removed from a pseries machine using
>> hash MMU and the XIVE interrupt mode, the POWER hypervisor, pHyp,
>> expects the guest OS to have cleared all page table entries related to
>> the adapter. If some are still present, the RTAS call which isolates
>> the PCI slot returns error 9001 "valid outstanding translations" and
>> the removal of the IO adapter fails.
>>
>> INTx interrupt numbers need special care because Linux maps the
>> interrupts automatically in the Linux interrupt number space if they
>> are presented in the device tree node describing the IO adapter. These
>> interrupts are not un-mapped automatically and in case of an hot-plug
>> adapter, the PCI hot-plug layer needs to handle the cleanup to make
>> sure that all the page table entries of the XIVE ESB pages are
>> cleared.
>>
>> Cc: "Oliver O'Halloran" <oohall@gmail.com>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  arch/powerpc/kernel/pci-hotplug.c | 2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
>> index bf83f76563a3..9e9c6befd7ea 100644
>> --- a/arch/powerpc/kernel/pci-hotplug.c
>> +++ b/arch/powerpc/kernel/pci-hotplug.c
>> @@ -57,6 +57,8 @@ void pcibios_release_device(struct pci_dev *dev)
>>         struct pci_controller *phb = pci_bus_to_host(dev->bus);
>>         struct pci_dn *pdn = pci_get_pdn(dev);
>>
>> +       irq_dispose_mapping(dev->irq);
> 
> What does the original mapping? Powerpc arch code or the PCI core?

Powerpc. In pci_read_irq_line() when a device is added.

> Tearing down the mapping in pcibios_release_device() seems a bit fishy
> to me since the PCI core has already torn down the device state at
> that point. If the release is delayed it's possible that another
> pci_dev has mapped the IRQ before we get here, but maybe that's ok.

Which scenario would that be ? multiple devices mapping the same INTx
interrupt because all are used already ? 

Where should we drop the mapping ?

Thanks,

C.

>> +
>>         eeh_remove_device(dev);
>>
>>         if (phb->controller_ops.release_device)
>> --
>> 2.25.4
>>


^ permalink raw reply

* Re: [PATCH 2/3] powerpc/pci: unmap legacy INTx interrupts of passthrough IO adapters
From: Cédric Le Goater @ 2020-05-27 11:05 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: Oliver O'Halloran, linuxppc-dev
In-Reply-To: <20200429075122.1216388-3-clg@kaod.org>

Hello Michael,

On 4/29/20 9:51 AM, Cédric Le Goater wrote:
> When a passthrough IO adapter is removed from a pseries machine using
> hash MMU and the XIVE interrupt mode, the POWER hypervisor, pHyp,
> expects the guest OS to have cleared all page table entries related to
> the adapter. If some are still present, the RTAS call which isolates
> the PCI slot returns error 9001 "valid outstanding translations" and
> the removal of the IO adapter fails.
> 
> INTx interrupt numbers need special care because Linux maps the
> interrupts automatically in the Linux interrupt number space if they
> are presented in the device tree node describing the IO adapter. These
> interrupts are not un-mapped automatically and in case of an hot-plug
> adapter, the PCI hot-plug layer needs to handle the cleanup to make
> sure that all the page table entries of the XIVE ESB pages are
> cleared.

It seems this patch needs more digging to make sure we are handling
the IRQ unmapping in the correct PCI handler. Could you please keep
it back for the moment ? 

Thanks,

C.

^ permalink raw reply

* [PATCH] powerpc/bpf: Enable bpf_probe_read{, str}() on powerpc again
From: Petr Mladek @ 2020-05-27 12:28 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Petr Mladek, Daniel Borkmann, linux-kernel, Alexei Starovoitov,
	Paul Mackerras, Masami Hiramatsu, Brendan Gregg, Miroslav Benes,
	linuxppc-dev, Christoph Hellwig

The commit 0ebeea8ca8a4d1d453a ("bpf: Restrict bpf_probe_read{, str}() only
to archs where they work") caused that bpf_probe_read{, str}() functions
were not longer available on architectures where the same logical address
might have different content in kernel and user memory mapping. These
architectures should use probe_read_{user,kernel}_str helpers.

For backward compatibility, the problematic functions are still available
on architectures where the user and kernel address spaces are not
overlapping. This is defined CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE.

At the moment, these backward compatible functions are enabled only
on x86_64, arm, and arm64. Let's do it also on powerpc that has
the non overlapping address space as well.

Signed-off-by: Petr Mladek <pmladek@suse.com>
To: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-kernel@vger.kernel.org
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d13b5328ca10..b29d7cb38368 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -126,6 +126,7 @@ config PPC
 	select ARCH_HAS_MMIOWB			if PPC64
 	select ARCH_HAS_PHYS_TO_DMA
 	select ARCH_HAS_PMEM_API
+	select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	select ARCH_HAS_PTE_DEVMAP		if PPC_BOOK3S_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_MEMBARRIER_CALLBACKS
-- 
2.26.2


^ permalink raw reply related

* [PATCH V4 0/2] powerpc/perf: Add support for perf extended regs in powerpc
From: Athira Rajeev @ 2020-05-27  9:50 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: ravi.bangoria, atrajeev, maddy, linux-kernel, acme, anju, jolsa

Patch set to add support for perf extended register capability in
powerpc. The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to
indicate the PMU which support extended registers. The generic code
define the mask of extended registers as 0 for non supported architectures.

patch 1/2 defines this PERF_PMU_CAP_EXTENDED_REGS mask to output the
values of mmcr0,mmcr1,mmcr2 for POWER9. Defines `PERF_REG_EXTENDED_MASK`
at runtime which contains mask value of the supported registers under
extended regs.

Patch 2/2 adds extended regs to sample_reg_mask in the tool side to use
with `-I?` option.

Anju T Sudhakar (2):
  powerpc/perf: Add support for outputting extended regs in perf
    intr_regs
  tools/perf: Add perf tools support for extended register capability in
    powerpc

---
Changes from v3 -> v4
- Addressed the comments for new line/tab issue
  and added "Reviewed-by" from Madhavan Srinivasn.

Changes from v2 -> v3
- Split kernel and tools side patches as suggested by Arnaldo
- Addressed review comment from Madhavan Srinivasn

Changes from v1 -> v2

- PERF_REG_EXTENDED_MASK` is defined at runtime in the kernel
based on platform. This will give flexibility in using extended
regs for all processor versions where the supported registers may differ.
- removed PERF_REG_EXTENDED_MASK from the perf tools side. Based on the
processor version(from PVR value), tool side will return the appropriate
extended mask
- Since tool changes can handle without a "PERF_REG_EXTENDED_MASK" macro,
dropped patch to set NO_AUXTRACE.
- Addressed review comments from Ravi Bangoria for V1

---

 arch/powerpc/include/asm/perf_event_server.h    |  8 ++++
 arch/powerpc/include/uapi/asm/perf_regs.h       | 14 ++++++-
 arch/powerpc/perf/core-book3s.c                 |  1 +
 arch/powerpc/perf/perf_regs.c                   | 34 +++++++++++++--
 arch/powerpc/perf/power9-pmu.c                  |  6 +++
 tools/arch/powerpc/include/uapi/asm/perf_regs.h | 14 ++++++-
 tools/perf/arch/powerpc/include/perf_regs.h     |  5 ++-
 tools/perf/arch/powerpc/util/perf_regs.c        | 55 +++++++++++++++++++++++++
 8 files changed, 131 insertions(+), 6 deletions(-)

-- 
1.8.3.1


^ permalink raw reply

* [PATCH V4 1/2] powerpc/perf: Add support for outputting extended regs in perf intr_regs
From: Athira Rajeev @ 2020-05-27  9:50 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: ravi.bangoria, atrajeev, maddy, linux-kernel, acme, anju, jolsa
In-Reply-To: <1590573018-5201-1-git-send-email-atrajeev@linux.vnet.ibm.com>

From: Anju T Sudhakar <anju@linux.vnet.ibm.com>

Add support for perf extended register capability in powerpc.
The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to indicate the
PMU which support extended registers. The generic code define the mask
of extended registers as 0 for non supported architectures.

Patch adds extended regs support for power9 platform by
exposing MMCR0, MMCR1 and MMCR2 registers.

REG_RESERVED mask needs update to include extended regs.
`PERF_REG_EXTENDED_MASK`, contains mask value of the supported registers,
is defined at runtime in the kernel based on platform since the supported
registers may differ from one processor version to another and hence the
MASK value.

with patch
----------

available registers: r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11
r12 r13 r14 r15 r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r26
r27 r28 r29 r30 r31 nip msr orig_r3 ctr link xer ccr softe
trap dar dsisr sier mmcra mmcr0 mmcr1 mmcr2

PERF_RECORD_SAMPLE(IP, 0x1): 4784/4784: 0 period: 1 addr: 0
... intr regs: mask 0xffffffffffff ABI 64-bit
.... r0    0xc00000000012b77c
.... r1    0xc000003fe5e03930
.... r2    0xc000000001b0e000
.... r3    0xc000003fdcddf800
.... r4    0xc000003fc7880000
.... r5    0x9c422724be
.... r6    0xc000003fe5e03908
.... r7    0xffffff63bddc8706
.... r8    0x9e4
.... r9    0x0
.... r10   0x1
.... r11   0x0
.... r12   0xc0000000001299c0
.... r13   0xc000003ffffc4800
.... r14   0x0
.... r15   0x7fffdd8b8b00
.... r16   0x0
.... r17   0x7fffdd8be6b8
.... r18   0x7e7076607730
.... r19   0x2f
.... r20   0xc00000001fc26c68
.... r21   0xc0002041e4227e00
.... r22   0xc00000002018fb60
.... r23   0x1
.... r24   0xc000003ffec4d900
.... r25   0x80000000
.... r26   0x0
.... r27   0x1
.... r28   0x1
.... r29   0xc000000001be1260
.... r30   0x6008010
.... r31   0xc000003ffebb7218
.... nip   0xc00000000012b910
.... msr   0x9000000000009033
.... orig_r3 0xc00000000012b86c
.... ctr   0xc0000000001299c0
.... link  0xc00000000012b77c
.... xer   0x0
.... ccr   0x28002222
.... softe 0x1
.... trap  0xf00
.... dar   0x0
.... dsisr 0x80000000000
.... sier  0x0
.... mmcra 0x80000000000
.... mmcr0 0x82008090
.... mmcr1 0x1e000000
.... mmcr2 0x0
 ... thread: perf:4784

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
[Defined PERF_REG_EXTENDED_MASK at run time to add support for different platforms ]
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/perf_event_server.h |  8 +++++++
 arch/powerpc/include/uapi/asm/perf_regs.h    | 14 +++++++++++-
 arch/powerpc/perf/core-book3s.c              |  1 +
 arch/powerpc/perf/perf_regs.c                | 34 +++++++++++++++++++++++++---
 arch/powerpc/perf/power9-pmu.c               |  6 +++++
 5 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index 3e9703f..1458e1a 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -15,6 +15,9 @@
 #define MAX_EVENT_ALTERNATIVES	8
 #define MAX_LIMITED_HWCOUNTERS	2
 
+extern u64 mask_var;
+#define PERF_REG_EXTENDED_MASK          mask_var
+
 struct perf_event;
 
 /*
@@ -55,6 +58,11 @@ struct power_pmu {
 	int 		*blacklist_ev;
 	/* BHRB entries in the PMU */
 	int		bhrb_nr;
+	/*
+	 * set this flag with `PERF_PMU_CAP_EXTENDED_REGS` if
+	 * the pmu supports extended perf regs capability
+	 */
+	int		capabilities;
 };
 
 /*
diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h b/arch/powerpc/include/uapi/asm/perf_regs.h
index f599064..485b1d5 100644
--- a/arch/powerpc/include/uapi/asm/perf_regs.h
+++ b/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -48,6 +48,18 @@ enum perf_event_powerpc_regs {
 	PERF_REG_POWERPC_DSISR,
 	PERF_REG_POWERPC_SIER,
 	PERF_REG_POWERPC_MMCRA,
-	PERF_REG_POWERPC_MAX,
+	/* Extended registers */
+	PERF_REG_POWERPC_MMCR0,
+	PERF_REG_POWERPC_MMCR1,
+	PERF_REG_POWERPC_MMCR2,
+	/* Max regs without the extended regs */
+	PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
 };
+
+#define PERF_REG_PMU_MASK	((1ULL << PERF_REG_POWERPC_MAX) - 1)
+
+/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
+#define PERF_REG_PMU_MASK_300   (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) \
+				- PERF_REG_PMU_MASK)
+
 #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 3dcfecf..7f63edf 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2275,6 +2275,7 @@ int register_power_pmu(struct power_pmu *pmu)
 		pmu->name);
 
 	power_pmu.attr_groups = ppmu->attr_groups;
+	power_pmu.capabilities |= (ppmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS);
 
 #ifdef MSR_HV
 	/*
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
index a213a0a..c8a7e8c 100644
--- a/arch/powerpc/perf/perf_regs.c
+++ b/arch/powerpc/perf/perf_regs.c
@@ -13,9 +13,11 @@
 #include <asm/ptrace.h>
 #include <asm/perf_regs.h>
 
+u64 mask_var;
+
 #define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
 
-#define REG_RESERVED (~((1ULL << PERF_REG_POWERPC_MAX) - 1))
+#define REG_RESERVED (~(PERF_REG_EXTENDED_MASK | PERF_REG_PMU_MASK))
 
 static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = {
 	PT_REGS_OFFSET(PERF_REG_POWERPC_R0,  gpr[0]),
@@ -69,10 +71,26 @@
 	PT_REGS_OFFSET(PERF_REG_POWERPC_MMCRA, dsisr),
 };
 
+/* Function to return the extended register values */
+static u64 get_ext_regs_value(int idx)
+{
+	switch (idx) {
+	case PERF_REG_POWERPC_MMCR0:
+		return mfspr(SPRN_MMCR0);
+	case PERF_REG_POWERPC_MMCR1:
+		return mfspr(SPRN_MMCR1);
+	case PERF_REG_POWERPC_MMCR2:
+		return mfspr(SPRN_MMCR2);
+	default: return 0;
+	}
+}
+
 u64 perf_reg_value(struct pt_regs *regs, int idx)
 {
-	if (WARN_ON_ONCE(idx >= PERF_REG_POWERPC_MAX))
-		return 0;
+	u64 PERF_REG_EXTENDED_MAX;
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		PERF_REG_EXTENDED_MAX = PERF_REG_POWERPC_MMCR2 + 1;
 
 	if (idx == PERF_REG_POWERPC_SIER &&
 	   (IS_ENABLED(CONFIG_FSL_EMB_PERF_EVENT) ||
@@ -85,6 +103,16 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
 	    IS_ENABLED(CONFIG_PPC32)))
 		return 0;
 
+	if (idx >= PERF_REG_POWERPC_MAX && idx < PERF_REG_EXTENDED_MAX)
+		return get_ext_regs_value(idx);
+
+	/*
+	 * If the idx is referring to value beyond the
+	 * supported registers, return 0 with a warning
+	 */
+	if (WARN_ON_ONCE(idx >= PERF_REG_EXTENDED_MAX))
+		return 0;
+
 	return regs_get_register(regs, pt_regs_offset[idx]);
 }
 
diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 08c3ef7..4525090 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -90,6 +90,8 @@ enum {
 #define POWER9_MMCRA_IFM3		0x00000000C0000000UL
 #define POWER9_MMCRA_BHRB_MASK		0x00000000C0000000UL
 
+extern u64 mask_var;
+
 /* Nasty Power9 specific hack */
 #define PVR_POWER9_CUMULUS		0x00002000
 
@@ -434,6 +436,7 @@ static void power9_config_bhrb(u64 pmu_bhrb_filter)
 	.cache_events		= &power9_cache_events,
 	.attr_groups		= power9_pmu_attr_groups,
 	.bhrb_nr		= 32,
+	.capabilities           = PERF_PMU_CAP_EXTENDED_REGS,
 };
 
 int init_power9_pmu(void)
@@ -457,6 +460,9 @@ int init_power9_pmu(void)
 		}
 	}
 
+	/* Set the PERF_REG_EXTENDED_MASK here */
+	mask_var = PERF_REG_PMU_MASK_300;
+
 	rc = register_power_pmu(&power9_pmu);
 	if (rc)
 		return rc;
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH V4 2/2] tools/perf: Add perf tools support for extended register capability in powerpc
From: Athira Rajeev @ 2020-05-27  9:50 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: ravi.bangoria, atrajeev, maddy, linux-kernel, acme, anju, jolsa
In-Reply-To: <1590573018-5201-1-git-send-email-atrajeev@linux.vnet.ibm.com>

From: Anju T Sudhakar <anju@linux.vnet.ibm.com>

Add extended regs to sample_reg_mask in the tool side to use
with `-I?` option. Perf tools side uses extended mask to display
the platform supported register names (with -I? option) to the user
and also send this mask to the kernel to capture the extended registers
in each sample. Hence decide the mask value based on the processor
version.

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
[Decide extended mask at run time based on platform]
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
---
 tools/arch/powerpc/include/uapi/asm/perf_regs.h | 14 ++++++-
 tools/perf/arch/powerpc/include/perf_regs.h     |  5 ++-
 tools/perf/arch/powerpc/util/perf_regs.c        | 55 +++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/tools/arch/powerpc/include/uapi/asm/perf_regs.h b/tools/arch/powerpc/include/uapi/asm/perf_regs.h
index f599064..485b1d5 100644
--- a/tools/arch/powerpc/include/uapi/asm/perf_regs.h
+++ b/tools/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -48,6 +48,18 @@ enum perf_event_powerpc_regs {
 	PERF_REG_POWERPC_DSISR,
 	PERF_REG_POWERPC_SIER,
 	PERF_REG_POWERPC_MMCRA,
-	PERF_REG_POWERPC_MAX,
+	/* Extended registers */
+	PERF_REG_POWERPC_MMCR0,
+	PERF_REG_POWERPC_MMCR1,
+	PERF_REG_POWERPC_MMCR2,
+	/* Max regs without the extended regs */
+	PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
 };
+
+#define PERF_REG_PMU_MASK	((1ULL << PERF_REG_POWERPC_MAX) - 1)
+
+/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
+#define PERF_REG_PMU_MASK_300   (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) \
+				- PERF_REG_PMU_MASK)
+
 #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/tools/perf/arch/powerpc/include/perf_regs.h b/tools/perf/arch/powerpc/include/perf_regs.h
index e18a355..46ed00d 100644
--- a/tools/perf/arch/powerpc/include/perf_regs.h
+++ b/tools/perf/arch/powerpc/include/perf_regs.h
@@ -64,7 +64,10 @@
 	[PERF_REG_POWERPC_DAR] = "dar",
 	[PERF_REG_POWERPC_DSISR] = "dsisr",
 	[PERF_REG_POWERPC_SIER] = "sier",
-	[PERF_REG_POWERPC_MMCRA] = "mmcra"
+	[PERF_REG_POWERPC_MMCRA] = "mmcra",
+	[PERF_REG_POWERPC_MMCR0] = "mmcr0",
+	[PERF_REG_POWERPC_MMCR1] = "mmcr1",
+	[PERF_REG_POWERPC_MMCR2] = "mmcr2",
 };
 
 static inline const char *perf_reg_name(int id)
diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c
index 0a52429..9179230 100644
--- a/tools/perf/arch/powerpc/util/perf_regs.c
+++ b/tools/perf/arch/powerpc/util/perf_regs.c
@@ -6,9 +6,14 @@
 
 #include "../../../util/perf_regs.h"
 #include "../../../util/debug.h"
+#include "../../../util/event.h"
+#include "../../../util/header.h"
+#include "../../../perf-sys.h"
 
 #include <linux/kernel.h>
 
+#define PVR_POWER9		0x004E
+
 const struct sample_reg sample_reg_masks[] = {
 	SMPL_REG(r0, PERF_REG_POWERPC_R0),
 	SMPL_REG(r1, PERF_REG_POWERPC_R1),
@@ -55,6 +60,9 @@
 	SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR),
 	SMPL_REG(sier, PERF_REG_POWERPC_SIER),
 	SMPL_REG(mmcra, PERF_REG_POWERPC_MMCRA),
+	SMPL_REG(mmcr0, PERF_REG_POWERPC_MMCR0),
+	SMPL_REG(mmcr1, PERF_REG_POWERPC_MMCR1),
+	SMPL_REG(mmcr2, PERF_REG_POWERPC_MMCR2),
 	SMPL_REG_END
 };
 
@@ -163,3 +171,50 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)
 
 	return SDT_ARG_VALID;
 }
+
+uint64_t arch__intr_reg_mask(void)
+{
+	struct perf_event_attr attr = {
+		.type                   = PERF_TYPE_HARDWARE,
+		.config                 = PERF_COUNT_HW_CPU_CYCLES,
+		.sample_type            = PERF_SAMPLE_REGS_INTR,
+		.precise_ip             = 1,
+		.disabled               = 1,
+		.exclude_kernel         = 1,
+	};
+	int fd, ret;
+	char buffer[64];
+	u32 version;
+	u64 extended_mask = 0;
+
+	/* Get the PVR value to set the extended
+	 * mask specific to platform
+	 */
+	get_cpuid(buffer, sizeof(buffer));
+	ret = sscanf(buffer, "%u,", &version);
+
+	if (ret != 1) {
+		pr_debug("Failed to get the processor version, unable to output extended registers\n");
+		return PERF_REGS_MASK;
+	}
+
+	if (version == PVR_POWER9)
+		extended_mask = PERF_REG_PMU_MASK_300;
+	else
+		return PERF_REGS_MASK;
+
+	attr.sample_regs_intr = extended_mask;
+	attr.sample_period = 1;
+	event_attr_init(&attr);
+
+	/*
+	 * check if the pmu supports perf extended regs, before
+	 * returning the register mask to sample.
+	 */
+	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
+	if (fd != -1) {
+		close(fd);
+		return (extended_mask | PERF_REGS_MASK);
+	}
+	return PERF_REGS_MASK;
+}
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH v3 3/7] kunit: tests for stats_fs API
From: Alan Maguire @ 2020-05-27 10:05 UTC (permalink / raw)
  To: Emanuele Giuseppe Esposito
  Cc: linux-s390, kvm, linux-doc, netdev, kunit-dev,
	Emanuele Giuseppe Esposito, linux-kernel, kvm-ppc, Jonathan Adams,
	brendanhiggins, Christian Borntraeger, Alexander Viro,
	linux-kselftest, David Rientjes, linux-fsdevel, Paolo Bonzini,
	linux-mips, linuxppc-dev, linux-arm-kernel, Jim Mattson
In-Reply-To: <20200526110318.69006-4-eesposit@redhat.com>

On Tue, 26 May 2020, Emanuele Giuseppe Esposito wrote:

> Add kunit tests to extensively test the stats_fs API functionality.
>

I've added in the kunit-related folks.
 
> In order to run them, the kernel .config must set CONFIG_KUNIT=y
> and a new .kunitconfig file must be created with CONFIG_STATS_FS=y
> and CONFIG_STATS_FS_TEST=y
>

It looks like CONFIG_STATS_FS is built-in, but it exports
much of the functionality you are testing.  However could the
tests also be built as a module (i.e. make CONFIG_STATS_FS_TEST
a tristate variable)? To test this you'd need to specify
CONFIG_KUNIT=m and CONFIG_STATS_FS_TEST=m, and testing would
simply be a case of "modprobe"ing the stats fs module and collecting
results in /sys/kernel/debug/kunit/<module_name> (rather 
than running kunit.py). Are you relying on unexported internals in
the the tests that would prevent building them as a module?

Thanks!

Alan

^ permalink raw reply

* Re: [PATCH v3 0/7] Statsfs: a new ram-based file system for Linux kernel statistics
From: Emanuele Giuseppe Esposito @ 2020-05-27 13:14 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: linux-s390, kvm, linux-doc, netdev, Emanuele Giuseppe Esposito,
	linux-kernel, kvm-ppc, Jonathan Adams, Christian Borntraeger,
	Andrew Lunn, Alexander Viro, David Rientjes, linux-fsdevel,
	Paolo Bonzini, linux-mips, linuxppc-dev, linux-arm-kernel,
	Jim Mattson
In-Reply-To: <20200526153128.448bfb43@kicinski-fedora-PC1C0HJN.hsd1.ca.comcast.net>


>>
>> The file system is mounted on /sys/kernel/stats and would be already used
>> by kvm. Statsfs was initially introduced by Paolo Bonzini [1].
> 
> What's the direct motivation for this work? Moving KVM stats out of
> debugfs?

There's many reasons: one of these is not using debugfs for statistics, 
but also (and mainly) to try and have a single tool that automatically 
takes care and displays them, instead of leaving each subsystem "on its 
own".

Sure, everyone gathers and processes stats in different ways, and the 
aim of this tool is to hopefully be extensible enough to cover all needs.
> In my experience stats belong in the API used for creating/enumerating
> objects, statsfs sounds like going in the exact opposite direction -
> creating a parallel structure / hierarchy for exposing stats.

  I know
> nothing about KVM but are you sure all the info that has to be exposed
> will be stats?I don't understand, what do you mean here?

> 
> In case of networking we have the basic stats in sysfs, under the
> netdevice's kobject. But since we're not using sysfs much any more
> for config, new stats are added in netlink APIs. Again - same APIs
> used for enumeration and config.

I don't really know a lot about the networking subsystem, and as it was 
pointed out in another email on patch 7 by Andrew, networking needs to 
atomically gather and display statistics in order to make them 
consistent, and currently this is not supported by stats_fs but could be 
added in future.

In addition, right now it won't work properly if the networking 
namespaces are enabled. That is another issue to take into 
consideration. That's also why I marked patch 7 as "not for merge"

Regarding the config, as I said the idea is to gather multiple 
subsystems' statistics, therefore there wouldn't be a single 
configuration method like in netlink.
For example in kvm there are file descriptors for configuration, and 
creating them requires no privilege, contrary to the network interfaces.

Thank you,
Emanuele


^ permalink raw reply

* Re: [PATCH v3 3/7] kunit: tests for stats_fs API
From: Emanuele Giuseppe Esposito @ 2020-05-27 13:26 UTC (permalink / raw)
  To: Alan Maguire
  Cc: linux-s390, kvm, linux-doc, netdev, kunit-dev,
	Emanuele Giuseppe Esposito, linux-kernel, kvm-ppc, Jonathan Adams,
	brendanhiggins, Christian Borntraeger, Alexander Viro,
	linux-kselftest, David Rientjes, linux-fsdevel, Paolo Bonzini,
	linux-mips, linuxppc-dev, linux-arm-kernel, Jim Mattson
In-Reply-To: <alpine.LRH.2.21.2005271054360.24819@localhost>


>> In order to run them, the kernel .config must set CONFIG_KUNIT=y
>> and a new .kunitconfig file must be created with CONFIG_STATS_FS=y
>> and CONFIG_STATS_FS_TEST=y
>>
> 
> It looks like CONFIG_STATS_FS is built-in, but it exports
> much of the functionality you are testing.  However could the
> tests also be built as a module (i.e. make CONFIG_STATS_FS_TEST
> a tristate variable)? To test this you'd need to specify
> CONFIG_KUNIT=m and CONFIG_STATS_FS_TEST=m, and testing would
> simply be a case of "modprobe"ing the stats fs module and collecting
> results in /sys/kernel/debug/kunit/<module_name> (rather
> than running kunit.py). Are you relying on unexported internals in
> the the tests that would prevent building them as a module?
> 

I haven't checked it yet, but tests should work as separate module.
I will look into it, thanks.

Emanuele


^ permalink raw reply

* Re: [PATCH v3 0/7] Statsfs: a new ram-based file system for Linux kernel statistics
From: Andrew Lunn @ 2020-05-27 13:33 UTC (permalink / raw)
  To: Emanuele Giuseppe Esposito
  Cc: linux-s390, kvm, linux-doc, netdev, David Rientjes,
	Emanuele Giuseppe Esposito, linux-kernel, kvm-ppc, Jonathan Adams,
	Christian Borntraeger, Alexander Viro, Paolo Bonzini,
	linux-fsdevel, Jakub Kicinski, linux-mips, linuxppc-dev,
	linux-arm-kernel, Jim Mattson
In-Reply-To: <6a754b40-b148-867d-071d-8f31c5c0d172@redhat.com>

> I don't really know a lot about the networking subsystem, and as it was
> pointed out in another email on patch 7 by Andrew, networking needs to
> atomically gather and display statistics in order to make them consistent,
> and currently this is not supported by stats_fs but could be added in
> future.

Hi Emanuele

Do you have any idea how you will support atomic access? It does not
seem easy to implement in a filesystem based model.

     Andrew

^ permalink raw reply

* Re: [PATCH] media: omap3isp: Shuffle cacheflush.h and include mm.h
From: Laurent Pinchart @ 2020-05-27 13:45 UTC (permalink / raw)
  To: Nathan Chancellor
  Cc: linux-ia64@vger.kernel.org, Linux-sh list, Roman Zippel,
	Linux Kernel Mailing List, Linux MM, sparclinux, linux-riscv,
	Christoph Hellwig, Linux-Arch, linux-c6x-dev,
	open list:QUALCOMM HEXAGON..., the arch/x86 maintainers,
	Geert Uytterhoeven, linux-media,
	open list:TENSILICA XTENSA PORT (xtensa), Arnd Bergmann,
	Jessica Yu, linux-um, linux-m68k, Openrisc, Linux ARM,
	Michal Simek, open list:BROADCOM NVRAM DRIVER, Sakari Ailus,
	alpha, Linux FS Devel, Andrew Morton, linuxppc-dev
In-Reply-To: <20200527081337.GA3506499@ubuntu-s3-xlarge-x86>

Hi Nathan,

(CC'ing Sakari Ailus and the linux-media mailing list)

On Wed, May 27, 2020 at 01:13:37AM -0700, Nathan Chancellor wrote:
> On Wed, May 27, 2020 at 09:02:51AM +0200, Geert Uytterhoeven wrote:
> > On Wed, May 27, 2020 at 6:37 AM Nathan Chancellor wrote:
> > > After mm.h was removed from the asm-generic version of cacheflush.h,
> > > s390 allyesconfig shows several warnings of the following nature:
> > >
> > > In file included from ./arch/s390/include/generated/asm/cacheflush.h:1,
> > >                  from drivers/media/platform/omap3isp/isp.c:42:
> > > ./include/asm-generic/cacheflush.h:16:42: warning: 'struct mm_struct'
> > > declared inside parameter list will not be visible outside of this
> > > definition or declaration
> > >
> > > cacheflush.h does not include mm.h nor does it include any forward
> > > declaration of these structures hence the warning. To avoid this,
> > > include mm.h explicitly in this file and shuffle cacheflush.h below it.
> > >
> > > Fixes: 19c0054597a0 ("asm-generic: don't include <linux/mm.h> in cacheflush.h")
> > > Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
> > 
> > Thanks for your patch!
> > 
> > > I am aware the fixes tag is kind of irrelevant because that SHA will
> > > change in the next linux-next revision and this will probably get folded
> > > into the original patch anyways but still.
> > >
> > > The other solution would be to add forward declarations of these structs
> > > to the top of cacheflush.h, I just chose to do what Christoph did in the
> > > original patch. I am happy to do that instead if you all feel that is
> > > better.
> > 
> > That actually looks like a better solution to me, as it would address the
> > problem for all users.

Headers should be self-contained, so that would be the best fix in my
opinion.

This being said, as cacheflush.h isn't needed in isp.c, I think we
should also drop it. It seems to have been included there since the
first driver version, and was likely a left-over from the out-of-tree
development. Manual cache handling was part of
drivers/media/platform/omap3isp/ispqueue.c and has been removed in
commit fbac1400bd1a ("[media] omap3isp: Move to videobuf2").

cacheflush.h can also be dropped from ispvideo.c which suffers from the
same issue.

> > >  drivers/media/platform/omap3isp/isp.c | 5 +++--
> > >  1 file changed, 3 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/media/platform/omap3isp/isp.c b/drivers/media/platform/omap3isp/isp.c
> > > index a4ee6b86663e..54106a768e54 100644
> > > --- a/drivers/media/platform/omap3isp/isp.c
> > > +++ b/drivers/media/platform/omap3isp/isp.c
> > > @@ -39,8 +39,6 @@
> > >   *     Troy Laramy <t-laramy@ti.com>
> > >   */
> > >
> > > -#include <asm/cacheflush.h>
> > > -
> > >  #include <linux/clk.h>
> > >  #include <linux/clkdev.h>
> > >  #include <linux/delay.h>
> > > @@ -49,6 +47,7 @@
> > >  #include <linux/i2c.h>
> > >  #include <linux/interrupt.h>
> > >  #include <linux/mfd/syscon.h>
> > > +#include <linux/mm.h>
> > >  #include <linux/module.h>
> > >  #include <linux/omap-iommu.h>
> > >  #include <linux/platform_device.h>
> > > @@ -58,6 +57,8 @@
> > >  #include <linux/sched.h>
> > >  #include <linux/vmalloc.h>
> > >
> > > +#include <asm/cacheflush.h>
> > > +
> > >  #ifdef CONFIG_ARM_DMA_USE_IOMMU
> > >  #include <asm/dma-iommu.h>
> > >  #endif
> > 
> > Why does this file need <asm/cacheflush.h> at all?
> > It doesn't call any of the flush_*() functions, and seems to compile fine
> > without (on arm32).
> > 
> > Perhaps it was included at the top intentionally, to override the definitions
> > of copy_{to,from}_user_page()? Fortunately that doesn't seem to be the
> > case, from a quick look at the assembler output.
> > 
> > So let's just remove the #include instead?
> 
> Sounds good to me. I can send a patch if needed or I suppose Andrew can
> just make a small fixup patch for it. Let me know what I should do.

-- 
Regards,

Laurent Pinchart

^ permalink raw reply

* [RFC PATCH 1/4] powerpc/64s: Don't init FSCR_DSCR in __init_FSCR()
From: Michael Ellerman @ 2020-05-27 14:58 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: alistair, mikey, npiggin, jniethe5

__init_FSCR() was added originally in commit 2468dcf641e4 ("powerpc:
Add support for context switching the TAR register") (Feb 2013), and
only set FSCR_TAR.

At that point FSCR (Facility Status and Control Register) was not
context switched, so the setting was permanent after boot.

Later we added initialisation of FSCR_DSCR to __init_FSCR(), in commit
54c9b2253d34 ("powerpc: Set DSCR bit in FSCR setup") (Mar 2013), again
that was permanent after boot.

Then commit 2517617e0de6 ("powerpc: Fix context switch DSCR on
POWER8") (Aug 2013) added a limited context switch of FSCR, just the
FSCR_DSCR bit was context switched based on thread.dscr_inherit. That
commit said "This clears the H/FSCR DSCR bit initially", but it
didn't, it left the initialisation of FSCR_DSCR in __init_FSCR().
However the initial context switch from init_task to pid 1 would clear
FSCR_DSCR because thread.dscr_inherit was 0.

That commit also introduced the requirement that FSCR_DSCR be clear
for user processes, so that we can take the facility unavailable
interrupt in order to manage dscr_inherit.

Then in commit 152d523e6307 ("powerpc: Create context switch helpers
save_sprs() and restore_sprs()") (Dec 2015) FSCR was added to
thread_struct. However it still wasn't fully context switched, we just
took the existing value and set FSCR_DSCR if the new thread had
dscr_inherit set. FSCR was still initialised at boot to FSCR_DSCR |
FSCR_TAR, but that value was not propagated into the thread_struct, so
the initial context switch set FSCR_DSCR back to 0.

Finally commit b57bd2de8c6c ("powerpc: Improve FSCR init and context
switching") (Jun 2016) added a full context switch of the FSCR, and
added an initialisation of init_task.thread.fscr to FSCR_TAR |
FSCR_EBB, but omitted FSCR_DSCR.

The end result is that swapper runs with FSCR_DSCR set because of the
initialisation in __init_FSCR(), but no other processes do, they use
the value from init_task.thread.fscr.

Having FSCR_DSCR set for swapper allows it to access SPR 3 from
userspace, but swapper never runs userspace, so it has no useful
effect. It's also confusing to have the value initialised in two
places to two different values.

So remove FSCR_DSCR from __init_FSCR(), this at least gets us to the
point where there's a single value of FSCR, even if it's still set in
two places.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/cpu_setup_power.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index a460298c7ddb..f91ecb10d0ae 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -184,7 +184,7 @@ _GLOBAL(__restore_cpu_power9)
 
 __init_FSCR:
 	mfspr	r3,SPRN_FSCR
-	ori	r3,r3,FSCR_TAR|FSCR_DSCR|FSCR_EBB
+	ori	r3,r3,FSCR_TAR|FSCR_EBB
 	mtspr	SPRN_FSCR,r3
 	blr
 
-- 
2.25.1


^ permalink raw reply related

* [RFC PATCH 3/4] powerpc/64s: Save FSCR to init_task.thread.fscr after feature init
From: Michael Ellerman @ 2020-05-27 14:58 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: alistair, mikey, npiggin, jniethe5
In-Reply-To: <20200527145843.2761782-1-mpe@ellerman.id.au>

At boot the FSCR is initialised via one of two paths. On most systems
it's set to a hard coded value in __init_FSCR().

On newer skiboot systems we use the device tree CPU features binding,
where firmware can tell Linux what bits to set in FSCR (and HFSCR).

In both cases the value that's configured at boot is not propagated
into the init_task.thread.fscr value prior to the initial fork of init
(pid 1), which means the value is not used by any processes other than
swapper (the idle task).

For the __init_FSCR() case this is OK, because the value in
init_task.thread.fscr is initialised to something sensible. However it
does mean that the value set in __init_FSCR() is not used other than
for swapper, which is odd and confusing.

The bigger problem is for the device tree CPU features case it
prevents firmware from setting (or clearing) FSCR bits for use by user
space. This means all existing kernels can not have features
enabled/disabled by firmware if those features require
setting/clearing FSCR bits.

We can handle both cases by saving the FSCR value into
init_task.thread.fscr after we have initialised it at boot. This fixes
the bug for device tree CPU features, and will allow us to simplify
the initialisation for the __init_FSCR() case in a future patch.

Fixes: 5a61ef74f269 ("powerpc/64s: Support new device tree binding for discovering CPU features")
Cc: stable@vger.kernel.org # v4.12+
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/prom.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 1dcf0e214a22..a74bfd09cb38 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -685,6 +685,23 @@ static void __init tm_init(void)
 static void tm_init(void) { }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
+#ifdef CONFIG_PPC64
+static void __init save_fscr_to_task(void)
+{
+	/*
+	 * Ensure the init_task (pid 0, aka swapper) uses the value of FSCR we
+	 * have configured via the device tree features or via __init_FSCR().
+	 * That value will then be propagated to pid 1 (init) and all future
+	 * processes.
+	 */
+	if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
+		init_task.thread.fscr = mfspr(SPRN_FSCR);
+}
+#else
+static inline void save_fscr_to_task(void) {};
+#endif
+
+
 void __init early_init_devtree(void *params)
 {
 	phys_addr_t limit;
@@ -773,6 +790,8 @@ void __init early_init_devtree(void *params)
 		BUG();
 	}
 
+	save_fscr_to_task();
+
 #if defined(CONFIG_SMP) && defined(CONFIG_PPC64)
 	/* We'll later wait for secondaries to check in; there are
 	 * NCPUS-1 non-boot CPUs  :-)
-- 
2.25.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox