LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH v2 10/16] powerpc/powernv/pci: Refactor pnv_ioda_alloc_pe()
From: Alexey Kardashevskiy @ 2020-07-24  5:20 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev
In-Reply-To: <20200722065715.1432738-10-oohall@gmail.com>



On 22/07/2020 16:57, Oliver O'Halloran wrote:
> Rework the PE allocation logic to allow allocating blocks of PEs rather
> than individually. We'll use this to allocate contigious blocks of PEs for
> the SR-IOVs.
> 
> This patch also adds code to pnv_ioda_alloc_pe() and pnv_ioda_reserve_pe() to
> use the existing, but unused, phb->pe_alloc_mutex. Currently these functions
> use atomic bit ops to release a currently allocated PE number. However,
> the pnv_ioda_alloc_pe() wants to have exclusive access to the bit map while
> scanning for hole large enough to accomodate the allocation size.
> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> ---
> v2: Add some details about the pe_alloc mutex and why we're using it.


Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>




> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 41 ++++++++++++++++++-----
>  arch/powerpc/platforms/powernv/pci.h      |  2 +-
>  2 files changed, 34 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 2d36a9ebf0e9..c9c25fb0783c 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -145,23 +145,45 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
>  		return;
>  	}
>  
> +	mutex_lock(&phb->ioda.pe_alloc_mutex);
>  	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
>  		pr_debug("%s: PE %x was reserved on PHB#%x\n",
>  			 __func__, pe_no, phb->hose->global_number);
> +	mutex_unlock(&phb->ioda.pe_alloc_mutex);
>  
>  	pnv_ioda_init_pe(phb, pe_no);
>  }
>  
> -struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
> +struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count)
>  {
> -	long pe;
> +	struct pnv_ioda_pe *ret = NULL;
> +	int run = 0, pe, i;
>  
> +	mutex_lock(&phb->ioda.pe_alloc_mutex);
> +
> +	/* scan backwards for a run of @count cleared bits */
>  	for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
> -		if (!test_and_set_bit(pe, phb->ioda.pe_alloc))
> -			return pnv_ioda_init_pe(phb, pe);
> +		if (test_bit(pe, phb->ioda.pe_alloc)) {
> +			run = 0;
> +			continue;
> +		}
> +
> +		run++;
> +		if (run == count)
> +			break;
>  	}
> +	if (run != count)
> +		goto out;
>  
> -	return NULL;
> +	for (i = pe; i < pe + count; i++) {
> +		set_bit(i, phb->ioda.pe_alloc);
> +		pnv_ioda_init_pe(phb, i);
> +	}
> +	ret = &phb->ioda.pe_array[pe];
> +
> +out:
> +	mutex_unlock(&phb->ioda.pe_alloc_mutex);
> +	return ret;
>  }
>  
>  void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
> @@ -173,7 +195,10 @@ void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
>  	WARN_ON(pe->npucomp); /* NPUs for nvlink are not supposed to be freed */
>  	kfree(pe->npucomp);
>  	memset(pe, 0, sizeof(struct pnv_ioda_pe));
> +
> +	mutex_lock(&phb->ioda.pe_alloc_mutex);
>  	clear_bit(pe_num, phb->ioda.pe_alloc);
> +	mutex_unlock(&phb->ioda.pe_alloc_mutex);
>  }
>  
>  /* The default M64 BAR is shared by all PEs */
> @@ -976,7 +1001,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
>  	if (pdn->pe_number != IODA_INVALID_PE)
>  		return NULL;
>  
> -	pe = pnv_ioda_alloc_pe(phb);
> +	pe = pnv_ioda_alloc_pe(phb, 1);
>  	if (!pe) {
>  		pr_warn("%s: Not enough PE# available, disabling device\n",
>  			pci_name(dev));
> @@ -1047,7 +1072,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
>  
>  	/* The PE number isn't pinned by M64 */
>  	if (!pe)
> -		pe = pnv_ioda_alloc_pe(phb);
> +		pe = pnv_ioda_alloc_pe(phb, 1);
>  
>  	if (!pe) {
>  		pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n",
> @@ -3065,7 +3090,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>  		pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
>  	} else {
>  		/* otherwise just allocate one */
> -		root_pe = pnv_ioda_alloc_pe(phb);
> +		root_pe = pnv_ioda_alloc_pe(phb, 1);
>  		phb->ioda.root_pe_idx = root_pe->pe_number;
>  	}
>  
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index 23fc5e391c7f..06431a452130 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -224,7 +224,7 @@ int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
>  void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
>  void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe);
>  
> -struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb);
> +struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count);
>  void pnv_ioda_free_pe(struct pnv_ioda_pe *pe);
>  
>  #ifdef CONFIG_PCI_IOV
> 

-- 
Alexey

^ permalink raw reply

* [powerpc:next] BUILD SUCCESS 335aca5f65f1a39670944930131da5f2276f888f
From: kernel test robot @ 2020-07-24  5:41 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  next
branch HEAD: 335aca5f65f1a39670944930131da5f2276f888f  Merge branch 'scv' support into next

elapsed time: 951m

configs tested: 74
configs skipped: 1

The following configs have been built successfully.
More configs may be tested in the coming days.

arm64                            allyesconfig
arm64                               defconfig
arm64                            allmodconfig
arm64                             allnoconfig
arm                                 defconfig
arm                              allyesconfig
arm                              allmodconfig
arm                               allnoconfig
i386                                defconfig
i386                             allyesconfig
i386                              debian-10.3
i386                              allnoconfig
ia64                             allmodconfig
ia64                                defconfig
ia64                              allnoconfig
ia64                             allyesconfig
m68k                             allmodconfig
m68k                              allnoconfig
m68k                           sun3_defconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
nios2                            allyesconfig
c6x                              allyesconfig
c6x                               allnoconfig
openrisc                         allyesconfig
openrisc                            defconfig
nds32                               defconfig
nds32                             allnoconfig
csky                             allyesconfig
csky                                defconfig
alpha                               defconfig
alpha                            allyesconfig
xtensa                           allyesconfig
h8300                            allyesconfig
h8300                            allmodconfig
xtensa                              defconfig
arc                                 defconfig
arc                              allyesconfig
sh                               allmodconfig
sh                                allnoconfig
microblaze                        allnoconfig
mips                             allyesconfig
mips                              allnoconfig
mips                             allmodconfig
parisc                            allnoconfig
parisc                              defconfig
parisc                           allyesconfig
parisc                           allmodconfig
powerpc                             defconfig
powerpc                          allyesconfig
powerpc                          rhel-kconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
riscv                            allyesconfig
riscv                             allnoconfig
riscv                               defconfig
riscv                            allmodconfig
s390                             allyesconfig
s390                              allnoconfig
s390                             allmodconfig
s390                                defconfig
sparc                            allyesconfig
sparc                               defconfig
sparc64                           allnoconfig
sparc64                          allmodconfig
sparc64                             defconfig
sparc64                          allyesconfig
x86_64                    rhel-7.6-kselftests
x86_64                               rhel-8.3
x86_64                                  kexec
x86_64                                   rhel
x86_64                                    lkp
x86_64                              fedora-25

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

* Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state
From: Nicholas Piggin @ 2020-07-24  5:59 UTC (permalink / raw)
  To: Alexey Kardashevskiy, Peter Zijlstra
  Cc: linux-arch, Will Deacon, Ingo Molnar, linuxppc-dev, linux-kernel
In-Reply-To: <1884dcea-9ecd-a1f3-21bb-213c655e2480@ozlabs.ru>

Excerpts from Alexey Kardashevskiy's message of July 24, 2020 2:16 pm:
> 
> 
> On 23/07/2020 23:11, Nicholas Piggin wrote:
>> Excerpts from Peter Zijlstra's message of July 23, 2020 9:40 pm:
>>> On Thu, Jul 23, 2020 at 08:56:14PM +1000, Nicholas Piggin wrote:
>>>
>>>> diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
>>>> index 3a0db7b0b46e..35060be09073 100644
>>>> --- a/arch/powerpc/include/asm/hw_irq.h
>>>> +++ b/arch/powerpc/include/asm/hw_irq.h
>>>> @@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void)
>>>>  #define powerpc_local_irq_pmu_save(flags)			\
>>>>  	 do {							\
>>>>  		raw_local_irq_pmu_save(flags);			\
>>>> -		trace_hardirqs_off();				\
>>>> +		if (!raw_irqs_disabled_flags(flags))		\
>>>> +			trace_hardirqs_off();			\
>>>>  	} while(0)
>>>>  #define powerpc_local_irq_pmu_restore(flags)			\
>>>>  	do {							\
>>>> -		if (raw_irqs_disabled_flags(flags)) {		\
>>>> -			raw_local_irq_pmu_restore(flags);	\
>>>> -			trace_hardirqs_off();			\
>>>> -		} else {					\
>>>> +		if (!raw_irqs_disabled_flags(flags))		\
>>>>  			trace_hardirqs_on();			\
>>>> -			raw_local_irq_pmu_restore(flags);	\
>>>> -		}						\
>>>> +		raw_local_irq_pmu_restore(flags);		\
>>>>  	} while(0)
>>>
>>> You shouldn't be calling lockdep from NMI context!
>> 
>> After this patch it doesn't.
>> 
>> trace_hardirqs_on/off implementation appears to expect to be called in NMI 
>> context though, for some reason.
>> 
>>> That is, I recently
>>> added suport for that on x86:
>>>
>>>   https://lkml.kernel.org/r/20200623083721.155449112@infradead.org
>>>   https://lkml.kernel.org/r/20200623083721.216740948@infradead.org
>>>
>>> But you need to be very careful on how you order things, as you can see
>>> the above relies on preempt_count() already having been incremented with
>>> NMI_MASK.
>> 
>> Hmm. My patch seems simpler.
> 
> And your patches fix my error while Peter's do not:
> 
> 
> IRQs not enabled as expected
> WARNING: CPU: 0 PID: 1377 at /home/aik/p/kernel/kernel/softirq.c:169
> __local_bh_enable_ip+0x118/0x190

I think they would have needed some powerpc bits as well. But I don't
see a reason we can't merge my patches, at least they fix this case and
don't seem to make things worse in any way.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state
From: Athira Rajeev @ 2020-07-24  6:16 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: linux-arch, Peter Zijlstra, linuxppc-dev, linux-kernel,
	Nicholas Piggin, Ingo Molnar, Will Deacon
In-Reply-To: <1884dcea-9ecd-a1f3-21bb-213c655e2480@ozlabs.ru>



> On 24-Jul-2020, at 9:46 AM, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> 
> 
> 
> On 23/07/2020 23:11, Nicholas Piggin wrote:
>> Excerpts from Peter Zijlstra's message of July 23, 2020 9:40 pm:
>>> On Thu, Jul 23, 2020 at 08:56:14PM +1000, Nicholas Piggin wrote:
>>> 
>>>> diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
>>>> index 3a0db7b0b46e..35060be09073 100644
>>>> --- a/arch/powerpc/include/asm/hw_irq.h
>>>> +++ b/arch/powerpc/include/asm/hw_irq.h
>>>> @@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void)
>>>> #define powerpc_local_irq_pmu_save(flags)			\
>>>> 	 do {							\
>>>> 		raw_local_irq_pmu_save(flags);			\
>>>> -		trace_hardirqs_off();				\
>>>> +		if (!raw_irqs_disabled_flags(flags))		\
>>>> +			trace_hardirqs_off();			\
>>>> 	} while(0)
>>>> #define powerpc_local_irq_pmu_restore(flags)			\
>>>> 	do {							\
>>>> -		if (raw_irqs_disabled_flags(flags)) {		\
>>>> -			raw_local_irq_pmu_restore(flags);	\
>>>> -			trace_hardirqs_off();			\
>>>> -		} else {					\
>>>> +		if (!raw_irqs_disabled_flags(flags))		\
>>>> 			trace_hardirqs_on();			\
>>>> -			raw_local_irq_pmu_restore(flags);	\
>>>> -		}						\
>>>> +		raw_local_irq_pmu_restore(flags);		\
>>>> 	} while(0)
>>> 
>>> You shouldn't be calling lockdep from NMI context!
>> 
>> After this patch it doesn't.
>> 
>> trace_hardirqs_on/off implementation appears to expect to be called in NMI 
>> context though, for some reason.
>> 
>>> That is, I recently
>>> added suport for that on x86:
>>> 
>>>  https://lkml.kernel.org/r/20200623083721.155449112@infradead.org
>>>  https://lkml.kernel.org/r/20200623083721.216740948@infradead.org
>>> 
>>> But you need to be very careful on how you order things, as you can see
>>> the above relies on preempt_count() already having been incremented with
>>> NMI_MASK.
>> 
>> Hmm. My patch seems simpler.
> 
> And your patches fix my error while Peter's do not:
> 
> 
> IRQs not enabled as expected
> WARNING: CPU: 0 PID: 1377 at /home/aik/p/kernel/kernel/softirq.c:169
> __local_bh_enable_ip+0x118/0x190

Hi Nicholas, Alexey

I was able to reproduce the warning which Alexey reported using perf_fuzzer test suite. 
With the patch provided by Nick, I don’t see the issue anymore. This patch fixes the
warnings I got with perf fuzzer run.

Thanks Nick for the fix. 

Tested-by: Athira Rajeev<atrajeev@linux.ibm.com>


> 
> 
>> 
>> I don't know this stuff very well, I don't really understand what your patch 
>> enables for x86 but at least it shouldn't be incompatible with this one 
>> AFAIKS.
>> 
>> Thanks,
>> Nick
>> 
> 
> -- 
> Alexey


^ permalink raw reply

* Re: [v4] powerpc/perf: Initialize power10 PMU registers in cpu setup routine
From: Athira Rajeev @ 2020-07-24  6:32 UTC (permalink / raw)
  To: kernel test robot
  Cc: Michael Neuling, maddy, kbuild-all, Jordan Niethe, linuxppc-dev
In-Reply-To: <202007241030.wtjRnkIR%lkp@intel.com>



> On 24-Jul-2020, at 8:32 AM, kernel test robot <lkp@intel.com> wrote:
> 
> Hi Athira,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on powerpc/next]
> [also build test ERROR on v5.8-rc6 next-20200723]
> [cannot apply to mpe/next scottwood/next]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch]
> 
> url:    https://github.com/0day-ci/linux/commits/Athira-Rajeev/powerpc-perf-Initialize-power10-PMU-registers-in-cpu-setup-routine/20200723-153537
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
> config: powerpc-defconfig (attached as .config)
> compiler: powerpc64-linux-gcc (GCC) 9.3.0
> reproduce (this is a W=1 build):
>        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>        chmod +x ~/bin/make.cross
>        # save the attached .config to linux build tree
>        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc 
> 
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>
> 
> All errors (new ones prefixed by >>):
> 
>   arch/powerpc/kernel/cpu_setup_power.S: Assembler messages:
>>> arch/powerpc/kernel/cpu_setup_power.S:244: Error: non-constant expression in ".if" statement
>>> arch/powerpc/kernel/cpu_setup_power.S:244: Error: non-constant expression in ".if" statement
>>> arch/powerpc/kernel/cpu_setup_power.S:243: Error: unsupported relocation against SPRN_MMCR3

Hi,

As mentioned in changelog:

This patch was initially part of Power10 PMU base enablement
 series ( https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=190462 ).
 Moving this as separate patch as suggested by Michael
 Ellerman. Hence there is a dependency on initial series ( Patch 7 ) which defines
 MMCRA_BHRB_DISABLE and new SPR’s. And the build error is because of that.

Thanks
Athira

> 
> vim +244 arch/powerpc/kernel/cpu_setup_power.S
> 
>    14	
>    15	/* Entry: r3 = crap, r4 = ptr to cputable entry
>    16	 *
>    17	 * Note that we can be called twice for pseudo-PVRs
>    18	 */
>    19	_GLOBAL(__setup_cpu_power7)
>    20		mflr	r11
>    21		bl	__init_hvmode_206
>    22		mtlr	r11
>    23		beqlr
>    24		li	r0,0
>    25		mtspr	SPRN_LPID,r0
>    26		LOAD_REG_IMMEDIATE(r0, PCR_MASK)
>    27		mtspr	SPRN_PCR,r0
>    28		mfspr	r3,SPRN_LPCR
>    29		li	r4,(LPCR_LPES1 >> LPCR_LPES_SH)
>    30		bl	__init_LPCR_ISA206
>    31		mtlr	r11
>    32		blr
>    33	
>    34	_GLOBAL(__restore_cpu_power7)
>    35		mflr	r11
>    36		mfmsr	r3
>    37		rldicl.	r0,r3,4,63
>    38		beqlr
>    39		li	r0,0
>    40		mtspr	SPRN_LPID,r0
>    41		LOAD_REG_IMMEDIATE(r0, PCR_MASK)
>    42		mtspr	SPRN_PCR,r0
>    43		mfspr	r3,SPRN_LPCR
>    44		li	r4,(LPCR_LPES1 >> LPCR_LPES_SH)
>    45		bl	__init_LPCR_ISA206
>    46		mtlr	r11
>    47		blr
>    48	
>    49	_GLOBAL(__setup_cpu_power8)
>    50		mflr	r11
>    51		bl	__init_FSCR
>    52		bl	__init_PMU
>    53		bl	__init_PMU_ISA207
>    54		bl	__init_hvmode_206
>    55		mtlr	r11
>    56		beqlr
>    57		li	r0,0
>    58		mtspr	SPRN_LPID,r0
>    59		LOAD_REG_IMMEDIATE(r0, PCR_MASK)
>    60		mtspr	SPRN_PCR,r0
>    61		mfspr	r3,SPRN_LPCR
>    62		ori	r3, r3, LPCR_PECEDH
>    63		li	r4,0 /* LPES = 0 */
>    64		bl	__init_LPCR_ISA206
>    65		bl	__init_HFSCR
>    66		bl	__init_PMU_HV
>    67		bl	__init_PMU_HV_ISA207
>    68		mtlr	r11
>    69		blr
>    70	
>    71	_GLOBAL(__restore_cpu_power8)
>    72		mflr	r11
>    73		bl	__init_FSCR
>    74		bl	__init_PMU
>    75		bl	__init_PMU_ISA207
>    76		mfmsr	r3
>    77		rldicl.	r0,r3,4,63
>    78		mtlr	r11
>    79		beqlr
>    80		li	r0,0
>    81		mtspr	SPRN_LPID,r0
>    82		LOAD_REG_IMMEDIATE(r0, PCR_MASK)
>    83		mtspr	SPRN_PCR,r0
>    84		mfspr   r3,SPRN_LPCR
>    85		ori	r3, r3, LPCR_PECEDH
>    86		li	r4,0 /* LPES = 0 */
>    87		bl	__init_LPCR_ISA206
>    88		bl	__init_HFSCR
>    89		bl	__init_PMU_HV
>    90		bl	__init_PMU_HV_ISA207
>    91		mtlr	r11
>    92		blr
>    93	
>    94	_GLOBAL(__setup_cpu_power10)
>    95		mflr	r11
>    96		bl	__init_FSCR_power10
>    97		bl	__init_PMU
>    98		bl	__init_PMU_ISA31
>    99		b	1f
>   100	
>   101	_GLOBAL(__setup_cpu_power9)
>   102		mflr	r11
>   103		bl	__init_FSCR
>   104		bl	__init_PMU
>   105	1:	bl	__init_hvmode_206
>   106		mtlr	r11
>   107		beqlr
>   108		li	r0,0
>   109		mtspr	SPRN_PSSCR,r0
>   110		mtspr	SPRN_LPID,r0
>   111		mtspr	SPRN_PID,r0
>   112		LOAD_REG_IMMEDIATE(r0, PCR_MASK)
>   113		mtspr	SPRN_PCR,r0
>   114		mfspr	r3,SPRN_LPCR
>   115		LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE  | LPCR_HEIC)
>   116		or	r3, r3, r4
>   117		LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR)
>   118		andc	r3, r3, r4
>   119		li	r4,0 /* LPES = 0 */
>   120		bl	__init_LPCR_ISA300
>   121		bl	__init_HFSCR
>   122		bl	__init_PMU_HV
>   123		mtlr	r11
>   124		blr
>   125	
>   126	_GLOBAL(__restore_cpu_power10)
>   127		mflr	r11
>   128		bl	__init_FSCR_power10
>   129		bl	__init_PMU
>   130		bl	__init_PMU_ISA31
>   131		b	1f
>   132	
>   133	_GLOBAL(__restore_cpu_power9)
>   134		mflr	r11
>   135		bl	__init_FSCR
>   136		bl	__init_PMU
>   137	1:	mfmsr	r3
>   138		rldicl.	r0,r3,4,63
>   139		mtlr	r11
>   140		beqlr
>   141		li	r0,0
>   142		mtspr	SPRN_PSSCR,r0
>   143		mtspr	SPRN_LPID,r0
>   144		mtspr	SPRN_PID,r0
>   145		LOAD_REG_IMMEDIATE(r0, PCR_MASK)
>   146		mtspr	SPRN_PCR,r0
>   147		mfspr   r3,SPRN_LPCR
>   148		LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC)
>   149		or	r3, r3, r4
>   150		LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR)
>   151		andc	r3, r3, r4
>   152		li	r4,0 /* LPES = 0 */
>   153		bl	__init_LPCR_ISA300
>   154		bl	__init_HFSCR
>   155		bl	__init_PMU_HV
>   156		mtlr	r11
>   157		blr
>   158	
>   159	__init_hvmode_206:
>   160		/* Disable CPU_FTR_HVMODE and exit if MSR:HV is not set */
>   161		mfmsr	r3
>   162		rldicl.	r0,r3,4,63
>   163		bnelr
>   164		ld	r5,CPU_SPEC_FEATURES(r4)
>   165		LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE | CPU_FTR_P9_TM_HV_ASSIST)
>   166		andc	r5,r5,r6
>   167		std	r5,CPU_SPEC_FEATURES(r4)
>   168		blr
>   169	
>   170	__init_LPCR_ISA206:
>   171		/* Setup a sane LPCR:
>   172		 *   Called with initial LPCR in R3 and desired LPES 2-bit value in R4
>   173		 *
>   174		 *   LPES = 0b01 (HSRR0/1 used for 0x500)
>   175		 *   PECE = 0b111
>   176		 *   DPFD = 4
>   177		 *   HDICE = 0
>   178		 *   VC = 0b100 (VPM0=1, VPM1=0, ISL=0)
>   179		 *   VRMASD = 0b10000 (L=1, LP=00)
>   180		 *
>   181		 * Other bits untouched for now
>   182		 */
>   183		li	r5,0x10
>   184		rldimi	r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5
>   185	
>   186		/* POWER9 has no VRMASD */
>   187	__init_LPCR_ISA300:
>   188		rldimi	r3,r4, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
>   189		ori	r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2)
>   190		li	r5,4
>   191		rldimi	r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3
>   192		clrrdi	r3,r3,1		/* clear HDICE */
>   193		li	r5,4
>   194		rldimi	r3,r5, LPCR_VC_SH, 0
>   195		mtspr	SPRN_LPCR,r3
>   196		isync
>   197		blr
>   198	
>   199	__init_FSCR_power10:
>   200		mfspr	r3, SPRN_FSCR
>   201		ori	r3, r3, FSCR_PREFIX
>   202		mtspr	SPRN_FSCR, r3
>   203		// fall through
>   204	
>   205	__init_FSCR:
>   206		mfspr	r3,SPRN_FSCR
>   207		ori	r3,r3,FSCR_TAR|FSCR_EBB
>   208		mtspr	SPRN_FSCR,r3
>   209		blr
>   210	
>   211	__init_HFSCR:
>   212		mfspr	r3,SPRN_HFSCR
>   213		ori	r3,r3,HFSCR_TAR|HFSCR_TM|HFSCR_BHRB|HFSCR_PM|\
>   214			      HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB|HFSCR_MSGP
>   215		mtspr	SPRN_HFSCR,r3
>   216		blr
>   217	
>   218	__init_PMU_HV:
>   219		li	r5,0
>   220		mtspr	SPRN_MMCRC,r5
>   221		blr
>   222	
>   223	__init_PMU_HV_ISA207:
>   224		li	r5,0
>   225		mtspr	SPRN_MMCRH,r5
>   226		blr
>   227	
>   228	__init_PMU:
>   229		li	r5,0
>   230		mtspr	SPRN_MMCRA,r5
>   231		mtspr	SPRN_MMCR0,r5
>   232		mtspr	SPRN_MMCR1,r5
>   233		mtspr	SPRN_MMCR2,r5
>   234		blr
>   235	
>   236	__init_PMU_ISA207:
>   237		li	r5,0
>   238		mtspr	SPRN_MMCRS,r5
>   239		blr
>   240	
>   241	__init_PMU_ISA31:
>   242		li	r5,0
>> 243		mtspr	SPRN_MMCR3,r5
>> 244		LOAD_REG_IMMEDIATE(r5, MMCRA_BHRB_DISABLE)
> 
> ---
> 0-DAY CI Kernel Test Service, Intel Corporation
> https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
> <.config.gz>


^ permalink raw reply

* Re: [PATCH v2 2/3] powerpc/powernv/idle: save-restore DAWR0,DAWRX0 for P10
From: Pratik Sampat @ 2020-07-24  6:34 UTC (permalink / raw)
  To: Michael Neuling, mpe, benh, paulus, ravi.bangoria, ego, svaidy,
	pratik.r.sampat, linuxppc-dev, linux-kernel
In-Reply-To: <b9507631629bfc1f36893a280b2b83ea484516f9.camel@neuling.org>



On 24/07/20 6:55 am, Michael Neuling wrote:
> On Fri, 2020-07-10 at 10:52 +0530, Pratik Rajesh Sampat wrote:
>> Additional registers DAWR0, DAWRX0 may be lost on Power 10 for
>> stop levels < 4.
>> Therefore save the values of these SPRs before entering a  "stop"
>> state and restore their values on wakeup.
>>
>> Signed-off-by: Pratik Rajesh Sampat <psampat@linux.ibm.com>
>> ---
>>   arch/powerpc/platforms/powernv/idle.c | 10 ++++++++++
>>   1 file changed, 10 insertions(+)
>>
>> diff --git a/arch/powerpc/platforms/powernv/idle.c
>> b/arch/powerpc/platforms/powernv/idle.c
>> index 19d94d021357..f2e2a6a4c274 100644
>> --- a/arch/powerpc/platforms/powernv/idle.c
>> +++ b/arch/powerpc/platforms/powernv/idle.c
>> @@ -600,6 +600,8 @@ struct p9_sprs {
>>   	u64 iamr;
>>   	u64 amor;
>>   	u64 uamor;
>> +	u64 dawr0;
>> +	u64 dawrx0;
>>   };
>>   
>>   static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
>> @@ -687,6 +689,10 @@ static unsigned long power9_idle_stop(unsigned long
>> psscr, bool mmu_on)
>>   	sprs.iamr	= mfspr(SPRN_IAMR);
>>   	sprs.amor	= mfspr(SPRN_AMOR);
>>   	sprs.uamor	= mfspr(SPRN_UAMOR);
>> +	if (cpu_has_feature(CPU_FTR_ARCH_31)) {

You are actually viewing an old version of the patches
The main point of change were based on comments from Nick Piggin, I
have changed the top level function check from ARCH_300 to a P9 PVR
check instead.

A similar thing needs to be done for P10, however as the P10 PVR isn't
exposed yet, I've shelved this particular patch.

Nick's comment to check based on PVR:https://lkml.org/lkml/2020/7/13/1018
v4 of the series:https://lkml.org/lkml/2020/7/21/784

Thanks for your review,
Pratik

> Can you add a comment here saying even though DAWR0 is ARCH_30, it's only
> required to be saved on 31. Otherwise this looks pretty odd.
>
>> +		sprs.dawr0 = mfspr(SPRN_DAWR0);
>> +		sprs.dawrx0 = mfspr(SPRN_DAWRX0);
>> +	}
>>   
>>   	srr1 = isa300_idle_stop_mayloss(psscr);		/* go idle */
>>   
>> @@ -710,6 +716,10 @@ static unsigned long power9_idle_stop(unsigned long
>> psscr, bool mmu_on)
>>   		mtspr(SPRN_IAMR,	sprs.iamr);
>>   		mtspr(SPRN_AMOR,	sprs.amor);
>>   		mtspr(SPRN_UAMOR,	sprs.uamor);
>> +		if (cpu_has_feature(CPU_FTR_ARCH_31)) {
>> +			mtspr(SPRN_DAWR0, sprs.dawr0);
>> +			mtspr(SPRN_DAWRX0, sprs.dawrx0);
>> +		}
>>   
>>   		/*
>>   		 * Workaround for POWER9 DD2.0, if we lost resources, the ERAT


^ permalink raw reply

* [PATCH v4] powerpc/pseries: Avoid using addr_to_pfn in real mode
From: Ganesh Goudar @ 2020-07-24  6:39 UTC (permalink / raw)
  To: mpe, linuxppc-dev; +Cc: mahesh, Ganesh Goudar, npiggin, aneesh.kumar

When an UE or memory error exception is encountered the MCE handler
tries to find the pfn using addr_to_pfn() which takes effective
address as an argument, later pfn is used to poison the page where
memory error occurred, recent rework in this area made addr_to_pfn
to run in real mode, which can be fatal as it may try to access
memory outside RMO region.

Have two helper functions to separate things to be done in real mode
and virtual mode without changing any functionality. This also fixes
the following error as the use of addr_to_pfn is now moved to virtual
mode.

Without this change following kernel crash is seen on hitting UE.

[  485.128036] Oops: Kernel access of bad area, sig: 11 [#1]
[  485.128040] LE SMP NR_CPUS=2048 NUMA pSeries
[  485.128047] Modules linked in:
[  485.128067] CPU: 15 PID: 6536 Comm: insmod Kdump: loaded Tainted: G OE 5.7.0 #22
[  485.128074] NIP:  c00000000009b24c LR: c0000000000398d8 CTR: c000000000cd57c0
[  485.128078] REGS: c000000003f1f970 TRAP: 0300   Tainted: G OE (5.7.0)
[  485.128082] MSR:  8000000000001003 <SF,ME,RI,LE>  CR: 28008284  XER: 00000001
[  485.128088] CFAR: c00000000009b190 DAR: c0000001fab00000 DSISR: 40000000 IRQMASK: 1
[  485.128088] GPR00: 0000000000000001 c000000003f1fbf0 c000000001634300 0000b0fa01000000
[  485.128088] GPR04: d000000002220000 0000000000000000 00000000fab00000 0000000000000022
[  485.128088] GPR08: c0000001fab00000 0000000000000000 c0000001fab00000 c000000003f1fc14
[  485.128088] GPR12: 0000000000000008 c000000003ff5880 d000000002100008 0000000000000000
[  485.128088] GPR16: 000000000000ff20 000000000000fff1 000000000000fff2 d0000000021a1100
[  485.128088] GPR20: d000000002200000 c00000015c893c50 c000000000d49b28 c00000015c893c50
[  485.128088] GPR24: d0000000021a0d08 c0000000014e5da8 d0000000021a0818 000000000000000a
[  485.128088] GPR28: 0000000000000008 000000000000000a c0000000017e2970 000000000000000a
[  485.128125] NIP [c00000000009b24c] __find_linux_pte+0x11c/0x310
[  485.128130] LR [c0000000000398d8] addr_to_pfn+0x138/0x170
[  485.128133] Call Trace:
[  485.128135] Instruction dump:
[  485.128138] 3929ffff 7d4a3378 7c883c36 7d2907b4 794a1564 7d294038 794af082 3900ffff
[  485.128144] 79291f24 790af00e 78e70020 7d095214 <7c69502a> 2fa30000 419e011c 70690040
[  485.128152] ---[ end trace d34b27e29ae0e340 ]---

Fixes: 9ca766f9891d ("powerpc/64s/pseries: machine check convert to use common event code")
Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
---
V2: Leave bare metal code and save_mce_event as is.

V3: Have separate functions for realmode and virtual mode handling.

V4: Fix build warning, rephrase commit message.
---
 arch/powerpc/platforms/pseries/ras.c | 118 ++++++++++++++++-----------
 1 file changed, 69 insertions(+), 49 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index f3736fcd98fc..c509e43bac23 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -522,18 +522,55 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
 	return 0; /* need to perform reset */
 }
 
+static int mce_handle_err_realmode(int disposition, u8 error_type)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (disposition == RTAS_DISP_NOT_RECOVERED) {
+		switch (error_type) {
+		case	MC_ERROR_TYPE_SLB:
+		case	MC_ERROR_TYPE_ERAT:
+			/*
+			 * Store the old slb content in paca before flushing.
+			 * Print this when we go to virtual mode.
+			 * There are chances that we may hit MCE again if there
+			 * is a parity error on the SLB entry we trying to read
+			 * for saving. Hence limit the slb saving to single
+			 * level of recursion.
+			 */
+			if (local_paca->in_mce == 1)
+				slb_save_contents(local_paca->mce_faulty_slbs);
+			flush_and_reload_slb();
+			disposition = RTAS_DISP_FULLY_RECOVERED;
+			break;
+		default:
+			break;
+		}
+	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
+		/* Platform corrected itself but could be degraded */
+		pr_err("MCE: limited recovery, system may be degraded\n");
+		disposition = RTAS_DISP_FULLY_RECOVERED;
+	}
+#endif
+	return disposition;
+}
 
-static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
+static int mce_handle_err_virtmode(struct pt_regs *regs,
+				   struct rtas_error_log *errp,
+				   struct pseries_mc_errorlog *mce_log,
+				   int disposition)
 {
 	struct mce_error_info mce_err = { 0 };
-	unsigned long eaddr = 0, paddr = 0;
-	struct pseries_errorlog *pseries_log;
-	struct pseries_mc_errorlog *mce_log;
-	int disposition = rtas_error_disposition(errp);
 	int initiator = rtas_error_initiator(errp);
 	int severity = rtas_error_severity(errp);
+	unsigned long eaddr = 0, paddr = 0;
 	u8 error_type, err_sub_type;
 
+	if (!mce_log)
+		goto out;
+
+	error_type = mce_log->error_type;
+	err_sub_type = rtas_mc_error_sub_type(mce_log);
+
 	if (initiator == RTAS_INITIATOR_UNKNOWN)
 		mce_err.initiator = MCE_INITIATOR_UNKNOWN;
 	else if (initiator == RTAS_INITIATOR_CPU)
@@ -572,18 +609,7 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 	mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
 	mce_err.error_class = MCE_ECLASS_UNKNOWN;
 
-	if (!rtas_error_extended(errp))
-		goto out;
-
-	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
-	if (pseries_log == NULL)
-		goto out;
-
-	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
-	error_type = mce_log->error_type;
-	err_sub_type = rtas_mc_error_sub_type(mce_log);
-
-	switch (mce_log->error_type) {
+	switch (error_type) {
 	case MC_ERROR_TYPE_UE:
 		mce_err.error_type = MCE_ERROR_TYPE_UE;
 		mce_common_process_ue(regs, &mce_err);
@@ -683,37 +709,31 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 		mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
 		break;
 	}
+out:
+	save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED,
+		       &mce_err, regs->nip, eaddr, paddr);
+	return disposition;
+}
 
-#ifdef CONFIG_PPC_BOOK3S_64
-	if (disposition == RTAS_DISP_NOT_RECOVERED) {
-		switch (error_type) {
-		case	MC_ERROR_TYPE_SLB:
-		case	MC_ERROR_TYPE_ERAT:
-			/*
-			 * Store the old slb content in paca before flushing.
-			 * Print this when we go to virtual mode.
-			 * There are chances that we may hit MCE again if there
-			 * is a parity error on the SLB entry we trying to read
-			 * for saving. Hence limit the slb saving to single
-			 * level of recursion.
-			 */
-			if (local_paca->in_mce == 1)
-				slb_save_contents(local_paca->mce_faulty_slbs);
-			flush_and_reload_slb();
-			disposition = RTAS_DISP_FULLY_RECOVERED;
-			break;
-		default:
-			break;
-		}
-	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
-		/* Platform corrected itself but could be degraded */
-		printk(KERN_ERR "MCE: limited recovery, system may "
-		       "be degraded\n");
-		disposition = RTAS_DISP_FULLY_RECOVERED;
-	}
-#endif
+static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
+{
+	struct pseries_errorlog *pseries_log;
+	struct pseries_mc_errorlog *mce_log = NULL;
+	int disposition = rtas_error_disposition(errp);
+	u8 error_type;
+
+	if (!rtas_error_extended(errp))
+		goto out;
+
+	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+	if (!pseries_log)
+		goto out;
+
+	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+	error_type = mce_log->error_type;
+
+	disposition = mce_handle_err_realmode(disposition, error_type);
 
-out:
 	/*
 	 * Enable translation as we will be accessing per-cpu variables
 	 * in save_mce_event() which may fall outside RMO region, also
@@ -724,10 +744,10 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 	 * Note: All the realmode handling like flushing SLB entries for
 	 *       SLB multihit is done by now.
 	 */
+out:
 	mtmsr(mfmsr() | MSR_IR | MSR_DR);
-	save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED,
-			&mce_err, regs->nip, eaddr, paddr);
-
+	disposition = mce_handle_err_virtmode(regs, errp, mce_log,
+					      disposition);
 	return disposition;
 }
 
-- 
2.17.2


^ permalink raw reply related

* Re: [PATCH v3 02/10] powerpc/smp: Merge Power9 topology with Power topology
From: Gautham R Shenoy @ 2020-07-24  7:06 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Michael Neuling, Peter Zijlstra,
	LKML, Nicholas Piggin, Valentin Schneider, Oliver O'Halloran,
	Jordan Niethe, linuxppc-dev, Ingo Molnar
In-Reply-To: <20200723085116.4731-3-srikar@linux.vnet.ibm.com>

On Thu, Jul 23, 2020 at 02:21:08PM +0530, Srikar Dronamraju wrote:
> A new sched_domain_topology_level was added just for Power9. However the
> same can be achieved by merging powerpc_topology with power9_topology
> and makes the code more simpler especially when adding a new sched
> domain.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: LKML <linux-kernel@vger.kernel.org>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Nicholas Piggin <npiggin@gmail.com>
> Cc: Anton Blanchard <anton@ozlabs.org>
> Cc: Oliver O'Halloran <oohall@gmail.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@neuling.org>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Valentin Schneider <valentin.schneider@arm.com>
> Cc: Jordan Niethe <jniethe5@gmail.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

LGTM.


Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>

> ---
> Changelog v1 -> v2:
> 	Replaced a reference to cpu_smt_mask with per_cpu(cpu_sibling_map, cpu)
> 	since cpu_smt_mask is only defined under CONFIG_SCHED_SMT
> 
>  arch/powerpc/kernel/smp.c | 33 ++++++++++-----------------------
>  1 file changed, 10 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index edf94ca64eea..283a04e54f52 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1313,7 +1313,7 @@ int setup_profiling_timer(unsigned int multiplier)
>  }
> 
>  #ifdef CONFIG_SCHED_SMT
> -/* cpumask of CPUs with asymetric SMT dependancy */
> +/* cpumask of CPUs with asymmetric SMT dependency */
>  static int powerpc_smt_flags(void)
>  {
>  	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
> @@ -1326,14 +1326,6 @@ static int powerpc_smt_flags(void)
>  }
>  #endif
> 
> -static struct sched_domain_topology_level powerpc_topology[] = {
> -#ifdef CONFIG_SCHED_SMT
> -	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
> -#endif
> -	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
> -	{ NULL, },
> -};
> -
>  /*
>   * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
>   * This topology makes it *much* cheaper to migrate tasks between adjacent cores
> @@ -1351,7 +1343,13 @@ static int powerpc_shared_cache_flags(void)
>   */
>  static const struct cpumask *shared_cache_mask(int cpu)
>  {
> -	return cpu_l2_cache_mask(cpu);
> +	if (shared_caches)
> +		return cpu_l2_cache_mask(cpu);
> +
> +	if (has_big_cores)
> +		return cpu_smallcore_mask(cpu);
> +
> +	return per_cpu(cpu_sibling_map, cpu);
>  }
> 
>  #ifdef CONFIG_SCHED_SMT
> @@ -1361,7 +1359,7 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
>  }
>  #endif
> 
> -static struct sched_domain_topology_level power9_topology[] = {
> +static struct sched_domain_topology_level powerpc_topology[] = {
>  #ifdef CONFIG_SCHED_SMT
>  	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
>  #endif
> @@ -1386,21 +1384,10 @@ void __init smp_cpus_done(unsigned int max_cpus)
>  #ifdef CONFIG_SCHED_SMT
>  	if (has_big_cores) {
>  		pr_info("Big cores detected but using small core scheduling\n");
> -		power9_topology[0].mask = smallcore_smt_mask;
>  		powerpc_topology[0].mask = smallcore_smt_mask;
>  	}
>  #endif
> -	/*
> -	 * If any CPU detects that it's sharing a cache with another CPU then
> -	 * use the deeper topology that is aware of this sharing.
> -	 */
> -	if (shared_caches) {
> -		pr_info("Using shared cache scheduler topology\n");
> -		set_sched_topology(power9_topology);
> -	} else {
> -		pr_info("Using standard scheduler topology\n");
> -		set_sched_topology(powerpc_topology);
> -	}
> +	set_sched_topology(powerpc_topology);
>  }
> 
>  #ifdef CONFIG_HOTPLUG_CPU
> -- 
> 2.18.2
> 

^ permalink raw reply

* Re: [PATCH v3 04/10] powerpc/smp: Move topology fixups into  a new function
From: Gautham R Shenoy @ 2020-07-24  7:08 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Michael Neuling, Peter Zijlstra,
	LKML, Nicholas Piggin, Valentin Schneider, Oliver O'Halloran,
	Jordan Niethe, linuxppc-dev, Ingo Molnar
In-Reply-To: <20200723085116.4731-5-srikar@linux.vnet.ibm.com>

On Thu, Jul 23, 2020 at 02:21:10PM +0530, Srikar Dronamraju wrote:
> Move topology fixup based on the platform attributes into its own
> function which is called just before set_sched_topology.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: LKML <linux-kernel@vger.kernel.org>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Nicholas Piggin <npiggin@gmail.com>
> Cc: Anton Blanchard <anton@ozlabs.org>
> Cc: Oliver O'Halloran <oohall@gmail.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@neuling.org>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Valentin Schneider <valentin.schneider@arm.com>
> Cc: Jordan Niethe <jniethe5@gmail.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>


Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
> ---
> Changelog v2 -> v3:
> 	Rewrote changelog (Gautham)
> 	Renamed to powerpc/smp: Move topology fixups into  a new function
> 
>  arch/powerpc/kernel/smp.c | 17 +++++++++++------
>  1 file changed, 11 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index a685915e5941..da27f6909be1 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1368,6 +1368,16 @@ int setup_profiling_timer(unsigned int multiplier)
>  	return 0;
>  }
> 
> +static void fixup_topology(void)
> +{
> +#ifdef CONFIG_SCHED_SMT
> +	if (has_big_cores) {
> +		pr_info("Big cores detected but using small core scheduling\n");
> +		powerpc_topology[0].mask = smallcore_smt_mask;
> +	}
> +#endif
> +}
> +
>  void __init smp_cpus_done(unsigned int max_cpus)
>  {
>  	/*
> @@ -1381,12 +1391,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
> 
>  	dump_numa_cpu_topology();
> 
> -#ifdef CONFIG_SCHED_SMT
> -	if (has_big_cores) {
> -		pr_info("Big cores detected but using small core scheduling\n");
> -		powerpc_topology[0].mask = smallcore_smt_mask;
> -	}
> -#endif
> +	fixup_topology();
>  	set_sched_topology(powerpc_topology);
>  }
> 
> -- 
> 2.18.2
> 

^ permalink raw reply

* Re: [PATCH v2 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling
From: Gautham R Shenoy @ 2020-07-24  7:10 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Peter Zijlstra, Jordan Niethe, Anton Blanchard,
	LKML, Valentin Schneider, Nick Piggin, linuxppc-dev, Ingo Molnar
In-Reply-To: <20200722065747.GB9290@linux.vnet.ibm.com>

On Wed, Jul 22, 2020 at 12:27:47PM +0530, Srikar Dronamraju wrote:
> * Gautham R Shenoy <ego@linux.vnet.ibm.com> [2020-07-22 11:51:14]:
> 
> > Hi Srikar,
> > 
> > > diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> > > index 72f16dc0cb26..57468877499a 100644
> > > --- a/arch/powerpc/kernel/smp.c
> > > +++ b/arch/powerpc/kernel/smp.c
> > > @@ -1196,6 +1196,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int))
> > >  	if (!l2_cache)
> > >  		return false;
> > > 
> > > +	cpumask_set_cpu(cpu, mask_fn(cpu));
> > 
> > 
> > Ok, we need to do this because "cpu" is not yet set in the
> > cpu_online_mask. Prior to your patch the "cpu" was getting set in
> > cpu_l2_cache_map(cpu) as a side-effect of the code that is removed in
> > the patch.
> > 
> 
> Right.
> 
> > 
> > >  	for_each_cpu(i, cpu_online_mask) {
> > >  		/*
> > >  		 * when updating the marks the current CPU has not been marked
> > > @@ -1278,29 +1279,30 @@ static void add_cpu_to_masks(int cpu)
> > >  	 * add it to it's own thread sibling mask.
> > >  	 */
> > >  	cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
> > > +	cpumask_set_cpu(cpu, cpu_core_mask(cpu));
> 
> Note: Above, we are explicitly setting the cpu_core_mask.

You are right. I missed this.

> 
> > > 
> > >  	for (i = first_thread; i < first_thread + threads_per_core; i++)
> > >  		if (cpu_online(i))
> > >  			set_cpus_related(i, cpu, cpu_sibling_mask);
> > > 
> > >  	add_cpu_to_smallcore_masks(cpu);
> > > -	/*
> > > -	 * Copy the thread sibling mask into the cache sibling mask
> > > -	 * and mark any CPUs that share an L2 with this CPU.
> > > -	 */
> > > -	for_each_cpu(i, cpu_sibling_mask(cpu))
> > > -		set_cpus_related(cpu, i, cpu_l2_cache_mask);
> > >  	update_mask_by_l2(cpu, cpu_l2_cache_mask);
> > > 
> > > -	/*
> > > -	 * Copy the cache sibling mask into core sibling mask and mark
> > > -	 * any CPUs on the same chip as this CPU.
> > > -	 */
> > > -	for_each_cpu(i, cpu_l2_cache_mask(cpu))
> > > -		set_cpus_related(cpu, i, cpu_core_mask);
> > > +	if (pkg_id == -1) {
> > 
> > I suppose this "if" condition is an optimization, since if pkg_id != -1,
> > we anyway set these CPUs in the cpu_core_mask below.
> > 
> > However...
> 
> This is not just an optimization.
> The hunk removed would only work if cpu_l2_cache_mask is bigger than
> cpu_sibling_mask. (this was the previous assumption that we want to break)
> If the cpu_sibling_mask is bigger than cpu_l2_cache_mask and pkg_id is -1,
> then setting only cpu_l2_cache_mask in cpu_core_mask will result in a broken 
> topology.
> 
> > 
> > > +		struct cpumask *(*mask)(int) = cpu_sibling_mask;
> > > +
> > > +		/*
> > > +		 * Copy the sibling mask into core sibling mask and
> > > +		 * mark any CPUs on the same chip as this CPU.
> > > +		 */
> > > +		if (shared_caches)
> > > +			mask = cpu_l2_cache_mask;
> > > +
> > > +		for_each_cpu(i, mask(cpu))
> > > +			set_cpus_related(cpu, i, cpu_core_mask);
> > > 
> > > -	if (pkg_id == -1)
> > >  		return;
> > > +	}
> > 
> > 
> > ... since "cpu" is not yet set in the cpu_online_mask, do we not miss setting
> > "cpu" in the cpu_core_mask(cpu) in the for-loop below ?
> > 
> > 
> 
> As noted above, we are setting before. So we don't missing the cpu and hence
> have not different from before.


Fair enough.

> 
> > --
> > Thanks and Regards
> > gautham.
> 
> -- 
> Thanks and Regards
> Srikar Dronamraju

^ permalink raw reply

* Re: [PATCH v3 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling
From: Gautham R Shenoy @ 2020-07-24  7:11 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Michael Neuling, Peter Zijlstra,
	LKML, Nicholas Piggin, Valentin Schneider, Oliver O'Halloran,
	Jordan Niethe, linuxppc-dev, Ingo Molnar
In-Reply-To: <20200723085116.4731-6-srikar@linux.vnet.ibm.com>


On Thu, Jul 23, 2020 at 02:21:11PM +0530, Srikar Dronamraju wrote:
> Current code assumes that cpumask of cpus sharing a l2-cache mask will
> always be a superset of cpu_sibling_mask.
> 
> Lets stop that assumption. cpu_l2_cache_mask is a superset of
> cpu_sibling_mask if and only if shared_caches is set.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: LKML <linux-kernel@vger.kernel.org>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Nicholas Piggin <npiggin@gmail.com>
> Cc: Anton Blanchard <anton@ozlabs.org>
> Cc: Oliver O'Halloran <oohall@gmail.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@neuling.org>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Valentin Schneider <valentin.schneider@arm.com>
> Cc: Jordan Niethe <jniethe5@gmail.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>

> ---
> Changelog v1 -> v2:
> 	Set cpumask after verifying l2-cache. (Gautham)
> 
>  arch/powerpc/kernel/smp.c | 28 +++++++++++++++-------------
>  1 file changed, 15 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index da27f6909be1..d997c7411664 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1194,6 +1194,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int))
>  	if (!l2_cache)
>  		return false;
> 
> +	cpumask_set_cpu(cpu, mask_fn(cpu));
>  	for_each_cpu(i, cpu_online_mask) {
>  		/*
>  		 * when updating the marks the current CPU has not been marked
> @@ -1276,29 +1277,30 @@ static void add_cpu_to_masks(int cpu)
>  	 * add it to it's own thread sibling mask.
>  	 */
>  	cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
> +	cpumask_set_cpu(cpu, cpu_core_mask(cpu));
> 
>  	for (i = first_thread; i < first_thread + threads_per_core; i++)
>  		if (cpu_online(i))
>  			set_cpus_related(i, cpu, cpu_sibling_mask);
> 
>  	add_cpu_to_smallcore_masks(cpu);
> -	/*
> -	 * Copy the thread sibling mask into the cache sibling mask
> -	 * and mark any CPUs that share an L2 with this CPU.
> -	 */
> -	for_each_cpu(i, cpu_sibling_mask(cpu))
> -		set_cpus_related(cpu, i, cpu_l2_cache_mask);
>  	update_mask_by_l2(cpu, cpu_l2_cache_mask);
> 
> -	/*
> -	 * Copy the cache sibling mask into core sibling mask and mark
> -	 * any CPUs on the same chip as this CPU.
> -	 */
> -	for_each_cpu(i, cpu_l2_cache_mask(cpu))
> -		set_cpus_related(cpu, i, cpu_core_mask);
> +	if (pkg_id == -1) {
> +		struct cpumask *(*mask)(int) = cpu_sibling_mask;
> +
> +		/*
> +		 * Copy the sibling mask into core sibling mask and
> +		 * mark any CPUs on the same chip as this CPU.
> +		 */
> +		if (shared_caches)
> +			mask = cpu_l2_cache_mask;
> +
> +		for_each_cpu(i, mask(cpu))
> +			set_cpus_related(cpu, i, cpu_core_mask);
> 
> -	if (pkg_id == -1)
>  		return;
> +	}
> 
>  	for_each_cpu(i, cpu_online_mask)
>  		if (get_physical_package_id(i) == pkg_id)
> -- 
> 2.18.2
> 

^ permalink raw reply

* Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone
From: Arnd Bergmann @ 2020-07-24  7:20 UTC (permalink / raw)
  To: Atish Patra
  Cc: Albert Ou, Alexandre Ghiti, Linux-MM, Anup Patel,
	linux-kernel@vger.kernel.org, Atish Patra, Palmer Dabbelt,
	Zong Li, Paul Walmsley, Paul Mackerras, linux-riscv, linuxppc-dev
In-Reply-To: <CAOnJCULmX+vUcpEmBd5w7xjtZSFk=Ju2V=wBJCOXHQ8m9yG9-Q@mail.gmail.com>

On Wed, Jul 22, 2020 at 11:06 PM Atish Patra <atishp@atishpatra.org> wrote:
>
> On Wed, Jul 22, 2020 at 1:23 PM Arnd Bergmann <arnd@arndb.de> wrote:
> >
> > I just noticed that rv32 allows 2GB of lowmem rather than just the usual
> > 768MB or 1GB, at the expense of addressable user memory. This seems
> > like an unusual choice, but I also don't see any reason to change this
> > or make it more flexible unless actual users appear.
> >
>
> I am a bit confused here. As per my understanding, RV32 supports 1GB
> of lowmem only
> as the page offset is set to 0xC0000000. The config option
> MAXPHYSMEM_2GB is misleading
> as RV32 actually allows 1GB of physical memory only.

Ok, in that case I was apparently misled by the Kconfig option name.

I just tried building a kernel to see what the boundaries actually are,
as this is not the only confusing bit. Here is what I see:

0x9dc00000 TASK_SIZE/FIXADDR_START   /* code comment says 0x9fc00000 */
0x9e000000 FIXADDR_TOP/PCI_IO_START
0x9f000000 PCI_IO_END/VMEMMAP_START
0xa0000000 VMEMMAP_END/VMALLOC_START
0xc0000000 VMALLOC_END/PAGE_OFFSET

Having exactly 1GB of linear map does make a lot of sense. Having PCI I/O,
vmemmap and fixmap come out of the user range means you get slightly
different behavior in user space if there are any changes to that set, but
that is probably fine as well, if you want the flexibility to go to a 2GB linear
map and expect user space to deal with that as well.

There is one common trick from arm32 however that you might want to
consider: if vmalloc was moved above the linear map rather than below,
the size of the vmalloc area can dynamically depend on the amount of
RAM that is actually present rather than be set to a fixed value.

On arm32, there is around 240MB of vmalloc space if the linear map
is fully populated with RAM, but it can grow to use all of the avaialable
address space if less RAM was detected at boot time (up to 3GB
depending on CONFIG_VMSPLIT).

> Any memory blocks beyond
> DRAM + 1GB are removed in setup_bootmem. IMHO, The current config
> should clarify that.
>
> Moreover, we should add 2G split under a separate configuration if we
> want to support that.

Right. It's probably not needed immediately, but can't hurt either.

    Arnd

^ permalink raw reply

* Re: [PATCH v5 7/7] KVM: PPC: Book3S HV: rework secure mem slot dropping
From: Laurent Dufour @ 2020-07-24  7:43 UTC (permalink / raw)
  To: bharata, Ram Pai
  Cc: cclaudio, kvm-ppc, sathnaga, aneesh.kumar, sukadev, linuxppc-dev,
	bauerman, david
In-Reply-To: <20200724030337.GC1082478@in.ibm.com>

Le 24/07/2020 à 05:03, Bharata B Rao a écrit :
> On Thu, Jul 23, 2020 at 01:07:24PM -0700, Ram Pai wrote:
>> From: Laurent Dufour <ldufour@linux.ibm.com>
>>
>> When a secure memslot is dropped, all the pages backed in the secure
>> device (aka really backed by secure memory by the Ultravisor)
>> should be paged out to a normal page. Previously, this was
>> achieved by triggering the page fault mechanism which is calling
>> kvmppc_svm_page_out() on each pages.
>>
>> This can't work when hot unplugging a memory slot because the memory
>> slot is flagged as invalid and gfn_to_pfn() is then not trying to access
>> the page, so the page fault mechanism is not triggered.
>>
>> Since the final goal is to make a call to kvmppc_svm_page_out() it seems
>> simpler to call directly instead of triggering such a mechanism. This
>> way kvmppc_uvmem_drop_pages() can be called even when hot unplugging a
>> memslot.
>>
>> Since kvmppc_uvmem_drop_pages() is already holding kvm->arch.uvmem_lock,
>> the call to __kvmppc_svm_page_out() is made.  As
>> __kvmppc_svm_page_out needs the vma pointer to migrate the pages,
>> the VMA is fetched in a lazy way, to not trigger find_vma() all
>> the time. In addition, the mmap_sem is held in read mode during
>> that time, not in write mode since the virual memory layout is not
>> impacted, and kvm->arch.uvmem_lock prevents concurrent operation
>> on the secure device.
>>
>> Cc: Ram Pai <linuxram@us.ibm.com>
>> Cc: Bharata B Rao <bharata@linux.ibm.com>
>> Cc: Paul Mackerras <paulus@ozlabs.org>
>> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
>> 	[modified the changelog description]
>> Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
>> ---
>>   arch/powerpc/kvm/book3s_hv_uvmem.c | 54 ++++++++++++++++++++++++++------------
>>   1 file changed, 37 insertions(+), 17 deletions(-)
>>
>> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
>> index c772e92..daffa6e 100644
>> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
>> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
>> @@ -632,35 +632,55 @@ static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,
>>    * fault on them, do fault time migration to replace the device PTEs in
>>    * QEMU page table with normal PTEs from newly allocated pages.
>>    */
>> -void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
>> +void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot,
>>   			     struct kvm *kvm, bool skip_page_out)
>>   {
>>   	int i;
>>   	struct kvmppc_uvmem_page_pvt *pvt;
>> -	unsigned long pfn, uvmem_pfn;
>> -	unsigned long gfn = free->base_gfn;
>> +	struct page *uvmem_page;
>> +	struct vm_area_struct *vma = NULL;
>> +	unsigned long uvmem_pfn, gfn;
>> +	unsigned long addr, end;
>> +
>> +	mmap_read_lock(kvm->mm);
>> +
>> +	addr = slot->userspace_addr;
>> +	end = addr + (slot->npages * PAGE_SIZE);
>>   
>> -	for (i = free->npages; i; --i, ++gfn) {
>> -		struct page *uvmem_page;
>> +	gfn = slot->base_gfn;
>> +	for (i = slot->npages; i; --i, ++gfn, addr += PAGE_SIZE) {
>> +
>> +		/* Fetch the VMA if addr is not in the latest fetched one */
>> +		if (!vma || (addr < vma->vm_start || addr >= vma->vm_end)) {
>> +			vma = find_vma_intersection(kvm->mm, addr, end);
>> +			if (!vma ||
>> +			    vma->vm_start > addr || vma->vm_end < end) {
>> +				pr_err("Can't find VMA for gfn:0x%lx\n", gfn);
>> +				break;
>> +			}
> 
> There is a potential issue with the boundary condition check here
> which I discussed with Laurent yesterday. Guess he hasn't gotten around
> to look at it yet.

Right, I'm working on that..



^ permalink raw reply

* [PATCH] powerpc/book3s64/radix: Add kernel command line option to disable radix GTSE
From: Aneesh Kumar K.V @ 2020-07-24  7:56 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V, Bharata B Rao

This adds a kernel command line option that can be used to disable GTSE support.
Disabling GTSE implies kernel will make hcalls to invalidate TLB entries.

This was done so that we can do VM migration between configs that enable/disable
GTSE support via hypervisor. To migrate a VM from a system that supports
GTSE to a system that doesn't, we can boot the guest with radix_gtse=off, thereby
forcing the guest to use hcalls for TLB invalidates.

The check for hcall availability is done in pSeries_setup_arch so that
the panic message appears on the console. This should only happen on
a hypervisor that doesn't force the guest to hash translation even
though it can't handle the radix GTSE=0 request via CAS. With radix_gtse=off
if the hypervisor doesn't support hcall_rpt_invalidate hcall it should
force the LPAR to hash translation.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  3 +++
 arch/powerpc/include/asm/firmware.h             |  4 +++-
 arch/powerpc/kernel/prom_init.c                 | 13 +++++++++----
 arch/powerpc/platforms/pseries/firmware.c       |  1 +
 arch/powerpc/platforms/pseries/setup.c          |  5 +++++
 5 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fb95fad81c79..df20c98a8920 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -896,6 +896,9 @@
 	disable_radix	[PPC]
 			Disable RADIX MMU mode on POWER9
 
+	radix_gtse=off	[PPC/PSERIES]
+			Disable RADIX GTSE feature.
+
 	disable_tlbie	[PPC]
 			Disable TLBIE instruction. Currently does not work
 			with KVM, with HASH MMU, or with coherent accelerators.
diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
index 6003c2e533a0..aa6a5ef5d483 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -52,6 +52,7 @@
 #define FW_FEATURE_PAPR_SCM 	ASM_CONST(0x0000002000000000)
 #define FW_FEATURE_ULTRAVISOR	ASM_CONST(0x0000004000000000)
 #define FW_FEATURE_STUFF_TCE	ASM_CONST(0x0000008000000000)
+#define FW_FEATURE_RPT_INVALIDATE ASM_CONST(0x0000010000000000)
 
 #ifndef __ASSEMBLY__
 
@@ -71,7 +72,8 @@ enum {
 		FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
 		FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
 		FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
-		FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR,
+		FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR |
+		FW_FEATURE_RPT_INVALIDATE,
 	FW_FEATURE_PSERIES_ALWAYS = 0,
 	FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR,
 	FW_FEATURE_POWERNV_ALWAYS = 0,
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index cbc605cfdec0..e7e91965fe6c 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -169,6 +169,7 @@ static unsigned long __prombss prom_tce_alloc_end;
 
 #ifdef CONFIG_PPC_PSERIES
 static bool __prombss prom_radix_disable;
+static bool __prombss prom_radix_gtse_disable;
 static bool __prombss prom_xive_disable;
 #endif
 
@@ -823,6 +824,12 @@ static void __init early_cmdline_parse(void)
 	if (prom_radix_disable)
 		prom_debug("Radix disabled from cmdline\n");
 
+	opt = prom_strstr(prom_cmd_line, "radix_gtse=off");
+	if (opt) {
+		prom_radix_gtse_disable = true;
+		prom_debug("Radix GTSE disabled from cmdline\n");
+	}
+
 	opt = prom_strstr(prom_cmd_line, "xive=off");
 	if (opt) {
 		prom_xive_disable = true;
@@ -1285,10 +1292,8 @@ static void __init prom_parse_platform_support(u8 index, u8 val,
 		prom_parse_mmu_model(val & OV5_FEAT(OV5_MMU_SUPPORT), support);
 		break;
 	case OV5_INDX(OV5_RADIX_GTSE): /* Radix Extensions */
-		if (val & OV5_FEAT(OV5_RADIX_GTSE)) {
-			prom_debug("Radix - GTSE supported\n");
-			support->radix_gtse = true;
-		}
+		if (val & OV5_FEAT(OV5_RADIX_GTSE))
+			support->radix_gtse = !prom_radix_gtse_disable;
 		break;
 	case OV5_INDX(OV5_XIVE_SUPPORT): /* Interrupt mode */
 		prom_parse_xive_model(val & OV5_FEAT(OV5_XIVE_SUPPORT),
diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
index 3e49cc23a97a..4c7b7f5a2ebc 100644
--- a/arch/powerpc/platforms/pseries/firmware.c
+++ b/arch/powerpc/platforms/pseries/firmware.c
@@ -65,6 +65,7 @@ hypertas_fw_features_table[] = {
 	{FW_FEATURE_HPT_RESIZE,		"hcall-hpt-resize"},
 	{FW_FEATURE_BLOCK_REMOVE,	"hcall-block-remove"},
 	{FW_FEATURE_PAPR_SCM,		"hcall-scm"},
+	{FW_FEATURE_RPT_INVALIDATE,	"hcall-rpt-invalidate"},
 };
 
 /* Build up the firmware features bitmask using the contents of
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 2db8469e475f..9e5e098665f7 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -745,6 +745,11 @@ static void __init pSeries_setup_arch(void)
 	smp_init_pseries();
 
 
+	if (radix_enabled() && !mmu_has_feature(MMU_FTR_GTSE))
+		if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
+			panic("BUG: Radix support requires either GTSE or RPT_INVALIDATE\n");
+
+
 	/* openpic global configuration register (64-bit format). */
 	/* openpic Interrupt Source Unit pointer (64-bit format). */
 	/* python0 facility area (mmio) (64-bit format) REAL address. */
-- 
2.26.2


^ permalink raw reply related

* Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone
From: Arnd Bergmann @ 2020-07-24  8:14 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Albert Ou, Alex Ghiti, Atish Patra, Anup Patel,
	linux-kernel@vger.kernel.org, Linux-MM, Palmer Dabbelt, Zong Li,
	Paul Walmsley, Paul Mackerras, linux-riscv, linuxppc-dev
In-Reply-To: <418d5f3d3f42bbc79c5cf30e18ec89edfe2dbd26.camel@kernel.crashing.org>

On Fri, Jul 24, 2020 at 12:34 AM Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Thu, 2020-07-23 at 01:21 -0400, Alex Ghiti wrote:
> > > works fine with huge pages, what is your problem there ? You rely on
> > > punching small-page size holes in there ?
> > >
> >
> > ARCH_HAS_STRICT_KERNEL_RWX prevents the use of a hugepage for the kernel
> > mapping in the direct mapping as it sets different permissions to
> > different part of the kernel (data, text..etc).
>
> Ah ok, that can be solved in a couple of ways...
>
> One is to use the linker script to ensure those sections are linked
> HUGE_PAGE_SIZE appart and moved appropriately by early boot code. One
> is to selectively degrade just those huge pages.
>
> I'm not familiar with the RiscV MMU (I should probably go have a look)
> but if it's a classic radix tree with huge pages at PUD/PMD level, then
> you could just degrade the one(s) that cross those boundaries.

That would work, but if the system can otherwise use 1GB-sized pages,
that might mean degrading the first gigabyte into a mix of 2MB pages
and 4KB pages.

If the kernel is in vmalloc space and vmap is able to use 2MB pages
for contiguous chunks of the mapping, you get a somewhat better
TLB usage.

However, this also means that a writable mapping exists in the
linear mapping for any executable part of the kernel (.text in
both vmlinux and modules). Do we have that on other architectures
as well, or is this something that ought to be prevented with
STRICT_KERNEL_RWX/STRICT_MODULE_RWX?

     Arnd

^ permalink raw reply

* Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR
From: Will Deacon @ 2020-07-24  8:16 UTC (permalink / raw)
  To: peterz
  Cc: linux-arch, Boqun Feng, linux-kernel, Nicholas Piggin,
	virtualization, Ingo Molnar, kvm-ppc, Waiman Long, linuxppc-dev
In-Reply-To: <20200723184759.GS119549@hirez.programming.kicks-ass.net>

On Thu, Jul 23, 2020 at 08:47:59PM +0200, peterz@infradead.org wrote:
> On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
> > BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
> > I will have to update the patch to fix the reported 0-day test problem, but
> > I want to collect other feedback before sending out v3.
> 
> I want to say I hate it all, it adds instructions to a path we spend an
> aweful lot of time optimizing without really getting anything back for
> it.
> 
> Will, how do you feel about it?

I can see it potentially being useful for debugging, but I hate the
limitation to 256 CPUs. Even arm64 is hitting that now.

Also, you're talking ~1% gains here. I think our collective time would
be better spent off reviewing the CNA series and trying to make it more
deterministic.

Will

^ permalink raw reply

* Re: [v3 12/15] powerpc/perf: Add support for outputting extended regs in perf intr_regs
From: Athira Rajeev @ 2020-07-24  8:25 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Gautham R Shenoy, Michael Neuling, maddy, kvm, kajoljain, kvm-ppc,
	svaidyan, Jiri Olsa, linuxppc-dev
In-Reply-To: <20200723145609.GA1943963@kernel.org>



> On 23-Jul-2020, at 8:26 PM, Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> 
> Em Thu, Jul 23, 2020 at 11:14:16AM +0530, kajoljain escreveu:
>> 
>> 
>> On 7/21/20 11:32 AM, kajoljain wrote:
>>> 
>>> 
>>> On 7/17/20 8:08 PM, Athira Rajeev wrote:
>>>> From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
>>>> 
>>>> Add support for perf extended register capability in powerpc.
>>>> The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to indicate the
>>>> PMU which support extended registers. The generic code define the mask
>>>> of extended registers as 0 for non supported architectures.
>>>> 
>>>> Patch adds extended regs support for power9 platform by
>>>> exposing MMCR0, MMCR1 and MMCR2 registers.
>>>> 
>>>> REG_RESERVED mask needs update to include extended regs.
>>>> `PERF_REG_EXTENDED_MASK`, contains mask value of the supported registers,
>>>> is defined at runtime in the kernel based on platform since the supported
>>>> registers may differ from one processor version to another and hence the
>>>> MASK value.
>>>> 
>>>> with patch
>>>> ----------
>>>> 
>>>> available registers: r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11
>>>> r12 r13 r14 r15 r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r26
>>>> r27 r28 r29 r30 r31 nip msr orig_r3 ctr link xer ccr softe
>>>> trap dar dsisr sier mmcra mmcr0 mmcr1 mmcr2
>>>> 
>>>> PERF_RECORD_SAMPLE(IP, 0x1): 4784/4784: 0 period: 1 addr: 0
>>>> ... intr regs: mask 0xffffffffffff ABI 64-bit
>>>> .... r0    0xc00000000012b77c
>>>> .... r1    0xc000003fe5e03930
>>>> .... r2    0xc000000001b0e000
>>>> .... r3    0xc000003fdcddf800
>>>> .... r4    0xc000003fc7880000
>>>> .... r5    0x9c422724be
>>>> .... r6    0xc000003fe5e03908
>>>> .... r7    0xffffff63bddc8706
>>>> .... r8    0x9e4
>>>> .... r9    0x0
>>>> .... r10   0x1
>>>> .... r11   0x0
>>>> .... r12   0xc0000000001299c0
>>>> .... r13   0xc000003ffffc4800
>>>> .... r14   0x0
>>>> .... r15   0x7fffdd8b8b00
>>>> .... r16   0x0
>>>> .... r17   0x7fffdd8be6b8
>>>> .... r18   0x7e7076607730
>>>> .... r19   0x2f
>>>> .... r20   0xc00000001fc26c68
>>>> .... r21   0xc0002041e4227e00
>>>> .... r22   0xc00000002018fb60
>>>> .... r23   0x1
>>>> .... r24   0xc000003ffec4d900
>>>> .... r25   0x80000000
>>>> .... r26   0x0
>>>> .... r27   0x1
>>>> .... r28   0x1
>>>> .... r29   0xc000000001be1260
>>>> .... r30   0x6008010
>>>> .... r31   0xc000003ffebb7218
>>>> .... nip   0xc00000000012b910
>>>> .... msr   0x9000000000009033
>>>> .... orig_r3 0xc00000000012b86c
>>>> .... ctr   0xc0000000001299c0
>>>> .... link  0xc00000000012b77c
>>>> .... xer   0x0
>>>> .... ccr   0x28002222
>>>> .... softe 0x1
>>>> .... trap  0xf00
>>>> .... dar   0x0
>>>> .... dsisr 0x80000000000
>>>> .... sier  0x0
>>>> .... mmcra 0x80000000000
>>>> .... mmcr0 0x82008090
>>>> .... mmcr1 0x1e000000
>>>> .... mmcr2 0x0
>>>> ... thread: perf:4784
>>>> 
>>>> Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
>>>> [Defined PERF_REG_EXTENDED_MASK at run time to add support for different platforms ]
>>>> Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
>>>> Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
>>>> ---
>>> 
>>> Patch looks good to me.
>>> 
>>> Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
>> 
>> Hi Arnaldo and Jiri,
>> 	 Please let me know if you have any comments on these patches. Can you pull/ack these
>> patches if they seems fine to you.
> 
> Can you please clarify something here, I think I saw a kernel build bot
> complaint followed by a fix, in these cases I think, for reviewer's
> sake, that this would entail a v4 patchkit? One that has no such build
> issues?
> 
> Or have I got something wrong?

Hi Arnaldo,

yes you are right, I will send version 4 as a new series with changes to add support for extended regs and including fix for the build issue.
Thanks for your response.

Athira 

> 
> - Arnaldo


^ permalink raw reply

* Re: [PATCH v2 1/3] module: Rename module_alloc() to text_alloc() and move to kernel proper
From: Jarkko Sakkinen @ 2020-07-24  7:36 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Catalin Marinas, Kefeng Wang, Paul Mackerras, Zong Li, Andi Kleen,
	Paul Burton, Vincent Whitchurch, Petr Mladek, Brian Gerst,
	Andy Lutomirski, Thomas Gleixner, Jiri Kosina, Anup Patel,
	Linux Kernel Mailing List, Philipp Rudo, Torsten Duwe,
	Masami Hiramatsu, Andrew Morton, Mark Rutland,
	James E.J. Bottomley, Vincent Chen, Omar Sandoval, open list:S390,
	Joe Lawrence, Helge Deller, John Fastabend, Anil S Keshavamurthy,
	Yonghong Song, Iurii Zaikin, Andrii Nakryiko, Thomas Huth,
	Vasily Gorbik, moderated list:ARM PORT, Daniel Axtens,
	Damien Le Moal, Martin KaFai Lau, Song Liu, Josh Poimboeuf,
	Heiko Carstens, Alexei Starovoitov, Atish Patra, Will Deacon,
	Daniel Borkmann, Masahiro Yamada, Nayna Jain, Ley Foon Tan,
	Christian Borntraeger, Sami Tolvanen, Naveen N. Rao, Mao Han,
	Marco Elver, Steven Rostedt, Babu Moger, Borislav Petkov,
	Greentime Hu, Ben Dooks, Guan Xuetao, Thomas Bogendoerfer,
	open list:PARISC ARCHITECTURE, Jessica Yu,
	open list:BPF JIT for MIPS 32-BIT AND 64-BIT, David S. Miller,
	Thiago Jung Bauermann, Peter Zijlstra, David Howells,
	Amit Daniel Kachhap, Sandipan Das, H. Peter Anvin,
	open list:SPARC + UltraSPARC sparc/sparc64,
	open list:RISC-V ARCHITECTURE, Miroslav Benes, Jiri Olsa,
	Tiezhu Yang, Vincenzo Frascino, Anders Roxell, Sven Schnelle,
	maintainer:X86 ARCHITECTURE 32-BIT AND 64-BIT, Russell King,
	Mike Rapoport, Ingo Molnar, Albert Ou, Paul E. McKenney,
	Paul Walmsley, KP Singh, Dmitry Vyukov, Nick Hu,
	open list:BPF JIT for MIPS 32-BIT AND 64-BIT, open list:MIPS,
	Palmer Dabbelt, open list:LINUX FOR POWERPC 32-BIT AND 64-BIT
In-Reply-To: <CAMj1kXGJhqC+asc6JUNeEkRsHYTzNQVe4-65vKqigbW03gO9Jg@mail.gmail.com>

On Thu, Jul 23, 2020 at 03:42:09PM +0300, Ard Biesheuvel wrote:
> On Thu, 23 Jul 2020 at 04:52, Jarkko Sakkinen
> <jarkko.sakkinen@linux.intel.com> wrote:
> >
> > On Thu, Jul 16, 2020 at 06:49:09PM +0200, Christophe Leroy wrote:
> > > Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com> a écrit :
> > >
> > > > Rename module_alloc() to text_alloc() and module_memfree() to
> > > > text_memfree(), and move them to kernel/text.c, which is unconditionally
> > > > compiled to the kernel proper. This allows kprobes, ftrace and bpf to
> > > > allocate space for executable code without requiring to compile the modules
> > > > support (CONFIG_MODULES=y) in.
> > >
> > > You are not changing enough in powerpc to have this work.
> > > On powerpc 32 bits (6xx), when STRICT_KERNEL_RWX is selected, the vmalloc
> > > space is set to NX (no exec) at segment level (ie by 256Mbytes zone) unless
> > > CONFIG_MODULES is selected.
> > >
> > > Christophe
> >
> > This has been deduced down to:
> >
> > https://lore.kernel.org/lkml/20200717030422.679972-1-jarkko.sakkinen@linux.intel.com/
> >
> > I.e. not intruding PPC anymore :-)
> >
> 
> Ok, so after the elaborate discussion we had between Jessica, Russell,
> Peter, Will, Mark, you and myself, where we pointed out that
> a) a single text_alloc() abstraction for bpf, kprobes and ftrace does
> not fit other architectures very well, and
> b) that module_alloc() is not suitable as a default to base text_alloc() on,

In the latest iteration (v5) it is conditionally available only if arch
defines and fallback has been removed.

> you went ahead and implemented that anyway, but only cc'ing Peter,
> akpm, Masami and the mm list this time?

No problems with that. Actually each patch gets everything that
get_maintainer.pl gives with a cc cmd script, not just the ones
explicitly listed in the patch.

Should I explicitly CC you to the next version? I'm happy to grow
the list when requested.

> Sorry, but that is not how it works. Once people get pulled into a
> discussion, you cannot dismiss them or their feedback like that and go
> off and do your own thing anyway. Generic features like this are
> tricky to get right, and it will likely take many iterations and input
> from many different people.

Sure. I'm not expecting this move quickly.

I don't think I've at least purposely done that. As you said it's tricky
to get this right.

/Jarkko

^ permalink raw reply

* Re: [PATCH v2 1/3] module: Rename module_alloc() to text_alloc() and move to kernel proper
From: Jessica Yu @ 2020-07-24  8:05 UTC (permalink / raw)
  To: Jarkko Sakkinen
  Cc: Catalin Marinas, Kefeng Wang, Paul Mackerras, Zong Li, Andi Kleen,
	Paul Burton, Vincent Whitchurch, Petr Mladek, Brian Gerst,
	Andy Lutomirski, Thomas Gleixner, Jiri Kosina, Anup Patel,
	Linux Kernel Mailing List, Philipp Rudo, Torsten Duwe,
	Masami Hiramatsu, Andrew Morton, Mark Rutland,
	James E.J. Bottomley, Vincent Chen, Omar Sandoval, open list:S390,
	Joe Lawrence, Helge Deller, John Fastabend, Anil S Keshavamurthy,
	Yonghong Song, Iurii Zaikin, Andrii Nakryiko, Thomas Huth,
	Vasily Gorbik, moderated list:ARM PORT, Daniel Axtens,
	Damien Le Moal, Martin KaFai Lau, Song Liu, Josh Poimboeuf,
	Heiko Carstens, Alexei Starovoitov, Atish Patra, Will Deacon,
	Daniel Borkmann, Masahiro Yamada, Nayna Jain, Ley Foon Tan,
	Christian Borntraeger, Sami Tolvanen, Naveen N. Rao, Mao Han,
	Marco Elver, Steven Rostedt, Babu Moger, Borislav Petkov,
	Greentime Hu, Ben Dooks, Guan Xuetao, Tiezhu Yang,
	Thomas Bogendoerfer, open list:PARISC ARCHITECTURE,
	open list:BPF JIT for MIPS 32-BIT AND 64-BIT, David S. Miller,
	Thiago Jung Bauermann, Peter Zijlstra, David Howells,
	Amit Daniel Kachhap, Sandipan Das, H. Peter Anvin,
	open list:SPARC + UltraSPARC sparc/sparc64,
	open list:RISC-V ARCHITECTURE, Miroslav Benes, Jiri Olsa,
	Ard Biesheuvel, Vincenzo Frascino, Anders Roxell, Sven Schnelle,
	maintainer:X86 ARCHITECTURE 32-BIT AND 64-BIT, Russell King,
	Mike Rapoport, Ingo Molnar, Albert Ou, Paul E. McKenney,
	Paul Walmsley, KP Singh, Dmitry Vyukov, Nick Hu,
	open list:BPF JIT for MIPS 32-BIT AND 64-BIT, open list:MIPS,
	Palmer Dabbelt, open list:LINUX FOR POWERPC 32-BIT AND 64-BIT
In-Reply-To: <20200724073621.GC1872662@linux.intel.com>

+++ Jarkko Sakkinen [24/07/20 10:36 +0300]:
>On Thu, Jul 23, 2020 at 03:42:09PM +0300, Ard Biesheuvel wrote:
>> On Thu, 23 Jul 2020 at 04:52, Jarkko Sakkinen
>> <jarkko.sakkinen@linux.intel.com> wrote:
>> >
>> > On Thu, Jul 16, 2020 at 06:49:09PM +0200, Christophe Leroy wrote:
>> > > Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com> a écrit :
>> > >
>> > > > Rename module_alloc() to text_alloc() and module_memfree() to
>> > > > text_memfree(), and move them to kernel/text.c, which is unconditionally
>> > > > compiled to the kernel proper. This allows kprobes, ftrace and bpf to
>> > > > allocate space for executable code without requiring to compile the modules
>> > > > support (CONFIG_MODULES=y) in.
>> > >
>> > > You are not changing enough in powerpc to have this work.
>> > > On powerpc 32 bits (6xx), when STRICT_KERNEL_RWX is selected, the vmalloc
>> > > space is set to NX (no exec) at segment level (ie by 256Mbytes zone) unless
>> > > CONFIG_MODULES is selected.
>> > >
>> > > Christophe
>> >
>> > This has been deduced down to:
>> >
>> > https://lore.kernel.org/lkml/20200717030422.679972-1-jarkko.sakkinen@linux.intel.com/
>> >
>> > I.e. not intruding PPC anymore :-)
>> >
>>
>> Ok, so after the elaborate discussion we had between Jessica, Russell,
>> Peter, Will, Mark, you and myself, where we pointed out that
>> a) a single text_alloc() abstraction for bpf, kprobes and ftrace does
>> not fit other architectures very well, and
>> b) that module_alloc() is not suitable as a default to base text_alloc() on,
>
>In the latest iteration (v5) it is conditionally available only if arch
>defines and fallback has been removed.
>
>> you went ahead and implemented that anyway, but only cc'ing Peter,
>> akpm, Masami and the mm list this time?
>
>No problems with that. Actually each patch gets everything that
>get_maintainer.pl gives with a cc cmd script, not just the ones
>explicitly listed in the patch.
>
>Should I explicitly CC you to the next version? I'm happy to grow
>the list when requested.

Yes, please CC everybody that was part of the discussion last time
especially during v2, and please use a consistent CC list for the
whole patchset. It is difficult to review when you only receive patch
1 out of 6 with no mention of text_alloc() anywhere and without being
CC'd on the cover letter.

Jessica

^ permalink raw reply

* [PATCH] KVM: PPC: Book3S HV: rework secure mem slot dropping
From: Laurent Dufour @ 2020-07-24  8:35 UTC (permalink / raw)
  To: bharata, linuxram
  Cc: cclaudio, kvm-ppc, sathnaga, aneesh.kumar, sukadev, linuxppc-dev,
	bauerman, david
In-Reply-To: <20200724030337.GC1082478@in.ibm.com>

When a secure memslot is dropped, all the pages backed in the secure
device (aka really backed by secure memory by the Ultravisor)
should be paged out to a normal page. Previously, this was
achieved by triggering the page fault mechanism which is calling
kvmppc_svm_page_out() on each pages.

This can't work when hot unplugging a memory slot because the memory
slot is flagged as invalid and gfn_to_pfn() is then not trying to access
the page, so the page fault mechanism is not triggered.

Since the final goal is to make a call to kvmppc_svm_page_out() it seems
simpler to call directly instead of triggering such a mechanism. This
way kvmppc_uvmem_drop_pages() can be called even when hot unplugging a
memslot.

Since kvmppc_uvmem_drop_pages() is already holding kvm->arch.uvmem_lock,
the call to __kvmppc_svm_page_out() is made.  As
__kvmppc_svm_page_out needs the vma pointer to migrate the pages,
the VMA is fetched in a lazy way, to not trigger find_vma() all
the time. In addition, the mmap_sem is held in read mode during
that time, not in write mode since the virual memory layout is not
impacted, and kvm->arch.uvmem_lock prevents concurrent operation
on the secure device.

Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
	[modified the changelog description]
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
        [modified check on the VMA in kvmppc_uvmem_drop_pages]
---
 arch/powerpc/kvm/book3s_hv_uvmem.c | 53 ++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index c772e921f769..5dd3e9acdcab 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -632,35 +632,54 @@ static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,
  * fault on them, do fault time migration to replace the device PTEs in
  * QEMU page table with normal PTEs from newly allocated pages.
  */
-void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
+void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot,
 			     struct kvm *kvm, bool skip_page_out)
 {
 	int i;
 	struct kvmppc_uvmem_page_pvt *pvt;
-	unsigned long pfn, uvmem_pfn;
-	unsigned long gfn = free->base_gfn;
+	struct page *uvmem_page;
+	struct vm_area_struct *vma = NULL;
+	unsigned long uvmem_pfn, gfn;
+	unsigned long addr, end;
+
+	mmap_read_lock(kvm->mm);
+
+	addr = slot->userspace_addr;
+	end = addr + (slot->npages * PAGE_SIZE);
 
-	for (i = free->npages; i; --i, ++gfn) {
-		struct page *uvmem_page;
+	gfn = slot->base_gfn;
+	for (i = slot->npages; i; --i, ++gfn, addr += PAGE_SIZE) {
+
+		/* Fetch the VMA if addr is not in the latest fetched one */
+		if (!vma || addr >= vma->vm_end) {
+			vma = find_vma_intersection(kvm->mm, addr, addr+1);
+			if (!vma) {
+				pr_err("Can't find VMA for gfn:0x%lx\n", gfn);
+				break;
+			}
+		}
 
 		mutex_lock(&kvm->arch.uvmem_lock);
-		if (!kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) {
+
+		if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) {
+			uvmem_page = pfn_to_page(uvmem_pfn);
+			pvt = uvmem_page->zone_device_data;
+			pvt->skip_page_out = skip_page_out;
+			pvt->remove_gfn = true;
+
+			if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE,
+						  PAGE_SHIFT, kvm, pvt->gpa))
+				pr_err("Can't page out gpa:0x%lx addr:0x%lx\n",
+				       pvt->gpa, addr);
+		} else {
+			/* Remove the shared flag if any */
 			kvmppc_gfn_remove(gfn, kvm);
-			mutex_unlock(&kvm->arch.uvmem_lock);
-			continue;
 		}
 
-		uvmem_page = pfn_to_page(uvmem_pfn);
-		pvt = uvmem_page->zone_device_data;
-		pvt->skip_page_out = skip_page_out;
-		pvt->remove_gfn = true;
 		mutex_unlock(&kvm->arch.uvmem_lock);
-
-		pfn = gfn_to_pfn(kvm, gfn);
-		if (is_error_noslot_pfn(pfn))
-			continue;
-		kvm_release_pfn_clean(pfn);
 	}
+
+	mmap_read_unlock(kvm->mm);
 }
 
 unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm)
-- 
2.27.0


^ permalink raw reply related

* Re: [PATCH 2/5] powerpc: Allow 4096 bytes of stack expansion for the signal frame
From: Michael Ellerman @ 2020-07-24  9:20 UTC (permalink / raw)
  To: Daniel Axtens, linuxppc-dev; +Cc: linux-arch, hughd, linux-kernel
In-Reply-To: <87blk6tkuv.fsf@dja-thinkpad.axtens.net>

Daniel Axtens <dja@axtens.net> writes:
> Hi Michael,
>
> Unfortunately, this patch doesn't completely solve the problem.
>
> Trying the original reproducer, I'm still able to trigger the crash even
> with this patch, although not 100% of the time. (If I turn ASLR off
> outside of tmux it reliably crashes, if I turn ASLR off _inside_ of tmux
> it reliably succeeds; all of this is on a serial console.)
>
> ./foo 1241000 & sleep 1; killall -USR1 foo; echo ok
>
> If I add some debugging information, I see that I'm getting
> address + 4096 = 7fffffed0fa0
> gpr1 =           7fffffed1020
>
> So address + 4096 is 0x80 bytes below the 4k window. I haven't been able
> to figure out why, gdb gives me a NIP in __kernel_sigtramp_rt64 but I
> don't know what to make of that.

Thanks for testing.

I looked at it again this morning and it's fairly obvious when it's not
11pm :)

We need space for struct rt_sigframe as well as another 128 bytes,
which is __SIGNAL_FRAMESIZE. It's actually mentioned in the comment
above struct rt_sigframe.

I'll send a v2.

> P.S. I don't know what your policy on linking to kernel bugzilla is, but
> if you want:
>
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=205183

In general I prefer to keep things clean with just a single Link: tag
pointing to the archive of the patch submission.

That can then contain further links and other info, and has the
advantage that people can reply to the patch submission in the future to
add information to the thread that wasn't known at the time of the
commit.

cheers

^ permalink raw reply

* [PATCH v2 1/5] selftests/powerpc: Add test of stack expansion logic
From: Michael Ellerman @ 2020-07-24  9:25 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: linux-kernel, dja

We have custom stack expansion checks that it turns out are extremely
badly tested and contain bugs, surprise. So add some tests that
exercise the code and capture the current boundary conditions.

The signal test currently fails on 64-bit kernels because the 2048
byte allowance for the signal frame is too small, we will fix that in
a subsequent patch.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---

v2:
 - Concentrate on used stack around the 1MB size, as that's where our
   custom logic kicks in.
 - Increment the used stack size by 64 so we can exercise the case
   where we overflow the page by less than 128 (__SIGNAL_FRAMESIZE).

---
 tools/testing/selftests/powerpc/mm/.gitignore |   2 +
 tools/testing/selftests/powerpc/mm/Makefile   |   9 +-
 .../powerpc/mm/stack_expansion_ldst.c         | 233 ++++++++++++++++++
 .../powerpc/mm/stack_expansion_signal.c       | 118 +++++++++
 tools/testing/selftests/powerpc/pmu/lib.h     |   1 +
 5 files changed, 362 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
 create mode 100644 tools/testing/selftests/powerpc/mm/stack_expansion_signal.c

diff --git a/tools/testing/selftests/powerpc/mm/.gitignore b/tools/testing/selftests/powerpc/mm/.gitignore
index 8d041f508a51..52308f42b7de 100644
--- a/tools/testing/selftests/powerpc/mm/.gitignore
+++ b/tools/testing/selftests/powerpc/mm/.gitignore
@@ -8,3 +8,5 @@ large_vm_fork_separation
 bad_accesses
 tlbie_test
 pkey_exec_prot
+stack_expansion_ldst
+stack_expansion_signal
diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile
index 5a86d59441dc..6cd772e0e374 100644
--- a/tools/testing/selftests/powerpc/mm/Makefile
+++ b/tools/testing/selftests/powerpc/mm/Makefile
@@ -3,7 +3,9 @@
 	$(MAKE) -C ../
 
 TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot segv_errors wild_bctr \
-		  large_vm_fork_separation bad_accesses pkey_exec_prot
+		  large_vm_fork_separation bad_accesses pkey_exec_prot stack_expansion_signal \
+		  stack_expansion_ldst
+
 TEST_GEN_PROGS_EXTENDED := tlbie_test
 TEST_GEN_FILES := tempfile
 
@@ -17,6 +19,11 @@ $(OUTPUT)/large_vm_fork_separation: CFLAGS += -m64
 $(OUTPUT)/bad_accesses: CFLAGS += -m64
 $(OUTPUT)/pkey_exec_prot: CFLAGS += -m64
 
+$(OUTPUT)/stack_expansion_signal: ../utils.c ../pmu/lib.c
+
+$(OUTPUT)/stack_expansion_ldst: CFLAGS += -fno-stack-protector
+$(OUTPUT)/stack_expansion_ldst: ../utils.c
+
 $(OUTPUT)/tempfile:
 	dd if=/dev/zero of=$@ bs=64k count=1
 
diff --git a/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c b/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
new file mode 100644
index 000000000000..0587e11437f5
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test that loads/stores expand the stack segment, or trigger a SEGV, in
+ * various conditions.
+ *
+ * Based on test code by Tom Lane.
+ */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <err.h>
+#include <errno.h>
+#include <stdio.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#define _KB (1024)
+#define _MB (1024 * 1024)
+
+volatile char *stack_top_ptr;
+volatile unsigned long stack_top_sp;
+volatile char c;
+
+enum access_type {
+	LOAD,
+	STORE,
+};
+
+/*
+ * Consume stack until the stack pointer is below @target_sp, then do an access
+ * (load or store) at offset @delta from either the base of the stack or the
+ * current stack pointer.
+ */
+__attribute__ ((noinline))
+int consume_stack(unsigned long target_sp, unsigned long stack_high, int delta, enum access_type type)
+{
+	unsigned long target;
+	char stack_cur;
+
+	if ((unsigned long)&stack_cur > target_sp)
+		return consume_stack(target_sp, stack_high, delta, type);
+	else {
+		// We don't really need this, but without it GCC might not
+		// generate a recursive call above.
+		stack_top_ptr = &stack_cur;
+
+#ifdef __powerpc__
+		asm volatile ("mr %[sp], %%r1" : [sp] "=r" (stack_top_sp));
+#else
+		asm volatile ("mov %%rsp, %[sp]" : [sp] "=r" (stack_top_sp));
+#endif
+
+		// Kludge, delta < 0 indicates relative to SP
+		if (delta < 0)
+			target = stack_top_sp + delta;
+		else
+			target = stack_high - delta + 1;
+
+		volatile char *p = (char *)target;
+
+		if (type == STORE)
+			*p = c;
+		else
+			c = *p;
+
+		// Do something to prevent the stack frame being popped prior to
+		// our access above.
+		getpid();
+	}
+
+	return 0;
+}
+
+static int search_proc_maps(char *needle, unsigned long *low, unsigned long *high)
+{
+	unsigned long start, end;
+	static char buf[4096];
+	char name[128];
+	FILE *f;
+	int rc;
+
+	f = fopen("/proc/self/maps", "r");
+	if (!f) {
+		perror("fopen");
+		return -1;
+	}
+
+	while (fgets(buf, sizeof(buf), f)) {
+		rc = sscanf(buf, "%lx-%lx %*c%*c%*c%*c %*x %*d:%*d %*d %127s\n",
+			    &start, &end, name);
+		if (rc == 2)
+			continue;
+
+		if (rc != 3) {
+			printf("sscanf errored\n");
+			rc = -1;
+			break;
+		}
+
+		if (strstr(name, needle)) {
+			*low = start;
+			*high = end - 1;
+			rc = 0;
+			break;
+		}
+	}
+
+	fclose(f);
+
+	return rc;
+}
+
+int child(unsigned int stack_used, int delta, enum access_type type)
+{
+	unsigned long low, stack_high;
+
+	assert(search_proc_maps("[stack]", &low, &stack_high) == 0);
+
+	assert(consume_stack(stack_high - stack_used, stack_high, delta, type) == 0);
+
+	printf("Access OK: %s delta %-7d used size 0x%06x stack high 0x%lx top_ptr %p top sp 0x%lx actual used 0x%lx\n",
+	       type == LOAD ? "load" : "store", delta, stack_used, stack_high,
+	       stack_top_ptr, stack_top_sp, stack_high - stack_top_sp + 1);
+
+	return 0;
+}
+
+static int test_one(unsigned int stack_used, int delta, enum access_type type)
+{
+	pid_t pid;
+	int rc;
+
+	pid = fork();
+	if (pid == 0)
+		exit(child(stack_used, delta, type));
+
+	assert(waitpid(pid, &rc, 0) != -1);
+
+	if (WIFEXITED(rc) && WEXITSTATUS(rc) == 0)
+		return 0;
+
+	// We don't expect a non-zero exit that's not a signal
+	assert(!WIFEXITED(rc));
+
+	printf("Faulted:   %s delta %-7d used size 0x%06x signal %d\n",
+	       type == LOAD ? "load" : "store", delta, stack_used,
+	       WTERMSIG(rc));
+
+	return 1;
+}
+
+// This is fairly arbitrary but is well below any of the targets below,
+// so that the delta between the stack pointer and the target is large.
+#define DEFAULT_SIZE	(32 * _KB)
+
+static void test_one_type(enum access_type type, unsigned long page_size, unsigned long rlim_cur)
+{
+	assert(test_one(DEFAULT_SIZE, 512 * _KB, type) == 0);
+
+	// powerpc has a special case to allow up to 1MB
+	assert(test_one(DEFAULT_SIZE, 1 * _MB, type) == 0);
+
+#ifdef __powerpc__
+	// This fails on powerpc because it's > 1MB and is not a stdu &
+	// not close to r1
+	assert(test_one(DEFAULT_SIZE, 1 * _MB + 8, type) != 0);
+#else
+	assert(test_one(DEFAULT_SIZE, 1 * _MB + 8, type) == 0);
+#endif
+
+#ifdef __powerpc__
+	// Accessing way past the stack pointer is not allowed on powerpc
+	assert(test_one(DEFAULT_SIZE, rlim_cur, type) != 0);
+#else
+	// We should be able to access anywhere within the rlimit
+	assert(test_one(DEFAULT_SIZE, rlim_cur, type) == 0);
+#endif
+
+	// But if we go past the rlimit it should fail
+	assert(test_one(DEFAULT_SIZE, rlim_cur + 1, type) != 0);
+
+	// Above 1MB powerpc only allows accesses within 2048 bytes of
+	// r1 for accesses that aren't stdu
+	assert(test_one(1 * _MB + page_size - 128, -2048, type) == 0);
+#ifdef __powerpc__
+	assert(test_one(1 * _MB + page_size - 128, -2049, type) != 0);
+#else
+	assert(test_one(1 * _MB + page_size - 128, -2049, type) == 0);
+#endif
+
+	// By consuming 2MB of stack we test the stdu case
+	assert(test_one(2 * _MB + page_size - 128, -2048, type) == 0);
+}
+
+static int test(void)
+{
+	unsigned long page_size;
+	struct rlimit rlimit;
+
+	page_size = getpagesize();
+	getrlimit(RLIMIT_STACK, &rlimit);
+	printf("Stack rlimit is 0x%lx\n", rlimit.rlim_cur);
+
+	printf("Testing loads ...\n");
+	test_one_type(LOAD, page_size, rlimit.rlim_cur);
+	printf("Testing stores ...\n");
+	test_one_type(STORE, page_size, rlimit.rlim_cur);
+
+	printf("All OK\n");
+
+	return 0;
+}
+
+#ifdef __powerpc__
+#include "utils.h"
+
+int main(void)
+{
+	return test_harness(test, "stack_expansion_ldst");
+}
+#else
+int main(void)
+{
+	return test();
+}
+#endif
diff --git a/tools/testing/selftests/powerpc/mm/stack_expansion_signal.c b/tools/testing/selftests/powerpc/mm/stack_expansion_signal.c
new file mode 100644
index 000000000000..c8b32a29e274
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/stack_expansion_signal.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test that signal delivery is able to expand the stack segment without
+ * triggering a SEGV.
+ *
+ * Based on test code by Tom Lane.
+ */
+
+#include <err.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../pmu/lib.h"
+#include "utils.h"
+
+#define _KB (1024)
+#define _MB (1024 * 1024)
+
+static char *stack_base_ptr;
+static char *stack_top_ptr;
+
+static volatile sig_atomic_t sig_occurred = 0;
+
+static void sigusr1_handler(int signal_arg)
+{
+	sig_occurred = 1;
+}
+
+static int consume_stack(unsigned int stack_size, union pipe write_pipe)
+{
+	char stack_cur;
+
+	if ((stack_base_ptr - &stack_cur) < stack_size)
+		return consume_stack(stack_size, write_pipe);
+	else {
+		stack_top_ptr = &stack_cur;
+
+		FAIL_IF(notify_parent(write_pipe));
+
+		while (!sig_occurred)
+			barrier();
+	}
+
+	return 0;
+}
+
+static int child(unsigned int stack_size, union pipe write_pipe)
+{
+	struct sigaction act;
+	char stack_base;
+
+	act.sa_handler = sigusr1_handler;
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = 0;
+	if (sigaction(SIGUSR1, &act, NULL) < 0)
+		err(1, "sigaction");
+
+	stack_base_ptr = (char *) (((size_t) &stack_base + 65535) & ~65535UL);
+
+	FAIL_IF(consume_stack(stack_size, write_pipe));
+
+	printf("size 0x%06x: OK, stack base %p top %p (%zx used)\n",
+		stack_size, stack_base_ptr, stack_top_ptr,
+		stack_base_ptr - stack_top_ptr);
+
+	return 0;
+}
+
+static int test_one_size(unsigned int stack_size)
+{
+	union pipe read_pipe, write_pipe;
+	pid_t pid;
+
+	FAIL_IF(pipe(read_pipe.fds) == -1);
+	FAIL_IF(pipe(write_pipe.fds) == -1);
+
+	pid = fork();
+	if (pid == 0) {
+		close(read_pipe.read_fd);
+		close(write_pipe.write_fd);
+		exit(child(stack_size, read_pipe));
+	}
+
+	close(read_pipe.write_fd);
+	close(write_pipe.read_fd);
+	FAIL_IF(sync_with_child(read_pipe, write_pipe));
+
+	kill(pid, SIGUSR1);
+
+	FAIL_IF(wait_for_child(pid));
+
+	close(read_pipe.read_fd);
+	close(write_pipe.write_fd);
+
+	return 0;
+}
+
+int test(void)
+{
+	unsigned int i, size;
+
+	// Test with used stack from 1MB - 64K to 1MB + 64K
+	// Increment by 64 to get more coverage of odd sizes
+	for (i = 0; i < (128 * _KB); i += 64) {
+		size = i + (1 * _MB) - (64 * _KB);
+		FAIL_IF(test_one_size(size));
+	}
+
+	return 0;
+}
+
+int main(void)
+{
+	return test_harness(test, "stack_expansion_signal");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/lib.h b/tools/testing/selftests/powerpc/pmu/lib.h
index fa12e7d0b4d3..bf1bec013bbb 100644
--- a/tools/testing/selftests/powerpc/pmu/lib.h
+++ b/tools/testing/selftests/powerpc/pmu/lib.h
@@ -6,6 +6,7 @@
 #ifndef __SELFTESTS_POWERPC_PMU_LIB_H
 #define __SELFTESTS_POWERPC_PMU_LIB_H
 
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
-- 
2.25.1


^ permalink raw reply related

* [PATCH v2 2/5] powerpc: Allow 4224 bytes of stack expansion for the signal frame
From: Michael Ellerman @ 2020-07-24  9:25 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: linux-kernel, dja
In-Reply-To: <20200724092528.1578671-1-mpe@ellerman.id.au>

We have powerpc specific logic in our page fault handling to decide if
an access to an unmapped address below the stack pointer should expand
the stack VMA.

The code was originally added in 2004 "ported from 2.4". The rough
logic is that the stack is allowed to grow to 1MB with no extra
checking. Over 1MB the access must be within 2048 bytes of the stack
pointer, or be from a user instruction that updates the stack pointer.

The 2048 byte allowance below the stack pointer is there to cover the
288 byte "red zone" as well as the "about 1.5kB" needed by the signal
delivery code.

Unfortunately since then the signal frame has expanded, and is now
4224 bytes on 64-bit kernels with transactional memory enabled. This
means if a process has consumed more than 1MB of stack, and its stack
pointer lies less than 4224 bytes from the next page boundary, signal
delivery will fault when trying to expand the stack and the process
will see a SEGV.

The total size of the signal frame is the size of struct rt_sigframe
(which includes the red zone) plus __SIGNAL_FRAMESIZE (128 bytes on
64-bit).

The 2048 byte allowance was correct until 2008 as the signal frame
was:

struct rt_sigframe {
        struct ucontext    uc;                           /*     0  1440 */
        /* --- cacheline 11 boundary (1408 bytes) was 32 bytes ago --- */
        long unsigned int          _unused[2];           /*  1440    16 */
        unsigned int               tramp[6];             /*  1456    24 */
        struct siginfo *           pinfo;                /*  1480     8 */
        void *                     puc;                  /*  1488     8 */
        struct siginfo     info;                         /*  1496   128 */
        /* --- cacheline 12 boundary (1536 bytes) was 88 bytes ago --- */
        char                       abigap[288];          /*  1624   288 */

        /* size: 1920, cachelines: 15, members: 7 */
        /* padding: 8 */
};

1920 + 128 = 2048

Then in commit ce48b2100785 ("powerpc: Add VSX context save/restore,
ptrace and signal support") (Jul 2008) the signal frame expanded to
2304 bytes:

struct rt_sigframe {
        struct ucontext    uc;                           /*     0  1696 */	<--
        /* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
        long unsigned int          _unused[2];           /*  1696    16 */
        unsigned int               tramp[6];             /*  1712    24 */
        struct siginfo *           pinfo;                /*  1736     8 */
        void *                     puc;                  /*  1744     8 */
        struct siginfo     info;                         /*  1752   128 */
        /* --- cacheline 14 boundary (1792 bytes) was 88 bytes ago --- */
        char                       abigap[288];          /*  1880   288 */

        /* size: 2176, cachelines: 17, members: 7 */
        /* padding: 8 */
};

2176 + 128 = 2304

At this point we should have been exposed to the bug, though as far as
I know it was never reported. I no longer have a system old enough to
easily test on.

Then in 2010 commit 320b2b8de126 ("mm: keep a guard page below a
grow-down stack segment") caused our stack expansion code to never
trigger, as there was always a VMA found for a write up to PAGE_SIZE
below r1.

That meant the bug was hidden as we continued to expand the signal
frame in commit 2b0a576d15e0 ("powerpc: Add new transactional memory
state to the signal context") (Feb 2013):

struct rt_sigframe {
        struct ucontext    uc;                           /*     0  1696 */
        /* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
        struct ucontext    uc_transact;                  /*  1696  1696 */	<--
        /* --- cacheline 26 boundary (3328 bytes) was 64 bytes ago --- */
        long unsigned int          _unused[2];           /*  3392    16 */
        unsigned int               tramp[6];             /*  3408    24 */
        struct siginfo *           pinfo;                /*  3432     8 */
        void *                     puc;                  /*  3440     8 */
        struct siginfo     info;                         /*  3448   128 */
        /* --- cacheline 27 boundary (3456 bytes) was 120 bytes ago --- */
        char                       abigap[288];          /*  3576   288 */

        /* size: 3872, cachelines: 31, members: 8 */
        /* padding: 8 */
        /* last cacheline: 32 bytes */
};

3872 + 128 = 4000

And commit 573ebfa6601f ("powerpc: Increase stack redzone for 64-bit
userspace to 512 bytes") (Feb 2014):

struct rt_sigframe {
        struct ucontext    uc;                           /*     0  1696 */
        /* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
        struct ucontext    uc_transact;                  /*  1696  1696 */
        /* --- cacheline 26 boundary (3328 bytes) was 64 bytes ago --- */
        long unsigned int          _unused[2];           /*  3392    16 */
        unsigned int               tramp[6];             /*  3408    24 */
        struct siginfo *           pinfo;                /*  3432     8 */
        void *                     puc;                  /*  3440     8 */
        struct siginfo     info;                         /*  3448   128 */
        /* --- cacheline 27 boundary (3456 bytes) was 120 bytes ago --- */
        char                       abigap[512];          /*  3576   512 */	<--

        /* size: 4096, cachelines: 32, members: 8 */
        /* padding: 8 */
};

4096 + 128 = 4224

Then finally in 2017, commit 1be7107fbe18 ("mm: larger stack guard
gap, between vmas") exposed us to the existing bug, because it changed
the stack VMA to be the correct/real size, meaning our stack expansion
code is now triggered.

Fix it by increasing the allowance to 4224 bytes.

Hard-coding 4224 is obviously unsafe against future expansions of the
signal frame in the same way as the existing code. We can't easily use
sizeof() because the signal frame structure is not in a header. We
will either fix that, or rip out all the custom stack expansion
checking logic entirely.

Fixes: ce48b2100785 ("powerpc: Add VSX context save/restore, ptrace and signal support")
Cc: stable@vger.kernel.org # v2.6.27+
Reported-by: Tom Lane <tgl@sss.pgh.pa.us>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---

v2: Account for the extra 128 bytes of __SIGNAL_FRAMESIZE, making the
    total size 4224, as noticed by dja.

See also https://bugzilla.kernel.org/show_bug.cgi?id=205183
---
 arch/powerpc/mm/fault.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 641fc5f3d7dd..3ebb1792e636 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -267,6 +267,9 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
 	return false;
 }
 
+// This comes from 64-bit struct rt_sigframe + __SIGNAL_FRAMESIZE
+#define SIGFRAME_MAX_SIZE	(4096 + 128)
+
 static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
 				struct vm_area_struct *vma, unsigned int flags,
 				bool *must_retry)
@@ -274,7 +277,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
 	/*
 	 * N.B. The POWER/Open ABI allows programs to access up to
 	 * 288 bytes below the stack pointer.
-	 * The kernel signal delivery code writes up to about 1.5kB
+	 * The kernel signal delivery code writes a bit over 4KB
 	 * below the stack pointer (r1) before decrementing it.
 	 * The exec code can write slightly over 640kB to the stack
 	 * before setting the user r1.  Thus we allow the stack to
@@ -299,7 +302,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
 		 * between the last mapped region and the stack will
 		 * expand the stack rather than segfaulting.
 		 */
-		if (address + 2048 >= uregs->gpr[1])
+		if (address + SIGFRAME_MAX_SIZE >= uregs->gpr[1])
 			return false;
 
 		if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
-- 
2.25.1


^ permalink raw reply related

* [PATCH v2 3/5] selftests/powerpc: Update the stack expansion test
From: Michael Ellerman @ 2020-07-24  9:25 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: linux-kernel, dja
In-Reply-To: <20200724092528.1578671-1-mpe@ellerman.id.au>

Update the stack expansion load/store test to take into account the
new allowance of 4224 bytes below the stack pointer.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 .../selftests/powerpc/mm/stack_expansion_ldst.c        | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

v2: Update for change of size to 4224.

diff --git a/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c b/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
index 0587e11437f5..8dbfb51acf0f 100644
--- a/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
+++ b/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
@@ -186,17 +186,17 @@ static void test_one_type(enum access_type type, unsigned long page_size, unsign
 	// But if we go past the rlimit it should fail
 	assert(test_one(DEFAULT_SIZE, rlim_cur + 1, type) != 0);
 
-	// Above 1MB powerpc only allows accesses within 2048 bytes of
+	// Above 1MB powerpc only allows accesses within 4224 bytes of
 	// r1 for accesses that aren't stdu
-	assert(test_one(1 * _MB + page_size - 128, -2048, type) == 0);
+	assert(test_one(1 * _MB + page_size - 128, -4224, type) == 0);
 #ifdef __powerpc__
-	assert(test_one(1 * _MB + page_size - 128, -2049, type) != 0);
+	assert(test_one(1 * _MB + page_size - 128, -4225, type) != 0);
 #else
-	assert(test_one(1 * _MB + page_size - 128, -2049, type) == 0);
+	assert(test_one(1 * _MB + page_size - 128, -4225, type) == 0);
 #endif
 
 	// By consuming 2MB of stack we test the stdu case
-	assert(test_one(2 * _MB + page_size - 128, -2048, type) == 0);
+	assert(test_one(2 * _MB + page_size - 128, -4224, type) == 0);
 }
 
 static int test(void)
-- 
2.25.1


^ permalink raw reply related

* [PATCH v2 4/5] powerpc/mm: Remove custom stack expansion checking
From: Michael Ellerman @ 2020-07-24  9:25 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: linux-kernel, dja
In-Reply-To: <20200724092528.1578671-1-mpe@ellerman.id.au>

We have powerpc specific logic in our page fault handling to decide if
an access to an unmapped address below the stack pointer should expand
the stack VMA.

The logic aims to prevent userspace from doing bad accesses below the
stack pointer. However as long as the stack is < 1MB in size, we allow
all accesses without further checks. Adding some debug I see that I
can do a full kernel build and LTP run, and not a single process has
used more than 1MB of stack. So for the majority of processes the
logic never even fires.

We also recently found a nasty bug in this code which could cause
userspace programs to be killed during signal delivery. It went
unnoticed presumably because most processes use < 1MB of stack.

The generic mm code has also grown support for stack guard pages since
this code was originally written, so the most heinous case of the
stack expanding into other mappings is now handled for us.

Finally although some other arches have special logic in this path,
from what I can tell none of x86, arm64, arm and s390 impose any extra
checks other than those in expand_stack().

So drop our complicated logic and like other architectures just let
the stack expand as long as its within the rlimit.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/fault.c | 109 ++--------------------------------------
 1 file changed, 5 insertions(+), 104 deletions(-)

v2: no change just rebased.

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 3ebb1792e636..925a7231abb3 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -42,39 +42,7 @@
 #include <asm/kup.h>
 #include <asm/inst.h>
 
-/*
- * Check whether the instruction inst is a store using
- * an update addressing form which will update r1.
- */
-static bool store_updates_sp(struct ppc_inst inst)
-{
-	/* check for 1 in the rA field */
-	if (((ppc_inst_val(inst) >> 16) & 0x1f) != 1)
-		return false;
-	/* check major opcode */
-	switch (ppc_inst_primary_opcode(inst)) {
-	case OP_STWU:
-	case OP_STBU:
-	case OP_STHU:
-	case OP_STFSU:
-	case OP_STFDU:
-		return true;
-	case OP_STD:	/* std or stdu */
-		return (ppc_inst_val(inst) & 3) == 1;
-	case OP_31:
-		/* check minor opcode */
-		switch ((ppc_inst_val(inst) >> 1) & 0x3ff) {
-		case OP_31_XOP_STDUX:
-		case OP_31_XOP_STWUX:
-		case OP_31_XOP_STBUX:
-		case OP_31_XOP_STHUX:
-		case OP_31_XOP_STFSUX:
-		case OP_31_XOP_STFDUX:
-			return true;
-		}
-	}
-	return false;
-}
+
 /*
  * do_page_fault error handling helpers
  */
@@ -267,57 +235,6 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
 	return false;
 }
 
-// This comes from 64-bit struct rt_sigframe + __SIGNAL_FRAMESIZE
-#define SIGFRAME_MAX_SIZE	(4096 + 128)
-
-static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
-				struct vm_area_struct *vma, unsigned int flags,
-				bool *must_retry)
-{
-	/*
-	 * N.B. The POWER/Open ABI allows programs to access up to
-	 * 288 bytes below the stack pointer.
-	 * The kernel signal delivery code writes a bit over 4KB
-	 * below the stack pointer (r1) before decrementing it.
-	 * The exec code can write slightly over 640kB to the stack
-	 * before setting the user r1.  Thus we allow the stack to
-	 * expand to 1MB without further checks.
-	 */
-	if (address + 0x100000 < vma->vm_end) {
-		struct ppc_inst __user *nip = (struct ppc_inst __user *)regs->nip;
-		/* get user regs even if this fault is in kernel mode */
-		struct pt_regs *uregs = current->thread.regs;
-		if (uregs == NULL)
-			return true;
-
-		/*
-		 * A user-mode access to an address a long way below
-		 * the stack pointer is only valid if the instruction
-		 * is one which would update the stack pointer to the
-		 * address accessed if the instruction completed,
-		 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
-		 * (or the byte, halfword, float or double forms).
-		 *
-		 * If we don't check this then any write to the area
-		 * between the last mapped region and the stack will
-		 * expand the stack rather than segfaulting.
-		 */
-		if (address + SIGFRAME_MAX_SIZE >= uregs->gpr[1])
-			return false;
-
-		if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
-		    access_ok(nip, sizeof(*nip))) {
-			struct ppc_inst inst;
-
-			if (!probe_user_read_inst(&inst, nip))
-				return !store_updates_sp(inst);
-			*must_retry = true;
-		}
-		return true;
-	}
-	return false;
-}
-
 #ifdef CONFIG_PPC_MEM_KEYS
 static bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey,
 			      struct vm_area_struct *vma)
@@ -483,7 +400,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
 	int is_user = user_mode(regs);
 	int is_write = page_fault_is_write(error_code);
 	vm_fault_t fault, major = 0;
-	bool must_retry = false;
 	bool kprobe_fault = kprobe_page_fault(regs, 11);
 
 	if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
@@ -572,30 +488,15 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
 	vma = find_vma(mm, address);
 	if (unlikely(!vma))
 		return bad_area(regs, address);
-	if (likely(vma->vm_start <= address))
-		goto good_area;
-	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
-		return bad_area(regs, address);
 
-	/* The stack is being expanded, check if it's valid */
-	if (unlikely(bad_stack_expansion(regs, address, vma, flags,
-					 &must_retry))) {
-		if (!must_retry)
+	if (unlikely(vma->vm_start > address)) {
+		if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
 			return bad_area(regs, address);
 
-		mmap_read_unlock(mm);
-		if (fault_in_pages_readable((const char __user *)regs->nip,
-					    sizeof(unsigned int)))
-			return bad_area_nosemaphore(regs, address);
-		goto retry;
+		if (unlikely(expand_stack(vma, address)))
+			return bad_area(regs, address);
 	}
 
-	/* Try to expand it */
-	if (unlikely(expand_stack(vma, address)))
-		return bad_area(regs, address);
-
-good_area:
-
 #ifdef CONFIG_PPC_MEM_KEYS
 	if (unlikely(access_pkey_error(is_write, is_exec,
 				       (error_code & DSISR_KEYFAULT), vma)))
-- 
2.25.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox