LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH v2 1/3] powerpc/numa: Introduce logical numa id
From: Srikar Dronamraju @ 2020-08-17 11:49 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, linuxppc-dev
In-Reply-To: <15a2f88d-e609-cce9-a82c-321073b9574b@linux.ibm.com>

* Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> [2020-08-17 17:04:24]:

> On 8/17/20 4:29 PM, Srikar Dronamraju wrote:
> > * Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> [2020-08-17 16:02:36]:
> > 
> > > We use ibm,associativity and ibm,associativity-lookup-arrays to derive the numa
> > > node numbers. These device tree properties are firmware indicated grouping of
> > > resources based on their hierarchy in the platform. These numbers (group id) are
> > > not sequential and hypervisor/firmware can follow different numbering schemes.
> > > For ex: on powernv platforms, we group them in the below order.
> > > 
> > >   *     - CCM node ID
> > >   *     - HW card ID
> > >   *     - HW module ID
> > >   *     - Chip ID
> > >   *     - Core ID
> > > 
> > > Based on ibm,associativity-reference-points we use one of the above group ids as
> > > Linux NUMA node id. (On PowerNV platform Chip ID is used). This results
> > > in Linux reporting non-linear NUMA node id and which also results in Linux
> > > reporting empty node 0 NUMA nodes.
> > > 
> > > This can  be resolved by mapping the firmware provided group id to a logical Linux
> > > NUMA id. In this patch, we do this only for pseries platforms considering the
> > > firmware group id is a virtualized entity and users would not have drawn any
> > > conclusion based on the Linux Numa Node id.
> > > 
> > > On PowerNV platform since we have historically mapped Chip ID as Linux NUMA node
> > > id, we keep the existing Linux NUMA node id numbering.
> > 
> > I still dont understand how you are going to handle numa distances.
> > With your patch, have you tried dlpar add/remove on a sparsely noded machine?
> > 
> 
> We follow the same steps when fetching distance information. Instead of
> using affinity domain id, we now use the mapped node id. The relevant hunk
> in the patch is
> 
> +	nid = affinity_domain_to_nid(&domain);
> 
>  	if (nid > 0 &&
> -		of_read_number(associativity, 1) >= distance_ref_points_depth) {
> +	    of_read_number(associativity, 1) >= distance_ref_points_depth) {
>  		/*
>  		 * Skip the length field and send start of associativity array
>  		 */
> 
> I haven't tried dlpar add/remove. I don't have a setup to try that. Do you
> see a problem there?
> 

Yes, I think there can be 2 problems.

1. distance table may be filled with incorrect data.
2. numactl -H distance table shows symmetric data, the symmetric nature may
be lost.

> -aneesh
> 
> 

-- 
Thanks and Regards
Srikar Dronamraju

^ permalink raw reply

* Re: [PATCH v2 2/4] powerpc/mem: Store the dt_root_size/addr cell values for later usage
From: Hari Bathini @ 2020-08-17 15:30 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linuxppc-dev, mpe; +Cc: Nathan Lynch
In-Reply-To: <20200806162329.276534-2-aneesh.kumar@linux.ibm.com>



On 06/08/20 9:53 pm, Aneesh Kumar K.V wrote:
> dt_root_addr_cells and dt_root_size_cells are __initdata variables.
> So make a copy of the same which can be used post init.
> 

This avoids doing the same thing at multiple places.
So, thanks for the patch, Aneesh.

Looks good to me.

but nitpick below...

> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>   arch/powerpc/include/asm/drmem.h | 2 ++
>   arch/powerpc/kernel/prom.c       | 7 +++++++
>   arch/powerpc/mm/numa.c           | 1 +
>   3 files changed, 10 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
> index d719cbac34b2..ffb59caa88ee 100644
> --- a/arch/powerpc/include/asm/drmem.h
> +++ b/arch/powerpc/include/asm/drmem.h
> @@ -123,4 +123,6 @@ static inline void lmb_clear_nid(struct drmem_lmb *lmb)
>   }
>   #endif
>   
> +extern int mem_addr_cells, mem_size_cells;

Should this be in include/asm/prom.h instead, given the definition
comes from kernel/prom.c file?

> +
>   #endif /* _ASM_POWERPC_LMB_H */
> diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
> index d8a2fb87ba0c..9a1701e85747 100644
> --- a/arch/powerpc/kernel/prom.c
> +++ b/arch/powerpc/kernel/prom.c
> @@ -73,6 +73,7 @@ u64 ppc64_rma_size;
>   #endif
>   static phys_addr_t first_memblock_size;
>   static int __initdata boot_cpu_count;
> +int mem_addr_cells, mem_size_cells;
>   
>   static int __init early_parse_mem(char *p)
>   {
> @@ -536,6 +537,12 @@ static int __init early_init_dt_scan_memory_ppc(unsigned long node,
>   						const char *uname,
>   						int depth, void *data)
>   {
> +	/*
> +	 * Make a copy from __initdata variable
> +	 */
> +	mem_addr_cells = dt_root_addr_cells;
> +	mem_size_cells = dt_root_size_cells;
> +
>   #ifdef CONFIG_PPC_PSERIES
>   	if (depth == 1 &&
>   	    strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) {
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 058fee9a0835..77d41d9775d2 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -368,6 +368,7 @@ static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
>   	of_node_put(memory);
>   }
>   
> +/*  dt_mem_next_cell is __init  */
>   static unsigned long read_n_cells(int n, const __be32 **buf)
>   {
>   	unsigned long result = 0;
> 

^ permalink raw reply

* Re: [PATCH 06/20] ethernet: chelsio: convert tasklets to use new tasklet_setup() API
From: Jakub Kicinski @ 2020-08-17 15:32 UTC (permalink / raw)
  To: Allen Pais
  Cc: jes, borisp, keescook, linux-rdma, netdev, kda, cooldavid,
	dougmill, linux-kernel, linux-acenic, oss-drivers, Romain Perier,
	linuxppc-dev, davem, linux-arm-kernel, mlindner
In-Reply-To: <20200817082434.21176-8-allen.lkml@gmail.com>

On Mon, 17 Aug 2020 13:54:20 +0530 Allen Pais wrote:
> In preparation for unconditionally passing the
> struct tasklet_struct pointer to all tasklet
> callbacks, switch to using the new tasklet_setup()
> and from_tasklet() to pass the tasklet pointer explicitly.
> 
> Signed-off-by: Romain Perier <romain.perier@gmail.com>
> Signed-off-by: Allen Pais <allen.lkml@gmail.com>

You need to adjust kdoc when you change functions:

drivers/net/ethernet/chelsio/cxgb4/sge.c:2664: warning: Function parameter or member 't' not described in 'restart_ctrlq'
drivers/net/ethernet/chelsio/cxgb4/sge.c:2664: warning: Excess function parameter 'data' description in 'restart_ctrlq'
drivers/net/ethernet/chelsio/cxgb4/sge.c:2965: warning: Function parameter or member 't' not described in 'restart_ofldq'
drivers/net/ethernet/chelsio/cxgb4/sge.c:2965: warning: Excess function parameter 'data' description in 'restart_ofldq'

^ permalink raw reply

* Re: [PATCH 08/20] ethernet: hinic: convert tasklets to use new tasklet_setup() API
From: Jakub Kicinski @ 2020-08-17 15:33 UTC (permalink / raw)
  To: Allen Pais
  Cc: jes, borisp, keescook, linux-rdma, netdev, kda, cooldavid,
	dougmill, linux-kernel, linux-acenic, oss-drivers, Romain Perier,
	linuxppc-dev, davem, linux-arm-kernel, mlindner
In-Reply-To: <20200817082434.21176-10-allen.lkml@gmail.com>

On Mon, 17 Aug 2020 13:54:22 +0530 Allen Pais wrote:
> In preparation for unconditionally passing the
> struct tasklet_struct pointer to all tasklet
> callbacks, switch to using the new tasklet_setup()
> and from_tasklet() to pass the tasklet pointer explicitly.
> 
> Signed-off-by: Romain Perier <romain.perier@gmail.com>
> Signed-off-by: Allen Pais <allen.lkml@gmail.com>

drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c:374: warning: Function parameter or member 't' not described in 'ceq_tasklet'
drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c:374: warning: Excess function parameter 'ceq_data' description in 'ceq_tasklet'

^ permalink raw reply

* Re: [PATCH] powerpc/book3s64/radix: Fix boot failure with large amount of guest memory
From: Hari Bathini @ 2020-08-17 15:43 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linuxppc-dev, mpe; +Cc: Shirisha Ganta, Sandipan Das, npiggin
In-Reply-To: <20200813162039.608649-1-aneesh.kumar@linux.ibm.com>



On 13/08/20 9:50 pm, Aneesh Kumar K.V wrote:
> If the hypervisor doesn't support hugepages, the kernel ends up allocating a large
> number of page table pages. The early page table allocation was wrongly
> setting the max memblock limit to ppc64_rma_size with radix translation
> which resulted in boot failure as shown below.
> 
> Kernel panic - not syncing:
> early_alloc_pgtable: Failed to allocate 16777216 bytes align=0x1000000 nid=-1 from=0x0000000000000000 max_addr=0xffffffffffffffff
>   CPU: 0 PID: 0 Comm: swapper Not tainted 5.8.0-24.9-default+ #2
>   Call Trace:
>   [c0000000016f3d00] [c0000000007c6470] dump_stack+0xc4/0x114 (unreliable)
>   [c0000000016f3d40] [c00000000014c78c] panic+0x164/0x418
>   [c0000000016f3dd0] [c000000000098890] early_alloc_pgtable+0xe0/0xec
>   [c0000000016f3e60] [c0000000010a5440] radix__early_init_mmu+0x360/0x4b4
>   [c0000000016f3ef0] [c000000001099bac] early_init_mmu+0x1c/0x3c
>   [c0000000016f3f10] [c00000000109a320] early_setup+0x134/0x170
> 
> This was because the kernel was checking for the radix feature before we enable the
> feature via mmu_features. This resulted in the kernel using hash restrictions on
> radix.
> 
> Rework the early init code such that the kernel boot with memblock restrictions
> as imposed by hash. At that point, the kernel still hasn't finalized the
> translation the kernel will end up using.
> 
> We have three different ways of detecting radix.
> 
> 1. dt_cpu_ftrs_scan -> used only in case of PowerNV
> 2. ibm,pa-features -> Used when we don't use cpu_dt_ftr_scan
> 3. CAS -> Where we negotiate with hypervisor about the supported translation.
> 
> We look at 1 or 2 early in the boot and after that, we look at the CAS vector to
> finalize the translation the kernel will use. We also support a kernel command
> line option (disable_radix) to switch to hash.
> 
> Update the memblock limit after mmu_early_init_devtree() if the kernel is going
> to use radix translation. This forces some of the memblock allocations we do before
> mmu_early_init_devtree() to be within the RMA limit.

Minor comments below. Nonetheless...

Reviewed-by: Hari Bathini <hbathini@linux.ibm.com>

> 
> Fixes: 2bfd65e45e87 ("powerpc/mm/radix: Add radix callbacks for early init routines")
> Reported-by: Shirisha Ganta <shiganta@in.ibm.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>   arch/powerpc/include/asm/book3s/64/mmu.h | 8 +++++---
>   arch/powerpc/kernel/prom.c               | 6 ++++++
>   arch/powerpc/mm/book3s64/radix_pgtable.c | 2 ++
>   3 files changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
> index 55442d45c597..4245f99453f5 100644
> --- a/arch/powerpc/include/asm/book3s/64/mmu.h
> +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
> @@ -244,9 +244,11 @@ extern void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
>   static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base,
>   					      phys_addr_t first_memblock_size)
>   {
> -	if (early_radix_enabled())
> -		return radix__setup_initial_memory_limit(first_memblock_base,
> -						   first_memblock_size);
> +	/*
> +	 * Hash has more strict restrictions. At this point we don't
> +	 * know which translations we will pick. Hence got with hash

:s/got with/go with/

> +	 * restrictions.
> +	 */
>   	return hash__setup_initial_memory_limit(first_memblock_base,
>   					   first_memblock_size);
>   }
> diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
> index d8a2fb87ba0c..340900ae95a4 100644
> --- a/arch/powerpc/kernel/prom.c
> +++ b/arch/powerpc/kernel/prom.c
> @@ -811,6 +811,12 @@ void __init early_init_devtree(void *params)
>   
>   	mmu_early_init_devtree();
>   
> +	/*
> +	 * Reset ppc64_rma_size and memblock memory limit
> +	 */
> +	if (early_radix_enabled())
> +		radix__setup_initial_memory_limit(memstart_addr, first_memblock_size);
> +
>   #ifdef CONFIG_PPC_POWERNV
>   	/* Scan and build the list of machine check recoverable ranges */
>   	of_scan_flat_dt(early_init_dt_scan_recoverable_ranges, NULL);
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index 28c784976bed..094daf16acac 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -747,6 +747,8 @@ void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
>   	 * Radix mode is not limited by RMA / VRMA addressing.
>   	 */
>   	ppc64_rma_size = ULONG_MAX;

> +
> +	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);

Probably the same thing but I would prefer the below instead:

memblock_set_current_limit(ppc64_rma_size);

Thanks
Hari

^ permalink raw reply

* Re: [PATCH 10/11] powerpc: use non-set_fs based maccess routines
From: Christophe Leroy @ 2020-08-17 15:47 UTC (permalink / raw)
  To: Christoph Hellwig, Al Viro, Michael Ellerman, x86
  Cc: linux-fsdevel, linux-arch, linuxppc-dev, Kees Cook, linux-kernel
In-Reply-To: <20200817073212.830069-11-hch@lst.de>



Le 17/08/2020 à 09:32, Christoph Hellwig a écrit :
> Provide __get_kernel_nofault and __put_kernel_nofault routines to
> implement the maccess routines without messing with set_fs and without
> opening up access to user space.

__get_user_size() opens access to user space. You have to use 
__get_user_size_allowed() when user access is already allowed (or when 
not needed to allow it).

Christophe

> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   arch/powerpc/include/asm/uaccess.h | 16 ++++++++++++++++
>   1 file changed, 16 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
> index 00699903f1efca..a31de40ac00b62 100644
> --- a/arch/powerpc/include/asm/uaccess.h
> +++ b/arch/powerpc/include/asm/uaccess.h
> @@ -623,4 +623,20 @@ do {									\
>   		__put_user_goto(*(u8*)(_src + _i), (u8 __user *)(_dst + _i), e);\
>   } while (0)
>   
> +#define HAVE_GET_KERNEL_NOFAULT
> +
> +#define __get_kernel_nofault(dst, src, type, err_label)			\
> +do {									\
> +	int __kr_err;							\
> +									\
> +	__get_user_size(*((type *)(dst)), (__force type __user *)(src),	\
> +			sizeof(type), __kr_err);			\
> +	if (unlikely(__kr_err))						\
> +		goto err_label;						\
> +} while (0)
> +
> +#define __put_kernel_nofault(dst, src, type, err_label)			\
> +	__put_user_size_goto(*((type *)(src)),				\
> +		(__force type __user *)(dst), sizeof(type), err_label)
> +
>   #endif	/* _ARCH_POWERPC_UACCESS_H */
> 

^ permalink raw reply

* Re: [PATCH v2 2/4] powerpc/mem: Store the dt_root_size/addr cell values for later usage
From: Aneesh Kumar K.V @ 2020-08-17 15:56 UTC (permalink / raw)
  To: Hari Bathini, linuxppc-dev, mpe; +Cc: Nathan Lynch
In-Reply-To: <f0c90b3b-5192-de01-c18c-0c69e895237f@linux.ibm.com>

On 8/17/20 9:00 PM, Hari Bathini wrote:
> 
> 
> On 06/08/20 9:53 pm, Aneesh Kumar K.V wrote:
>> dt_root_addr_cells and dt_root_size_cells are __initdata variables.
>> So make a copy of the same which can be used post init.
>>
> 
> This avoids doing the same thing at multiple places.
> So, thanks for the patch, Aneesh.
> 
> Looks good to me.
> 
> but nitpick below...
> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>   arch/powerpc/include/asm/drmem.h | 2 ++
>>   arch/powerpc/kernel/prom.c       | 7 +++++++
>>   arch/powerpc/mm/numa.c           | 1 +
>>   3 files changed, 10 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/drmem.h 
>> b/arch/powerpc/include/asm/drmem.h
>> index d719cbac34b2..ffb59caa88ee 100644
>> --- a/arch/powerpc/include/asm/drmem.h
>> +++ b/arch/powerpc/include/asm/drmem.h
>> @@ -123,4 +123,6 @@ static inline void lmb_clear_nid(struct drmem_lmb 
>> *lmb)
>>   }
>>   #endif
>> +extern int mem_addr_cells, mem_size_cells;
> 
> Should this be in include/asm/prom.h instead, given the definition
> comes from kernel/prom.c file?
> 

We added the variable definition to prom.c because that is where we are 
doing early device tree scanning. But the users should not really be 
including prom.h. The variables are related drmem and hence I used 
drmem.h for include.

>> +
>>   #endif /* _ASM_POWERPC_LMB_H */
>> diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
>> index d8a2fb87ba0c..9a1701e85747 100644
>> --- a/arch/powerpc/kernel/prom.c
>> +++ b/arch/powerpc/kernel/prom.c
>> @@ -73,6 +73,7 @@ u64 ppc64_rma_size;
>>   #endif
>>   static phys_addr_t first_memblock_size;
>>   static int __initdata boot_cpu_count;
>> +int mem_addr_cells, mem_size_cells;
>>   static int __init early_parse_mem(char *p)
>>   {
>> @@ -536,6 +537,12 @@ static int __init 
>> early_init_dt_scan_memory_ppc(unsigned long node,
>>                           const char *uname,
>>                           int depth, void *data)
>>   {
>> +    /*
>> +     * Make a copy from __initdata variable
>> +     */
>> +    mem_addr_cells = dt_root_addr_cells;
>> +    mem_size_cells = dt_root_size_cells;
>> +
>>   #ifdef CONFIG_PPC_PSERIES
>>       if (depth == 1 &&
>>           strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) {
>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>> index 058fee9a0835..77d41d9775d2 100644
>> --- a/arch/powerpc/mm/numa.c
>> +++ b/arch/powerpc/mm/numa.c
>> @@ -368,6 +368,7 @@ static void __init get_n_mem_cells(int 
>> *n_addr_cells, int *n_size_cells)
>>       of_node_put(memory);
>>   }
>> +/*  dt_mem_next_cell is __init  */
>>   static unsigned long read_n_cells(int n, const __be32 **buf)
>>   {
>>       unsigned long result = 0;
>>


^ permalink raw reply

* Re: [PATCH] powerpc/book3s64/radix: Fix boot failure with large amount of guest memory
From: Aneesh Kumar K.V @ 2020-08-17 15:58 UTC (permalink / raw)
  To: Hari Bathini, linuxppc-dev, mpe; +Cc: Shirisha Ganta, Sandipan Das, npiggin
In-Reply-To: <87c4ab91-b0a0-bd79-40fd-df93aedd98f3@linux.ibm.com>

On 8/17/20 9:13 PM, Hari Bathini wrote:
> 
> 
> On 13/08/20 9:50 pm, Aneesh Kumar K.V wrote:
>> If the hypervisor doesn't support hugepages, the kernel ends up 
>> allocating a large
>> number of page table pages. The early page table allocation was wrongly
>> setting the max memblock limit to ppc64_rma_size with radix translation
>> which resulted in boot failure as shown below.
>>
>> Kernel panic - not syncing:
>> early_alloc_pgtable: Failed to allocate 16777216 bytes align=0x1000000 
>> nid=-1 from=0x0000000000000000 max_addr=0xffffffffffffffff
>>   CPU: 0 PID: 0 Comm: swapper Not tainted 5.8.0-24.9-default+ #2
>>   Call Trace:
>>   [c0000000016f3d00] [c0000000007c6470] dump_stack+0xc4/0x114 
>> (unreliable)
>>   [c0000000016f3d40] [c00000000014c78c] panic+0x164/0x418
>>   [c0000000016f3dd0] [c000000000098890] early_alloc_pgtable+0xe0/0xec
>>   [c0000000016f3e60] [c0000000010a5440] radix__early_init_mmu+0x360/0x4b4
>>   [c0000000016f3ef0] [c000000001099bac] early_init_mmu+0x1c/0x3c
>>   [c0000000016f3f10] [c00000000109a320] early_setup+0x134/0x170
>>
>> This was because the kernel was checking for the radix feature before 
>> we enable the
>> feature via mmu_features. This resulted in the kernel using hash 
>> restrictions on
>> radix.
>>
>> Rework the early init code such that the kernel boot with memblock 
>> restrictions
>> as imposed by hash. At that point, the kernel still hasn't finalized the
>> translation the kernel will end up using.
>>
>> We have three different ways of detecting radix.
>>
>> 1. dt_cpu_ftrs_scan -> used only in case of PowerNV
>> 2. ibm,pa-features -> Used when we don't use cpu_dt_ftr_scan
>> 3. CAS -> Where we negotiate with hypervisor about the supported 
>> translation.
>>
>> We look at 1 or 2 early in the boot and after that, we look at the CAS 
>> vector to
>> finalize the translation the kernel will use. We also support a kernel 
>> command
>> line option (disable_radix) to switch to hash.
>>
>> Update the memblock limit after mmu_early_init_devtree() if the kernel 
>> is going
>> to use radix translation. This forces some of the memblock allocations 
>> we do before
>> mmu_early_init_devtree() to be within the RMA limit.
> 
> Minor comments below. Nonetheless...
> 
> Reviewed-by: Hari Bathini <hbathini@linux.ibm.com>
> 
>>
>> Fixes: 2bfd65e45e87 ("powerpc/mm/radix: Add radix callbacks for early 
>> init routines")
>> Reported-by: Shirisha Ganta <shiganta@in.ibm.com>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>   arch/powerpc/include/asm/book3s/64/mmu.h | 8 +++++---
>>   arch/powerpc/kernel/prom.c               | 6 ++++++
>>   arch/powerpc/mm/book3s64/radix_pgtable.c | 2 ++
>>   3 files changed, 13 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
>> b/arch/powerpc/include/asm/book3s/64/mmu.h
>> index 55442d45c597..4245f99453f5 100644
>> --- a/arch/powerpc/include/asm/book3s/64/mmu.h
>> +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
>> @@ -244,9 +244,11 @@ extern void 
>> radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
>>   static inline void setup_initial_memory_limit(phys_addr_t 
>> first_memblock_base,
>>                             phys_addr_t first_memblock_size)
>>   {
>> -    if (early_radix_enabled())
>> -        return radix__setup_initial_memory_limit(first_memblock_base,
>> -                           first_memblock_size);
>> +    /*
>> +     * Hash has more strict restrictions. At this point we don't
>> +     * know which translations we will pick. Hence got with hash
> 
> :s/got with/go with/
> 
>> +     * restrictions.
>> +     */
>>       return hash__setup_initial_memory_limit(first_memblock_base,
>>                          first_memblock_size);
>>   }
>> diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
>> index d8a2fb87ba0c..340900ae95a4 100644
>> --- a/arch/powerpc/kernel/prom.c
>> +++ b/arch/powerpc/kernel/prom.c
>> @@ -811,6 +811,12 @@ void __init early_init_devtree(void *params)
>>       mmu_early_init_devtree();
>> +    /*
>> +     * Reset ppc64_rma_size and memblock memory limit
>> +     */
>> +    if (early_radix_enabled())
>> +        radix__setup_initial_memory_limit(memstart_addr, 
>> first_memblock_size);
>> +
>>   #ifdef CONFIG_PPC_POWERNV
>>       /* Scan and build the list of machine check recoverable ranges */
>>       of_scan_flat_dt(early_init_dt_scan_recoverable_ranges, NULL);
>> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
>> b/arch/powerpc/mm/book3s64/radix_pgtable.c
>> index 28c784976bed..094daf16acac 100644
>> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
>> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
>> @@ -747,6 +747,8 @@ void radix__setup_initial_memory_limit(phys_addr_t 
>> first_memblock_base,
>>        * Radix mode is not limited by RMA / VRMA addressing.
>>        */
>>       ppc64_rma_size = ULONG_MAX;
> 
>> +
>> +    memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
> 
> Probably the same thing but I would prefer the below instead:
> 
> memblock_set_current_limit(ppc64_rma_size);

This is not really related to ppc64_rma_size right? On radix what we 
actually want is memblock alloc from anywhere. Actually what we want is

memblock_set_current_limit(memblock_limit_from_rma(ppc64_rma_size))


But that is unnecessary complication?

-aneesh


^ permalink raw reply

* Re: [RFC PATCH] powerpc/drmem: use global variable instead of fetching again
From: Hari Bathini @ 2020-08-17 17:31 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linuxppc-dev, mpe; +Cc: Nathan Lynch
In-Reply-To: <20200806125200.252403-1-aneesh.kumar@linux.ibm.com>



On 06/08/20 6:22 pm, Aneesh Kumar K.V wrote:
> use mem_addr_cells/mem_size_cells instead of fetching the values
> again from device tree.
> 

Looks good to me.

> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>   arch/powerpc/mm/drmem.c | 24 ++++++------------------
>   1 file changed, 6 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
> index b2eeea39684c..f533a7b04ab9 100644
> --- a/arch/powerpc/mm/drmem.c
> +++ b/arch/powerpc/mm/drmem.c
> @@ -14,8 +14,6 @@
>   #include <asm/prom.h>
>   #include <asm/drmem.h>
>   
> -static int n_root_addr_cells, n_root_size_cells;
> -
>   static struct drmem_lmb_info __drmem_info;
>   struct drmem_lmb_info *drmem_info = &__drmem_info;
>   
> @@ -196,8 +194,8 @@ static void read_drconf_v1_cell(struct drmem_lmb *lmb,
>   {
>   	const __be32 *p = *prop;
>   
> -	lmb->base_addr = of_read_number(p, n_root_addr_cells);
> -	p += n_root_addr_cells;
> +	lmb->base_addr = of_read_number(p, mem_addr_cells);
> +	p += mem_addr_cells;
>   	lmb->drc_index = of_read_number(p++, 1);
>   
>   	p++; /* skip reserved field */
> @@ -233,8 +231,8 @@ static void read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
>   	const __be32 *p = *prop;
>   
>   	dr_cell->seq_lmbs = of_read_number(p++, 1);
> -	dr_cell->base_addr = of_read_number(p, n_root_addr_cells);
> -	p += n_root_addr_cells;
> +	dr_cell->base_addr = of_read_number(p, mem_addr_cells);
> +	p += mem_addr_cells;
>   	dr_cell->drc_index = of_read_number(p++, 1);
>   	dr_cell->aa_index = of_read_number(p++, 1);
>   	dr_cell->flags = of_read_number(p++, 1);
> @@ -285,10 +283,6 @@ int __init walk_drmem_lmbs_early(unsigned long node, void *data,
>   	if (!prop || len < dt_root_size_cells * sizeof(__be32))
>   		return ret;
>   
> -	/* Get the address & size cells */
> -	n_root_addr_cells = dt_root_addr_cells;
> -	n_root_size_cells = dt_root_size_cells;
> -
>   	drmem_info->lmb_size = dt_mem_next_cell(dt_root_size_cells, &prop);
>   
>   	usm = of_get_flat_dt_prop(node, "linux,drconf-usable-memory", &len);
> @@ -318,12 +312,12 @@ static int init_drmem_lmb_size(struct device_node *dn)
>   		return 0;
>   
>   	prop = of_get_property(dn, "ibm,lmb-size", &len);
> -	if (!prop || len < n_root_size_cells * sizeof(__be32)) {
> +	if (!prop || len < mem_size_cells * sizeof(__be32)) {
>   		pr_info("Could not determine LMB size\n");
>   		return -1;
>   	}
>   
> -	drmem_info->lmb_size = of_read_number(prop, n_root_size_cells);
> +	drmem_info->lmb_size = of_read_number(prop, mem_size_cells);
>   	return 0;
>   }
>   
> @@ -353,12 +347,6 @@ int walk_drmem_lmbs(struct device_node *dn, void *data,
>   	if (!of_root)
>   		return ret;
>   
> -	/* Get the address & size cells */
> -	of_node_get(of_root);
> -	n_root_addr_cells = of_n_addr_cells(of_root);
> -	n_root_size_cells = of_n_size_cells(of_root);
> -	of_node_put(of_root);
> -
>   	if (init_drmem_lmb_size(dn))
>   		return ret;
>   
> 

Thanks
Hari

^ permalink raw reply

* Re: [PATCH 0/8] scsi: convert tasklets to use new tasklet_setup()
From: Kees Cook @ 2020-08-17 19:28 UTC (permalink / raw)
  To: James Bottomley
  Cc: martin.petersen, linux-scsi, shivasharan.srikanteshwara,
	linux-kernel, kashyap.desai, sumit.saxena, Allen Pais,
	target-devel, Allen Pais, linuxppc-dev, megaraidlinux.pdl
In-Reply-To: <1597675318.4475.11.camel@linux.ibm.com>

On Mon, Aug 17, 2020 at 07:41:58AM -0700, James Bottomley wrote:
> On Mon, 2020-08-17 at 14:24 +0530, Allen Pais wrote:
> > From: Allen Pais <allen.lkml@gmail.com>
> > 
> > Commit 12cc923f1ccc ("tasklet: Introduce new initialization API")'
> > introduced a new tasklet initialization API. This series converts 
> > all the scsi drivers to use the new tasklet_setup() API
> 
> I've got to say I agree with Jens, this was a silly obfuscation:
> 
> +#define from_tasklet(var, callback_tasklet, tasklet_fieldname) \
> +       container_of(callback_tasklet, typeof(*var), tasklet_fieldname)
> 
> Just use container_of directly since we all understand what it does.

But then the lines get really long, wrapped, etc. This is what the
timer_struct conversion did too (added a container_of wrapper), so I
think it makes sense here too.

-- 
Kees Cook

^ permalink raw reply

* [PATCH v2] powerpc/pseries/svm: Allocate SWIOTLB buffer anywhere in memory
From: Thiago Jung Bauermann @ 2020-08-17 21:46 UTC (permalink / raw)
  To: iommu
  Cc: Konrad Rzeszutek Wilk, linuxppc-dev, Ram Pai, linux-kernel,
	Satheesh Rajendran, Robin Murphy, Christoph Hellwig,
	Thiago Jung Bauermann, Marek Szyprowski

POWER secure guests (i.e., guests which use the Protection Execution
Facility) need to use SWIOTLB to be able to do I/O with the hypervisor, but
they don't need the SWIOTLB memory to be in low addresses since the
hypervisor doesn't have any addressing limitation.

This solves a SWIOTLB initialization problem we are seeing in secure guests
with 128 GB of RAM: they are configured with 4 GB of crashkernel reserved
memory, which leaves no space for SWIOTLB in low addresses.

To do this, we use mostly the same code as swiotlb_init(), but allocate the
buffer using memblock_alloc() instead of memblock_alloc_low().

We also need to add swiotlb_set_no_iotlb_memory() in order to set the
no_iotlb_memory flag if initialization fails.

Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---
 arch/powerpc/include/asm/svm.h       |  4 ++++
 arch/powerpc/mm/mem.c                |  6 +++++-
 arch/powerpc/platforms/pseries/svm.c | 27 +++++++++++++++++++++++++++
 include/linux/swiotlb.h              |  1 +
 kernel/dma/swiotlb.c                 |  5 +++++
 5 files changed, 42 insertions(+), 1 deletion(-)

Changes from v1:
- Open-code swiotlb_init() in arch-specific code, as suggested by
  Christoph.

diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h
index 85580b30aba4..7546402d796a 100644
--- a/arch/powerpc/include/asm/svm.h
+++ b/arch/powerpc/include/asm/svm.h
@@ -15,6 +15,8 @@ static inline bool is_secure_guest(void)
 	return mfmsr() & MSR_S;
 }
 
+void __init svm_swiotlb_init(void);
+
 void dtl_cache_ctor(void *addr);
 #define get_dtl_cache_ctor()	(is_secure_guest() ? dtl_cache_ctor : NULL)
 
@@ -25,6 +27,8 @@ static inline bool is_secure_guest(void)
 	return false;
 }
 
+static inline void svm_swiotlb_init(void) {}
+
 #define get_dtl_cache_ctor() NULL
 
 #endif /* CONFIG_PPC_SVM */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index c2c11eb8dcfc..0f21bcb16405 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -50,6 +50,7 @@
 #include <asm/swiotlb.h>
 #include <asm/rtas.h>
 #include <asm/kasan.h>
+#include <asm/svm.h>
 
 #include <mm/mmu_decl.h>
 
@@ -290,7 +291,10 @@ void __init mem_init(void)
 	 * back to to-down.
 	 */
 	memblock_set_bottom_up(true);
-	swiotlb_init(0);
+	if (is_secure_guest())
+		svm_swiotlb_init();
+	else
+		swiotlb_init(0);
 #endif
 
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c
index 40c0637203d5..d592e663a8d6 100644
--- a/arch/powerpc/platforms/pseries/svm.c
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/memblock.h>
 #include <asm/machdep.h>
 #include <asm/svm.h>
 #include <asm/swiotlb.h>
@@ -34,6 +35,32 @@ static int __init init_svm(void)
 }
 machine_early_initcall(pseries, init_svm);
 
+/*
+ * Initialize SWIOTLB. Essentially the same as swiotlb_init(), except that it
+ * can allocate the buffer anywhere in memory. Since the hypervisor doesn't have
+ * any addressing limitation, we don't need to allocate it in low addresses.
+ */
+void __init svm_swiotlb_init(void)
+{
+	unsigned char *vstart;
+	unsigned long bytes, io_tlb_nslabs;
+
+	io_tlb_nslabs = (swiotlb_size_or_default() >> IO_TLB_SHIFT);
+	io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+
+	bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+
+	vstart = memblock_alloc(PAGE_ALIGN(bytes), PAGE_SIZE);
+	if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, false))
+		return;
+
+	if (io_tlb_start)
+		memblock_free_early(io_tlb_start,
+				    PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
+	pr_warn("Cannot allocate SWIOTLB buffer");
+	swiotlb_set_no_iotlb_memory(true);
+}
+
 int set_memory_encrypted(unsigned long addr, int numpages)
 {
 	if (!PAGE_ALIGNED(addr))
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 046bb94bd4d6..991e9f13e663 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -35,6 +35,7 @@ extern unsigned long swiotlb_nr_tbl(void);
 unsigned long swiotlb_size_or_default(void);
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
 extern void __init swiotlb_update_mem_attributes(void);
+void __init swiotlb_set_no_iotlb_memory(bool value);
 
 /*
  * Enumeration for sync targets
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index c19379fabd20..ed2b8818ff67 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -132,6 +132,11 @@ early_param("swiotlb", setup_io_tlb_npages);
 
 static bool no_iotlb_memory;
 
+void __init swiotlb_set_no_iotlb_memory(bool value)
+{
+	no_iotlb_memory = value;
+}
+
 unsigned long swiotlb_nr_tbl(void)
 {
 	return unlikely(no_iotlb_memory) ? 0 : io_tlb_nslabs;

^ permalink raw reply related

* Re: [PATCH] swiotlb: Allow allocating buffer anywhere in memory
From: Thiago Jung Bauermann @ 2020-08-17 21:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Konrad Rzeszutek Wilk, linuxppc-dev, Ram Pai, linux-kernel, iommu,
	Satheesh Rajendran, Robin Murphy, Mike Rapoport, Marek Szyprowski
In-Reply-To: <20200817102020.GD25336@lst.de>


Hello Christoph,

Christoph Hellwig <hch@lst.de> writes:

> On Sat, Aug 15, 2020 at 05:45:36PM -0300, Thiago Jung Bauermann wrote:
>> POWER secure guests (i.e., guests which use the Protection Execution
>> Facility) need to use SWIOTLB to be able to do I/O with the hypervisor, but
>> they don't need the SWIOTLB memory to be in low addresses since the
>> hypervisor doesn't have any addressing limitation.
>> 
>> This solves a SWIOTLB initialization problem we are seeing in secure guests
>> with 128 GB of RAM: they are configured with 4 GB of crashkernel reserved
>> memory, which leaves no space for SWIOTLB in low addresses.
>
> What about just open coding the allocation and using
> swiotlb_init_with_tbl?

Yes, that works too. I just sent a v2 implementing that change. I just
had to add a small accessor function so that I could set no_iotlb_memory
from outside swiotlb.c.

Thank you for the quick review.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [PATCH v4 6/8] mm: Move vmap_range from lib/ioremap.c to mm/vmalloc.c
From: Tang Yizhou @ 2020-08-17 13:23 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm
  Cc: linux-arch, Zefan Li, Will Deacon, Catalin Marinas, x86,
	linux-kernel, john.wanghui, Ingo Molnar, Borislav Petkov,
	Jonathan Cameron, H. Peter Anvin, Thomas Gleixner, linuxppc-dev,
	linux-arm-kernel
In-Reply-To: <20200816090904.83947-7-npiggin@gmail.com>

Hi Nicholas,

We may change the title as follows:
mm: Move vmap_range from mm/ioremap.c to mm/vmalloc.c

Yizhou


^ permalink raw reply

* Re: [PATCH 0/8] scsi: convert tasklets to use new tasklet_setup()
From: James Bottomley @ 2020-08-17 14:41 UTC (permalink / raw)
  To: Allen Pais, martin.petersen, kashyap.desai, sumit.saxena,
	shivasharan.srikanteshwara
  Cc: keescook, linux-scsi, linux-kernel, Allen Pais, target-devel,
	linuxppc-dev, megaraidlinux.pdl
In-Reply-To: <20200817085409.25268-1-allen.cryptic@gmail.com>

On Mon, 2020-08-17 at 14:24 +0530, Allen Pais wrote:
> From: Allen Pais <allen.lkml@gmail.com>
> 
> Commit 12cc923f1ccc ("tasklet: Introduce new initialization API")'
> introduced a new tasklet initialization API. This series converts 
> all the scsi drivers to use the new tasklet_setup() API

I've got to say I agree with Jens, this was a silly obfuscation:

+#define from_tasklet(var, callback_tasklet, tasklet_fieldname) \
+       container_of(callback_tasklet, typeof(*var), tasklet_fieldname)

Just use container_of directly since we all understand what it does.

James


^ permalink raw reply

* Re: [PATCH 0/8] scsi: convert tasklets to use new tasklet_setup()
From: James Bottomley @ 2020-08-17 19:57 UTC (permalink / raw)
  To: Kees Cook
  Cc: martin.petersen, linux-scsi, shivasharan.srikanteshwara,
	linux-kernel, kashyap.desai, sumit.saxena, Allen Pais,
	target-devel, Allen Pais, linuxppc-dev, megaraidlinux.pdl
In-Reply-To: <202008171227.D3A4F454D8@keescook>

On Mon, 2020-08-17 at 12:28 -0700, Kees Cook wrote:
> On Mon, Aug 17, 2020 at 07:41:58AM -0700, James Bottomley wrote:
> > On Mon, 2020-08-17 at 14:24 +0530, Allen Pais wrote:
> > > From: Allen Pais <allen.lkml@gmail.com>
> > > 
> > > Commit 12cc923f1ccc ("tasklet: Introduce new initialization
> > > API")' introduced a new tasklet initialization API. This series
> > > converts all the scsi drivers to use the new tasklet_setup() API
> > 
> > I've got to say I agree with Jens, this was a silly obfuscation:
> > 
> > +#define from_tasklet(var, callback_tasklet, tasklet_fieldname) \
> > +       container_of(callback_tasklet, typeof(*var),
> > tasklet_fieldname)
> > 
> > Just use container_of directly since we all understand what it
> > does.
> 
> But then the lines get really long, wrapped, etc.

I really don't think that's a problem but if you want to add a new
generic container_of that does typeof instead of insisting on the type,
I'd be sort of OK with that ... provided you don't gratuitously alter
the argument order.

The thing I object to is that this encourages everyone to roll their
own unnecessary container_of type macros in spite of the fact that it's
function is wholly generic.  It's fine if you're eliminating one of the
arguments, or actually making the macro specific to the type, but in
this case you're not, you're making a completely generic macro where
the name is the only thing that's specific to this case.

>  This is what the timer_struct conversion did too (added a
> container_of wrapper), so I think it makes sense here too.

I didn't see that one to object to it ...

James

^ permalink raw reply

* Re: [oss-drivers] [PATCH 16/20] ethernet: netronome: convert tasklets to use new tasklet_setup() API
From: Simon Horman @ 2020-08-17 14:15 UTC (permalink / raw)
  To: Allen Pais
  Cc: jes, borisp, keescook, linux-rdma, netdev, kda, cooldavid,
	dougmill, linux-kernel, linux-acenic, oss-drivers, kuba,
	Romain Perier, linuxppc-dev, davem, linux-arm-kernel, mlindner
In-Reply-To: <20200817082434.21176-18-allen.lkml@gmail.com>

On Mon, Aug 17, 2020 at 01:54:30PM +0530, Allen Pais wrote:
> In preparation for unconditionally passing the
> struct tasklet_struct pointer to all tasklet
> callbacks, switch to using the new tasklet_setup()
> and from_tasklet() to pass the tasklet pointer explicitly.
> 
> Signed-off-by: Romain Perier <romain.perier@gmail.com>
> Signed-off-by: Allen Pais <allen.lkml@gmail.com>

Reviewed-by: Simon Horman <simon.horman@netronome.com>

But:

This series should be targeted at net-next, and thus have net-next in its
subject

	[PATCH net-next x/y] ...

And it should be posted when net-next is open: it is currently closed.

	http://vger.kernel.org/~davem/net-next.html

> ---
>  drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 7 +++----
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
> index 39ee23e8c0bf..1dcd24d899f5 100644
> --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
> +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
> @@ -2287,9 +2287,9 @@ static bool nfp_ctrl_rx(struct nfp_net_r_vector *r_vec)
>  	return budget;
>  }
>  
> -static void nfp_ctrl_poll(unsigned long arg)
> +static void nfp_ctrl_poll(struct tasklet_struct *t)
>  {
> -	struct nfp_net_r_vector *r_vec = (void *)arg;
> +	struct nfp_net_r_vector *r_vec = from_tasklet(r_vec, t, tasklet);
>  
>  	spin_lock(&r_vec->lock);
>  	nfp_net_tx_complete(r_vec->tx_ring, 0);
> @@ -2337,8 +2337,7 @@ static void nfp_net_vecs_init(struct nfp_net *nn)
>  
>  			__skb_queue_head_init(&r_vec->queue);
>  			spin_lock_init(&r_vec->lock);
> -			tasklet_init(&r_vec->tasklet, nfp_ctrl_poll,
> -				     (unsigned long)r_vec);
> +			tasklet_setup(&r_vec->tasklet, nfp_ctrl_poll);
>  			tasklet_disable(&r_vec->tasklet);
>  		}
>  
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH 1/2] powerpc/64s: remove PROT_SAO support
From: Shawn Anastasio @ 2020-08-17 19:14 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev
In-Reply-To: <20200607120209.463501-1-npiggin@gmail.com>

I'm a bit concerned about the removal of PROT_SAO.

 From what I can see, a feature like this would be extremely useful for
emulating architectures with stronger memory models. QEMU's multi-
threaded TCG project in particular looks like it would be a good
candidate, since as far as I'm aware it is currently completely
unable to perform strong-on-weak emulation.

Without hardware support like SAO provides, the only way I could see
to achieve this would be by emitting tons of unnecessary and costly
memory barrier instructions.

I understand that ISA 3.1 and POWER10 have dropped SAO, but as a POWER9
user it seems a bit silly to have a potentially useful feature dropped
from the kernel just because a future processor doesn't support it.

Curious to hear more thoughts on this.

Regards,
Shawn

On 6/7/20 7:02 AM, Nicholas Piggin wrote:
> ISA v3.1 does not support the SAO storage control attribute required to
> implement PROT_SAO. PROT_SAO was used by specialised system software
> (Lx86) that has been discontinued for about 7 years, and is not thought
> to be used elsewhere, so removal should not cause problems.
> 
> We rather remove it than keep support for older processors, because
> live migrating guest partitions to newer processors may not be possible
> if SAO is in use.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>   arch/powerpc/include/asm/book3s/64/pgtable.h  |  8 ++--
>   arch/powerpc/include/asm/cputable.h           |  9 ++--
>   arch/powerpc/include/asm/kvm_book3s_64.h      |  3 +-
>   arch/powerpc/include/asm/mman.h               | 24 +++--------
>   arch/powerpc/include/asm/nohash/64/pgtable.h  |  2 -
>   arch/powerpc/kernel/dt_cpu_ftrs.c             |  2 +-
>   arch/powerpc/mm/book3s64/hash_utils.c         |  2 -
>   include/linux/mm.h                            |  2 -
>   include/trace/events/mmflags.h                |  2 -
>   mm/ksm.c                                      |  4 --
>   tools/testing/selftests/powerpc/mm/.gitignore |  1 -
>   tools/testing/selftests/powerpc/mm/Makefile   |  4 +-
>   tools/testing/selftests/powerpc/mm/prot_sao.c | 42 -------------------
>   13 files changed, 18 insertions(+), 87 deletions(-)
>   delete mode 100644 tools/testing/selftests/powerpc/mm/prot_sao.c
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index f17442c3a092..d9e92586f8dc 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -20,9 +20,13 @@
>   #define _PAGE_RW		(_PAGE_READ | _PAGE_WRITE)
>   #define _PAGE_RWX		(_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
>   #define _PAGE_PRIVILEGED	0x00008 /* kernel access only */
> -#define _PAGE_SAO		0x00010 /* Strong access order */
> +
> +#define _PAGE_CACHE_CTL		0x00030 /* Bits for the folowing cache modes */
> +			/*	No bits set is normal cacheable memory */
> +			/*	0x00010 unused, is SAO bit on radix POWER9 */
>   #define _PAGE_NON_IDEMPOTENT	0x00020 /* non idempotent memory */
>   #define _PAGE_TOLERANT		0x00030 /* tolerant memory, cache inhibited */
> +
>   #define _PAGE_DIRTY		0x00080 /* C: page changed */
>   #define _PAGE_ACCESSED		0x00100 /* R: page referenced */
>   /*
> @@ -825,8 +829,6 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
>   	return hash__set_pte_at(mm, addr, ptep, pte, percpu);
>   }
>   
> -#define _PAGE_CACHE_CTL	(_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT)
> -
>   #define pgprot_noncached pgprot_noncached
>   static inline pgprot_t pgprot_noncached(pgprot_t prot)
>   {
> diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
> index bac2252c839e..c7e923b0000a 100644
> --- a/arch/powerpc/include/asm/cputable.h
> +++ b/arch/powerpc/include/asm/cputable.h
> @@ -191,7 +191,6 @@ static inline void cpu_feature_keys_init(void) { }
>   #define CPU_FTR_SPURR			LONG_ASM_CONST(0x0000000001000000)
>   #define CPU_FTR_DSCR			LONG_ASM_CONST(0x0000000002000000)
>   #define CPU_FTR_VSX			LONG_ASM_CONST(0x0000000004000000)
> -#define CPU_FTR_SAO			LONG_ASM_CONST(0x0000000008000000)
>   #define CPU_FTR_CP_USE_DCBTZ		LONG_ASM_CONST(0x0000000010000000)
>   #define CPU_FTR_UNALIGNED_LD_STD	LONG_ASM_CONST(0x0000000020000000)
>   #define CPU_FTR_ASYM_SMT		LONG_ASM_CONST(0x0000000040000000)
> @@ -435,7 +434,7 @@ static inline void cpu_feature_keys_init(void) { }
>   	    CPU_FTR_MMCRA | CPU_FTR_SMT | \
>   	    CPU_FTR_COHERENT_ICACHE | \
>   	    CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
> -	    CPU_FTR_DSCR | CPU_FTR_SAO  | CPU_FTR_ASYM_SMT | \
> +	    CPU_FTR_DSCR | CPU_FTR_ASYM_SMT | \
>   	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
>   	    CPU_FTR_CFAR | CPU_FTR_HVMODE | \
>   	    CPU_FTR_VMX_COPY | CPU_FTR_HAS_PPR | CPU_FTR_DABRX | CPU_FTR_PKEY)
> @@ -444,7 +443,7 @@ static inline void cpu_feature_keys_init(void) { }
>   	    CPU_FTR_MMCRA | CPU_FTR_SMT | \
>   	    CPU_FTR_COHERENT_ICACHE | \
>   	    CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
> -	    CPU_FTR_DSCR | CPU_FTR_SAO  | \
> +	    CPU_FTR_DSCR | \
>   	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
>   	    CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
>   	    CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \
> @@ -455,7 +454,7 @@ static inline void cpu_feature_keys_init(void) { }
>   	    CPU_FTR_MMCRA | CPU_FTR_SMT | \
>   	    CPU_FTR_COHERENT_ICACHE | \
>   	    CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
> -	    CPU_FTR_DSCR | CPU_FTR_SAO  | \
> +	    CPU_FTR_DSCR | \
>   	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
>   	    CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
>   	    CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \
> @@ -473,7 +472,7 @@ static inline void cpu_feature_keys_init(void) { }
>   	    CPU_FTR_MMCRA | CPU_FTR_SMT | \
>   	    CPU_FTR_COHERENT_ICACHE | \
>   	    CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
> -	    CPU_FTR_DSCR | CPU_FTR_SAO  | \
> +	    CPU_FTR_DSCR | \
>   	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
>   	    CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
>   	    CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 9bb9bb370b53..579c9229124b 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -400,7 +400,8 @@ static inline bool hpte_cache_flags_ok(unsigned long hptel, bool is_ci)
>   
>   	/* Handle SAO */
>   	if (wimg == (HPTE_R_W | HPTE_R_I | HPTE_R_M) &&
> -	    cpu_has_feature(CPU_FTR_ARCH_206))
> +	    cpu_has_feature(CPU_FTR_ARCH_206) &&
> +	    !cpu_has_feature(CPU_FTR_ARCH_31))
>   		wimg = HPTE_R_M;
>   
>   	if (!is_ci)
> diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h
> index d610c2e07b28..43a62f3e21a0 100644
> --- a/arch/powerpc/include/asm/mman.h
> +++ b/arch/powerpc/include/asm/mman.h
> @@ -13,38 +13,24 @@
>   #include <linux/pkeys.h>
>   #include <asm/cpu_has_feature.h>
>   
> -/*
> - * This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits()
> - * here.  How important is the optimization?
> - */
> +#ifdef CONFIG_PPC_MEM_KEYS
>   static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
>   		unsigned long pkey)
>   {
> -#ifdef CONFIG_PPC_MEM_KEYS
> -	return (((prot & PROT_SAO) ? VM_SAO : 0) | pkey_to_vmflag_bits(pkey));
> -#else
> -	return ((prot & PROT_SAO) ? VM_SAO : 0);
> -#endif
> +	return pkey_to_vmflag_bits(pkey);
>   }
>   #define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
>   
>   static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
>   {
> -#ifdef CONFIG_PPC_MEM_KEYS
> -	return (vm_flags & VM_SAO) ?
> -		__pgprot(_PAGE_SAO | vmflag_to_pte_pkey_bits(vm_flags)) :
> -		__pgprot(0 | vmflag_to_pte_pkey_bits(vm_flags));
> -#else
> -	return (vm_flags & VM_SAO) ? __pgprot(_PAGE_SAO) : __pgprot(0);
> -#endif
> +	return __pgprot(vmflag_to_pte_pkey_bits(vm_flags));
>   }
>   #define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags)
> +#endif
>   
>   static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
>   {
> -	if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM | PROT_SAO))
> -		return false;
> -	if ((prot & PROT_SAO) && !cpu_has_feature(CPU_FTR_SAO))
> +	if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM))
>   		return false;
>   	return true;
>   }
> diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
> index 3424381b81da..2fd528ef48e0 100644
> --- a/arch/powerpc/include/asm/nohash/64/pgtable.h
> +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
> @@ -82,8 +82,6 @@
>    */
>   #include <asm/nohash/pte-book3e.h>
>   
> -#define _PAGE_SAO	0
> -
>   #define PTE_RPN_MASK	(~((1UL << PTE_RPN_SHIFT) - 1))
>   
>   /*
> diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
> index 3a409517c031..8d2e4043702f 100644
> --- a/arch/powerpc/kernel/dt_cpu_ftrs.c
> +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
> @@ -622,7 +622,7 @@ static struct dt_cpu_feature_match __initdata
>   	{"processor-control-facility-v3", feat_enable_dbell, CPU_FTR_DBELL},
>   	{"processor-utilization-of-resources-register", feat_enable_purr, 0},
>   	{"no-execute", feat_enable, 0},
> -	{"strong-access-ordering", feat_enable, CPU_FTR_SAO},
> +	{"strong-access-ordering", feat_enable, 0},
>   	{"cache-inhibited-large-page", feat_enable_large_ci, 0},
>   	{"coprocessor-icswx", feat_enable, 0},
>   	{"hypervisor-virtualization-interrupt", feat_enable_hvi, 0},
> diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
> index 0124003e60d0..14b6abdc3bd8 100644
> --- a/arch/powerpc/mm/book3s64/hash_utils.c
> +++ b/arch/powerpc/mm/book3s64/hash_utils.c
> @@ -232,8 +232,6 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags)
>   		rflags |= HPTE_R_I;
>   	else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)
>   		rflags |= (HPTE_R_I | HPTE_R_G);
> -	else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
> -		rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M);
>   	else
>   		/*
>   		 * Add memory coherence if cache inhibited is not set
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 86adc71a972f..bdcaae914120 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -316,8 +316,6 @@ extern unsigned int kobjsize(const void *objp);
>   
>   #if defined(CONFIG_X86)
>   # define VM_PAT		VM_ARCH_1	/* PAT reserves whole VMA at once (x86) */
> -#elif defined(CONFIG_PPC)
> -# define VM_SAO		VM_ARCH_1	/* Strong Access Ordering (powerpc) */
>   #elif defined(CONFIG_PARISC)
>   # define VM_GROWSUP	VM_ARCH_1
>   #elif defined(CONFIG_IA64)
> diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
> index 5fb752034386..939092dbcb8b 100644
> --- a/include/trace/events/mmflags.h
> +++ b/include/trace/events/mmflags.h
> @@ -114,8 +114,6 @@ IF_HAVE_PG_IDLE(PG_idle,		"idle"		)
>   
>   #if defined(CONFIG_X86)
>   #define __VM_ARCH_SPECIFIC_1 {VM_PAT,     "pat"           }
> -#elif defined(CONFIG_PPC)
> -#define __VM_ARCH_SPECIFIC_1 {VM_SAO,     "sao"           }
>   #elif defined(CONFIG_PARISC) || defined(CONFIG_IA64)
>   #define __VM_ARCH_SPECIFIC_1 {VM_GROWSUP,	"growsup"	}
>   #elif !defined(CONFIG_MMU)
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 18c5d005bd01..b225b0e16111 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -2452,10 +2452,6 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
>   		if (vma_is_dax(vma))
>   			return 0;
>   
> -#ifdef VM_SAO
> -		if (*vm_flags & VM_SAO)
> -			return 0;
> -#endif
>   #ifdef VM_SPARC_ADI
>   		if (*vm_flags & VM_SPARC_ADI)
>   			return 0;
> diff --git a/tools/testing/selftests/powerpc/mm/.gitignore b/tools/testing/selftests/powerpc/mm/.gitignore
> index 2ca523255b1b..ff296c94f627 100644
> --- a/tools/testing/selftests/powerpc/mm/.gitignore
> +++ b/tools/testing/selftests/powerpc/mm/.gitignore
> @@ -2,7 +2,6 @@
>   hugetlb_vs_thp_test
>   subpage_prot
>   tempfile
> -prot_sao
>   segv_errors
>   wild_bctr
>   large_vm_fork_separation
> diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile
> index b9103c4bb414..9b8a7b3069c5 100644
> --- a/tools/testing/selftests/powerpc/mm/Makefile
> +++ b/tools/testing/selftests/powerpc/mm/Makefile
> @@ -2,7 +2,7 @@
>   noarg:
>   	$(MAKE) -C ../
>   
> -TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors wild_bctr \
> +TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot segv_errors wild_bctr \
>   		  large_vm_fork_separation bad_accesses
>   TEST_GEN_PROGS_EXTENDED := tlbie_test
>   TEST_GEN_FILES := tempfile
> @@ -12,8 +12,6 @@ include ../../lib.mk
>   
>   $(TEST_GEN_PROGS): ../harness.c
>   
> -$(OUTPUT)/prot_sao: ../utils.c
> -
>   $(OUTPUT)/wild_bctr: CFLAGS += -m64
>   $(OUTPUT)/large_vm_fork_separation: CFLAGS += -m64
>   $(OUTPUT)/bad_accesses: CFLAGS += -m64
> diff --git a/tools/testing/selftests/powerpc/mm/prot_sao.c b/tools/testing/selftests/powerpc/mm/prot_sao.c
> deleted file mode 100644
> index e2eed65b7735..000000000000
> --- a/tools/testing/selftests/powerpc/mm/prot_sao.c
> +++ /dev/null
> @@ -1,42 +0,0 @@
> -// SPDX-License-Identifier: GPL-2.0-only
> -/*
> - * Copyright 2016, Michael Ellerman, IBM Corp.
> - */
> -
> -#include <stdio.h>
> -#include <stdlib.h>
> -#include <string.h>
> -#include <sys/mman.h>
> -
> -#include <asm/cputable.h>
> -
> -#include "utils.h"
> -
> -#define SIZE (64 * 1024)
> -
> -int test_prot_sao(void)
> -{
> -	char *p;
> -
> -	/* 2.06 or later should support SAO */
> -	SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06));
> -
> -	/*
> -	 * Ensure we can ask for PROT_SAO.
> -	 * We can't really verify that it does the right thing, but at least we
> -	 * confirm the kernel will accept it.
> -	 */
> -	p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE | PROT_SAO,
> -		 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
> -	FAIL_IF(p == MAP_FAILED);
> -
> -	/* Write to the mapping, to at least cause a fault */
> -	memset(p, 0xaa, SIZE);
> -
> -	return 0;
> -}
> -
> -int main(void)
> -{
> -	return test_harness(test_prot_sao, "prot-sao");
> -}
> 

^ permalink raw reply

* [PATCH v1 00/10] DDW indirect mapping
From: Leonardo Bras @ 2020-08-17 23:40 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Alexey Kardashevskiy, Christophe Leroy, Leonardo Bras,
	Joel Stanley, Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev, linux-kernel

This patchset must be applied on top of:
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=194179&state=%2A&archive=both

As of today, if the biggest DDW that can be created can't map the whole
partition, it's creation is skipped and the default DMA window
ibm,dma-window" is used instead.

Usually, the available DDW will be 16x bigger than the default DMA window,
as it keep the same page count and raise the page size from 4k to 64k.
Besides the increased window size, it performs better on allocations
bigger than 4k, so it would be nice to use it instead.

Patch #1 replaces hard-coded 4K page size with a variable containing the
correct page size for the window.

Patch #2 makes sure alignment is correct in iommu_*_coherent().

Patch #3 let small allocations use largepool if there is no more space
left in the other pools, thus allowing the whole DMA window to be used by
smaller allocations.

Patch #4 introduces iommu_table_in_use(), and replace manual bit-field
checking where it's used. It will be used for aborting enable_ddw() if
there is any current iommu allocation and we are trying single window
indirect mapping.

Patch #5 introduces iommu_pseries_alloc_table() that will be helpful
when indirect mapping needs to replace the iommu_table.

Patch #6 adds helpers for adding and removing DDWs in the list.

Patch #7 refactors enable_ddw() so it returns if direct mapping is
possible, instead of DMA offset. It helps for next patches on
indirect DMA mapping and also allows DMA windows starting at 0x00.

Patch #8 bring new helper to simplify enable_ddw(), allowing
some reorganization for introducing indirect mapping DDW.

Patch #9:
Instead of destroying the created DDW if it doesn't map the whole
partition, make use of it instead of the default DMA window as it improves
performance. Also, update the iommu_table and re-generate the pools.

Patch #10:
Does some renaming of 'direct window' to 'dma window', given the DDW
created can now be also used in indirect mapping if direct mapping is not
available.

All patches were tested into an LPAR with an Ethernet VF:
4005:01:00.0 Ethernet controller: Mellanox Technologies MT27700 Family
[ConnectX-4 Virtual Function]

Patchset was tested with a 64GB DDW which did not map the whole
partition (128G).

Leonardo Bras (10):
  powerpc/pseries/iommu: Replace hard-coded page shift
  powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE on
    iommu_*_coherent()
  powerpc/kernel/iommu: Use largepool as a last resort when !largealloc
  powerpc/kernel/iommu: Add new iommu_table_in_use() helper
  powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper
  powerpc/pseries/iommu: Add ddw_list_add() helper
  powerpc/pseries/iommu: Allow DDW windows starting at 0x00
  powerpc/pseries/iommu: Add ddw_property_create() and refactor
    enable_ddw()
  powerpc/pseries/iommu: Make use of DDW even if it does not map the
    partition
  powerpc/pseries/iommu: Rename "direct window" to "dma window"

 arch/powerpc/include/asm/iommu.h       |   1 +
 arch/powerpc/include/asm/tce.h         |  10 +-
 arch/powerpc/kernel/iommu.c            |  88 +++---
 arch/powerpc/platforms/pseries/iommu.c | 394 ++++++++++++++++---------
 4 files changed, 305 insertions(+), 188 deletions(-)

-- 
2.25.4

^ permalink raw reply

* [PATCH v1 01/10] powerpc/pseries/iommu: Replace hard-coded page shift
From: Leonardo Bras @ 2020-08-17 23:40 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Alexey Kardashevskiy, Christophe Leroy, Leonardo Bras,
	Joel Stanley, Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200817234033.442511-1-leobras.c@gmail.com>

Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.

In the process, some defines like TCE_SHIFT were made obsolete, and then
removed. TCE_RPN_MASK was updated to generate a mask according to
the pageshift used.

Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/include/asm/tce.h         | 10 ++----
 arch/powerpc/platforms/pseries/iommu.c | 42 ++++++++++++++++----------
 2 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h
index db5fc2f2262d..971cba2d87cc 100644
--- a/arch/powerpc/include/asm/tce.h
+++ b/arch/powerpc/include/asm/tce.h
@@ -19,15 +19,9 @@
 #define TCE_VB			0
 #define TCE_PCI			1
 
-/* TCE page size is 4096 bytes (1 << 12) */
-
-#define TCE_SHIFT	12
-#define TCE_PAGE_SIZE	(1 << TCE_SHIFT)
-
 #define TCE_ENTRY_SIZE		8		/* each TCE is 64 bits */
-
-#define TCE_RPN_MASK		0xfffffffffful  /* 40-bit RPN (4K pages) */
-#define TCE_RPN_SHIFT		12
+#define TCE_RPN_BITS		52		/* Bits 0-51 represent RPN on TCE */
+#define TCE_RPN_MASK(ps)	((1ul << (TCE_RPN_BITS - (ps))) - 1)
 #define TCE_VALID		0x800		/* TCE valid */
 #define TCE_ALLIO		0x400		/* TCE valid for all lpars */
 #define TCE_PCI_WRITE		0x2		/* write from PCI allowed */
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index e4198700ed1a..8fe23b7dff3a 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -107,6 +107,9 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index,
 	u64 proto_tce;
 	__be64 *tcep;
 	u64 rpn;
+	const unsigned long tceshift = tbl->it_page_shift;
+	const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);
+	const u64 rpn_mask = TCE_RPN_MASK(tceshift);
 
 	proto_tce = TCE_PCI_READ; // Read allowed
 
@@ -117,10 +120,10 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index,
 
 	while (npages--) {
 		/* can't move this out since we might cross MEMBLOCK boundary */
-		rpn = __pa(uaddr) >> TCE_SHIFT;
-		*tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT);
+		rpn = __pa(uaddr) >> tceshift;
+		*tcep = cpu_to_be64(proto_tce | (rpn & rpn_mask) << tceshift);
 
-		uaddr += TCE_PAGE_SIZE;
+		uaddr += pagesize;
 		tcep++;
 	}
 	return 0;
@@ -146,7 +149,7 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
 	return be64_to_cpu(*tcep);
 }
 
-static void tce_free_pSeriesLP(unsigned long liobn, long, long);
+static void tce_free_pSeriesLP(unsigned long liobn, long, long, long);
 static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
 
 static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
@@ -159,6 +162,7 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
 	u64 rpn;
 	int ret = 0;
 	long tcenum_start = tcenum, npages_start = npages;
+	const u64 rpn_mask = TCE_RPN_MASK(tceshift);
 
 	rpn = __pa(uaddr) >> tceshift;
 	proto_tce = TCE_PCI_READ;
@@ -166,12 +170,12 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
 		proto_tce |= TCE_PCI_WRITE;
 
 	while (npages--) {
-		tce = proto_tce | (rpn & TCE_RPN_MASK) << tceshift;
+		tce = proto_tce | (rpn & rpn_mask) << tceshift;
 		rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);
 
 		if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
 			ret = (int)rc;
-			tce_free_pSeriesLP(liobn, tcenum_start,
+			tce_free_pSeriesLP(liobn, tcenum_start, tceshift,
 			                   (npages_start - (npages + 1)));
 			break;
 		}
@@ -205,10 +209,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 	long tcenum_start = tcenum, npages_start = npages;
 	int ret = 0;
 	unsigned long flags;
+	const unsigned long tceshift = tbl->it_page_shift;
+	const u64 rpn_mask = TCE_RPN_MASK(tceshift);
 
 	if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
 		return tce_build_pSeriesLP(tbl->it_index, tcenum,
-					   tbl->it_page_shift, npages, uaddr,
+					   tceshift, npages, uaddr,
 		                           direction, attrs);
 	}
 
@@ -225,13 +231,13 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		if (!tcep) {
 			local_irq_restore(flags);
 			return tce_build_pSeriesLP(tbl->it_index, tcenum,
-					tbl->it_page_shift,
+					tceshift,
 					npages, uaddr, direction, attrs);
 		}
 		__this_cpu_write(tce_page, tcep);
 	}
 
-	rpn = __pa(uaddr) >> TCE_SHIFT;
+	rpn = __pa(uaddr) >> tceshift;
 	proto_tce = TCE_PCI_READ;
 	if (direction != DMA_TO_DEVICE)
 		proto_tce |= TCE_PCI_WRITE;
@@ -245,12 +251,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE);
 
 		for (l = 0; l < limit; l++) {
-			tcep[l] = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT);
+			tcep[l] = cpu_to_be64(proto_tce | (rpn & rpn_mask) << tceshift);
 			rpn++;
 		}
 
 		rc = plpar_tce_put_indirect((u64)tbl->it_index,
-					    (u64)tcenum << 12,
+					    (u64)tcenum << tceshift,
 					    (u64)__pa(tcep),
 					    limit);
 
@@ -277,12 +283,13 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 	return ret;
 }
 
-static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long npages)
+static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
+			       long npages)
 {
 	u64 rc;
 
 	while (npages--) {
-		rc = plpar_tce_put((u64)liobn, (u64)tcenum << 12, 0);
+		rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, 0);
 
 		if (rc && printk_ratelimit()) {
 			printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
@@ -301,9 +308,11 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n
 	u64 rc;
 
 	if (!firmware_has_feature(FW_FEATURE_STUFF_TCE))
-		return tce_free_pSeriesLP(tbl->it_index, tcenum, npages);
+		return tce_free_pSeriesLP(tbl->it_index, tcenum,
+					  tbl->it_page_shift, npages);
 
-	rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);
+	rc = plpar_tce_stuff((u64)tbl->it_index,
+			     (u64)tcenum << tbl->it_page_shift, 0, npages);
 
 	if (rc && printk_ratelimit()) {
 		printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
@@ -319,7 +328,8 @@ static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)
 	u64 rc;
 	unsigned long tce_ret;
 
-	rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret);
+	rc = plpar_tce_get((u64)tbl->it_index,
+			   (u64)tcenum << tbl->it_page_shift, &tce_ret);
 
 	if (rc && printk_ratelimit()) {
 		printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc);
-- 
2.25.4


^ permalink raw reply related

* [PATCH v1 02/10] powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE on iommu_*_coherent()
From: Leonardo Bras @ 2020-08-17 23:40 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Alexey Kardashevskiy, Christophe Leroy, Leonardo Bras,
	Joel Stanley, Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200817234033.442511-1-leobras.c@gmail.com>

Both iommu_alloc_coherent() and iommu_free_coherent() assume that once
size is aligned to PAGE_SIZE it will be aligned to IOMMU_PAGE_SIZE.

Update those functions to guarantee alignment with requested size
using IOMMU_PAGE_ALIGN() before doing iommu_alloc() / iommu_free().

Also, on iommu_range_alloc(), replace ALIGN(n, 1 << tbl->it_page_shift)
with IOMMU_PAGE_ALIGN(n, tbl), which seems easier to read.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/kernel/iommu.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 9704f3f76e63..d7086087830f 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -237,10 +237,9 @@ static unsigned long iommu_range_alloc(struct device *dev,
 	}
 
 	if (dev)
-		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
-				      1 << tbl->it_page_shift);
+		boundary_size = IOMMU_PAGE_ALIGN(dma_get_seg_boundary(dev) + 1, tbl);
 	else
-		boundary_size = ALIGN(1UL << 32, 1 << tbl->it_page_shift);
+		boundary_size = IOMMU_PAGE_ALIGN(1UL << 32, tbl);
 	/* 4GB boundary for iseries_hv_alloc and iseries_hv_map */
 
 	n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset,
@@ -858,6 +857,7 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
 	unsigned int order;
 	unsigned int nio_pages, io_order;
 	struct page *page;
+	size_t size_io = size;
 
 	size = PAGE_ALIGN(size);
 	order = get_order(size);
@@ -884,8 +884,9 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
 	memset(ret, 0, size);
 
 	/* Set up tces to cover the allocated range */
-	nio_pages = size >> tbl->it_page_shift;
-	io_order = get_iommu_order(size, tbl);
+	size_io = IOMMU_PAGE_ALIGN(size_io, tbl);
+	nio_pages = size_io >> tbl->it_page_shift;
+	io_order = get_iommu_order(size_io, tbl);
 	mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
 			      mask >> tbl->it_page_shift, io_order, 0);
 	if (mapping == DMA_MAPPING_ERROR) {
@@ -900,11 +901,11 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 			 void *vaddr, dma_addr_t dma_handle)
 {
 	if (tbl) {
-		unsigned int nio_pages;
+		size_t size_io = IOMMU_PAGE_ALIGN(size, tbl);
+		unsigned int nio_pages = size_io >> tbl->it_page_shift;
 
-		size = PAGE_ALIGN(size);
-		nio_pages = size >> tbl->it_page_shift;
 		iommu_free(tbl, dma_handle, nio_pages);
+
 		size = PAGE_ALIGN(size);
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
-- 
2.25.4


^ permalink raw reply related

* [PATCH v1 03/10] powerpc/kernel/iommu: Use largepool as a last resort when !largealloc
From: Leonardo Bras @ 2020-08-17 23:40 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Alexey Kardashevskiy, Christophe Leroy, Leonardo Bras,
	Joel Stanley, Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200817234033.442511-1-leobras.c@gmail.com>

As of today, doing iommu_range_alloc() only for !largealloc (npages <= 15)
will only be able to use 3/4 of the available pages, given pages on
largepool  not being available for !largealloc.

This could mean some drivers not being able to fully use all the available
pages for the DMA window.

Add pages on largepool as a last resort for !largealloc, making all pages
of the DMA window available.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/kernel/iommu.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index d7086087830f..7f603d4e62d4 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -261,6 +261,15 @@ static unsigned long iommu_range_alloc(struct device *dev,
 			pass++;
 			goto again;
 
+		} else if (pass == tbl->nr_pools + 1) {
+			/* Last resort: try largepool */
+			spin_unlock(&pool->lock);
+			pool = &tbl->large_pool;
+			spin_lock(&pool->lock);
+			pool->hint = pool->start;
+			pass++;
+			goto again;
+
 		} else {
 			/* Give up */
 			spin_unlock_irqrestore(&(pool->lock), flags);
-- 
2.25.4


^ permalink raw reply related

* [PATCH v1 04/10] powerpc/kernel/iommu: Add new iommu_table_in_use() helper
From: Leonardo Bras @ 2020-08-17 23:40 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Alexey Kardashevskiy, Christophe Leroy, Leonardo Bras,
	Joel Stanley, Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200817234033.442511-1-leobras.c@gmail.com>

Having a function to check if the iommu table has any allocation helps
deciding if a tbl can be reset for using a new DMA window.

It should be enough to replace all instances of !bitmap_empty(tbl...).

iommu_table_in_use() skips reserved memory, so we don't need to worry about
releasing it before testing. This causes iommu_table_release_pages() to
become unnecessary, given it is only used to remove reserved memory for
testing.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/include/asm/iommu.h |  1 +
 arch/powerpc/kernel/iommu.c      | 62 ++++++++++++++++++--------------
 2 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 5032f1593299..2913e5c8b1f8 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -154,6 +154,7 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table *tbl,
 		int nid, unsigned long res_start, unsigned long res_end);
+bool iommu_table_in_use(struct iommu_table *tbl);
 
 #define IOMMU_TABLE_GROUP_MAX_TABLES	2
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 7f603d4e62d4..c5d5d36ab65e 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -668,21 +668,6 @@ static void iommu_table_reserve_pages(struct iommu_table *tbl,
 		set_bit(i - tbl->it_offset, tbl->it_map);
 }
 
-static void iommu_table_release_pages(struct iommu_table *tbl)
-{
-	int i;
-
-	/*
-	 * In case we have reserved the first bit, we should not emit
-	 * the warning below.
-	 */
-	if (tbl->it_offset == 0)
-		clear_bit(0, tbl->it_map);
-
-	for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
-		clear_bit(i - tbl->it_offset, tbl->it_map);
-}
-
 /*
  * Build a iommu_table structure.  This contains a bit map which
  * is used to manage allocation of the tce space.
@@ -743,6 +728,38 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
 	return tbl;
 }
 
+bool iommu_table_in_use(struct iommu_table *tbl)
+{
+	bool in_use;
+	unsigned long p1_start = 0, p1_end, p2_start, p2_end;
+
+	/*ignore reserved bit0*/
+	if (tbl->it_offset == 0)
+		p1_start = 1;
+
+	/* Check if reserved memory is valid*/
+	if (tbl->it_reserved_start >= tbl->it_offset &&
+	    tbl->it_reserved_start <= (tbl->it_offset + tbl->it_size) &&
+	    tbl->it_reserved_end   >= tbl->it_offset &&
+	    tbl->it_reserved_end   <= (tbl->it_offset + tbl->it_size)) {
+		p1_end = tbl->it_reserved_start - tbl->it_offset;
+		p2_start = tbl->it_reserved_end - tbl->it_offset + 1;
+		p2_end = tbl->it_size;
+	} else {
+		p1_end = tbl->it_size;
+		p2_start = 0;
+		p2_end = 0;
+	}
+
+	in_use = (find_next_bit(tbl->it_map, p1_end, p1_start) != p1_end);
+	if (in_use || p2_start == 0)
+		return in_use;
+
+	in_use = (find_next_bit(tbl->it_map, p2_end, p2_start) != p2_end);
+
+	return in_use;
+}
+
 static void iommu_table_free(struct kref *kref)
 {
 	unsigned long bitmap_sz;
@@ -759,10 +776,8 @@ static void iommu_table_free(struct kref *kref)
 		return;
 	}
 
-	iommu_table_release_pages(tbl);
-
 	/* verify that table contains no entries */
-	if (!bitmap_empty(tbl->it_map, tbl->it_size))
+	if (iommu_table_in_use(tbl))
 		pr_warn("%s: Unexpected TCEs\n", __func__);
 
 	/* calculate bitmap size in bytes */
@@ -1069,18 +1084,13 @@ int iommu_take_ownership(struct iommu_table *tbl)
 	for (i = 0; i < tbl->nr_pools; i++)
 		spin_lock(&tbl->pools[i].lock);
 
-	iommu_table_release_pages(tbl);
-
-	if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
+	if (iommu_table_in_use(tbl)) {
 		pr_err("iommu_tce: it_map is not empty");
 		ret = -EBUSY;
-		/* Undo iommu_table_release_pages, i.e. restore bit#0, etc */
-		iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
-				tbl->it_reserved_end);
-	} else {
-		memset(tbl->it_map, 0xff, sz);
 	}
 
+	memset(tbl->it_map, 0xff, sz);
+
 	for (i = 0; i < tbl->nr_pools; i++)
 		spin_unlock(&tbl->pools[i].lock);
 	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
-- 
2.25.4


^ permalink raw reply related

* [PATCH v1 05/10] powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper
From: Leonardo Bras @ 2020-08-17 23:40 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Alexey Kardashevskiy, Christophe Leroy, Leonardo Bras,
	Joel Stanley, Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200817234033.442511-1-leobras.c@gmail.com>

Creates a helper to allow allocating a new iommu_table without the need
to reallocate the iommu_group.

This will be helpful for replacing the iommu_table for the new DMA window,
after we remove the old one with iommu_tce_table_put().

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 8fe23b7dff3a..39617ce0ec83 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -53,28 +53,31 @@ enum {
 	DDW_EXT_QUERY_OUT_SIZE = 2
 };
 
-static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+static struct iommu_table *iommu_pseries_alloc_table(int node)
 {
-	struct iommu_table_group *table_group;
 	struct iommu_table *tbl;
 
-	table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
-			   node);
-	if (!table_group)
-		return NULL;
-
 	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
 	if (!tbl)
-		goto free_group;
+		return NULL;
 
 	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
 	kref_init(&tbl->it_kref);
+	return tbl;
+}
 
-	table_group->tables[0] = tbl;
+static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+{
+	struct iommu_table_group *table_group;
+
+	table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node);
+	if (!table_group)
+		return NULL;
 
-	return table_group;
+	table_group->tables[0] = iommu_pseries_alloc_table(node);
+	if (table_group->tables[0])
+		return table_group;
 
-free_group:
 	kfree(table_group);
 	return NULL;
 }
-- 
2.25.4


^ permalink raw reply related

* [PATCH v1 06/10] powerpc/pseries/iommu: Add ddw_list_add() helper
From: Leonardo Bras @ 2020-08-17 23:40 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Alexey Kardashevskiy, Christophe Leroy, Leonardo Bras,
	Joel Stanley, Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200817234033.442511-1-leobras.c@gmail.com>

There are two functions adding DDW to the direct_window_list in a
similar way, so create a ddw_list_add() to avoid duplicity and
simplify those functions.

Also, on enable_ddw(), add list_del() on out_free_window to allow
removing the window from list if any error occurs.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 42 ++++++++++++++++----------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 39617ce0ec83..fcdefcc0f365 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -872,6 +872,24 @@ static u64 find_existing_ddw(struct device_node *pdn)
 	return dma_addr;
 }
 
+static struct direct_window *ddw_list_add(struct device_node *pdn,
+					  const struct dynamic_dma_window_prop *dma64)
+{
+	struct direct_window *window;
+
+	window = kzalloc(sizeof(*window), GFP_KERNEL);
+	if (!window)
+		return NULL;
+
+	window->device = pdn;
+	window->prop = dma64;
+	spin_lock(&direct_window_list_lock);
+	list_add(&window->list, &direct_window_list);
+	spin_unlock(&direct_window_list_lock);
+
+	return window;
+}
+
 static int find_existing_ddw_windows(void)
 {
 	int len;
@@ -887,18 +905,11 @@ static int find_existing_ddw_windows(void)
 		if (!direct64)
 			continue;
 
-		window = kzalloc(sizeof(*window), GFP_KERNEL);
-		if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
+		window = ddw_list_add(pdn, direct64);
+		if (!window || len < sizeof(*direct64)) {
 			kfree(window);
 			remove_ddw(pdn, true);
-			continue;
 		}
-
-		window->device = pdn;
-		window->prop = direct64;
-		spin_lock(&direct_window_list_lock);
-		list_add(&window->list, &direct_window_list);
-		spin_unlock(&direct_window_list_lock);
 	}
 
 	return 0;
@@ -1261,7 +1272,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n",
 		  create.liobn, dn);
 
-	window = kzalloc(sizeof(*window), GFP_KERNEL);
+	/* Add new window to existing DDW list */
+	window = ddw_list_add(pdn, ddwprop);
 	if (!window)
 		goto out_clear_window;
 
@@ -1280,16 +1292,14 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 		goto out_free_window;
 	}
 
-	window->device = pdn;
-	window->prop = ddwprop;
-	spin_lock(&direct_window_list_lock);
-	list_add(&window->list, &direct_window_list);
-	spin_unlock(&direct_window_list_lock);
-
 	dma_addr = be64_to_cpu(ddwprop->dma_base);
 	goto out_unlock;
 
 out_free_window:
+	spin_lock(&direct_window_list_lock);
+	list_del(&window->list);
+	spin_unlock(&direct_window_list_lock);
+
 	kfree(window);
 
 out_clear_window:
-- 
2.25.4


^ permalink raw reply related

* [PATCH v1 07/10] powerpc/pseries/iommu: Allow DDW windows starting at 0x00
From: Leonardo Bras @ 2020-08-17 23:40 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Alexey Kardashevskiy, Christophe Leroy, Leonardo Bras,
	Joel Stanley, Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200817234033.442511-1-leobras.c@gmail.com>

enable_ddw() currently returns the address of the DMA window, which is
considered invalid if has the value 0x00.

Also, it only considers valid an address returned from find_existing_ddw
if it's not 0x00.

Changing this behavior makes sense, given the users of enable_ddw() only
need to know if direct mapping is possible. It can also allow a DMA window
starting at 0x00 to be used.

This will be helpful for using a DDW with indirect mapping, as the window
address will be different than 0x00, but it will not map the whole
partition.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 30 ++++++++++++--------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index fcdefcc0f365..4031127c9537 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -852,24 +852,25 @@ static void remove_ddw(struct device_node *np, bool remove_prop)
 			np, ret);
 }
 
-static u64 find_existing_ddw(struct device_node *pdn)
+static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr)
 {
 	struct direct_window *window;
 	const struct dynamic_dma_window_prop *direct64;
-	u64 dma_addr = 0;
+	bool found = false;
 
 	spin_lock(&direct_window_list_lock);
 	/* check if we already created a window and dupe that config if so */
 	list_for_each_entry(window, &direct_window_list, list) {
 		if (window->device == pdn) {
 			direct64 = window->prop;
-			dma_addr = be64_to_cpu(direct64->dma_base);
+			*dma_addr = be64_to_cpu(direct64->dma_base);
+			found = true;
 			break;
 		}
 	}
 	spin_unlock(&direct_window_list_lock);
 
-	return dma_addr;
+	return found;
 }
 
 static struct direct_window *ddw_list_add(struct device_node *pdn,
@@ -1131,15 +1132,15 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
  * pdn: the parent pe node with the ibm,dma_window property
  * Future: also check if we can remap the base window for our base page size
  *
- * returns the dma offset for use by the direct mapped DMA code.
+ * returns true if can map all pages (direct mapping), false otherwise..
  */
-static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
+static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 {
 	int len, ret;
 	struct ddw_query_response query;
 	struct ddw_create_response create;
 	int page_shift;
-	u64 dma_addr, max_addr;
+	u64 max_addr;
 	struct device_node *dn;
 	u32 ddw_avail[DDW_APPLICABLE_SIZE];
 	struct direct_window *window;
@@ -1150,8 +1151,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 
 	mutex_lock(&direct_window_init_mutex);
 
-	dma_addr = find_existing_ddw(pdn);
-	if (dma_addr != 0)
+	if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset))
 		goto out_unlock;
 
 	/*
@@ -1292,7 +1292,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 		goto out_free_window;
 	}
 
-	dma_addr = be64_to_cpu(ddwprop->dma_base);
+	dev->dev.archdata.dma_offset = be64_to_cpu(ddwprop->dma_base);
 	goto out_unlock;
 
 out_free_window:
@@ -1309,6 +1309,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	kfree(win64->name);
 	kfree(win64->value);
 	kfree(win64);
+	win64 = NULL;
 
 out_failed:
 	if (default_win_removed)
@@ -1322,7 +1323,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 
 out_unlock:
 	mutex_unlock(&direct_window_init_mutex);
-	return dma_addr;
+	return win64;
 }
 
 static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
@@ -1401,11 +1402,8 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
 			break;
 	}
 
-	if (pdn && PCI_DN(pdn)) {
-		pdev->dev.archdata.dma_offset = enable_ddw(pdev, pdn);
-		if (pdev->dev.archdata.dma_offset)
-			return true;
-	}
+	if (pdn && PCI_DN(pdn))
+		return enable_ddw(pdev, pdn);
 
 	return false;
 }
-- 
2.25.4


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox