LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH v2 10/10] cxl: Remove abandonned capi support for the Mellanox CX4, final cleanup
From: Andrew Donnellan @ 2018-06-28 23:53 UTC (permalink / raw)
  To: Frederic Barrat, alastair, vaibhav, clombard, felix, linuxppc-dev; +Cc: huyn
In-Reply-To: <20180628100509.17413-11-fbarrat@linux.ibm.com>

On 28/06/18 20:05, Frederic Barrat wrote:
> Remove a few XSL/CX4 oddities which are no longer needed. A simple
> revert of the initial commits was not possible (or not worth it) due
> to the history of the code.
> 
> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>

Acked-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>

> ---
>   drivers/misc/cxl/context.c |  2 +-
>   drivers/misc/cxl/cxl.h     | 12 ------
>   drivers/misc/cxl/debugfs.c |  5 ---
>   drivers/misc/cxl/pci.c     | 75 +++-----------------------------------
>   4 files changed, 7 insertions(+), 87 deletions(-)
> 
> diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
> index 0355d42d367f..5fe529b43ebe 100644
> --- a/drivers/misc/cxl/context.c
> +++ b/drivers/misc/cxl/context.c
> @@ -95,7 +95,7 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master)
>   	 */
>   	mutex_lock(&afu->contexts_lock);
>   	idr_preload(GFP_KERNEL);
> -	i = idr_alloc(&ctx->afu->contexts_idr, ctx, ctx->afu->adapter->min_pe,
> +	i = idr_alloc(&ctx->afu->contexts_idr, ctx, 0,
>   		      ctx->afu->num_procs, GFP_NOWAIT);
>   	idr_preload_end();
>   	mutex_unlock(&afu->contexts_lock);
> diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
> index aa453448201d..44bcfafbb579 100644
> --- a/drivers/misc/cxl/cxl.h
> +++ b/drivers/misc/cxl/cxl.h
> @@ -93,11 +93,6 @@ static const cxl_p1_reg_t CXL_PSL_FIR_CNTL  = {0x0148};
>   static const cxl_p1_reg_t CXL_PSL_DSNDCTL   = {0x0150};
>   static const cxl_p1_reg_t CXL_PSL_SNWRALLOC = {0x0158};
>   static const cxl_p1_reg_t CXL_PSL_TRACE     = {0x0170};
> -/* XSL registers (Mellanox CX4) */
> -static const cxl_p1_reg_t CXL_XSL_Timebase  = {0x0100};
> -static const cxl_p1_reg_t CXL_XSL_TB_CTLSTAT = {0x0108};
> -static const cxl_p1_reg_t CXL_XSL_FEC       = {0x0158};
> -static const cxl_p1_reg_t CXL_XSL_DSNCTL    = {0x0168};
>   /* PSL registers - CAIA 2 */
>   static const cxl_p1_reg_t CXL_PSL9_CONTROL  = {0x0020};
>   static const cxl_p1_reg_t CXL_XSL9_INV      = {0x0110};
> @@ -695,7 +690,6 @@ struct cxl {
>   	struct bin_attribute cxl_attr;
>   	int adapter_num;
>   	int user_irqs;
> -	int min_pe;
>   	u64 ps_size;
>   	u16 psl_rev;
>   	u16 base_image;
> @@ -934,7 +928,6 @@ int cxl_debugfs_afu_add(struct cxl_afu *afu);
>   void cxl_debugfs_afu_remove(struct cxl_afu *afu);
>   void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, struct dentry *dir);
>   void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, struct dentry *dir);
> -void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter, struct dentry *dir);
>   void cxl_debugfs_add_afu_regs_psl9(struct cxl_afu *afu, struct dentry *dir);
>   void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct dentry *dir);
>   
> @@ -977,11 +970,6 @@ static inline void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter,
>   {
>   }
>   
> -static inline void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter,
> -						    struct dentry *dir)
> -{
> -}
> -
>   static inline void cxl_debugfs_add_afu_regs_psl9(struct cxl_afu *afu, struct dentry *dir)
>   {
>   }
> diff --git a/drivers/misc/cxl/debugfs.c b/drivers/misc/cxl/debugfs.c
> index 1643850d2302..a1921d81593a 100644
> --- a/drivers/misc/cxl/debugfs.c
> +++ b/drivers/misc/cxl/debugfs.c
> @@ -58,11 +58,6 @@ void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, struct dentry *dir)
>   	debugfs_create_io_x64("trace", S_IRUSR | S_IWUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_TRACE));
>   }
>   
> -void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter, struct dentry *dir)
> -{
> -	debugfs_create_io_x64("fec", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_XSL_FEC));
> -}
> -
>   int cxl_debugfs_adapter_add(struct cxl *adapter)
>   {
>   	struct dentry *dir;
> diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
> index 0ca818396524..6dfb4ed345d3 100644
> --- a/drivers/misc/cxl/pci.c
> +++ b/drivers/misc/cxl/pci.c
> @@ -593,27 +593,7 @@ static int init_implementation_adapter_regs_psl8(struct cxl *adapter, struct pci
>   	return 0;
>   }
>   
> -static int init_implementation_adapter_regs_xsl(struct cxl *adapter, struct pci_dev *dev)
> -{
> -	u64 xsl_dsnctl;
> -	u64 chipid;
> -	u32 phb_index;
> -	u64 capp_unit_id;
> -	int rc;
> -
> -	rc = cxl_calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id);
> -	if (rc)
> -		return rc;
> -
> -	/* Tell XSL where to route data to */
> -	xsl_dsnctl = 0x0000600000000000ULL | (chipid << (63-5));
> -	xsl_dsnctl |= (capp_unit_id << (63-13));
> -	cxl_p1_write(adapter, CXL_XSL_DSNCTL, xsl_dsnctl);
> -
> -	return 0;
> -}
> -
> -/* PSL & XSL */
> +/* PSL */
>   #define TBSYNC_CAL(n) (((u64)n & 0x7) << (63-3))
>   #define TBSYNC_CNT(n) (((u64)n & 0x7) << (63-6))
>   /* For the PSL this is a multiple for 0 < n <= 7: */
> @@ -625,21 +605,6 @@ static void write_timebase_ctrl_psl8(struct cxl *adapter)
>   		     TBSYNC_CNT(2 * PSL_2048_250MHZ_CYCLES));
>   }
>   
> -/* XSL */
> -#define TBSYNC_ENA (1ULL << 63)
> -/* For the XSL this is 2**n * 2000 clocks for 0 < n <= 6: */
> -#define XSL_2000_CLOCKS 1
> -#define XSL_4000_CLOCKS 2
> -#define XSL_8000_CLOCKS 3
> -
> -static void write_timebase_ctrl_xsl(struct cxl *adapter)
> -{
> -	cxl_p1_write(adapter, CXL_XSL_TB_CTLSTAT,
> -		     TBSYNC_ENA |
> -		     TBSYNC_CAL(3) |
> -		     TBSYNC_CNT(XSL_4000_CLOCKS));
> -}
> -
>   static u64 timebase_read_psl9(struct cxl *adapter)
>   {
>   	return cxl_p1_read(adapter, CXL_PSL9_Timebase);
> @@ -650,11 +615,6 @@ static u64 timebase_read_psl8(struct cxl *adapter)
>   	return cxl_p1_read(adapter, CXL_PSL_Timebase);
>   }
>   
> -static u64 timebase_read_xsl(struct cxl *adapter)
> -{
> -	return cxl_p1_read(adapter, CXL_XSL_Timebase);
> -}
> -
>   static void cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev)
>   {
>   	struct device_node *np;
> @@ -1671,37 +1631,14 @@ static const struct cxl_service_layer_ops psl8_ops = {
>   	.needs_reset_before_disable = true,
>   };
>   
> -static const struct cxl_service_layer_ops xsl_ops = {
> -	.adapter_regs_init = init_implementation_adapter_regs_xsl,
> -	.invalidate_all = cxl_invalidate_all_psl8,
> -	.sanitise_afu_regs = sanitise_afu_regs_psl8,
> -	.handle_interrupt = cxl_irq_psl8,
> -	.fail_irq = cxl_fail_irq_psl,
> -	.activate_dedicated_process = cxl_activate_dedicated_process_psl8,
> -	.attach_afu_directed = cxl_attach_afu_directed_psl8,
> -	.attach_dedicated_process = cxl_attach_dedicated_process_psl8,
> -	.update_dedicated_ivtes = cxl_update_dedicated_ivtes_psl8,
> -	.debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_xsl,
> -	.write_timebase_ctrl = write_timebase_ctrl_xsl,
> -	.timebase_read = timebase_read_xsl,
> -	.capi_mode = OPAL_PHB_CAPI_MODE_DMA,
> -};
> -
>   static void set_sl_ops(struct cxl *adapter, struct pci_dev *dev)
>   {
> -	if (dev->vendor == PCI_VENDOR_ID_MELLANOX && dev->device == 0x1013) {
> -		/* Mellanox CX-4 */
> -		dev_info(&dev->dev, "Device uses an XSL\n");
> -		adapter->native->sl_ops = &xsl_ops;
> -		adapter->min_pe = 1; /* Workaround for CX-4 hardware bug */
> +	if (cxl_is_power8()) {
> +		dev_info(&dev->dev, "Device uses a PSL8\n");
> +		adapter->native->sl_ops = &psl8_ops;
>   	} else {
> -		if (cxl_is_power8()) {
> -			dev_info(&dev->dev, "Device uses a PSL8\n");
> -			adapter->native->sl_ops = &psl8_ops;
> -		} else {
> -			dev_info(&dev->dev, "Device uses a PSL9\n");
> -			adapter->native->sl_ops = &psl9_ops;
> -		}
> +		dev_info(&dev->dev, "Device uses a PSL9\n");
> +		adapter->native->sl_ops = &psl9_ops;
>   	}
>   }
>   
> 

-- 
Andrew Donnellan              OzLabs, ADL Canberra
andrew.donnellan@au1.ibm.com  IBM Australia Limited

^ permalink raw reply

* Re: [PATCH 65/65] powerpc/mm/radix: Cputable update for radix
From: Benjamin Herrenschmidt @ 2018-06-28 23:55 UTC (permalink / raw)
  To: Aneesh Kumar K.V, Michael Ellerman, paulus, distroguy; +Cc: linuxppc-dev
In-Reply-To: <8760w165aj.fsf@linux.vnet.ibm.com>

On Fri, 2016-04-01 at 15:04 +0530, Aneesh Kumar K.V wrote:
> 
> commit 9c9d8b4f6a2c2210c90cbb3f5c6d33b2a642e8d2
> Author: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> Date:   Mon Feb 15 13:44:01 2016 +0530
> 
>     powerpc/mm/radix: Cputable update for radix
>     
>     With P9 Radix we need to do
>     
>     * set UPRT = 1
>     * set different TLB set count
>     
>     In this patch we delay the UPRT=1 to early mmu init. We also update
>     other cpu_spec callback there. The restore cpu callback is used to
>     init secondary cpus and also during opal init. So we do a full
>     radix variant for that, even though the only difference is UPRT=1
>     
>     Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

How are things working in absence of cputable/PVR match ?

Remember we have a requirement to be able to boot existing OSes on
future chips, so Nick's new cpu-features node needs to be what we
test against.

> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> index b546e6f28d44..3400ed884f10 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -347,6 +347,10 @@
>  #define   LPCR_LPES_SH	2
>  #define   LPCR_RMI     0x00000002      /* real mode is cache inhibit */
>  #define   LPCR_HDICE   0x00000001      /* Hyp Decr enable (HV,PR,EE) */
> +/*
> + * Used in asm code, hence we don't want to use PPC_BITCOUNT
> + */
> +#define	  LPCR_UPRT	(ASM_CONST(0x1) << 22)
>  #ifndef SPRN_LPID
>  #define SPRN_LPID	0x13F	/* Logical Partition Identifier */
>  #endif
> diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
> index 584e119fa8b0..8d717954d0ca 100644
> --- a/arch/powerpc/kernel/cpu_setup_power.S
> +++ b/arch/powerpc/kernel/cpu_setup_power.S
> @@ -117,6 +117,24 @@ _GLOBAL(__restore_cpu_power9)
>  	mtlr	r11
>  	blr
>  
> +_GLOBAL(__restore_cpu_power9_uprt)
> +	mflr	r11
> +	bl	__init_FSCR
> +	mfmsr	r3
> +	rldicl.	r0,r3,4,63
> +	mtlr	r11
> +	beqlr
> +	li	r0,0
> +	mtspr	SPRN_LPID,r0
> +	mfspr   r3,SPRN_LPCR
> +	ori	r3, r3, LPCR_PECEDH
> +	oris	r3,r3,LPCR_UPRT@h
> +	bl	__init_LPCR
> +	bl	__init_HFSCR
> +	bl	__init_tlb_power7
> +	mtlr	r11
> +	blr
> +
>  __init_hvmode_206:
>  	/* Disable CPU_FTR_HVMODE and exit if MSR:HV is not set */
>  	mfmsr	r3
> diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
> index 6c662b8de90d..e009722d5914 100644
> --- a/arch/powerpc/kernel/cputable.c
> +++ b/arch/powerpc/kernel/cputable.c
> @@ -514,7 +514,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
>  		.cpu_features		= CPU_FTRS_POWER9,
>  		.cpu_user_features	= COMMON_USER_POWER9,
>  		.cpu_user_features2	= COMMON_USER2_POWER9,
> -		.mmu_features		= MMU_FTRS_POWER9,
> +		.mmu_features		= MMU_FTRS_POWER9 | MMU_FTR_RADIX,
>  		.icache_bsize		= 128,
>  		.dcache_bsize		= 128,
>  		.num_pmcs		= 6,
> diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
> index 92a66a2a9b85..f902ede263ab 100644
> --- a/arch/powerpc/kernel/mce_power.c
> +++ b/arch/powerpc/kernel/mce_power.c
> @@ -75,6 +75,10 @@ void __flush_tlb_power9(unsigned int action)
>  	flush_tlb_206(POWER9_TLB_SETS_HASH, action);
>  }
>  
> +void __flush_tlb_power9_radix(unsigned int action)
> +{
> +	flush_tlb_206(POWER9_TLB_SETS_RADIX, action);
> +}
>  
>  /* flush SLBs and reload */
>  #ifdef CONFIG_PPC_MMU_STD_64
> diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
> index bb1eb7d0911c..6e56051bf825 100644
> --- a/arch/powerpc/mm/pgtable-radix.c
> +++ b/arch/powerpc/mm/pgtable-radix.c
> @@ -294,8 +294,20 @@ found:
>  	return;
>  }
>  
> +extern void __restore_cpu_power9_uprt(void);
> +extern void __flush_tlb_power9_radix(unsigned int action);
>  void __init rearly_init_mmu(void)
>  {
> +	unsigned long lpcr;
> +	/*
> +	 * setup LPCR UPRT based on mmu_features
> +	 */
> +	lpcr = mfspr(SPRN_LPCR);
> +	mtspr(SPRN_LPCR, lpcr | LPCR_UPRT);
> +	/* update cpu_spec to point to radix enabled callbacks */
> +	cur_cpu_spec->cpu_restore = __restore_cpu_power9_uprt;
> +	cur_cpu_spec->flush_tlb   = __flush_tlb_power9_radix;
> +
>  #ifdef CONFIG_PPC_64K_PAGES
>  	/* PAGE_SIZE mappings */
>  	mmu_virtual_psize = MMU_PAGE_64K;

^ permalink raw reply

* [PATCH] powerpc: Remove memtrace mmap
From: Michael Neuling @ 2018-06-29  0:12 UTC (permalink / raw)
  To: mpe; +Cc: linuxppc-dev, mikey, rashmica.g, anton

debugfs doesn't support mmap, so this code is never used.

Signed-off-by: Michael Neuling <mikey@neuling.org>
---
 arch/powerpc/platforms/powernv/memtrace.c | 29 -----------------------
 1 file changed, 29 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index b99283df85..f73101119e 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -47,38 +47,9 @@ static ssize_t memtrace_read(struct file *filp, char __user *ubuf,
 	return simple_read_from_buffer(ubuf, count, ppos, ent->mem, ent->size);
 }
 
-static bool valid_memtrace_range(struct memtrace_entry *dev,
-				 unsigned long start, unsigned long size)
-{
-	if ((start >= dev->start) &&
-	    ((start + size) <= (dev->start + dev->size)))
-		return true;
-
-	return false;
-}
-
-static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma)
-{
-	unsigned long size = vma->vm_end - vma->vm_start;
-	struct memtrace_entry *dev = filp->private_data;
-
-	if (!valid_memtrace_range(dev, vma->vm_pgoff << PAGE_SHIFT, size))
-		return -EINVAL;
-
-	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
-	if (remap_pfn_range(vma, vma->vm_start,
-			    vma->vm_pgoff + (dev->start >> PAGE_SHIFT),
-			    size, vma->vm_page_prot))
-		return -EAGAIN;
-
-	return 0;
-}
-
 static const struct file_operations memtrace_fops = {
 	.llseek = default_llseek,
 	.read	= memtrace_read,
-	.mmap	= memtrace_mmap,
 	.open	= simple_open,
 };
 
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH kernel v2 0/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page
From: Michael Ellerman @ 2018-06-29  1:55 UTC (permalink / raw)
  To: Alexey Kardashevskiy, linuxppc-dev
  Cc: Alexey Kardashevskiy, Alex Williamson, kvm-ppc, David Gibson
In-Reply-To: <20180626055926.27703-1-aik@ozlabs.ru>

Alexey Kardashevskiy <aik@ozlabs.ru> writes:

> This is to improve page boundaries checking and should probably
> be cc:stable. I came accross this while debugging nvlink2 passthrough
> but the lack of checking might be exploited by the existing userspace.

Do you really mean "exploited" ? As in there's a security issue?

Your change log for patch 2 sort of suggests that but then says that
without the fix you just hit an error in vfio code.

So I'm not clear on what the exposure is.

cheers

^ permalink raw reply

* Re: [PATCH kernel] powerpc/powernv/ioda2: Reduce upper limit for DMA window size
From: Michael Ellerman @ 2018-06-29  1:57 UTC (permalink / raw)
  To: Alexey Kardashevskiy, linuxppc-dev; +Cc: Alexey Kardashevskiy
In-Reply-To: <20180601080616.29279-1-aik@ozlabs.ru>

Alexey Kardashevskiy <aik@ozlabs.ru> writes:

> We use PHB in mode1 which uses bit 59 to select a correct DMA window.
> However there is mode2 which uses bits 59:55 and allows up to 32 DMA
> windows per a PE.

Do we ever use mode2?

> Even though documentation does not clearly specify that, it seems that
> the actual hardware does not support bits 59:55 even in mode1, in other
> words we can create a window as big as 1<<58 but DMA simply won't work.

Can we get anything more solid than "seems that" ?

Is this documented somewhere to not work or you just found this by
testing?

> This reduces the upper limit from 59 to 55 bits to let the userspace know
> about the hardware limits.
>
> Fixes: 7aafac11e3 "powerpc/powernv/ioda2: Gracefully fail if too many TCE levels requested"

Stable?

cheers

> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 92ca662..50e21d7 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -2839,7 +2839,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
>  	level_shift = entries_shift + 3;
>  	level_shift = max_t(unsigned, level_shift, PAGE_SHIFT);
>  
> -	if ((level_shift - 3) * levels + page_shift >= 60)
> +	if ((level_shift - 3) * levels + page_shift >= 55)
>  		return -EINVAL;
>  
>  	/* Allocate TCE table */
> -- 
> 2.11.0

^ permalink raw reply

* Re: [PATCH] powerpc/mm: fix always true/false warning in slice.c
From: Michael Ellerman @ 2018-06-29  2:55 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras, malat,
	aneesh.kumar
  Cc: linux-kernel, linuxppc-dev
In-Reply-To: <63b696ab7be8b941fa1e1589f28260320d12a32a.1529589640.git.christophe.leroy@c-s.fr>

Christophe Leroy <christophe.leroy@c-s.fr> writes:

> This patch fixes the following warnings (obtained with make W=1).
>
> arch/powerpc/mm/slice.c: In function 'slice_range_to_mask':
> arch/powerpc/mm/slice.c:73:12: error: comparison is always true due to limited range of data type [-Werror=type-limits]
>   if (start < SLICE_LOW_TOP) {

Presumably only on 32-bit ?

> diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
> index 9530c6db406a..17c57760e06c 100644
> --- a/arch/powerpc/mm/slice.c
> +++ b/arch/powerpc/mm/slice.c
> @@ -79,7 +86,7 @@ static void slice_range_to_mask(unsigned long start, unsigned long len,
>  			- (1u << GET_LOW_SLICE_INDEX(start));
>  	}
>  
> -	if ((start + len) > SLICE_LOW_TOP) {
> +	if (!slice_addr_is_low(end)) {
>  		unsigned long start_index = GET_HIGH_SLICE_INDEX(start);
>  		unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
>  		unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index;

This worries me.

By casting before the comparison in the helper you squash the compiler
warning, but the code is still broken if (start + len) overflows.

Presumably that "never happens", but it just seems fishy.

The other similar check in that file does:

  if (SLICE_NUM_HIGH && ((start + len) > SLICE_LOW_TOP)) {

Where SLICE_NUM_HIGH == 0 on 32-bit.


Could we fix the less than comparisons with SLICE_LOW_TOP with something
similar, eg:

	if (!SLICE_NUM_HIGH || start < SLICE_LOW_TOP) {

ie. limit them to the 64-bit code?

cheers

^ permalink raw reply

* Re: [PATCH 1/2] powerpc/pkeys: preallocate execute_only key only if the key is available.
From: Thiago Jung Bauermann @ 2018-06-29  2:56 UTC (permalink / raw)
  To: Ram Pai
  Cc: mpe, linuxppc-dev, hbabu, mhocko, bauerman, Ulrich.Weigand,
	fweimer, msuchanek
In-Reply-To: <1529979376-7292-1-git-send-email-linuxram@us.ibm.com>


Hello,

Ram Pai <linuxram@us.ibm.com> writes:

> Key 2 is preallocated and reserved for execute-only key. In rare
> cases if key-2 is unavailable, mprotect(PROT_EXEC) will behave
> incorrectly. NOTE: mprotect(PROT_EXEC) uses execute-only key.
>
> Ensure key 2 is available for preallocation before reserving it for
> execute_only purpose.  Problem noticed by Michael Ellermen.

Since "powerpc/pkeys: Preallocate execute-only key" isn't upstream yet,
this patch could be squashed into it.

> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> ---
>  arch/powerpc/mm/pkeys.c |   14 +++++++++-----
>  1 files changed, 9 insertions(+), 5 deletions(-)
>
> diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
> index cec990c..0b03914 100644
> --- a/arch/powerpc/mm/pkeys.c
> +++ b/arch/powerpc/mm/pkeys.c
> @@ -19,6 +19,7 @@
>  u64  pkey_amr_mask;		/* Bits in AMR not to be touched */
>  u64  pkey_iamr_mask;		/* Bits in AMR not to be touched */
>  u64  pkey_uamor_mask;		/* Bits in UMOR not to be touched */
> +int  execute_only_key = 2;
>
>  #define AMR_BITS_PER_PKEY 2
>  #define AMR_RD_BIT 0x1UL
> @@ -26,7 +27,6 @@
>  #define IAMR_EX_BIT 0x1UL
>  #define PKEY_REG_BITS (sizeof(u64)*8)
>  #define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
> -#define EXECUTE_ONLY_KEY 2
>
>  static void scan_pkey_feature(void)
>  {
> @@ -122,8 +122,12 @@ int pkey_initialize(void)
>  #else
>  	os_reserved = 0;
>  #endif
> +
> +	if ((pkeys_total - os_reserved) <= execute_only_key)
> +		execute_only_key = -1;
> +
>  	/* Bits are in LE format. */
> -	reserved_allocation_mask = (0x1 << 1) | (0x1 << EXECUTE_ONLY_KEY);
> +	reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key);

My understanding is that left-shifting by a negative amount is undefined
behavior in C. A quick test tells me that at least on the couple of
machines I tested, 1 < -1 = 0. Does GCC guarantee that behavior? If so,
a comment pointing this out would make this less confusing.

>  	initial_allocation_mask  = reserved_allocation_mask | (0x1 << PKEY_0);
>
>  	/* register mask is in BE format */
> @@ -132,11 +136,11 @@ int pkey_initialize(void)
>
>  	pkey_iamr_mask = ~0x0ul;
>  	pkey_iamr_mask &= ~(0x3ul << pkeyshift(PKEY_0));
> -	pkey_iamr_mask &= ~(0x3ul << pkeyshift(EXECUTE_ONLY_KEY));
> +	pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key));
>
>  	pkey_uamor_mask = ~0x0ul;
>  	pkey_uamor_mask &= ~(0x3ul << pkeyshift(PKEY_0));
> -	pkey_uamor_mask &= ~(0x3ul << pkeyshift(EXECUTE_ONLY_KEY));
> +	pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key));

Here the behaviour is undefined in C as well, given that pkeyshift(-1) =
64, which is the total number of bits in the left operand. Does GCC
guarantee that the result will be 0 here as well?

--
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [PATCH kernel v2 0/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page
From: Alexey Kardashevskiy @ 2018-06-29  3:00 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev, Alex Williamson, kvm-ppc, David Gibson
In-Reply-To: <877emiwe3n.fsf@concordia.ellerman.id.au>

On Fri, 29 Jun 2018 11:55:40 +1000
Michael Ellerman <mpe@ellerman.id.au> wrote:

> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> 
> > This is to improve page boundaries checking and should probably
> > be cc:stable. I came accross this while debugging nvlink2 passthrough
> > but the lack of checking might be exploited by the existing userspace.  
> 
> Do you really mean "exploited" ? As in there's a security issue?
> 
> Your change log for patch 2 sort of suggests that but then says that
> without the fix you just hit an error in vfio code.

The bug is that I can easily make unmodified guest use 16MB IOMMU pages
while guest RAM is backed with system 64K pages so unless the guest RAM
is allocated contigously (which is unlikely), a 16MB TCE will provide
the hardware access to the host physical memory it is not supposed to
have access to, which is 16MB minus first 64K.

There is a fast path for H_PUT_TCE - via KVM - there is no contained
test.

There is a slow path for H_PUT_TCE - via VFIO ioctl() - there is a
contained test.

Because of a different feature of VFIO on sPAPR (it stores an array of
userspace addresses which we received from QEMU and translated to host
physical addresses and programmed those to the TCE table) we do not take
the fast path on the very first H_PUT_TCE (because I allocate the
array when the slow path is taken the very first time), fail there,
pass the failure to the guest the guest decides that is over.

But a modified guest could ignore that initial H_PUT_TCE failure and
simply repeat H_PUT_TCE again - this time it will take the fast path
and allow the bad mapping.

> So I'm not clear on what the exposure is.
> 
> cheers

--
Alexey

^ permalink raw reply

* Re: [PATCH v2 4/6] powerpc/pkeys: Preallocate execute-only key
From: Thiago Jung Bauermann @ 2018-06-29  3:02 UTC (permalink / raw)
  To: Ram Pai
  Cc: mpe, linuxppc-dev, dave.hansen, aneesh.kumar, bsingharora, hbabu,
	mhocko, bauerman, Ulrich.Weigand, fweimer, luto, msuchanek
In-Reply-To: <1528936144-6696-5-git-send-email-linuxram@us.ibm.com>


Hello,

My understanding is that this patch isn't upstream yet and it's not too
late for bikeshedding. Please ignore if this is not the case.

Ram Pai <linuxram@us.ibm.com> writes:

> @@ -326,48 +330,7 @@ static inline bool pkey_allows_readwrite(int pkey)
>
>  int __execute_only_pkey(struct mm_struct *mm)
>  {
> -	bool need_to_set_mm_pkey = false;
> -	int execute_only_pkey = mm->context.execute_only_pkey;
> -	int ret;
> -
> -	/* Do we need to assign a pkey for mm's execute-only maps? */
> -	if (execute_only_pkey == -1) {
> -		/* Go allocate one to use, which might fail */
> -		execute_only_pkey = mm_pkey_alloc(mm);
> -		if (execute_only_pkey < 0)
> -			return -1;
> -		need_to_set_mm_pkey = true;
> -	}
> -
> -	/*
> -	 * We do not want to go through the relatively costly dance to set AMR
> -	 * if we do not need to. Check it first and assume that if the
> -	 * execute-only pkey is readwrite-disabled than we do not have to set it
> -	 * ourselves.
> -	 */
> -	if (!need_to_set_mm_pkey && !pkey_allows_readwrite(execute_only_pkey))
> -		return execute_only_pkey;
> -
> -	/*
> -	 * Set up AMR so that it denies access for everything other than
> -	 * execution.
> -	 */
> -	ret = __arch_set_user_pkey_access(current, execute_only_pkey,
> -					  PKEY_DISABLE_ACCESS |
> -					  PKEY_DISABLE_WRITE);
> -	/*
> -	 * If the AMR-set operation failed somehow, just return 0 and
> -	 * effectively disable execute-only support.
> -	 */
> -	if (ret) {
> -		mm_pkey_free(mm, execute_only_pkey);
> -		return -1;
> -	}
> -
> -	/* We got one, store it and use it from here on out */
> -	if (need_to_set_mm_pkey)
> -		mm->context.execute_only_pkey = execute_only_pkey;
> -	return execute_only_pkey;
> +	return mm->context.execute_only_pkey;
>  }

There's no reason to have a separate  __execute_only_pkey() function
anymore. Its single line can go directly in execute_only_pkey(), defined
in <asm/pkeys.h>.
-- 
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page
From: David Gibson @ 2018-06-29  4:12 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: linuxppc-dev, kvm-ppc, Alex Williamson, Paul Mackerras
In-Reply-To: <20180626055926.27703-3-aik@ozlabs.ru>

[-- Attachment #1: Type: text/plain, Size: 5241 bytes --]

On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy wrote:
> We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that
> an IOMMU page is contained in the physical page so the PCI hardware won't
> get access to unassigned host memory.
> 
> However we do not have this check in KVM fastpath (H_PUT_TCE accelerated
> code) so the user space can pin memory backed with 64k pages and create
> a hardware TCE table with a bigger page size. We were lucky so far and
> did not hit this yet as the very first time the mapping happens
> we do not have tbl::it_userspace allocated yet and fall back to
> the userspace which in turn calls VFIO IOMMU driver and that fails
> because of the check in vfio_iommu_spapr_tce.c which is really
> sustainable solution.
> 
> This stores the smallest preregistered page size in the preregistered
> region descriptor and changes the mm_iommu_xxx API to check this against
> the IOMMU page size.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> Changes:
> v2:
> * explicitly check for compound pages before calling compound_order()
> 
> ---
> The bug is: run QEMU _without_ hugepages (no -mempath) and tell it to
> advertise 16MB pages to the guest; a typical pseries guest will use 16MB
> for IOMMU pages without checking the mmu pagesize and this will fail
> at https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> 
> With the change, mapping will fail in KVM and the guest will print:
> 
> mlx5_core 0000:00:00.0: ibm,create-pe-dma-window(2027) 0 8000000 20000000 18 1f returned 0 (liobn = 0x80000001 starting addr = 8000000 0)
> mlx5_core 0000:00:00.0: created tce table LIOBN 0x80000001 for /pci@800000020000000/ethernet@0
> mlx5_core 0000:00:00.0: failed to map direct window for
> /pci@800000020000000/ethernet@0: -1

[snip]
> @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
>  		struct mm_iommu_table_group_mem_t **pmem)
>  {
>  	struct mm_iommu_table_group_mem_t *mem;
> -	long i, j, ret = 0, locked_entries = 0;
> +	long i, j, ret = 0, locked_entries = 0, pageshift;
>  	struct page *page = NULL;
>  
>  	mutex_lock(&mem_list_mutex);
> @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
>  		goto unlock_exit;
>  	}
>  
> +	mem->pageshift = 30; /* start from 1G pages - the biggest we have */

What about 16G pages on an HPT system?

>  	for (i = 0; i < entries; ++i) {
>  		if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
>  					1/* pages */, 1/* iswrite */, &page)) {
> @@ -199,6 +202,11 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
>  			}
>  		}
>  populate:
> +		pageshift = PAGE_SHIFT;
> +		if (PageCompound(page))
> +			pageshift += compound_order(compound_head(page));
> +		mem->pageshift = min_t(unsigned int, mem->pageshift, pageshift);

Why not make mem->pageshift and pageshift local the same type to avoid
the min_t() ?

> +
>  		mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
>  	}
>  
> @@ -349,7 +357,7 @@ struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
>  EXPORT_SYMBOL_GPL(mm_iommu_find);
>  
>  long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
> -		unsigned long ua, unsigned long *hpa)
> +		unsigned long ua, unsigned int pageshift, unsigned long *hpa)
>  {
>  	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
>  	u64 *va = &mem->hpas[entry];
> @@ -357,6 +365,9 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
>  	if (entry >= mem->entries)
>  		return -EFAULT;
>  
> +	if (pageshift > mem->pageshift)
> +		return -EFAULT;
> +
>  	*hpa = *va | (ua & ~PAGE_MASK);
>  
>  	return 0;
> @@ -364,7 +375,7 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
>  EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
>  
>  long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
> -		unsigned long ua, unsigned long *hpa)
> +		unsigned long ua, unsigned int pageshift, unsigned long *hpa)
>  {
>  	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
>  	void *va = &mem->hpas[entry];
> @@ -373,6 +384,9 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
>  	if (entry >= mem->entries)
>  		return -EFAULT;
>  
> +	if (pageshift > mem->pageshift)
> +		return -EFAULT;
> +
>  	pa = (void *) vmalloc_to_phys(va);
>  	if (!pa)
>  		return -EFAULT;
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> index 2da5f05..7cd63b0 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -467,7 +467,7 @@ static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
>  	if (!mem)
>  		return -EINVAL;
>  
> -	ret = mm_iommu_ua_to_hpa(mem, tce, phpa);
> +	ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
>  	if (ret)
>  		return -EINVAL;
>  

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH kernel v2 0/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page
From: David Gibson @ 2018-06-29  4:14 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Michael Ellerman, linuxppc-dev, Alex Williamson, kvm-ppc
In-Reply-To: <20180629130007.1b78e168@aik.ozlabs.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 2181 bytes --]

On Fri, Jun 29, 2018 at 01:00:07PM +1000, Alexey Kardashevskiy wrote:
> On Fri, 29 Jun 2018 11:55:40 +1000
> Michael Ellerman <mpe@ellerman.id.au> wrote:
> 
> > Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> > 
> > > This is to improve page boundaries checking and should probably
> > > be cc:stable. I came accross this while debugging nvlink2 passthrough
> > > but the lack of checking might be exploited by the existing userspace.  
> > 
> > Do you really mean "exploited" ? As in there's a security issue?
> > 
> > Your change log for patch 2 sort of suggests that but then says that
> > without the fix you just hit an error in vfio code.
> 
> 
> The bug is that I can easily make unmodified guest use 16MB IOMMU pages
> while guest RAM is backed with system 64K pages so unless the guest RAM
> is allocated contigously (which is unlikely), a 16MB TCE will provide
> the hardware access to the host physical memory it is not supposed to
> have access to, which is 16MB minus first 64K.
> 
> There is a fast path for H_PUT_TCE - via KVM - there is no contained
> test.
> 
> There is a slow path for H_PUT_TCE - via VFIO ioctl() - there is a
> contained test.
> 
> Because of a different feature of VFIO on sPAPR (it stores an array of
> userspace addresses which we received from QEMU and translated to host
> physical addresses and programmed those to the TCE table) we do not take
> the fast path on the very first H_PUT_TCE (because I allocate the
> array when the slow path is taken the very first time), fail there,
> pass the failure to the guest the guest decides that is over.
> 
> But a modified guest could ignore that initial H_PUT_TCE failure and
> simply repeat H_PUT_TCE again - this time it will take the fast path
> and allow the bad mapping.

In short, yes, it's an exploitable security hole in the host.  An
unmodified Linux guest kernel just doesn't happen to exploit it, even
if the guest userspace tries to get it to.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH v4 1/6] powerpc/pseries: Defer the logging of rtas error to irq work queue.
From: Mahesh Jagannath Salgaonkar @ 2018-06-29  4:41 UTC (permalink / raw)
  To: Laurent Dufour, linuxppc-dev; +Cc: stable, Nicholas Piggin, Aneesh Kumar K.V
In-Reply-To: <b26f14a3-9f08-798d-1bf9-eb7530f040c0@linux.vnet.ibm.com>

On 06/28/2018 06:49 PM, Laurent Dufour wrote:
> On 28/06/2018 13:10, Mahesh J Salgaonkar wrote:
>> From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
>>
>> rtas_log_buf is a buffer to hold RTAS event data that are communicated
>> to kernel by hypervisor. This buffer is then used to pass RTAS event
>> data to user through proc fs. This buffer is allocated from vmalloc
>> (non-linear mapping) area.
>>
>> On Machine check interrupt, register r3 points to RTAS extended event
>> log passed by hypervisor that contains the MCE event. The pseries
>> machine check handler then logs this error into rtas_log_buf. The
>> rtas_log_buf is a vmalloc-ed (non-linear) buffer we end up taking up a
>> page fault (vector 0x300) while accessing it. Since machine check
>> interrupt handler runs in NMI context we can not afford to take any
>> page fault. Page faults are not honored in NMI context and causes
>> kernel panic. Apart from that, as Nick pointed out, pSeries_log_error()
>> also takes a spin_lock while logging error which is not safe in NMI
>> context. It may endup in deadlock if we get another MCE before releasing
>> the lock. Fix this by deferring the logging of rtas error to irq work queue.
>>
>> Current implementation uses two different buffers to hold rtas error log
>> depending on whether extended log is provided or not. This makes bit
>> difficult to identify which buffer has valid data that needs to logged
>> later in irq work. Simplify this using single buffer, one per paca, and
>> copy rtas log to it irrespective of whether extended log is provided or
>> not. Allocate this buffer below RMA region so that it can be accessed
>> in real mode mce handler.
>>
>> Fixes: b96672dd840f ("powerpc: Machine check interrupt is a non-maskable interrupt")
>> Cc: stable@vger.kernel.org
>> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
>> ---
>>  arch/powerpc/include/asm/paca.h        |    3 ++
>>  arch/powerpc/platforms/pseries/ras.c   |   39 +++++++++++++++++++++-----------
>>  arch/powerpc/platforms/pseries/setup.c |   16 +++++++++++++
>>  3 files changed, 45 insertions(+), 13 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
>> index 3f109a3e3edb..b441fef53077 100644
>> --- a/arch/powerpc/include/asm/paca.h
>> +++ b/arch/powerpc/include/asm/paca.h
>> @@ -251,6 +251,9 @@ struct paca_struct {
>>  	void *rfi_flush_fallback_area;
>>  	u64 l1d_flush_size;
>>  #endif
>> +#ifdef CONFIG_PPC_PSERIES
>> +	u8 *mce_data_buf;		/* buffer to hold per cpu rtas errlog */
>> +#endif /* CONFIG_PPC_PSERIES */
>>  } ____cacheline_aligned;
>>
>>  extern void copy_mm_to_paca(struct mm_struct *mm);
>> diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
>> index 5e1ef9150182..f6ba9a2a4f84 100644
>> --- a/arch/powerpc/platforms/pseries/ras.c
>> +++ b/arch/powerpc/platforms/pseries/ras.c
>> @@ -22,6 +22,7 @@
>>  #include <linux/of.h>
>>  #include <linux/fs.h>
>>  #include <linux/reboot.h>
>> +#include <linux/irq_work.h>
>>
>>  #include <asm/machdep.h>
>>  #include <asm/rtas.h>
>> @@ -32,11 +33,13 @@
>>  static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
>>  static DEFINE_SPINLOCK(ras_log_buf_lock);
>>
>> -static char global_mce_data_buf[RTAS_ERROR_LOG_MAX];
>> -static DEFINE_PER_CPU(__u64, mce_data_buf);
>> -
>>  static int ras_check_exception_token;
>>
>> +static void mce_process_errlog_event(struct irq_work *work);
>> +static struct irq_work mce_errlog_process_work = {
>> +	.func = mce_process_errlog_event,
>> +};
>> +
>>  #define EPOW_SENSOR_TOKEN	9
>>  #define EPOW_SENSOR_INDEX	0
>>
>> @@ -336,10 +339,9 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
>>   * the actual r3 if possible, and a ptr to the error log entry
>>   * will be returned if found.
>>   *
>> - * If the RTAS error is not of the extended type, then we put it in a per
>> - * cpu 64bit buffer. If it is the extended type we use global_mce_data_buf.
>> + * Use one buffer mce_data_buf per cpu to store RTAS error.
>>   *
>> - * The global_mce_data_buf does not have any locks or protection around it,
>> + * The mce_data_buf does not have any locks or protection around it,
>>   * if a second machine check comes in, or a system reset is done
>>   * before we have logged the error, then we will get corruption in the
>>   * error log.  This is preferable over holding off on calling
>> @@ -362,20 +364,19 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
>>  	savep = __va(regs->gpr[3]);
>>  	regs->gpr[3] = savep[0];	/* restore original r3 */
>>
>> -	/* If it isn't an extended log we can use the per cpu 64bit buffer */
>>  	h = (struct rtas_error_log *)&savep[1];
>> +	/* Use the per cpu buffer from paca to store rtas error log */
>> +	memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
>>  	if (!rtas_error_extended(h)) {
>> -		memcpy(this_cpu_ptr(&mce_data_buf), h, sizeof(__u64));
>> -		errhdr = (struct rtas_error_log *)this_cpu_ptr(&mce_data_buf);
>> +		memcpy(local_paca->mce_data_buf, h, sizeof(__u64));
>>  	} else {
>>  		int len, error_log_length;
>>
>>  		error_log_length = 8 + rtas_error_extended_log_length(h);
>>  		len = max_t(int, error_log_length, RTAS_ERROR_LOG_MAX);
>> -		memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
>> -		memcpy(global_mce_data_buf, h, len);
>> -		errhdr = (struct rtas_error_log *)global_mce_data_buf;
>> +		memcpy(local_paca->mce_data_buf, h, len);
>>  	}
>> +	errhdr = (struct rtas_error_log *)local_paca->mce_data_buf;
> 
> You should drop errhdr in this function and simply do
> 	return (struct rtas_error_log *)local_paca->mce_data_buf;

sure, will do that in next revision.

Thanks,
-Mahesh.

^ permalink raw reply

* Re: [PATCH v4 1/6] powerpc/pseries: Defer the logging of rtas error to irq work queue.
From: Mahesh Jagannath Salgaonkar @ 2018-06-29  4:49 UTC (permalink / raw)
  To: kbuild test robot
  Cc: kbuild-all, linuxppc-dev, Laurent Dufour, Nicholas Piggin, stable,
	Aneesh Kumar K.V
In-Reply-To: <201806290421.wmb85tYm%fengguang.wu@intel.com>

On 06/29/2018 02:35 AM, kbuild test robot wrote:
> Hi Mahesh,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on powerpc/next]
> [also build test ERROR on v4.18-rc2 next-20180628]
> [if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
> 
> url:    https://github.com/0day-ci/linux/commits/Mahesh-J-Salgaonkar/powerpc-pseries-Defer-the-logging-of-rtas-error-to-irq-work-queue/20180628-224101
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
> config: powerpc-defconfig (attached as .config)
> compiler: powerpc64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
> reproduce:
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # save the attached .config to linux build tree
>         GCC_VERSION=7.2.0 make.cross ARCH=powerpc 
> 
> Note: the linux-review/Mahesh-J-Salgaonkar/powerpc-pseries-Defer-the-logging-of-rtas-error-to-irq-work-queue/20180628-224101 HEAD 3496ae1afd6528103d508528e25bfca82c60f4ee builds fine.
>       It only hurts bisectibility.
> 
> All errors (new ones prefixed by >>):
> 
>    arch/powerpc/platforms/pseries/ras.c: In function 'mce_process_errlog_event':
>>> arch/powerpc/platforms/pseries/ras.c:433:8: error: implicit declaration of function 'fwnmi_get_errlog'; did you mean 'fwnmi_get_errinfo'? [-Werror=implicit-function-declaration]
>      err = fwnmi_get_errlog();
>            ^~~~~~~~~~~~~~~~
>            fwnmi_get_errinfo
>>> arch/powerpc/platforms/pseries/ras.c:433:6: error: assignment makes pointer from integer without a cast [-Werror=int-conversion]
>      err = fwnmi_get_errlog();
>          ^
>    cc1: all warnings being treated as errors

Ouch... Looks like I pushed down the function definition while
rearranging the hunks. Will fix it in next revision. Thanks for catching
this.

Thanks,
-Mahesh.

^ permalink raw reply

* Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page
From: Alexey Kardashevskiy @ 2018-06-29  4:51 UTC (permalink / raw)
  To: David Gibson; +Cc: linuxppc-dev, kvm-ppc, Alex Williamson, Paul Mackerras
In-Reply-To: <20180629041241.GC3422@umbus.fritz.box>

[-- Attachment #1: Type: text/plain, Size: 5771 bytes --]

On Fri, 29 Jun 2018 14:12:41 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy wrote:
> > We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that
> > an IOMMU page is contained in the physical page so the PCI hardware won't
> > get access to unassigned host memory.
> > 
> > However we do not have this check in KVM fastpath (H_PUT_TCE accelerated
> > code) so the user space can pin memory backed with 64k pages and create
> > a hardware TCE table with a bigger page size. We were lucky so far and
> > did not hit this yet as the very first time the mapping happens
> > we do not have tbl::it_userspace allocated yet and fall back to
> > the userspace which in turn calls VFIO IOMMU driver and that fails
> > because of the check in vfio_iommu_spapr_tce.c which is really
> > sustainable solution.
> > 
> > This stores the smallest preregistered page size in the preregistered
> > region descriptor and changes the mm_iommu_xxx API to check this against
> > the IOMMU page size.
> > 
> > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > ---
> > Changes:
> > v2:
> > * explicitly check for compound pages before calling compound_order()
> > 
> > ---
> > The bug is: run QEMU _without_ hugepages (no -mempath) and tell it to
> > advertise 16MB pages to the guest; a typical pseries guest will use 16MB
> > for IOMMU pages without checking the mmu pagesize and this will fail
> > at https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > 
> > With the change, mapping will fail in KVM and the guest will print:
> > 
> > mlx5_core 0000:00:00.0: ibm,create-pe-dma-window(2027) 0 8000000 20000000 18 1f returned 0 (liobn = 0x80000001 starting addr = 8000000 0)
> > mlx5_core 0000:00:00.0: created tce table LIOBN 0x80000001 for /pci@800000020000000/ethernet@0
> > mlx5_core 0000:00:00.0: failed to map direct window for
> > /pci@800000020000000/ethernet@0: -1  
> 
> [snip]
> > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
> >  		struct mm_iommu_table_group_mem_t **pmem)
> >  {
> >  	struct mm_iommu_table_group_mem_t *mem;
> > -	long i, j, ret = 0, locked_entries = 0;
> > +	long i, j, ret = 0, locked_entries = 0, pageshift;
> >  	struct page *page = NULL;
> >  
> >  	mutex_lock(&mem_list_mutex);
> > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
> >  		goto unlock_exit;
> >  	}
> >  
 > > +	mem->pageshift = 30; /* start from 1G pages - the biggest we have */  
> 
> What about 16G pages on an HPT system?


Below in the loop mem->pageshift will reduce to the biggest actual size
which will be 16mb/64k/4k. Or remain 1GB if no memory is actually
pinned, no loss there.


> 
> >  	for (i = 0; i < entries; ++i) {
> >  		if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
> >  					1/* pages */, 1/* iswrite */, &page)) {
> > @@ -199,6 +202,11 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
> >  			}
> >  		}
> >  populate:
> > +		pageshift = PAGE_SHIFT;
> > +		if (PageCompound(page))
> > +			pageshift += compound_order(compound_head(page));
> > +		mem->pageshift = min_t(unsigned int, mem->pageshift, pageshift);  
> 
> Why not make mem->pageshift and pageshift local the same type to avoid
> the min_t() ?

I was under impression min() is deprecated (misinterpret checkpatch.pl
may be) and therefore did not pay attention to it. I can fix this and
repost if there is no other question.


> 
> > +
> >  		mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
> >  	}
> >  
> > @@ -349,7 +357,7 @@ struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
> >  EXPORT_SYMBOL_GPL(mm_iommu_find);
> >  
> >  long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
> > -		unsigned long ua, unsigned long *hpa)
> > +		unsigned long ua, unsigned int pageshift, unsigned long *hpa)
> >  {
> >  	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
> >  	u64 *va = &mem->hpas[entry];
> > @@ -357,6 +365,9 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
> >  	if (entry >= mem->entries)
> >  		return -EFAULT;
> >  
> > +	if (pageshift > mem->pageshift)
> > +		return -EFAULT;
> > +
> >  	*hpa = *va | (ua & ~PAGE_MASK);
> >  
> >  	return 0;
> > @@ -364,7 +375,7 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
> >  EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
> >  
> >  long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
> > -		unsigned long ua, unsigned long *hpa)
> > +		unsigned long ua, unsigned int pageshift, unsigned long *hpa)
> >  {
> >  	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
> >  	void *va = &mem->hpas[entry];
> > @@ -373,6 +384,9 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
> >  	if (entry >= mem->entries)
> >  		return -EFAULT;
> >  
> > +	if (pageshift > mem->pageshift)
> > +		return -EFAULT;
> > +
> >  	pa = (void *) vmalloc_to_phys(va);
> >  	if (!pa)
> >  		return -EFAULT;
> > diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> > index 2da5f05..7cd63b0 100644
> > --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> > +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> > @@ -467,7 +467,7 @@ static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
> >  	if (!mem)
> >  		return -EINVAL;
> >  
> > -	ret = mm_iommu_ua_to_hpa(mem, tce, phpa);
> > +	ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
> >  	if (ret)
> >  		return -EINVAL;
> >    



--
Alexey

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page
From: David Gibson @ 2018-06-29  4:57 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: linuxppc-dev, kvm-ppc, Alex Williamson, Paul Mackerras
In-Reply-To: <20180629145121.5d03e067@aik.ozlabs.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 4251 bytes --]

On Fri, Jun 29, 2018 at 02:51:21PM +1000, Alexey Kardashevskiy wrote:
> On Fri, 29 Jun 2018 14:12:41 +1000
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy wrote:
> > > We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that
> > > an IOMMU page is contained in the physical page so the PCI hardware won't
> > > get access to unassigned host memory.
> > > 
> > > However we do not have this check in KVM fastpath (H_PUT_TCE accelerated
> > > code) so the user space can pin memory backed with 64k pages and create
> > > a hardware TCE table with a bigger page size. We were lucky so far and
> > > did not hit this yet as the very first time the mapping happens
> > > we do not have tbl::it_userspace allocated yet and fall back to
> > > the userspace which in turn calls VFIO IOMMU driver and that fails
> > > because of the check in vfio_iommu_spapr_tce.c which is really
> > > sustainable solution.
> > > 
> > > This stores the smallest preregistered page size in the preregistered
> > > region descriptor and changes the mm_iommu_xxx API to check this against
> > > the IOMMU page size.
> > > 
> > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > > ---
> > > Changes:
> > > v2:
> > > * explicitly check for compound pages before calling compound_order()
> > > 
> > > ---
> > > The bug is: run QEMU _without_ hugepages (no -mempath) and tell it to
> > > advertise 16MB pages to the guest; a typical pseries guest will use 16MB
> > > for IOMMU pages without checking the mmu pagesize and this will fail
> > > at https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > > 
> > > With the change, mapping will fail in KVM and the guest will print:
> > > 
> > > mlx5_core 0000:00:00.0: ibm,create-pe-dma-window(2027) 0 8000000 20000000 18 1f returned 0 (liobn = 0x80000001 starting addr = 8000000 0)
> > > mlx5_core 0000:00:00.0: created tce table LIOBN 0x80000001 for /pci@800000020000000/ethernet@0
> > > mlx5_core 0000:00:00.0: failed to map direct window for
> > > /pci@800000020000000/ethernet@0: -1  
> > 
> > [snip]
> > > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
> > >  		struct mm_iommu_table_group_mem_t **pmem)
> > >  {
> > >  	struct mm_iommu_table_group_mem_t *mem;
> > > -	long i, j, ret = 0, locked_entries = 0;
> > > +	long i, j, ret = 0, locked_entries = 0, pageshift;
> > >  	struct page *page = NULL;
> > >  
> > >  	mutex_lock(&mem_list_mutex);
> > > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
> > >  		goto unlock_exit;
> > >  	}
> > >  
>  > > +	mem->pageshift = 30; /* start from 1G pages - the biggest we have */  
> > 
> > What about 16G pages on an HPT system?
> 
> 
> Below in the loop mem->pageshift will reduce to the biggest actual size
> which will be 16mb/64k/4k. Or remain 1GB if no memory is actually
> pinned, no loss there.

Are you saying that 16G IOMMU pages aren't supported?  Or that there's
some reason a guest can never use them?

> > >  	for (i = 0; i < entries; ++i) {
> > >  		if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
> > >  					1/* pages */, 1/* iswrite */, &page)) {
> > > @@ -199,6 +202,11 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
> > >  			}
> > >  		}
> > >  populate:
> > > +		pageshift = PAGE_SHIFT;
> > > +		if (PageCompound(page))
> > > +			pageshift += compound_order(compound_head(page));
> > > +		mem->pageshift = min_t(unsigned int, mem->pageshift, pageshift);  
> > 
> > Why not make mem->pageshift and pageshift local the same type to avoid
> > the min_t() ?
> 
> I was under impression min() is deprecated (misinterpret checkpatch.pl
> may be) and therefore did not pay attention to it. I can fix this and
> repost if there is no other question.

Hm, it's possible.


-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page
From: Alexey Kardashevskiy @ 2018-06-29  5:18 UTC (permalink / raw)
  To: David Gibson; +Cc: linuxppc-dev, kvm-ppc, Alex Williamson, Paul Mackerras
In-Reply-To: <20180629045702.GI3422@umbus.fritz.box>

[-- Attachment #1: Type: text/plain, Size: 4518 bytes --]

On Fri, 29 Jun 2018 14:57:02 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Fri, Jun 29, 2018 at 02:51:21PM +1000, Alexey Kardashevskiy wrote:
> > On Fri, 29 Jun 2018 14:12:41 +1000
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >   
> > > On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy wrote:  
> > > > We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that
> > > > an IOMMU page is contained in the physical page so the PCI hardware won't
> > > > get access to unassigned host memory.
> > > > 
> > > > However we do not have this check in KVM fastpath (H_PUT_TCE accelerated
> > > > code) so the user space can pin memory backed with 64k pages and create
> > > > a hardware TCE table with a bigger page size. We were lucky so far and
> > > > did not hit this yet as the very first time the mapping happens
> > > > we do not have tbl::it_userspace allocated yet and fall back to
> > > > the userspace which in turn calls VFIO IOMMU driver and that fails
> > > > because of the check in vfio_iommu_spapr_tce.c which is really
> > > > sustainable solution.
> > > > 
> > > > This stores the smallest preregistered page size in the preregistered
> > > > region descriptor and changes the mm_iommu_xxx API to check this against
> > > > the IOMMU page size.
> > > > 
> > > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > > > ---
> > > > Changes:
> > > > v2:
> > > > * explicitly check for compound pages before calling compound_order()
> > > > 
> > > > ---
> > > > The bug is: run QEMU _without_ hugepages (no -mempath) and tell it to
> > > > advertise 16MB pages to the guest; a typical pseries guest will use 16MB
> > > > for IOMMU pages without checking the mmu pagesize and this will fail
> > > > at https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > > > 
> > > > With the change, mapping will fail in KVM and the guest will print:
> > > > 
> > > > mlx5_core 0000:00:00.0: ibm,create-pe-dma-window(2027) 0 8000000 20000000 18 1f returned 0 (liobn = 0x80000001 starting addr = 8000000 0)
> > > > mlx5_core 0000:00:00.0: created tce table LIOBN 0x80000001 for /pci@800000020000000/ethernet@0
> > > > mlx5_core 0000:00:00.0: failed to map direct window for
> > > > /pci@800000020000000/ethernet@0: -1    
> > > 
> > > [snip]  
> > > > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
> > > >  		struct mm_iommu_table_group_mem_t **pmem)
> > > >  {
> > > >  	struct mm_iommu_table_group_mem_t *mem;
> > > > -	long i, j, ret = 0, locked_entries = 0;
> > > > +	long i, j, ret = 0, locked_entries = 0, pageshift;
> > > >  	struct page *page = NULL;
> > > >  
> > > >  	mutex_lock(&mem_list_mutex);
> > > > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
> > > >  		goto unlock_exit;
> > > >  	}
> > > >  
> >  > > +	mem->pageshift = 30; /* start from 1G pages - the biggest we have */    
> > > 
> > > What about 16G pages on an HPT system?  
> > 
> > 
> > Below in the loop mem->pageshift will reduce to the biggest actual size
> > which will be 16mb/64k/4k. Or remain 1GB if no memory is actually
> > pinned, no loss there.  
> 
> Are you saying that 16G IOMMU pages aren't supported?  Or that there's
> some reason a guest can never use them?


ah, 16_G_, not _M_. My bad. I just never tried such huge pages, I will
lift the limit up to 64 then, easier this way.

> 
> > > >  	for (i = 0; i < entries; ++i) {
> > > >  		if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
> > > >  					1/* pages */, 1/* iswrite */, &page)) {
> > > > @@ -199,6 +202,11 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
> > > >  			}
> > > >  		}
> > > >  populate:
> > > > +		pageshift = PAGE_SHIFT;
> > > > +		if (PageCompound(page))
> > > > +			pageshift += compound_order(compound_head(page));
> > > > +		mem->pageshift = min_t(unsigned int, mem->pageshift, pageshift);    
> > > 
> > > Why not make mem->pageshift and pageshift local the same type to avoid
> > > the min_t() ?  
> > 
> > I was under impression min() is deprecated (misinterpret checkpatch.pl
> > may be) and therefore did not pay attention to it. I can fix this and
> > repost if there is no other question.  
> 
> Hm, it's possible.


Nah, tried min(), compiles fine.



--
Alexey

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH 1/2] powerpc/pkeys: preallocate execute_only key only if the key is available.
From: Gabriel Paubert @ 2018-06-29  6:07 UTC (permalink / raw)
  To: Thiago Jung Bauermann
  Cc: Ram Pai, fweimer, mhocko, Ulrich.Weigand, bauerman, msuchanek,
	linuxppc-dev
In-Reply-To: <87fu16xpul.fsf@morokweng.localdomain>

On Thu, Jun 28, 2018 at 11:56:34PM -0300, Thiago Jung Bauermann wrote:
> 
> Hello,
> 
> Ram Pai <linuxram@us.ibm.com> writes:
> 
> > Key 2 is preallocated and reserved for execute-only key. In rare
> > cases if key-2 is unavailable, mprotect(PROT_EXEC) will behave
> > incorrectly. NOTE: mprotect(PROT_EXEC) uses execute-only key.
> >
> > Ensure key 2 is available for preallocation before reserving it for
> > execute_only purpose.  Problem noticed by Michael Ellermen.
> 
> Since "powerpc/pkeys: Preallocate execute-only key" isn't upstream yet,
> this patch could be squashed into it.
> 
> > Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> > ---
> >  arch/powerpc/mm/pkeys.c |   14 +++++++++-----
> >  1 files changed, 9 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
> > index cec990c..0b03914 100644
> > --- a/arch/powerpc/mm/pkeys.c
> > +++ b/arch/powerpc/mm/pkeys.c
> > @@ -19,6 +19,7 @@
> >  u64  pkey_amr_mask;		/* Bits in AMR not to be touched */
> >  u64  pkey_iamr_mask;		/* Bits in AMR not to be touched */
> >  u64  pkey_uamor_mask;		/* Bits in UMOR not to be touched */
> > +int  execute_only_key = 2;
> >
> >  #define AMR_BITS_PER_PKEY 2
> >  #define AMR_RD_BIT 0x1UL
> > @@ -26,7 +27,6 @@
> >  #define IAMR_EX_BIT 0x1UL
> >  #define PKEY_REG_BITS (sizeof(u64)*8)
> >  #define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
> > -#define EXECUTE_ONLY_KEY 2
> >
> >  static void scan_pkey_feature(void)
> >  {
> > @@ -122,8 +122,12 @@ int pkey_initialize(void)
> >  #else
> >  	os_reserved = 0;
> >  #endif
> > +
> > +	if ((pkeys_total - os_reserved) <= execute_only_key)
> > +		execute_only_key = -1;
> > +
> >  	/* Bits are in LE format. */
> > -	reserved_allocation_mask = (0x1 << 1) | (0x1 << EXECUTE_ONLY_KEY);
> > +	reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key);
> 
> My understanding is that left-shifting by a negative amount is undefined
> behavior in C. A quick test tells me that at least on the couple of
> machines I tested, 1 < -1 = 0. Does GCC guarantee that behavior? 

Not in general. It probably always works on Power because of the definition 
of the machine instruction for shifts with variable amount (consider the 
shift amount unsigned and take it modulo twice the width of the operand), 
but for example it fails on x86 (1<<-1 gives 0x80000000).

> If so, a comment pointing this out would make this less confusing.

Unless I miss something, this code is run once at boot, so its
performance is irrelevant.

In this case simply rewrite it as:

	reserved_allocation_mask = 0x1 << 1;
	if ( (pkeys_total - os_reserved) <= execute_only_key) {
		execute_only_key = -1;
	} else {
		reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key);
	}

Caveat, I have assumed that the code will either:
- only run once,
- pkeys_total and os_reserved are int, not unsigned

> 
> >  	initial_allocation_mask  = reserved_allocation_mask | (0x1 << PKEY_0);
> >
> >  	/* register mask is in BE format */
> > @@ -132,11 +136,11 @@ int pkey_initialize(void)
> >
> >  	pkey_iamr_mask = ~0x0ul;
> >  	pkey_iamr_mask &= ~(0x3ul << pkeyshift(PKEY_0));
> > -	pkey_iamr_mask &= ~(0x3ul << pkeyshift(EXECUTE_ONLY_KEY));
> > +	pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key));
> >
> >  	pkey_uamor_mask = ~0x0ul;
> >  	pkey_uamor_mask &= ~(0x3ul << pkeyshift(PKEY_0));
> > -	pkey_uamor_mask &= ~(0x3ul << pkeyshift(EXECUTE_ONLY_KEY));
> > +	pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key));
> 
> Here the behaviour is undefined in C as well, given that pkeyshift(-1) =
> 64, which is the total number of bits in the left operand. Does GCC
> guarantee that the result will be 0 here as well?

Same answer: very likely on Power, not portable.

	Gabriel

^ permalink raw reply

* Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page
From: Alexey Kardashevskiy @ 2018-06-29  7:07 UTC (permalink / raw)
  To: David Gibson; +Cc: linuxppc-dev, kvm-ppc, Alex Williamson, Paul Mackerras
In-Reply-To: <20180629151820.461ae112@aik.ozlabs.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 5000 bytes --]

On Fri, 29 Jun 2018 15:18:20 +1000
Alexey Kardashevskiy <aik@ozlabs.ru> wrote:

> On Fri, 29 Jun 2018 14:57:02 +1000
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Fri, Jun 29, 2018 at 02:51:21PM +1000, Alexey Kardashevskiy wrote:  
> > > On Fri, 29 Jun 2018 14:12:41 +1000
> > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > >     
> > > > On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy wrote:    
> > > > > We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that
> > > > > an IOMMU page is contained in the physical page so the PCI hardware won't
> > > > > get access to unassigned host memory.
> > > > > 
> > > > > However we do not have this check in KVM fastpath (H_PUT_TCE accelerated
> > > > > code) so the user space can pin memory backed with 64k pages and create
> > > > > a hardware TCE table with a bigger page size. We were lucky so far and
> > > > > did not hit this yet as the very first time the mapping happens
> > > > > we do not have tbl::it_userspace allocated yet and fall back to
> > > > > the userspace which in turn calls VFIO IOMMU driver and that fails
> > > > > because of the check in vfio_iommu_spapr_tce.c which is really
> > > > > sustainable solution.
> > > > > 
> > > > > This stores the smallest preregistered page size in the preregistered
> > > > > region descriptor and changes the mm_iommu_xxx API to check this against
> > > > > the IOMMU page size.
> > > > > 
> > > > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > > > > ---
> > > > > Changes:
> > > > > v2:
> > > > > * explicitly check for compound pages before calling compound_order()
> > > > > 
> > > > > ---
> > > > > The bug is: run QEMU _without_ hugepages (no -mempath) and tell it to
> > > > > advertise 16MB pages to the guest; a typical pseries guest will use 16MB
> > > > > for IOMMU pages without checking the mmu pagesize and this will fail
> > > > > at https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > > > > 
> > > > > With the change, mapping will fail in KVM and the guest will print:
> > > > > 
> > > > > mlx5_core 0000:00:00.0: ibm,create-pe-dma-window(2027) 0 8000000 20000000 18 1f returned 0 (liobn = 0x80000001 starting addr = 8000000 0)
> > > > > mlx5_core 0000:00:00.0: created tce table LIOBN 0x80000001 for /pci@800000020000000/ethernet@0
> > > > > mlx5_core 0000:00:00.0: failed to map direct window for
> > > > > /pci@800000020000000/ethernet@0: -1      
> > > > 
> > > > [snip]    
> > > > > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
> > > > >  		struct mm_iommu_table_group_mem_t **pmem)
> > > > >  {
> > > > >  	struct mm_iommu_table_group_mem_t *mem;
> > > > > -	long i, j, ret = 0, locked_entries = 0;
> > > > > +	long i, j, ret = 0, locked_entries = 0, pageshift;
> > > > >  	struct page *page = NULL;
> > > > >  
> > > > >  	mutex_lock(&mem_list_mutex);
> > > > > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
> > > > >  		goto unlock_exit;
> > > > >  	}
> > > > >  
> > >  > > +	mem->pageshift = 30; /* start from 1G pages - the biggest we have */      
> > > > 
> > > > What about 16G pages on an HPT system?    
> > > 
> > > 
> > > Below in the loop mem->pageshift will reduce to the biggest actual size
> > > which will be 16mb/64k/4k. Or remain 1GB if no memory is actually
> > > pinned, no loss there.    
> > 
> > Are you saying that 16G IOMMU pages aren't supported?  Or that there's
> > some reason a guest can never use them?  
> 
> 
> ah, 16_G_, not _M_. My bad. I just never tried such huge pages, I will
> lift the limit up to 64 then, easier this way.


Ah, no, rather this as the upper limit:

mem->pageshift = ilog2(entries) + PAGE_SHIFT;


@entries here is a number of system pages being pinned in that
function.



> 
> >   
> > > > >  	for (i = 0; i < entries; ++i) {
> > > > >  		if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
> > > > >  					1/* pages */, 1/* iswrite */, &page)) {
> > > > > @@ -199,6 +202,11 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
> > > > >  			}
> > > > >  		}
> > > > >  populate:
> > > > > +		pageshift = PAGE_SHIFT;
> > > > > +		if (PageCompound(page))
> > > > > +			pageshift += compound_order(compound_head(page));
> > > > > +		mem->pageshift = min_t(unsigned int, mem->pageshift, pageshift);      
> > > > 
> > > > Why not make mem->pageshift and pageshift local the same type to avoid
> > > > the min_t() ?    
> > > 
> > > I was under impression min() is deprecated (misinterpret checkpatch.pl
> > > may be) and therefore did not pay attention to it. I can fix this and
> > > repost if there is no other question.    
> > 
> > Hm, it's possible.  
> 
> Nah, tried min(), compiles fine.



--
Alexey

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* [PATCH 0/3] PCI DMA pseudo-bypass for powernv
From: Russell Currey @ 2018-06-29  7:34 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: benh, alistair, aik, tpearson, Russell Currey

These patches implement a new set of DMA operations designed to allow
devices that cannot address bit 59 to use more than 32-bit DMA.

The previous implementation for PHB3 assumed contiguous memory, which
is no longer the case on POWER9 and possibly later.  As a result, a new
approach was necessary, which is what this new implementation does.

These patches aren't perfect yet - most notably, there is no
implementation of unmap() - once a TCE is created it is never freed.
For the most part this isn't an issue, but could potentially be a
problem for a device that can't address many bits and maps *a lot* of
memory.

I have a much more complex implementation in the back that I will post
in future, but until that's ready and fully tested, I believe the
drastic performance improvement is worth getting this upstream sooner
rather than later.

Thanks to Benjamin Herrenschmidt, Alistair Popple and Timothy Pearson
for their help.

Russell Currey (3):
  powerpc/powernv/pci: Track largest available TCE order per PHB
  powerpc/powernv: DMA operations for discontiguous allocation
  powerpc/powernv/pci: Track TCE tables in debugfs

 arch/powerpc/include/asm/dma-mapping.h    |   1 +
 arch/powerpc/platforms/powernv/Makefile   |   2 +-
 arch/powerpc/platforms/powernv/pci-dma.c  | 243 ++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci-ioda.c | 132 +++++++-----
 arch/powerpc/platforms/powernv/pci.h      |  10 +
 5 files changed, 334 insertions(+), 54 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/pci-dma.c

-- 
2.17.1

^ permalink raw reply

* [PATCH 1/3] powerpc/powernv/pci: Track largest available TCE order per PHB
From: Russell Currey @ 2018-06-29  7:34 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: benh, alistair, aik, tpearson, Russell Currey
In-Reply-To: <20180629073437.4060-1-ruscur@russell.cc>

Knowing the largest possible TCE size of a PHB is useful, so get it out
of the device tree.  This relies on the property being added in OPAL.

It is assumed that any PHB4 or later machine would be running firmware
that implemented this property, and otherwise assumed to be PHB3, which
has a maximum TCE order of 28 bits or 256MB TCEs.

This is used later in the series.

Signed-off-by: Russell Currey <ruscur@russell.cc>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 16 ++++++++++++++++
 arch/powerpc/platforms/powernv/pci.h      |  3 +++
 2 files changed, 19 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 5bd0eb6681bc..17c590087279 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3873,11 +3873,13 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 	struct resource r;
 	const __be64 *prop64;
 	const __be32 *prop32;
+	struct property *prop;
 	int len;
 	unsigned int segno;
 	u64 phb_id;
 	void *aux;
 	long rc;
+	u32 val;
 
 	if (!of_device_is_available(np))
 		return;
@@ -4016,6 +4018,20 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 	}
 	phb->ioda.pe_array = aux + pemap_off;
 
+	phb->ioda.max_tce_order = 0;
+	/* Get TCE order from the DT.  If it's not present, assume P8 */
+	if (!of_get_property(np, "ibm,supported-tce-sizes", NULL)) {
+		phb->ioda.max_tce_order = 28; /* assume P8 256mb TCEs */
+	} else {
+		of_property_for_each_u32(np, "ibm,supported-tce-sizes", prop,
+					 prop32, val) {
+			if (val > phb->ioda.max_tce_order)
+				phb->ioda.max_tce_order = val;
+		}
+		pr_debug("PHB%llx Found max TCE order of %d bits\n",
+			 phb->opal_id, phb->ioda.max_tce_order);
+	}
+
 	/*
 	 * Choose PE number for root bus, which shouldn't have
 	 * M64 resources consumed by its child devices. To pick
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index eada4b6068cb..c9952def5e93 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -173,6 +173,9 @@ struct pnv_phb {
 		struct list_head	pe_list;
 		struct mutex            pe_list_mutex;
 
+		/* Largest supported TCE order bits */
+		uint8_t			max_tce_order;
+
 		/* Reverse map of PEs, indexed by {bus, devfn} */
 		unsigned int		pe_rmap[0x10000];
 	} ioda;
-- 
2.17.1

^ permalink raw reply related

* [PATCH 2/3] powerpc/powernv: DMA operations for discontiguous allocation
From: Russell Currey @ 2018-06-29  7:34 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: benh, alistair, aik, tpearson, Russell Currey
In-Reply-To: <20180629073437.4060-1-ruscur@russell.cc>

DMA pseudo-bypass is a new set of DMA operations that solve some issues for
devices that want to address more than 32 bits but can't address the 59
bits required to enable direct DMA.

The previous implementation for POWER8/PHB3 worked around this by
configuring a bypass from the default 32-bit address space into 64-bit
address space.  This approach does not work for POWER9/PHB4 because
regions of memory are discontiguous and many devices will be unable to
address memory beyond the first node.

Instead, implement a new set of DMA operations that allocate TCEs as DMA
mappings are requested so that all memory is addressable even when a
one-to-one mapping between real addresses and DMA addresses isn't
possible.  These TCEs are the maximum size available on the platform,
which is 256M on PHB3 and 1G on PHB4.

Devices can now map any region of memory up to the maximum amount they can
address according to the DMA mask set, in chunks of the largest available
TCE size.

This implementation replaces the need for the existing PHB3 solution and
should be compatible with future PHB versions.

It is, however, rather naive.  There is no unmapping, and as a result
devices can eventually run out of space if they address their entire
DMA mask worth of TCEs.  An implementation with unmap() will come in
future (and requires a much more complex implementation), but this is a
good start due to the drastic performance improvement.

Signed-off-by: Russell Currey <ruscur@russell.cc>
---
 arch/powerpc/include/asm/dma-mapping.h    |   1 +
 arch/powerpc/platforms/powernv/Makefile   |   2 +-
 arch/powerpc/platforms/powernv/pci-dma.c  | 243 ++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci-ioda.c |  82 +++-----
 arch/powerpc/platforms/powernv/pci.h      |   7 +
 5 files changed, 281 insertions(+), 54 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/pci-dma.c

diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index 8fa394520af6..354f435160f3 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -74,6 +74,7 @@ static inline unsigned long device_to_mask(struct device *dev)
 extern struct dma_map_ops dma_iommu_ops;
 #endif
 extern const struct dma_map_ops dma_nommu_ops;
+extern const struct dma_map_ops dma_pseudo_bypass_ops;
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 703a350a7f4e..2467bdab3c13 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -6,7 +6,7 @@ obj-y			+= opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
 obj-y			+= opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
 
 obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
-obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o
+obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o pci-dma.o
 obj-$(CONFIG_CXL_BASE)	+= pci-cxl.o
 obj-$(CONFIG_EEH)	+= eeh-powernv.o
 obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
diff --git a/arch/powerpc/platforms/powernv/pci-dma.c b/arch/powerpc/platforms/powernv/pci-dma.c
new file mode 100644
index 000000000000..79382627c7be
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-dma.c
@@ -0,0 +1,243 @@
+/*
+ * DMA operations supporting pseudo-bypass for PHB3+
+ *
+ * Author: Russell Currey <ruscur@russell.cc>
+ *
+ * Copyright 2018 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/export.h>
+#include <linux/memblock.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/hash.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/pnv-pci.h>
+#include <asm/tce.h>
+
+#include "pci.h"
+
+/*
+ * This is a naive implementation that directly operates on TCEs, allocating
+ * on demand.  There is no locking or refcounts since no TCEs are ever removed
+ * and unmap does nothing.
+ */
+static dma_addr_t dma_pseudo_bypass_get_address(struct device *dev,
+					    phys_addr_t addr)
+{
+	struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pnv_ioda_pe *pe;
+	u64 i, tce, ret, offset;
+	__be64 entry;
+
+	offset = addr & ((1 << phb->ioda.max_tce_order) - 1);
+
+	pe = &phb->ioda.pe_array[pci_get_pdn(pdev)->pe_number];
+
+	/* look through the tracking table for a free entry */
+	for (i = 0; i < pe->tce_count; i++) {
+		/* skip between 2GB and 4GB */
+		if ((i << phb->ioda.max_tce_order) >= 0x80000000 &&
+		    (i << phb->ioda.max_tce_order) < 0x100000000)
+			continue;
+
+		tce = be64_to_cpu(pe->tces[i]);
+
+		/* if the TCE is already valid (read + write) */
+		if ((tce & 3) == 3) {
+			/* check if we're already allocated, if not move on */
+			if (tce >> phb->ioda.max_tce_order ==
+			    addr >> phb->ioda.max_tce_order) {
+				/* wait for the lock bit to clear */
+				while (be64_to_cpu(pe->tces[i]) & 4)
+					cpu_relax();
+
+				return (i << phb->ioda.max_tce_order) | offset;
+			}
+
+			continue;
+		}
+
+		/*
+		 * The TCE isn't being used, so let's try and allocate it.
+		 * Bits 0 and 1 are read/write, and we use bit 2 as a "lock"
+		 * bit.  This is to prevent any race where the value is set in
+		 * the TCE table but the invalidate/mb() hasn't finished yet.
+		 */
+		entry = cpu_to_be64((addr - offset) | 7);
+		ret = cmpxchg(&pe->tces[i], tce, entry);
+		if (ret != tce) {
+			/* conflict, start looking again just in case */
+			i--;
+			continue;
+		}
+		pnv_pci_phb3_tce_invalidate(pe, 0, 0, addr - offset, 1);
+		mb();
+		/* clear the lock bit now that we know it's active */
+		ret = cmpxchg(&pe->tces[i], entry, cpu_to_be64((addr - offset) | 3));
+		if (ret != entry) {
+			/* conflict, start looking again just in case */
+			i--;
+			continue;
+		}
+
+		return (i << phb->ioda.max_tce_order) | offset;
+	}
+	/* If we get here, the table must be full, so error out. */
+	return -1ULL;
+}
+
+/*
+ * For now, don't actually do anything on unmap.
+ */
+static void dma_pseudo_bypass_unmap_address(struct device *dev, dma_addr_t dma_addr)
+{
+}
+
+static int dma_pseudo_bypass_dma_supported(struct device *dev, u64 mask)
+{
+	/*
+	 * Normally dma_supported() checks if the mask is capable of addressing
+	 * all of memory.  Since we map physical memory in chunks that the
+	 * device can address, the device will be able to address whatever it
+	 * wants - just not all at once.
+	 */
+	return 1;
+}
+
+static void *dma_pseudo_bypass_alloc_coherent(struct device *dev,
+					  size_t size,
+					  dma_addr_t *dma_handle,
+					  gfp_t flag,
+					  unsigned long attrs)
+{
+	void *ret;
+	struct page *page;
+	int node = dev_to_node(dev);
+
+	/* ignore region specifiers */
+	flag &= ~(__GFP_HIGHMEM);
+
+	page = alloc_pages_node(node, flag, get_order(size));
+	if (page == NULL)
+		return NULL;
+	ret = page_address(page);
+	memset(ret, 0, size);
+	*dma_handle = dma_pseudo_bypass_get_address(dev, __pa(ret));
+
+	return ret;
+}
+
+static void dma_pseudo_bypass_free_coherent(struct device *dev,
+					 size_t size,
+					 void *vaddr,
+					 dma_addr_t dma_handle,
+					 unsigned long attrs)
+{
+	free_pages((unsigned long)vaddr, get_order(size));
+}
+
+static int dma_pseudo_bypass_mmap_coherent(struct device *dev,
+				       struct vm_area_struct *vma,
+				       void *cpu_addr,
+				       dma_addr_t handle,
+				       size_t size,
+				       unsigned long attrs)
+{
+	unsigned long pfn = page_to_pfn(virt_to_page(cpu_addr));
+
+	return remap_pfn_range(vma, vma->vm_start,
+			       pfn + vma->vm_pgoff,
+			       vma->vm_end - vma->vm_start,
+			       vma->vm_page_prot);
+}
+
+static inline dma_addr_t dma_pseudo_bypass_map_page(struct device *dev,
+						struct page *page,
+						unsigned long offset,
+						size_t size,
+						enum dma_data_direction dir,
+						unsigned long attrs)
+{
+	BUG_ON(dir == DMA_NONE);
+
+	return dma_pseudo_bypass_get_address(dev, page_to_phys(page) + offset);
+}
+
+static inline void dma_pseudo_bypass_unmap_page(struct device *dev,
+					 dma_addr_t dma_address,
+					 size_t size,
+					 enum dma_data_direction direction,
+					 unsigned long attrs)
+{
+	dma_pseudo_bypass_unmap_address(dev, dma_address);
+}
+
+
+static int dma_pseudo_bypass_map_sg(struct device *dev, struct scatterlist *sgl,
+			     int nents, enum dma_data_direction direction,
+			     unsigned long attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+
+	for_each_sg(sgl, sg, nents, i) {
+		sg->dma_address = dma_pseudo_bypass_get_address(dev, sg_phys(sg));
+		sg->dma_length = sg->length;
+
+		__dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
+	}
+
+	return nents;
+}
+
+static void dma_pseudo_bypass_unmap_sg(struct device *dev, struct scatterlist *sgl,
+				int nents, enum dma_data_direction direction,
+				unsigned long attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i) {
+		dma_pseudo_bypass_unmap_address(dev, sg->dma_address);
+	}
+}
+
+static u64 dma_pseudo_bypass_get_required_mask(struct device *dev)
+{
+	/*
+	 * there's no limitation on our end, the driver should just call
+	 * set_mask() with as many bits as the device can address.
+	 */
+	return -1ULL;
+}
+
+static int dma_pseudo_bypass_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return dma_addr == -1ULL;
+}
+
+
+const struct dma_map_ops dma_pseudo_bypass_ops = {
+	.alloc				= dma_pseudo_bypass_alloc_coherent,
+	.free				= dma_pseudo_bypass_free_coherent,
+	.mmap				= dma_pseudo_bypass_mmap_coherent,
+	.map_sg				= dma_pseudo_bypass_map_sg,
+	.unmap_sg			= dma_pseudo_bypass_unmap_sg,
+	.dma_supported			= dma_pseudo_bypass_dma_supported,
+	.map_page			= dma_pseudo_bypass_map_page,
+	.unmap_page			= dma_pseudo_bypass_unmap_page,
+	.get_required_mask		= dma_pseudo_bypass_get_required_mask,
+	.mapping_error			= dma_pseudo_bypass_mapping_error,
+};
+EXPORT_SYMBOL(dma_pseudo_bypass_ops);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 17c590087279..d2ca214610fd 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1088,6 +1088,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 	pe->pbus = NULL;
 	pe->mve_number = -1;
 	pe->rid = dev->bus->number << 8 | pdn->devfn;
+	pe->tces = NULL;
 
 	pe_info(pe, "Associated device to PE\n");
 
@@ -1569,6 +1570,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 		pe->mve_number = -1;
 		pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
 			   pci_iov_virtfn_devfn(pdev, vf_index);
+		pe->tces = NULL;
 
 		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
 			hose->global_number, pdev->bus->number,
@@ -1774,43 +1776,22 @@ static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
 	return true;
 }
 
-/*
- * Reconfigure TVE#0 to be usable as 64-bit DMA space.
- *
- * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses.
- * Devices can only access more than that if bit 59 of the PCI address is set
- * by hardware, which indicates TVE#1 should be used instead of TVE#0.
- * Many PCI devices are not capable of addressing that many bits, and as a
- * result are limited to the 4GB of virtual memory made available to 32-bit
- * devices in TVE#0.
- *
- * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit
- * devices by configuring the virtual memory past the first 4GB inaccessible
- * by 64-bit DMAs.  This should only be used by devices that want more than
- * 4GB, and only on PEs that have no 32-bit devices.
- *
- * Currently this will only work on PHB3 (POWER8).
- */
-static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
+static int pnv_pci_pseudo_bypass_setup(struct pnv_ioda_pe *pe)
 {
-	u64 window_size, table_size, tce_count, addr;
+	u64 tce_count, table_size, window_size;
+	struct pnv_phb *p = pe->phb;
 	struct page *table_pages;
-	u64 tce_order = 28; /* 256MB TCEs */
 	__be64 *tces;
-	s64 rc;
+	int rc = -ENOMEM;
 
-	/*
-	 * Window size needs to be a power of two, but needs to account for
-	 * shifting memory by the 4GB offset required to skip 32bit space.
-	 */
-	window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
-	tce_count = window_size >> tce_order;
+	window_size = roundup_pow_of_two(memory_hotplug_max());
+	tce_count = window_size >> p->ioda.max_tce_order;
 	table_size = tce_count << 3;
 
 	if (table_size < PAGE_SIZE)
 		table_size = PAGE_SIZE;
 
-	table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
+	table_pages = alloc_pages_node(p->hose->node, GFP_KERNEL,
 				       get_order(table_size));
 	if (!table_pages)
 		goto err;
@@ -1821,26 +1802,23 @@ static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
 
 	memset(tces, 0, table_size);
 
-	for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
-		tces[(addr + (1ULL << 32)) >> tce_order] =
-			cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
-	}
+	pe->tces = tces;
+	pe->tce_count = tce_count;
 
 	rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
 					pe->pe_number,
-					/* reconfigure window 0 */
 					(pe->pe_number << 1) + 0,
 					1,
 					__pa(tces),
 					table_size,
-					1 << tce_order);
+					1 << p->ioda.max_tce_order);
 	if (rc == OPAL_SUCCESS) {
-		pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
+		pe_info(pe, "TCE tables configured for pseudo-bypass\n");
 		return 0;
 	}
 err:
-	pe_err(pe, "Error configuring 64-bit DMA bypass\n");
-	return -EIO;
+	pe_err(pe, "error configuring pseudo-bypass\n");
+	return rc;
 }
 
 static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
@@ -1851,7 +1829,6 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
 	struct pnv_ioda_pe *pe;
 	uint64_t top;
 	bool bypass = false;
-	s64 rc;
 
 	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
 		return -ENODEV;
@@ -1868,21 +1845,15 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
 	} else {
 		/*
 		 * If the device can't set the TCE bypass bit but still wants
-		 * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
-		 * bypass the 32-bit region and be usable for 64-bit DMAs.
-		 * The device needs to be able to address all of this space.
+		 * to access 4GB or more, we need to use a different set of DMA
+		 * operations with an indirect mapping.
 		 */
 		if (dma_mask >> 32 &&
-		    dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
-		    pnv_pci_ioda_pe_single_vendor(pe) &&
-		    phb->model == PNV_PHB_MODEL_PHB3) {
-			/* Configure the bypass mode */
-			rc = pnv_pci_ioda_dma_64bit_bypass(pe);
-			if (rc)
-				return rc;
-			/* 4GB offset bypasses 32-bit space */
-			set_dma_offset(&pdev->dev, (1ULL << 32));
-			set_dma_ops(&pdev->dev, &dma_nommu_ops);
+		    phb->model != PNV_PHB_MODEL_P7IOC &&
+		    pnv_pci_ioda_pe_single_vendor(pe)) {
+			if (!pe->tces)
+				pnv_pci_pseudo_bypass_setup(pe);
+			set_dma_ops(&pdev->dev, &dma_pseudo_bypass_ops);
 		} else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) {
 			/*
 			 * Fail the request if a DMA mask between 32 and 64 bits
@@ -2071,7 +2042,7 @@ static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe)
 	__raw_writeq_be(val, invalidate);
 }
 
-static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
+void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
 					unsigned shift, unsigned long index,
 					unsigned long npages)
 {
@@ -2611,10 +2582,15 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
 static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 {
 	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
-						table_group);
+					      table_group);
+
 	/* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
 	struct iommu_table *tbl = pe->table_group.tables[0];
 
+	if (pe->tces)
+		free_pages((unsigned long)pe->tces,
+			   get_order(pe->tce_count << 3));
+
 	pnv_pci_ioda2_set_bypass(pe, false);
 	pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (pe->pbus)
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index c9952def5e93..56846ddc76a2 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -70,6 +70,10 @@ struct pnv_ioda_pe {
 	bool			tce_bypass_enabled;
 	uint64_t		tce_bypass_base;
 
+	/* TCE tables for DMA pseudo-bypass */
+	__be64			*tces;
+	u64			tce_count;
+
 	/* MSIs. MVE index is identical for for 32 and 64 bit MSI
 	 * and -1 if not supported. (It's actually identical to the
 	 * PE number)
@@ -211,6 +215,9 @@ extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
 extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
 		unsigned long *hpa, enum dma_data_direction *direction);
 extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
+extern void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
+					unsigned shift, unsigned long index,
+					unsigned long npages);
 
 void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
 				unsigned char *log_buff);
-- 
2.17.1

^ permalink raw reply related

* [PATCH 3/3] powerpc/powernv/pci: Track TCE tables in debugfs
From: Russell Currey @ 2018-06-29  7:34 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: benh, alistair, aik, tpearson, Russell Currey
In-Reply-To: <20180629073437.4060-1-ruscur@russell.cc>

Add a new debugfs entry to trigger dumping out the TCEs for a
given PE using pseudo-bypass, for example PE 0x4 of PHB 2:

echo 0x4 > /sys/kernel/debug/powerpc/PCI0002/dump_tces

This will result in the table being dumped out in dmesg.

Signed-off-by: Russell Currey <ruscur@russell.cc>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 34 +++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index d2ca214610fd..91316d3c26dc 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3314,6 +3314,38 @@ static int pnv_pci_diag_data_set(void *data, u64 val)
 DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_diag_data_fops, NULL,
 			pnv_pci_diag_data_set, "%llu\n");
 
+static int pnv_pci_dump_tces_set(void *data, u64 val)
+{
+	struct pci_controller *hose;
+	struct pnv_ioda_pe *pe;
+	struct pnv_phb *phb;
+	int i;
+
+	hose = (struct pci_controller *)data;
+	if (!hose || !hose->private_data)
+		return -ENODEV;
+
+	phb = hose->private_data;
+	pe = &phb->ioda.pe_array[val];
+
+	if (!pe)
+		return -EINVAL;
+
+	if (!pe->tces)
+		return -EIO;
+
+	for (i = 0; i < pe->tce_count; i++) {
+		if (i > 16 && pe->tces[i] == 0)
+			break;
+		pr_info("TCE %3d: %016llx\n", i, be64_to_cpu(pe->tces[i]));
+	}
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_dump_tces_fops, NULL,
+			pnv_pci_dump_tces_set, "%llu\n");
+
+
 #endif /* CONFIG_DEBUG_FS */
 
 static void pnv_pci_ioda_create_dbgfs(void)
@@ -3339,6 +3371,8 @@ static void pnv_pci_ioda_create_dbgfs(void)
 
 		debugfs_create_file("dump_diag_regs", 0200, phb->dbgfs, hose,
 				    &pnv_pci_diag_data_fops);
+		debugfs_create_file("dump_tces", 0200, phb->dbgfs, hose,
+				    &pnv_pci_dump_tces_fops);
 	}
 #endif /* CONFIG_DEBUG_FS */
 }
-- 
2.17.1

^ permalink raw reply related

* [PATCH 1/3] powerpc/mm/hash: Remove the superfluous bitwise operation when find hpte group
From: Aneesh Kumar K.V @ 2018-06-29  8:36 UTC (permalink / raw)
  To: npiggin, benh, paulus, mpe; +Cc: linuxppc-dev, Aneesh Kumar K.V

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

When computing the starting slot number for a hash page table group we used
to do this
hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;

Multiplying with 8 (HPTES_PER_GROUP) imply the last three bits are 0. Hence we
really don't need to clear then separately.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/mm/dump_hashpagetable.c |  2 +-
 arch/powerpc/mm/hash64_4k.c          |  8 ++++----
 arch/powerpc/mm/hash64_64k.c         | 15 +++++++--------
 arch/powerpc/mm/hash_utils_64.c      | 10 ++++------
 arch/powerpc/mm/hugepage-hash64.c    |  9 ++++-----
 5 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c
index 14cfb11b09d0..d241cb6518da 100644
--- a/arch/powerpc/mm/dump_hashpagetable.c
+++ b/arch/powerpc/mm/dump_hashpagetable.c
@@ -260,7 +260,7 @@ static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *
 	/* to check in the secondary hash table, we invert the hash */
 	if (!primary)
 		hash = ~hash;
-	hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+	hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 	/* see if we can find an entry in the hpte with this hash */
 	for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
 		lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes);
diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c
index d573d7d07f25..6fa6765a10eb 100644
--- a/arch/powerpc/mm/hash64_4k.c
+++ b/arch/powerpc/mm/hash64_4k.c
@@ -80,7 +80,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 		hash = hpt_hash(vpn, shift, ssize);
 
 repeat:
-		hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+		hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 
 		/* Insert into the hash table, primary slot */
 		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
@@ -89,7 +89,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 		 * Primary is full, try the secondary
 		 */
 		if (unlikely(slot == -1)) {
-			hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+			hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
 			slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
 							rflags,
 							HPTE_V_SECONDARY,
@@ -97,8 +97,8 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 							MMU_PAGE_4K, ssize);
 			if (slot == -1) {
 				if (mftb() & 0x1)
-					hpte_group = ((hash & htab_hash_mask) *
-						      HPTES_PER_GROUP) & ~0x7UL;
+					hpte_group = (hash & htab_hash_mask) *
+							HPTES_PER_GROUP;
 				mmu_hash_ops.hpte_remove(hpte_group);
 				/*
 				 * FIXME!! Should be try the group from which we removed ?
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index e601d95c3b20..3afa253d7f52 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -154,7 +154,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 	}
 	hash = hpt_hash(vpn, shift, ssize);
 repeat:
-	hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+	hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 
 	/* Insert into the hash table, primary slot */
 	slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
@@ -165,7 +165,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 	if (unlikely(slot == -1)) {
 		bool soft_invalid;
 
-		hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+		hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
 		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
 						rflags, HPTE_V_SECONDARY,
 						MMU_PAGE_4K, MMU_PAGE_4K,
@@ -193,8 +193,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 			 * that we do not get the same soft-invalid slot.
 			 */
 			if (soft_invalid || (mftb() & 0x1))
-				hpte_group = ((hash & htab_hash_mask) *
-					      HPTES_PER_GROUP) & ~0x7UL;
+				hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 
 			mmu_hash_ops.hpte_remove(hpte_group);
 			/*
@@ -288,7 +287,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
 		hash = hpt_hash(vpn, shift, ssize);
 
 repeat:
-		hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+		hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 
 		/* Insert into the hash table, primary slot */
 		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
@@ -298,7 +297,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
 		 * Primary is full, try the secondary
 		 */
 		if (unlikely(slot == -1)) {
-			hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+			hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
 			slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
 							rflags,
 							HPTE_V_SECONDARY,
@@ -306,8 +305,8 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
 							MMU_PAGE_64K, ssize);
 			if (slot == -1) {
 				if (mftb() & 0x1)
-					hpte_group = ((hash & htab_hash_mask) *
-						      HPTES_PER_GROUP) & ~0x7UL;
+					hpte_group = (hash & htab_hash_mask) *
+							HPTES_PER_GROUP;
 				mmu_hash_ops.hpte_remove(hpte_group);
 				/*
 				 * FIXME!! Should be try the group from which we removed ?
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 8318716e5075..4ba901223000 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1783,8 +1783,7 @@ long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
 	long slot;
 
 repeat:
-	hpte_group = ((hash & htab_hash_mask) *
-		       HPTES_PER_GROUP) & ~0x7UL;
+	hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 
 	/* Insert into the hash table, primary slot */
 	slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
@@ -1792,15 +1791,14 @@ long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
 
 	/* Primary is full, try the secondary */
 	if (unlikely(slot == -1)) {
-		hpte_group = ((~hash & htab_hash_mask) *
-			      HPTES_PER_GROUP) & ~0x7UL;
+		hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
 		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags,
 						vflags | HPTE_V_SECONDARY,
 						psize, psize, ssize);
 		if (slot == -1) {
 			if (mftb() & 0x1)
-				hpte_group = ((hash & htab_hash_mask) *
-					      HPTES_PER_GROUP)&~0x7UL;
+				hpte_group = (hash & htab_hash_mask) *
+						HPTES_PER_GROUP;
 
 			mmu_hash_ops.hpte_remove(hpte_group);
 			goto repeat;
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index f20d16f849c5..01f213d2bcb9 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -128,7 +128,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
 		new_pmd |= H_PAGE_HASHPTE;
 
 repeat:
-		hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+		hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 
 		/* Insert into the hash table, primary slot */
 		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
@@ -137,16 +137,15 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
 		 * Primary is full, try the secondary
 		 */
 		if (unlikely(slot == -1)) {
-			hpte_group = ((~hash & htab_hash_mask) *
-				      HPTES_PER_GROUP) & ~0x7UL;
+			hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
 			slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
 							rflags,
 							HPTE_V_SECONDARY,
 							psize, lpsize, ssize);
 			if (slot == -1) {
 				if (mftb() & 0x1)
-					hpte_group = ((hash & htab_hash_mask) *
-						      HPTES_PER_GROUP) & ~0x7UL;
+					hpte_group = (hash & htab_hash_mask) *
+							HPTES_PER_GROUP;
 
 				mmu_hash_ops.hpte_remove(hpte_group);
 				goto repeat;
-- 
2.17.1

^ permalink raw reply related

* [PATCH 2/3] powerpc/mm/hash: Add hpte_get_old_v and use that instead of opencoding
From: Aneesh Kumar K.V @ 2018-06-29  8:36 UTC (permalink / raw)
  To: npiggin, benh, paulus, mpe; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <20180629083631.28017-1-aneesh.kumar@linux.ibm.com>

No functional change

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 10 +++++++
 arch/powerpc/mm/hash_native_64.c              | 27 +++++--------------
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 50ed64fba4ae..eee0b5b8a23f 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -364,6 +364,16 @@ static inline unsigned long hpte_new_to_old_r(unsigned long r)
 	return r & ~HPTE_R_3_0_SSIZE_MASK;
 }
 
+static inline unsigned long hpte_get_old_v(struct hash_pte *hptep)
+{
+	unsigned long hpte_v;
+
+	hpte_v = be64_to_cpu(hptep->v);
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
+	return hpte_v;
+}
+
 /*
  * This function sets the AVPN and L fields of the HPTE  appropriately
  * using the base page size and actual page size.
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 1d049c78c82a..68e6eaf41bb9 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -423,9 +423,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 	DBG_LOW("    update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
 		vpn, want_v & HPTE_V_AVPN, slot, newpp);
 
-	hpte_v = be64_to_cpu(hptep->v);
-	if (cpu_has_feature(CPU_FTR_ARCH_300))
-		hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
+	hpte_v = hpte_get_old_v(hptep);
 	/*
 	 * We need to invalidate the TLB always because hpte_remove doesn't do
 	 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -439,9 +437,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 	} else {
 		native_lock_hpte(hptep);
 		/* recheck with locks held */
-		hpte_v = be64_to_cpu(hptep->v);
-		if (cpu_has_feature(CPU_FTR_ARCH_300))
-			hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
+		hpte_v = hpte_get_old_v(hptep);
 		if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) ||
 			     !(hpte_v & HPTE_V_VALID))) {
 			ret = -1;
@@ -481,11 +477,9 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
 	/* Bolted mappings are only ever in the primary group */
 	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 	for (i = 0; i < HPTES_PER_GROUP; i++) {
-		hptep = htab_address + slot;
-		hpte_v = be64_to_cpu(hptep->v);
-		if (cpu_has_feature(CPU_FTR_ARCH_300))
-			hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
 
+		hptep = htab_address + slot;
+		hpte_v = hpte_get_old_v(hptep);
 		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
 			/* HPTE matches */
 			return slot;
@@ -575,9 +569,7 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 
 	want_v = hpte_encode_avpn(vpn, bpsize, ssize);
 	native_lock_hpte(hptep);
-	hpte_v = be64_to_cpu(hptep->v);
-	if (cpu_has_feature(CPU_FTR_ARCH_300))
-		hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
+	hpte_v = hpte_get_old_v(hptep);
 
 	/*
 	 * We need to invalidate the TLB always because hpte_remove doesn't do
@@ -635,9 +627,7 @@ static void native_hugepage_invalidate(unsigned long vsid,
 		hptep = htab_address + slot;
 		want_v = hpte_encode_avpn(vpn, psize, ssize);
 		native_lock_hpte(hptep);
-		hpte_v = be64_to_cpu(hptep->v);
-		if (cpu_has_feature(CPU_FTR_ARCH_300))
-			hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
+		hpte_v = hpte_get_old_v(hptep);
 
 		/* Even if we miss, we need to invalidate the TLB */
 		if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
@@ -813,10 +803,7 @@ static void native_flush_hash_range(unsigned long number, int local)
 			hptep = htab_address + slot;
 			want_v = hpte_encode_avpn(vpn, psize, ssize);
 			native_lock_hpte(hptep);
-			hpte_v = be64_to_cpu(hptep->v);
-			if (cpu_has_feature(CPU_FTR_ARCH_300))
-				hpte_v = hpte_new_to_old_v(hpte_v,
-						be64_to_cpu(hptep->r));
+			hpte_v = hpte_get_old_v(hptep);
 			if (!HPTE_V_COMPARE(hpte_v, want_v) ||
 			    !(hpte_v & HPTE_V_VALID))
 				native_unlock_hpte(hptep);
-- 
2.17.1

^ permalink raw reply related

* [PATCH 3/3] powerpc/mm/nv/hash: Reduce contention on hpte lock
From: Aneesh Kumar K.V @ 2018-06-29  8:36 UTC (permalink / raw)
  To: npiggin, benh, paulus, mpe
  Cc: linuxppc-dev, Aneesh Kumar K.V, Aneesh Kumar K . V
In-Reply-To: <20180629083631.28017-1-aneesh.kumar@linux.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

We do this in some part. This patch make sure we always try to search for
hpte without holding lock and redo the compare with lock held once match found.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/hash_native_64.c | 49 +++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 68e6eaf41bb9..ffbd5ed4e8de 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -568,9 +568,19 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 	DBG_LOW("    invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
 
 	want_v = hpte_encode_avpn(vpn, bpsize, ssize);
-	native_lock_hpte(hptep);
 	hpte_v = hpte_get_old_v(hptep);
 
+	if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+		native_lock_hpte(hptep);
+		/* recheck with locks held */
+		hpte_v = hpte_get_old_v(hptep);
+
+		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
+			/* Invalidate the hpte. NOTE: this also unlocks it */
+			hptep->v = 0;
+		else
+			native_unlock_hpte(hptep);
+	}
 	/*
 	 * We need to invalidate the TLB always because hpte_remove doesn't do
 	 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -578,13 +588,6 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 	 * (hpte_remove) because we assume the old translation is still
 	 * technically "valid".
 	 */
-	if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
-		native_unlock_hpte(hptep);
-	else
-		/* Invalidate the hpte. NOTE: this also unlocks it */
-		hptep->v = 0;
-
-	/* Invalidate the TLB */
 	tlbie(vpn, bpsize, apsize, ssize, local);
 
 	local_irq_restore(flags);
@@ -626,15 +629,23 @@ static void native_hugepage_invalidate(unsigned long vsid,
 
 		hptep = htab_address + slot;
 		want_v = hpte_encode_avpn(vpn, psize, ssize);
-		native_lock_hpte(hptep);
 		hpte_v = hpte_get_old_v(hptep);
 
 		/* Even if we miss, we need to invalidate the TLB */
-		if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
-			native_unlock_hpte(hptep);
-		else
-			/* Invalidate the hpte. NOTE: this also unlocks it */
-			hptep->v = 0;
+		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+			/* recheck with locks held */
+			native_lock_hpte(hptep);
+			hpte_v = hpte_get_old_v(hptep);
+
+			if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+				/*
+				 * Invalidate the hpte. NOTE: this also unlocks it
+				 */
+
+				hptep->v = 0;
+			} else
+				native_unlock_hpte(hptep);
+		}
 		/*
 		 * We need to do tlb invalidate for all the address, tlbie
 		 * instruction compares entry_VA in tlb with the VA specified
@@ -802,13 +813,19 @@ static void native_flush_hash_range(unsigned long number, int local)
 			slot += hidx & _PTEIDX_GROUP_IX;
 			hptep = htab_address + slot;
 			want_v = hpte_encode_avpn(vpn, psize, ssize);
+			hpte_v = hpte_get_old_v(hptep);
+
+			if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+				continue;
+			/* lock and try again */
 			native_lock_hpte(hptep);
 			hpte_v = hpte_get_old_v(hptep);
-			if (!HPTE_V_COMPARE(hpte_v, want_v) ||
-			    !(hpte_v & HPTE_V_VALID))
+
+			if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
 				native_unlock_hpte(hptep);
 			else
 				hptep->v = 0;
+
 		} pte_iterate_hashed_end();
 	}
 
-- 
2.17.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox