LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH -V5 24/25] powerpc: Optimize hugepage invalidate
From: Aneesh Kumar K.V @ 2013-04-14 10:02 UTC (permalink / raw)
  To: David Gibson; +Cc: paulus, linuxppc-dev, linux-mm
In-Reply-To: <20130412042104.GH5065@truffula.fritz.box>

David Gibson <dwg@au1.ibm.com> writes:

> On Thu, Apr 04, 2013 at 11:28:02AM +0530, Aneesh Kumar K.V wrote:
>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>> 
>> Hugepage invalidate involves invalidating multiple hpte entries.
>> Optimize the operation using H_BULK_REMOVE on lpar platforms.
>> On native, reduce the number of tlb flush.
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>> ---
>>  arch/powerpc/include/asm/machdep.h    |    3 +
>>  arch/powerpc/mm/hash_native_64.c      |   78 ++++++++++++++++++++
>>  arch/powerpc/mm/pgtable.c             |   13 +++-
>>  arch/powerpc/platforms/pseries/lpar.c |  126 +++++++++++++++++++++++++++++++--
>>  4 files changed, 210 insertions(+), 10 deletions(-)
>> 
>> diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
>> index 6cee6e0..3bc7816 100644
>> --- a/arch/powerpc/include/asm/machdep.h
>> +++ b/arch/powerpc/include/asm/machdep.h
>> @@ -56,6 +56,9 @@ struct machdep_calls {
>>  	void            (*hpte_removebolted)(unsigned long ea,
>>  					     int psize, int ssize);
>>  	void		(*flush_hash_range)(unsigned long number, int local);
>> +	void		(*hugepage_invalidate)(struct mm_struct *mm,
>> +					       unsigned char *hpte_slot_array,
>> +					       unsigned long addr, int psize);
>>  
>>  	/* special for kexec, to be called in real mode, linear mapping is
>>  	 * destroyed as well */
>> diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
>> index ac84fa6..59f29bf 100644
>> --- a/arch/powerpc/mm/hash_native_64.c
>> +++ b/arch/powerpc/mm/hash_native_64.c
>> @@ -450,6 +450,83 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
>>  	local_irq_restore(flags);
>>  }
>>  
>> +static void native_hugepage_invalidate(struct mm_struct *mm,
>> +				       unsigned char *hpte_slot_array,
>> +				       unsigned long addr, int psize)
>> +{
>> +	int ssize = 0, i;
>> +	int lock_tlbie;
>> +	struct hash_pte *hptep;
>> +	int actual_psize = MMU_PAGE_16M;
>> +	unsigned int max_hpte_count, valid;
>> +	unsigned long flags, s_addr = addr;
>> +	unsigned long hpte_v, want_v, shift;
>> +	unsigned long hidx, vpn = 0, vsid, hash, slot;
>> +
>> +	shift = mmu_psize_defs[psize].shift;
>> +	max_hpte_count = HUGE_PAGE_SIZE/(1ul << shift);
>> +
>> +	local_irq_save(flags);
>> +	for (i = 0; i < max_hpte_count; i++) {
>> +		/*
>> +		 * 8 bits per each hpte entries
>> +		 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
>> +		 */
>> +		valid = hpte_slot_array[i] & 0x1;
>> +		if (!valid)
>> +			continue;
>> +		hidx =  hpte_slot_array[i]  >> 1;
>> +
>> +		/* get the vpn */
>> +		addr = s_addr + (i * (1ul << shift));
>> +		if (!is_kernel_addr(addr)) {
>> +			ssize = user_segment_size(addr);
>> +			vsid = get_vsid(mm->context.id, addr, ssize);
>> +			WARN_ON(vsid == 0);
>> +		} else {
>> +			vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
>> +			ssize = mmu_kernel_ssize;
>> +		}
>> +
>> +		vpn = hpt_vpn(addr, vsid, ssize);
>> +		hash = hpt_hash(vpn, shift, ssize);
>> +		if (hidx & _PTEIDX_SECONDARY)
>> +			hash = ~hash;
>> +
>> +		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
>> +		slot += hidx & _PTEIDX_GROUP_IX;
>> +
>> +		hptep = htab_address + slot;
>> +		want_v = hpte_encode_avpn(vpn, psize, ssize);
>> +		native_lock_hpte(hptep);
>> +		hpte_v = hptep->v;
>> +
>> +		/* Even if we miss, we need to invalidate the TLB */
>> +		if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
>> +			native_unlock_hpte(hptep);
>> +		else
>> +			/* Invalidate the hpte. NOTE: this also unlocks it */
>> +			hptep->v = 0;
>
> Shouldn't you be clearing the entry from the slot_array once it is
> invalidated in the hash table?

We don't need to do that. We should be fine even if hptes get
invalidated under us. Also inorder to update slot_array i will have to
mark the corresponding hpte busy, so that we can ensure nobody is
looking at the slot array.

>
>> +	}
>> +	/*
>> +	 * Since this is a hugepage, we just need a single tlbie.
>> +	 * use the last vpn.
>> +	 */
>> +	lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
>> +	if (lock_tlbie)
>> +		raw_spin_lock(&native_tlbie_lock);
>> +
>> +	asm volatile("ptesync":::"memory");
>> +	__tlbie(vpn, psize, actual_psize, ssize);
>> +	asm volatile("eieio; tlbsync; ptesync":::"memory");
>> +
>> +	if (lock_tlbie)
>> +		raw_spin_unlock(&native_tlbie_lock);
>> +
>> +	local_irq_restore(flags);
>> +}
>> +
>> +
>>  static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
>>  			int *psize, int *apsize, int *ssize, unsigned long *vpn)
>>  {
>> @@ -678,4 +755,5 @@ void __init hpte_init_native(void)
>>  	ppc_md.hpte_remove	= native_hpte_remove;
>>  	ppc_md.hpte_clear_all	= native_hpte_clear;
>>  	ppc_md.flush_hash_range = native_flush_hash_range;
>> +	ppc_md.hugepage_invalidate   = native_hugepage_invalidate;
>>  }
>> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
>> index fbff062..386cab8 100644
>> --- a/arch/powerpc/mm/pgtable.c
>> +++ b/arch/powerpc/mm/pgtable.c
>> @@ -433,6 +433,7 @@ void hpte_need_hugepage_flush(struct mm_struct *mm, unsigned long addr,
>>  {
>>  	int ssize, i;
>>  	unsigned long s_addr;
>> +	int max_hpte_count;
>>  	unsigned int psize, valid;
>>  	unsigned char *hpte_slot_array;
>>  	unsigned long hidx, vpn, vsid, hash, shift, slot;
>> @@ -446,12 +447,18 @@ void hpte_need_hugepage_flush(struct mm_struct *mm, unsigned long addr,
>>  	 * second half of the PMD
>>  	 */
>>  	hpte_slot_array = *(char **)(pmdp + PTRS_PER_PMD);
>> -
>>  	/* get the base page size */
>>  	psize = get_slice_psize(mm, s_addr);
>> -	shift = mmu_psize_defs[psize].shift;
>>  
>> -	for (i = 0; i < HUGE_PAGE_SIZE/(1ul << shift); i++) {
>> +	if (ppc_md.hugepage_invalidate)
>> +		return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
>> +						  s_addr, psize);
>> +	/*
>> +	 * No bluk hpte removal support, invalidate each entry
>> +	 */
>> +	shift = mmu_psize_defs[psize].shift;
>> +	max_hpte_count = HUGE_PAGE_SIZE/(1ul << shift);
>> +	for (i = 0; i < max_hpte_count; i++) {
>>  		/*
>>  		 * 8 bits per each hpte entries
>>  		 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
>> diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
>> index 3daced3..5fcc621 100644
>> --- a/arch/powerpc/platforms/pseries/lpar.c
>> +++ b/arch/powerpc/platforms/pseries/lpar.c
>> @@ -45,6 +45,13 @@
>>  #include "plpar_wrappers.h"
>>  #include "pseries.h"
>>  
>> +/* Flag bits for H_BULK_REMOVE */
>> +#define HBR_REQUEST	0x4000000000000000UL
>> +#define HBR_RESPONSE	0x8000000000000000UL
>> +#define HBR_END		0xc000000000000000UL
>> +#define HBR_AVPN	0x0200000000000000UL
>> +#define HBR_ANDCOND	0x0100000000000000UL
>> +
>>  
>>  /* in hvCall.S */
>>  EXPORT_SYMBOL(plpar_hcall);
>> @@ -339,6 +346,117 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
>>  	BUG_ON(lpar_rc != H_SUCCESS);
>>  }
>>  
>> +/*
>> + * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
>> + * to make sure that we avoid bouncing the hypervisor tlbie lock.
>> + */
>> +#define PPC64_HUGE_HPTE_BATCH 12
>> +
>> +static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
>> +					     unsigned long *vpn, int count,
>> +					     int psize, int ssize)
>> +{
>> +	unsigned long param[9];
>> +	int i = 0, pix = 0, rc;
>> +	unsigned long flags = 0;
>> +	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
>> +
>> +	if (lock_tlbie)
>> +		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
>> +
>> +	for (i = 0; i < count; i++) {
>> +
>> +		if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
>> +			pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize,
>> +						     ssize, 0);
>> +		} else {
>> +			param[pix] = HBR_REQUEST | HBR_AVPN | slot[i];
>> +			param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize);
>> +			pix += 2;
>> +			if (pix == 8) {
>> +				rc = plpar_hcall9(H_BULK_REMOVE, param,
>> +						  param[0], param[1], param[2],
>> +						  param[3], param[4], param[5],
>> +						  param[6], param[7]);
>> +				BUG_ON(rc != H_SUCCESS);
>> +				pix = 0;
>> +			}
>> +		}
>> +	}
>> +	if (pix) {
>> +		param[pix] = HBR_END;
>> +		rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
>> +				  param[2], param[3], param[4], param[5],
>> +				  param[6], param[7]);
>> +		BUG_ON(rc != H_SUCCESS);
>> +	}
>> +
>> +	if (lock_tlbie)
>> +		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
>> +}
>> +
>> +static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm,
>> +				       unsigned char *hpte_slot_array,
>> +				       unsigned long addr, int psize)
>> +{
>> +	int ssize = 0, i, index = 0;
>> +	unsigned long s_addr = addr;
>> +	unsigned int max_hpte_count, valid;
>> +	unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
>> +	unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
>
> These are really too big to be allocating on the stack.  You'd be
> better off going direct from the char slot array to the data structure
> for H_BULK_REMOVE, rather than introducing this intermediate
> structure.

The reason i wanted to do that was to make sure i don't lock/unlock
pSeries_lpar_tlbie_lock that frequently, ie, for ever H_BULK_REMOVE.
The total size taken by both the array is only 192 bytes. Is that big
enough to create trouble ?

>
>> +	unsigned long shift, hidx, vpn = 0, vsid, hash, slot;
>> +
>> +	shift = mmu_psize_defs[psize].shift;
>> +	max_hpte_count = HUGE_PAGE_SIZE/(1ul << shift);
>> +
>> +	for (i = 0; i < max_hpte_count; i++) {
>> +		/*
>> +		 * 8 bits per each hpte entries
>> +		 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
>> +		 */
>> +		valid = hpte_slot_array[i] & 0x1;
>> +		if (!valid)
>> +			continue;
>> +		hidx =  hpte_slot_array[i]  >> 1;
>> +
>> +		/* get the vpn */
>> +		addr = s_addr + (i * (1ul << shift));
>> +		if (!is_kernel_addr(addr)) {
>> +			ssize = user_segment_size(addr);
>> +			vsid = get_vsid(mm->context.id, addr, ssize);
>> +			WARN_ON(vsid == 0);
>> +		} else {
>> +			vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
>> +			ssize = mmu_kernel_ssize;
>> +		}
>> +
>> +		vpn = hpt_vpn(addr, vsid, ssize);
>> +		hash = hpt_hash(vpn, shift, ssize);
>> +		if (hidx & _PTEIDX_SECONDARY)
>> +			hash = ~hash;
>> +
>> +		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
>> +		slot += hidx & _PTEIDX_GROUP_IX;
>> +
>> +		slot_array[index] = slot;
>> +		vpn_array[index] = vpn;
>> +		if (index == PPC64_HUGE_HPTE_BATCH - 1) {
>> +			/*
>> +			 * Now do a bluk invalidate
>> +			 */
>> +			__pSeries_lpar_hugepage_invalidate(slot_array,
>> +							   vpn_array,
>> +							   PPC64_HUGE_HPTE_BATCH,
>> +							   psize, ssize);
>> +			index = 0;
>> +		} else
>> +			index++;
>> +	}
>> +	if (index)
>> +		__pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
>> +						   index, psize, ssize);
>> +}
>> +
>>  static void pSeries_lpar_hpte_removebolted(unsigned long ea,
>>  					   int psize, int ssize)
>>  {
>> @@ -354,13 +472,6 @@ static void pSeries_lpar_hpte_removebolted(unsigned long ea,
>>  	pSeries_lpar_hpte_invalidate(slot, vpn, psize, ssize, 0);
>>  }
>>  
>> -/* Flag bits for H_BULK_REMOVE */
>> -#define HBR_REQUEST	0x4000000000000000UL
>> -#define HBR_RESPONSE	0x8000000000000000UL
>> -#define HBR_END		0xc000000000000000UL
>> -#define HBR_AVPN	0x0200000000000000UL
>> -#define HBR_ANDCOND	0x0100000000000000UL
>> -
>>  /*
>>   * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
>>   * lock.
>> @@ -446,6 +557,7 @@ void __init hpte_init_lpar(void)
>>  	ppc_md.hpte_removebolted = pSeries_lpar_hpte_removebolted;
>>  	ppc_md.flush_hash_range	= pSeries_lpar_flush_hash_range;
>>  	ppc_md.hpte_clear_all   = pSeries_lpar_hptab_clear;
>> +	ppc_md.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
>>  }
>>  
>>  #ifdef CONFIG_PPC_SMLPAR

-aneesh

^ permalink raw reply

* [PATCH 1/3] iommu: Move swap_pci_ref function to pci.h.
From: Varun Sethi @ 2013-04-14 19:12 UTC (permalink / raw)
  To: joro, stuart.yoder, scottwood, iommu, linuxppc-dev, linux-kernel,
	galak, benh
  Cc: Varun Sethi

swap_pci_ref function is used by the IOMMU API code for swapping pci device
pointers, while determining the iommu group for the device.
Currently this function was being implemented for different IOMMU drivers.
This patch moves the function to pci.h so that the implementation can be
shared across various IOMMU drivers.

Signed-off-by: Varun Sethi <Varun.Sethi@freescale.com>
---
 drivers/iommu/amd_iommu.c   |    6 ------
 drivers/iommu/intel-iommu.c |    6 ------
 include/linux/pci.h         |    8 ++++++++
 3 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index a7f6b04..c36c046 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -263,12 +263,6 @@ static bool check_device(struct device *dev)
 	return true;
 }
 
-static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
-{
-	pci_dev_put(*from);
-	*from = to;
-}
-
 static struct pci_bus *find_hosted_bus(struct pci_bus *bus)
 {
 	while (!bus->self) {
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 6e0b9ff..8d7c979 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4137,12 +4137,6 @@ static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
 	return 0;
 }
 
-static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
-{
-	pci_dev_put(*from);
-	*from = to;
-}
-
 #define REQ_ACS_FLAGS	(PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
 
 static int intel_iommu_add_device(struct device *dev)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 2461033a..41511de 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1850,6 +1850,14 @@ static inline struct eeh_dev *pci_dev_to_eeh_dev(struct pci_dev *pdev)
 }
 #endif
 
+#ifdef CONFIG_IOMMU_API
+static inline void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
+{
+	pci_dev_put(*from);
+	*from = to;
+}
+#endif
+
 /**
  * pci_find_upstream_pcie_bridge - find upstream PCIe-to-PCI bridge of a device
  * @pdev: the PCI device
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 2/3 v12] iommu/fsl: Add additional iommu attributes required by the PAMU driver.
From: Varun Sethi @ 2013-04-14 19:12 UTC (permalink / raw)
  To: joro, stuart.yoder, scottwood, iommu, linuxppc-dev, linux-kernel,
	galak, benh
  Cc: Varun Sethi
In-Reply-To: <1365966722-1804-1-git-send-email-Varun.Sethi@freescale.com>

Added the following domain attributes for the FSL PAMU driver:
1. Added new iommu stash attribute, which allows setting of the
   LIODN specific stash id parameter through IOMMU API.
2. Added an attribute for enabling/disabling DMA to a particular
   memory window.
3. Added domain attribute to check for PAMUV1 specific constraints.

Signed-off-by: Varun Sethi <Varun.Sethi@freescale.com>
---
-v12 changes:
- Moved PAMU specifc stash ids and structures to PAMU header file.
- no change in v11.
- no change in v10.
 include/linux/iommu.h |   16 ++++++++++++++++
 1 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2727810..c5dc2b9 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -57,10 +57,26 @@ struct iommu_domain {
 #define IOMMU_CAP_CACHE_COHERENCY	0x1
 #define IOMMU_CAP_INTR_REMAP		0x2	/* isolates device intrs */
 
+/*
+ * Following constraints are specifc to PAMUV1:
+ *  -aperture must be power of 2, and naturally aligned
+ *  -number of windows must be power of 2, and address space size
+ *   of each window is determined by aperture size / # of windows
+ *  -the actual size of the mapped region of a window must be power
+ *   of 2 starting with 4KB and physical address must be naturally
+ *   aligned.
+ * DOMAIN_ATTR_FSL_PAMUV1 corresponds to the above mentioned contraints.
+ * The caller can invoke iommu_domain_get_attr to check if the underlying
+ * iommu implementation supports these constraints.
+ */
+
 enum iommu_attr {
 	DOMAIN_ATTR_GEOMETRY,
 	DOMAIN_ATTR_PAGING,
 	DOMAIN_ATTR_WINDOWS,
+	DOMAIN_ATTR_PAMU_STASH,
+	DOMAIN_ATTR_PAMU_ENABLE,
+	DOMAIN_ATTR_FSL_PAMUV1,
 	DOMAIN_ATTR_MAX,
 };
 
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 3/3 v12] iommu/fsl: Freescale PAMU driver and iommu implementation.
From: Varun Sethi @ 2013-04-14 19:12 UTC (permalink / raw)
  To: joro, stuart.yoder, scottwood, iommu, linuxppc-dev, linux-kernel,
	galak, benh
  Cc: Varun Sethi
In-Reply-To: <1365966722-1804-1-git-send-email-Varun.Sethi@freescale.com>

Following is a brief description of the PAMU hardware:
PAMU determines what action to take and whether to authorize the action on
the basis of the memory address, a Logical IO Device Number (LIODN), and
PAACT table (logically) indexed by LIODN and address. Hardware devices which
need to access memory must provide an LIODN in addition to the memory address.

Peripheral Access Authorization and Control Tables (PAACTs) are the primary
data structures used by PAMU. A PAACT is a table of peripheral access
authorization and control entries (PAACE).Each PAACE defines the range of
I/O bus address space that is accessible by the LIOD and the associated access
capabilities.

There are two types of PAACTs: primary PAACT (PPAACT) and secondary PAACT
(SPAACT).A given physical I/O device may be able to act as one or more
independent logical I/O devices (LIODs). Each such logical I/O device is
assigned an identifier called logical I/O device number (LIODN). A LIODN is
allocated a contiguous portion of the I/O bus address space called the DSA window
for performing DSA operations. The DSA window may optionally be divided into
multiple sub-windows, each of which may be used to map to a region in system
storage space. The first sub-window is referred to as the primary sub-window
and the remaining are called secondary sub-windows.

This patch provides the PAMU driver (fsl_pamu.c) and the corresponding IOMMU
API implementation (fsl_pamu_domain.c). The PAMU hardware driver (fsl_pamu.c)
has been derived from the work done by Ashish Kalra and Timur Tabi.

Signed-off-by: Timur Tabi <<timur@tabi.org>
Signed-off-by: Varun Sethi <Varun.Sethi@freescale.com>
---
changes in v12:
- Use is_power_of_2 for checking alignement.
- Check for multifucntion PCI device ACS flags for determining device groups.
- Fix get_stash_id function.
- Don't crash in case of access violations, disable the LIODN.
- Don't use list_empty while traversing list using list for each entry.
- Move stash structure and ids to PAMU header files.
- Fix geometry size calculation.
changes in v11:
- changed iova to dma_addr_t in iova_to_phys API.
changes in v10:
- Support for new guts compatibe string for T4 & B4 devices.
- Modified comment about port ID and mentioned the errata number.
- Fixed the issue where data pointer was not freed in case of a an error.
- Pass data pointer while freeing irq.
- Whle initializing the SPAACE entry clear the valid bit.
changes in v9:
- Merged and createad a single function to delete
a device from domain list.
- Refactored the add_device API code.
- Renamed the paace and spaace init fucntions.
- Renamed functions for mapping windows and subwindows.
- Changed the MAX LIODN value to MAX value u-boot can
program.
- Hard coded maximum number of subwindows.
changes in v8:
- implemented the new API for window based IOMMUs.
changes in v7:
- Set max_subwidows in the geometry attribute.
- Add checking for maximum supported LIODN value.
- Use upper_32_bits and lower_32_bits macros while
  intializing PAMU data structures.
changes in v6:
- Simplified complex conditional statements.
- Fixed indentation issues.
- Added comments for IOMMU API implementation.
changes in v5:
- Addressed comments from Timur.
changes in v4:
- Addressed comments from Timur and Scott.
changes in v3:
- Addressed comments by Kumar Gala
- dynamic fspi allocation
- fixed alignment check in map and unmap

 arch/powerpc/sysdev/fsl_pci.h   |    5 +
 drivers/iommu/Kconfig           |   10 +
 drivers/iommu/Makefile          |    1 +
 drivers/iommu/fsl_pamu.c        | 1309 +++++++++++++++++++++++++++++++++++++++
 drivers/iommu/fsl_pamu.h        |  426 +++++++++++++
 drivers/iommu/fsl_pamu_domain.c | 1143 ++++++++++++++++++++++++++++++++++
 drivers/iommu/fsl_pamu_domain.h |   85 +++
 7 files changed, 2979 insertions(+), 0 deletions(-)
 create mode 100644 drivers/iommu/fsl_pamu.c
 create mode 100644 drivers/iommu/fsl_pamu.h
 create mode 100644 drivers/iommu/fsl_pamu_domain.c
 create mode 100644 drivers/iommu/fsl_pamu_domain.h

diff --git a/arch/powerpc/sysdev/fsl_pci.h b/arch/powerpc/sysdev/fsl_pci.h
index c495c00..feb34f6 100644
--- a/arch/powerpc/sysdev/fsl_pci.h
+++ b/arch/powerpc/sysdev/fsl_pci.h
@@ -14,6 +14,11 @@
 #ifndef __POWERPC_FSL_PCI_H
 #define __POWERPC_FSL_PCI_H
 
+
+/* FSL PCI controller BRR1 register */
+#define PCI_FSL_BRR1      0xbf8
+#define PCI_FSL_BRR1_VER 0xffff
+
 #define PCIE_LTSSM	0x0404		/* PCIE Link Training and Status */
 #define PCIE_LTSSM_L0	0x16		/* L0 state */
 #define PCIE_IP_REV_2_2		0x02080202 /* PCIE IP block version Rev2.2 */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index c332fb9..f97db88 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -17,6 +17,16 @@ config OF_IOMMU
        def_bool y
        depends on OF
 
+config FSL_PAMU
+	bool "Freescale IOMMU support"
+	depends on PPC_E500MC
+	select IOMMU_API
+	select GENERIC_ALLOCATOR
+	help
+	  Freescale PAMU support. PAMU is the IOMMU present on Freescale QorIQ platforms.
+	  PAMU can authorize memory access, remap the memory address, and remap I/O
+	  transaction types.
+
 # MSM IOMMU support
 config MSM_IOMMU
 	bool "MSM IOMMU Support"
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index ef0e520..027d1af 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -15,3 +15,4 @@ obj-$(CONFIG_TEGRA_IOMMU_SMMU) += tegra-smmu.o
 obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o
 obj-$(CONFIG_SHMOBILE_IOMMU) += shmobile-iommu.o
 obj-$(CONFIG_SHMOBILE_IPMMU) += shmobile-ipmmu.o
+obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
diff --git a/drivers/iommu/fsl_pamu.c b/drivers/iommu/fsl_pamu.c
new file mode 100644
index 0000000..ebfa4da
--- /dev/null
+++ b/drivers/iommu/fsl_pamu.c
@@ -0,0 +1,1309 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ *
+ */
+
+#define pr_fmt(fmt)    "fsl-pamu: %s: " fmt, __func__
+
+#include <linux/init.h>
+#include <linux/iommu.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/device.h>
+#include <linux/of_platform.h>
+#include <linux/bootmem.h>
+#include <linux/genalloc.h>
+#include <asm/io.h>
+#include <asm/bitops.h>
+#include <asm/fsl_guts.h>
+
+#include "fsl_pamu.h"
+
+/* define indexes for each operation mapping scenario */
+#define OMI_QMAN        0x00
+#define OMI_FMAN        0x01
+#define OMI_QMAN_PRIV   0x02
+#define OMI_CAAM        0x03
+
+#define make64(high, low) (((u64)(high) << 32) | (low))
+
+struct pamu_isr_data {
+	void __iomem *pamu_reg_base;	/* Base address of PAMU regs*/
+	unsigned int count;		/* The number of PAMUs */
+};
+
+static struct paace *ppaact;
+static struct paace *spaact;
+static struct ome *omt;
+
+/*
+ * Table for matching compatible strings, for device tree
+ * guts node, for QorIQ SOCs.
+ * "fsl,qoriq-device-config-2.0" corresponds to T4 & B4
+ * SOCs. For the older SOCs "fsl,qoriq-device-config-1.0"
+ * string would be used.
+*/
+static const struct of_device_id guts_device_ids[] = {
+	{ .compatible = "fsl,qoriq-device-config-1.0", },
+	{ .compatible = "fsl,qoriq-device-config-2.0", },
+	{}
+};
+
+
+/*
+ * Table for matching compatible strings, for device tree
+ * L3 cache controller node.
+ * "fsl,t4240-l3-cache-controller" corresponds to T4,
+ * "fsl,b4860-l3-cache-controller" corresponds to B4 &
+ * "fsl,p4080-l3-cache-controller" corresponds to other,
+ * SOCs.
+*/
+static const struct of_device_id l3_device_ids[] = {
+	{ .compatible = "fsl,t4240-l3-cache-controller", },
+	{ .compatible = "fsl,b4860-l3-cache-controller", },
+	{ .compatible = "fsl,p4080-l3-cache-controller", },
+	{}
+};
+
+/* maximum subwindows permitted per liodn */
+static u32 max_subwindow_count;
+
+/* Pool for fspi allocation */
+struct gen_pool *spaace_pool;
+
+/**
+ * pamu_get_max_subwin_cnt() - Return the maximum supported
+ * subwindow count per liodn.
+ *
+ */
+u32 pamu_get_max_subwin_cnt()
+{
+	return max_subwindow_count;
+}
+
+/**
+ * pamu_get_ppaace() - Return the primary PACCE
+ * @liodn: liodn PAACT index for desired PAACE
+ *
+ * Returns the ppace pointer upon success else return
+ * null.
+ */
+static struct paace *pamu_get_ppaace(int liodn)
+{
+	if (!ppaact || liodn >= PAACE_NUMBER_ENTRIES) {
+		pr_err("PPAACT doesn't exist\n");
+		return NULL;
+	}
+
+	return &ppaact[liodn];
+}
+
+/**
+ * pamu_enable_liodn() - Set valid bit of PACCE
+ * @liodn: liodn PAACT index for desired PAACE
+ *
+ * Returns 0 upon success else error code < 0 returned
+ */
+int pamu_enable_liodn(int liodn)
+{
+	struct paace *ppaace;
+
+	ppaace = pamu_get_ppaace(liodn);
+	if (!ppaace) {
+		pr_err("Invalid primary paace entry\n");
+		return -ENOENT;
+	}
+
+	if (!get_bf(ppaace->addr_bitfields, PPAACE_AF_WSE)) {
+		pr_err("liodn %d not configured\n", liodn);
+		return -EINVAL;
+	}
+
+	/* Ensure that all other stores to the ppaace complete first */
+	mb();
+
+	set_bf(ppaace->addr_bitfields, PAACE_AF_V, PAACE_V_VALID);
+	mb();
+
+	return 0;
+}
+
+/**
+ * pamu_disable_liodn() - Clears valid bit of PACCE
+ * @liodn: liodn PAACT index for desired PAACE
+ *
+ * Returns 0 upon success else error code < 0 returned
+ */
+int pamu_disable_liodn(int liodn)
+{
+	struct paace *ppaace;
+
+	ppaace = pamu_get_ppaace(liodn);
+	if (!ppaace) {
+		pr_err("Invalid primary paace entry\n");
+		return -ENOENT;
+	}
+
+	set_bf(ppaace->addr_bitfields, PAACE_AF_V, PAACE_V_INVALID);
+	mb();
+
+	return 0;
+}
+
+/* Derive the window size encoding for a particular PAACE entry */
+static unsigned int map_addrspace_size_to_wse(phys_addr_t addrspace_size)
+{
+	/* Bug if not a power of 2 */
+	BUG_ON(!is_power_of_2(addrspace_size));
+
+	/* window size is 2^(WSE+1) bytes */
+	return __ffs(addrspace_size) - 1;
+}
+
+/* Derive the PAACE window count encoding for the subwindow count */
+static unsigned int map_subwindow_cnt_to_wce(u32 subwindow_cnt)
+{
+       /* window count is 2^(WCE+1) bytes */
+       return __ffs(subwindow_cnt) - 1;
+}
+
+/*
+ * Set the PAACE type as primary and set the coherency required domain
+ * attribute
+ */
+static void pamu_init_ppaace(struct paace *ppaace)
+{
+	set_bf(ppaace->addr_bitfields, PAACE_AF_PT, PAACE_PT_PRIMARY);
+
+	set_bf(ppaace->domain_attr.to_host.coherency_required, PAACE_DA_HOST_CR,
+	       PAACE_M_COHERENCE_REQ);
+}
+
+/*
+ * Set the PAACE type as secondary and set the coherency required domain
+ * attribute.
+ */
+static void pamu_init_spaace(struct paace *spaace)
+{
+	set_bf(spaace->addr_bitfields, PAACE_AF_PT, PAACE_PT_SECONDARY);
+	set_bf(spaace->domain_attr.to_host.coherency_required, PAACE_DA_HOST_CR,
+	       PAACE_M_COHERENCE_REQ);
+}
+
+/*
+ * Return the spaace (corresponding to the secondary window index)
+ * for a particular ppaace.
+ */
+static struct paace *pamu_get_spaace(struct paace *paace, u32 wnum)
+{
+	u32 subwin_cnt;
+	struct paace *spaace = NULL;
+
+	subwin_cnt = 1UL << (get_bf(paace->impl_attr, PAACE_IA_WCE) + 1);
+
+	if (wnum < subwin_cnt)
+		spaace = &spaact[paace->fspi + wnum];
+	else
+		pr_err("secondary paace out of bounds\n");
+
+	return spaace;
+}
+
+/**
+ * pamu_get_fspi_and_allocate() - Allocates fspi index and reserves subwindows
+ *                                required for primary PAACE in the secondary
+ *                                PAACE table.
+ * @subwin_cnt: Number of subwindows to be reserved.
+ *
+ * A PPAACE entry may have a number of associated subwindows. A subwindow
+ * corresponds to a SPAACE entry in the SPAACT table. Each PAACE entry stores
+ * the index (fspi) of the first SPAACE entry in the SPAACT table. This
+ * function returns the index of the first SPAACE entry. The remaining
+ * SPAACE entries are reserved contiguously from that index.
+ *
+ * Returns a valid fspi index in the range of 0 - SPAACE_NUMBER_ENTRIES on success.
+ * If no SPAACE entry is available or the allocator can not reserve the required
+ * number of contiguous entries function returns ULONG_MAX indicating a failure.
+ *
+*/
+static unsigned long pamu_get_fspi_and_allocate(u32 subwin_cnt)
+{
+	unsigned long spaace_addr;
+
+	spaace_addr = gen_pool_alloc(spaace_pool, subwin_cnt * sizeof(struct paace));
+	if (!spaace_addr)
+		return ULONG_MAX;
+
+	return (spaace_addr - (unsigned long)spaact) / (sizeof(struct paace));
+}
+
+/* Release the subwindows reserved for a particular LIODN */
+void pamu_free_subwins(int liodn)
+{
+	struct paace *ppaace;
+	u32 subwin_cnt, size;
+
+	ppaace = pamu_get_ppaace(liodn);
+	if (!ppaace) {
+		pr_err("Invalid liodn entry\n");
+		return;
+	}
+
+	if (get_bf(ppaace->addr_bitfields, PPAACE_AF_MW)) {
+		subwin_cnt = 1UL << (get_bf(ppaace->impl_attr, PAACE_IA_WCE) + 1);
+		size = (subwin_cnt - 1) * sizeof(struct paace);
+		gen_pool_free(spaace_pool, (unsigned long)&spaact[ppaace->fspi], size);
+		set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0);
+	}
+}
+
+/*
+ * Function used for updating stash destination for the coressponding
+ * LIODN.
+ */
+int  pamu_update_paace_stash(int liodn, u32 subwin, u32 value)
+{
+	struct paace *paace;
+
+	paace = pamu_get_ppaace(liodn);
+	if (!paace) {
+		pr_err("Invalid liodn entry\n");
+		return -ENOENT;
+	}
+	if (subwin) {
+		paace = pamu_get_spaace(paace, subwin - 1);
+		if (!paace) {
+			return -ENOENT;
+		}
+	}
+	set_bf(paace->impl_attr, PAACE_IA_CID, value);
+
+	mb();
+
+	return 0;
+}
+
+/* Disable a subwindow corresponding to the LIODN */
+int pamu_disable_spaace(int liodn, u32 subwin)
+{
+	struct paace *paace;
+
+	paace = pamu_get_ppaace(liodn);
+	if (!paace) {
+		pr_err("Invalid liodn entry\n");
+		return -ENOENT;
+	}
+	if (subwin) {
+		paace = pamu_get_spaace(paace, subwin - 1);
+		if (!paace) {
+			return -ENOENT;
+		}
+		set_bf(paace->addr_bitfields, PAACE_AF_V,
+			 PAACE_V_INVALID);
+	} else {
+		set_bf(paace->addr_bitfields, PAACE_AF_AP,
+			 PAACE_AP_PERMS_DENIED);
+	}
+
+	mb();
+
+	return 0;
+}
+
+
+/**
+ * pamu_config_paace() - Sets up PPAACE entry for specified liodn
+ *
+ * @liodn: Logical IO device number
+ * @win_addr: starting address of DSA window
+ * @win-size: size of DSA window
+ * @omi: Operation mapping index -- if ~omi == 0 then omi not defined
+ * @rpn: real (true physical) page number
+ * @stashid: cache stash id for associated cpu -- if ~stashid == 0 then
+ *	     stashid not defined
+ * @snoopid: snoop id for hardware coherency -- if ~snoopid == 0 then
+ *	     snoopid not defined
+ * @subwin_cnt: number of sub-windows
+ * @prot: window permissions
+ *
+ * Returns 0 upon success else error code < 0 returned
+ */
+int pamu_config_ppaace(int liodn, phys_addr_t win_addr, phys_addr_t win_size,
+		       u32 omi, unsigned long rpn, u32 snoopid, u32 stashid,
+		       u32 subwin_cnt, int prot)
+{
+	struct paace *ppaace;
+	unsigned long fspi;
+
+	if (!is_power_of_2(win_size) || win_size < PAMU_PAGE_SIZE) {
+		pr_err("window size too small or not a power of two %llx\n", win_size);
+		return -EINVAL;
+	}
+
+	if (win_addr & (win_size - 1)) {
+		pr_err("window address is not aligned with window size\n");
+		return -EINVAL;
+	}
+
+	ppaace = pamu_get_ppaace(liodn);
+	if (!ppaace) {
+		return -ENOENT;
+	}
+
+	/* window size is 2^(WSE+1) bytes */
+	set_bf(ppaace->addr_bitfields, PPAACE_AF_WSE,
+		map_addrspace_size_to_wse(win_size));
+
+	pamu_init_ppaace(ppaace);
+
+	ppaace->wbah = win_addr >> (PAMU_PAGE_SHIFT + 20);
+	set_bf(ppaace->addr_bitfields, PPAACE_AF_WBAL,
+	       (win_addr >> PAMU_PAGE_SHIFT));
+
+	/* set up operation mapping if it's configured */
+	if (omi < OME_NUMBER_ENTRIES) {
+		set_bf(ppaace->impl_attr, PAACE_IA_OTM, PAACE_OTM_INDEXED);
+		ppaace->op_encode.index_ot.omi = omi;
+	} else if (~omi != 0) {
+		pr_err("bad operation mapping index: %d\n", omi);
+		return -EINVAL;
+	}
+
+	/* configure stash id */
+	if (~stashid != 0)
+		set_bf(ppaace->impl_attr, PAACE_IA_CID, stashid);
+
+	/* configure snoop id */
+	if (~snoopid != 0)
+		ppaace->domain_attr.to_host.snpid = snoopid;
+
+	if (subwin_cnt) {
+		/* The first entry is in the primary PAACE instead */
+		fspi = pamu_get_fspi_and_allocate(subwin_cnt - 1);
+		if (fspi == ULONG_MAX) {
+			pr_err("spaace indexes exhausted\n");
+			return -EINVAL;
+		}
+
+		/* window count is 2^(WCE+1) bytes */
+		set_bf(ppaace->impl_attr, PAACE_IA_WCE,
+		       map_subwindow_cnt_to_wce(subwin_cnt));
+		set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0x1);
+		ppaace->fspi = fspi;
+	} else {
+		set_bf(ppaace->impl_attr, PAACE_IA_ATM, PAACE_ATM_WINDOW_XLATE);
+		ppaace->twbah = rpn >> 20;
+		set_bf(ppaace->win_bitfields, PAACE_WIN_TWBAL, rpn);
+		set_bf(ppaace->addr_bitfields, PAACE_AF_AP, prot);
+		set_bf(ppaace->impl_attr, PAACE_IA_WCE, 0);
+		set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0);
+	}
+	mb();
+
+	return 0;
+}
+
+/**
+ * pamu_config_spaace() - Sets up SPAACE entry for specified subwindow
+ *
+ * @liodn:  Logical IO device number
+ * @subwin_cnt:  number of sub-windows associated with dma-window
+ * @subwin: subwindow index
+ * @subwin_size: size of subwindow
+ * @omi: Operation mapping index
+ * @rpn: real (true physical) page number
+ * @snoopid: snoop id for hardware coherency -- if ~snoopid == 0 then
+ *			  snoopid not defined
+ * @stashid: cache stash id for associated cpu
+ * @enable: enable/disable subwindow after reconfiguration
+ * @prot: sub window permissions
+ *
+ * Returns 0 upon success else error code < 0 returned
+ */
+int pamu_config_spaace(int liodn, u32 subwin_cnt, u32 subwin,
+		       phys_addr_t subwin_size, u32 omi, unsigned long rpn,
+		       u32 snoopid, u32 stashid, int enable, int prot)
+{
+	struct paace *paace;
+
+
+	/* setup sub-windows */
+	if (!subwin_cnt) {
+		pr_err("Invalid subwindow count\n");
+		return -EINVAL;
+	}
+
+	paace = pamu_get_ppaace(liodn);
+	if (subwin > 0 && subwin < subwin_cnt && paace) {
+		paace = pamu_get_spaace(paace, subwin - 1);
+
+		if (paace && !(paace->addr_bitfields & PAACE_V_VALID)) {
+			pamu_init_spaace(paace);
+			set_bf(paace->addr_bitfields, SPAACE_AF_LIODN, liodn);
+		}
+	}
+
+	if (!paace) {
+		pr_err("Invalid liodn entry\n");
+		return -ENOENT;
+	}
+
+	if (!is_power_of_2(subwin_size) || subwin_size < PAMU_PAGE_SIZE) {
+		pr_err("subwindow size out of range, or not a power of 2\n");
+		return -EINVAL;
+	}
+
+	if (rpn == ULONG_MAX) {
+		pr_err("real page number out of range\n");
+		return -EINVAL;
+	}
+
+	/* window size is 2^(WSE+1) bytes */
+	set_bf(paace->win_bitfields, PAACE_WIN_SWSE,
+	       map_addrspace_size_to_wse(subwin_size));
+
+	set_bf(paace->impl_attr, PAACE_IA_ATM, PAACE_ATM_WINDOW_XLATE);
+	paace->twbah = rpn >> 20;
+	set_bf(paace->win_bitfields, PAACE_WIN_TWBAL, rpn);
+	set_bf(paace->addr_bitfields, PAACE_AF_AP, prot);
+
+	/* configure snoop id */
+	if (~snoopid != 0)
+		paace->domain_attr.to_host.snpid = snoopid;
+
+	/* set up operation mapping if it's configured */
+	if (omi < OME_NUMBER_ENTRIES) {
+		set_bf(paace->impl_attr, PAACE_IA_OTM, PAACE_OTM_INDEXED);
+		paace->op_encode.index_ot.omi = omi;
+	} else if (~omi != 0) {
+		pr_err("bad operation mapping index: %d\n", omi);
+		return -EINVAL;
+	}
+
+	if (~stashid != 0)
+		set_bf(paace->impl_attr, PAACE_IA_CID, stashid);
+
+	smp_wmb();
+
+	if (enable)
+		set_bf(paace->addr_bitfields, PAACE_AF_V, PAACE_V_VALID);
+
+	mb();
+
+	return 0;
+}
+
+/**
+* get_ome_index() - Returns the index in the operation mapping table
+*                   for device.
+* @*omi_index: pointer for storing the index value
+*
+*/
+void get_ome_index(u32 *omi_index, struct device *dev)
+{
+	if (of_device_is_compatible(dev->of_node, "fsl,qman-portal"))
+		*omi_index = OMI_QMAN;
+	if (of_device_is_compatible(dev->of_node, "fsl,qman"))
+		*omi_index = OMI_QMAN_PRIV;
+}
+
+/**
+ * get_stash_id - Returns stash destination id corresponding to a
+ *                cache type and vcpu.
+ * @stash_dest_hint: L1, L2 or L3
+ * @vcpu: vpcu target for a particular cache type.
+ *
+ * Returs stash on success or ~(u32)0 on failure.
+ *
+ */
+u32 get_stash_id(u32 stash_dest_hint, u32 vcpu)
+{
+	const u32 *prop;
+	struct device_node *node;
+	u32 cache_level;
+	int len, found = 0;
+	int i;
+
+	/* Fastpath, exit early if L3/CPC cache is target for stashing */
+	if (stash_dest_hint == PAMU_ATTR_CACHE_L3) {
+		node = of_find_matching_node(NULL, l3_device_ids);
+		if (node) {
+			prop = of_get_property(node, "cache-stash-id", 0);
+			if (!prop) {
+				pr_err("missing cache-stash-id at %s\n", node->full_name);
+				of_node_put(node);
+				return ~(u32)0;
+			}
+			of_node_put(node);
+			return be32_to_cpup(prop);
+		}
+		return ~(u32)0;
+	}
+
+	for_each_node_by_type(node, "cpu") {
+		prop = of_get_property(node, "reg", &len);
+		for (i = 0; i < len / sizeof(u32); i++) {
+			if (be32_to_cpup(&prop[i]) == vcpu) {
+				found = 1;
+				goto found_cpu_node;
+			}
+		}
+	}
+found_cpu_node:
+
+	/* find the hwnode that represents the cache */
+	for (cache_level = PAMU_ATTR_CACHE_L1; (cache_level < PAMU_ATTR_CACHE_L3) && found; cache_level++) {
+		if (stash_dest_hint == cache_level) {
+			prop = of_get_property(node, "cache-stash-id", 0);
+			if (!prop) {
+				pr_err("missing cache-stash-id at %s\n", node->full_name);
+				of_node_put(node);
+				return ~(u32)0;
+			}
+			of_node_put(node);
+			return be32_to_cpup(prop);
+		}
+
+		prop = of_get_property(node, "next-level-cache", 0);
+		if (!prop) {
+			pr_err("can't find next-level-cache at %s\n",
+				node->full_name);
+			of_node_put(node);
+			return ~(u32)0;  /* can't traverse any further */
+		}
+		of_node_put(node);
+
+		/* advance to next node in cache hierarchy */
+		node = of_find_node_by_phandle(*prop);
+		if (!node) {
+			pr_err("Invalid node for cache hierarchy %s\n",
+				node->full_name);
+			return ~(u32)0;
+		}
+	}
+
+	pr_err("stash dest not found for %d on vcpu %d\n",
+	          stash_dest_hint, vcpu);
+	return ~(u32)0;
+}
+
+/* Identify if the PAACT table entry belongs to QMAN, BMAN or QMAN Portal */
+#define QMAN_PAACE 1
+#define QMAN_PORTAL_PAACE 2
+#define BMAN_PAACE 3
+
+/**
+ * Setup operation mapping and stash destinations for QMAN and QMAN portal.
+ * Memory accesses to QMAN and BMAN private memory need not be coherent, so
+ * clear the PAACE entry coherency attribute for them.
+ */
+static void setup_qbman_paace(struct paace *ppaace, int  paace_type)
+{
+	switch (paace_type) {
+	case QMAN_PAACE:
+		set_bf(ppaace->impl_attr, PAACE_IA_OTM, PAACE_OTM_INDEXED);
+		ppaace->op_encode.index_ot.omi = OMI_QMAN_PRIV;
+		/* setup QMAN Private data stashing for the L3 cache */
+		set_bf(ppaace->impl_attr, PAACE_IA_CID, get_stash_id(PAMU_ATTR_CACHE_L3, 0));
+		set_bf(ppaace->domain_attr.to_host.coherency_required, PAACE_DA_HOST_CR,
+		       0);
+		break;
+	case QMAN_PORTAL_PAACE:
+		set_bf(ppaace->impl_attr, PAACE_IA_OTM, PAACE_OTM_INDEXED);
+		ppaace->op_encode.index_ot.omi = OMI_QMAN;
+		/*Set DQRR and Frame stashing for the L3 cache */
+		set_bf(ppaace->impl_attr, PAACE_IA_CID, get_stash_id(PAMU_ATTR_CACHE_L3, 0));
+		break;
+	case BMAN_PAACE:
+		set_bf(ppaace->domain_attr.to_host.coherency_required, PAACE_DA_HOST_CR,
+		       0);
+		break;
+	}
+}
+
+/**
+ * Setup the operation mapping table for various devices. This is a static
+ * table where each table index corresponds to a particular device. PAMU uses
+ * this table to translate device transaction to appropriate corenet
+ * transaction.
+ */
+static void __init setup_omt(struct ome *omt)
+{
+	struct ome *ome;
+
+	/* Configure OMI_QMAN */
+	ome = &omt[OMI_QMAN];
+
+	ome->moe[IOE_READ_IDX] = EOE_VALID | EOE_READ;
+	ome->moe[IOE_EREAD0_IDX] = EOE_VALID | EOE_RSA;
+	ome->moe[IOE_WRITE_IDX] = EOE_VALID | EOE_WRITE;
+	ome->moe[IOE_EWRITE0_IDX] = EOE_VALID | EOE_WWSAO;
+
+	ome->moe[IOE_DIRECT0_IDX] = EOE_VALID | EOE_LDEC;
+	ome->moe[IOE_DIRECT1_IDX] = EOE_VALID | EOE_LDECPE;
+
+	/* Configure OMI_FMAN */
+	ome = &omt[OMI_FMAN];
+	ome->moe[IOE_READ_IDX]  = EOE_VALID | EOE_READI;
+	ome->moe[IOE_WRITE_IDX] = EOE_VALID | EOE_WRITE;
+
+	/* Configure OMI_QMAN private */
+	ome = &omt[OMI_QMAN_PRIV];
+	ome->moe[IOE_READ_IDX]  = EOE_VALID | EOE_READ;
+	ome->moe[IOE_WRITE_IDX] = EOE_VALID | EOE_WRITE;
+	ome->moe[IOE_EREAD0_IDX] = EOE_VALID | EOE_RSA;
+	ome->moe[IOE_EWRITE0_IDX] = EOE_VALID | EOE_WWSA;
+
+	/* Configure OMI_CAAM */
+	ome = &omt[OMI_CAAM];
+	ome->moe[IOE_READ_IDX]  = EOE_VALID | EOE_READI;
+	ome->moe[IOE_WRITE_IDX] = EOE_VALID | EOE_WRITE;
+}
+
+/*
+ * Get the maximum number of PAACT table entries
+ * and subwindows supported by PAMU
+ */
+static void get_pamu_cap_values(unsigned long pamu_reg_base)
+{
+	u32 pc_val;
+
+	pc_val = in_be32((u32 *)(pamu_reg_base + PAMU_PC3));
+	/* Maximum number of subwindows per liodn */
+	max_subwindow_count = 1 << (1 + PAMU_PC3_MWCE(pc_val));
+}
+
+/* Setup PAMU registers pointing to PAACT, SPAACT and OMT */
+int setup_one_pamu(unsigned long pamu_reg_base, unsigned long pamu_reg_size,
+	           phys_addr_t ppaact_phys, phys_addr_t spaact_phys,
+		   phys_addr_t omt_phys)
+{
+	u32 *pc;
+	struct pamu_mmap_regs *pamu_regs;
+
+	pc = (u32 *) (pamu_reg_base + PAMU_PC);
+	pamu_regs = (struct pamu_mmap_regs *)
+		(pamu_reg_base + PAMU_MMAP_REGS_BASE);
+
+	/* set up pointers to corenet control blocks */
+
+	out_be32(&pamu_regs->ppbah, upper_32_bits(ppaact_phys));
+	out_be32(&pamu_regs->ppbal, lower_32_bits(ppaact_phys));
+	ppaact_phys = ppaact_phys + PAACT_SIZE;
+	out_be32(&pamu_regs->pplah, upper_32_bits(ppaact_phys));
+	out_be32(&pamu_regs->pplal, lower_32_bits(ppaact_phys));
+
+	out_be32(&pamu_regs->spbah, upper_32_bits(spaact_phys));
+	out_be32(&pamu_regs->spbal, lower_32_bits(spaact_phys));
+	spaact_phys = spaact_phys + SPAACT_SIZE;
+	out_be32(&pamu_regs->splah, upper_32_bits(spaact_phys));
+	out_be32(&pamu_regs->splal, lower_32_bits(spaact_phys));
+
+	out_be32(&pamu_regs->obah, upper_32_bits(omt_phys));
+	out_be32(&pamu_regs->obal, lower_32_bits(omt_phys));
+	omt_phys = omt_phys + OMT_SIZE;
+	out_be32(&pamu_regs->olah, upper_32_bits(omt_phys));
+	out_be32(&pamu_regs->olal, lower_32_bits(omt_phys));
+
+	/*
+	 * set PAMU enable bit,
+	 * allow ppaact & omt to be cached
+	 * & enable PAMU access violation interrupts.
+	 */
+
+	out_be32((u32 *)(pamu_reg_base + PAMU_PICS),
+			PAMU_ACCESS_VIOLATION_ENABLE);
+	out_be32(pc, PAMU_PC_PE | PAMU_PC_OCE | PAMU_PC_SPCC | PAMU_PC_PPCC);
+	return 0;
+}
+
+/* Enable all device LIODNS */
+static void __init setup_liodns(void)
+{
+	int i, len;
+	struct paace *ppaace;
+	struct device_node *node = NULL;
+	const u32 *prop;
+
+	for_each_node_with_property(node, "fsl,liodn") {
+		prop = of_get_property(node, "fsl,liodn", &len);
+		for (i = 0; i < len / sizeof(u32); i++) {
+			int liodn;
+
+			liodn = be32_to_cpup(&prop[i]);
+			if (liodn >= PAACE_NUMBER_ENTRIES) {
+				pr_err("Invalid LIODN value %d\n", liodn);
+				continue;
+			}
+			ppaace = pamu_get_ppaace(liodn);
+			pamu_init_ppaace(ppaace);
+			/* window size is 2^(WSE+1) bytes */
+			set_bf(ppaace->addr_bitfields, PPAACE_AF_WSE, 35);
+			ppaace->wbah = 0;
+			set_bf(ppaace->addr_bitfields, PPAACE_AF_WBAL, 0);
+			set_bf(ppaace->impl_attr, PAACE_IA_ATM,
+				PAACE_ATM_NO_XLATE);
+			set_bf(ppaace->addr_bitfields, PAACE_AF_AP,
+				PAACE_AP_PERMS_ALL);
+			if (of_device_is_compatible(node, "fsl,qman-portal"))
+				setup_qbman_paace(ppaace, QMAN_PORTAL_PAACE);
+			if (of_device_is_compatible(node, "fsl,qman"))
+				setup_qbman_paace(ppaace, QMAN_PAACE);
+			if (of_device_is_compatible(node, "fsl,bman"))
+				setup_qbman_paace(ppaace, BMAN_PAACE);
+			mb();
+			pamu_enable_liodn(liodn);
+		}
+	}
+}
+
+irqreturn_t pamu_av_isr(int irq, void *arg)
+{
+	struct pamu_isr_data *data = arg;
+	phys_addr_t phys;
+	unsigned int i, j, ret;
+
+	pr_emerg("fsl-pamu: access violation interrupt\n");
+
+	for (i = 0; i < data->count; i++) {
+		void __iomem *p = data->pamu_reg_base + i * PAMU_OFFSET;
+		u32 pics = in_be32(p + PAMU_PICS);
+
+		if (pics & PAMU_ACCESS_VIOLATION_STAT) {
+			u32 avs1 = in_be32(p + PAMU_AVS1);
+			struct paace *paace;
+
+			pr_emerg("POES1=%08x\n", in_be32(p + PAMU_POES1));
+			pr_emerg("POES2=%08x\n", in_be32(p + PAMU_POES2));
+			pr_emerg("AVS1=%08x\n", avs1);
+			pr_emerg("AVS2=%08x\n", in_be32(p + PAMU_AVS2));
+			pr_emerg("AVA=%016llx\n", make64(in_be32(p + PAMU_AVAH),
+				in_be32(p + PAMU_AVAL)));
+			pr_emerg("UDAD=%08x\n", in_be32(p + PAMU_UDAD));
+			pr_emerg("POEA=%016llx\n", make64(in_be32(p + PAMU_POEAH),
+				in_be32(p + PAMU_POEAL)));
+
+			phys = make64(in_be32(p + PAMU_POEAH),
+				in_be32(p + PAMU_POEAL));
+
+			/* Assume that POEA points to a PAACE */
+			if (phys) {
+				u32 *paace = phys_to_virt(phys);
+
+				/* Only the first four words are relevant */
+				for (j = 0; j < 4; j++)
+					pr_emerg("PAACE[%u]=%08x\n", j, in_be32(paace + j));
+			}
+
+			/* clear access violation condition */
+			out_be32((p + PAMU_AVS1), avs1 & PAMU_AV_MASK);
+			paace = pamu_get_ppaace(avs1 >> PAMU_AVS1_LIODN_SHIFT);
+			BUG_ON(!paace);
+			/* check if we got a violation for a disabled LIODN */
+			if (!get_bf(paace->addr_bitfields, PAACE_AF_V)) {
+				/*
+				 * As per hardware erratum A-003638, access
+				 * violation can be reported for a disabled
+				 * LIODN. If we hit that condition, disable 
+				 * access violation reporting
+				 */
+				pics &= ~PAMU_ACCESS_VIOLATION_ENABLE;
+			} else {
+				/* Disable the LIODN */
+				ret = pamu_disable_liodn(avs1 >> PAMU_AVS1_LIODN_SHIFT);
+				BUG_ON(ret);
+				pr_emerg("Disabling liodn %x\n", avs1 >> PAMU_AVS1_LIODN_SHIFT);
+			}
+			out_be32((p + PAMU_PICS), pics);
+		}
+	}
+
+
+	return IRQ_HANDLED;
+}
+
+#define LAWAR_EN		0x80000000
+#define LAWAR_TARGET_MASK	0x0FF00000
+#define LAWAR_TARGET_SHIFT	20
+#define LAWAR_SIZE_MASK		0x0000003F
+#define LAWAR_CSDID_MASK	0x000FF000
+#define LAWAR_CSDID_SHIFT	12
+
+#define LAW_SIZE_4K		0xb
+
+struct ccsr_law {
+	u32	lawbarh;	/* LAWn base address high */
+	u32	lawbarl;	/* LAWn base address low */
+	u32	lawar;		/* LAWn attributes */
+	u32	reserved;
+};
+
+/*
+ * Create a coherence subdomain for a given memory block.
+ */
+static int __init create_csd(phys_addr_t phys, size_t size, u32 csd_port_id)
+{
+	struct device_node *np;
+	const __be32 *iprop;
+	void __iomem *lac = NULL;	/* Local Access Control registers */
+	struct ccsr_law __iomem *law;
+	void __iomem *ccm = NULL;
+	u32 __iomem *csdids;
+	unsigned int i, num_laws, num_csds;
+	u32 law_target = 0;
+	u32 csd_id = 0;
+	int ret = 0;
+
+	np = of_find_compatible_node(NULL, NULL, "fsl,corenet-law");
+	if (!np)
+		return -ENODEV;
+
+	iprop = of_get_property(np, "fsl,num-laws", NULL);
+	if (!iprop) {
+		ret = -ENODEV;
+		goto error;
+	}
+
+	num_laws = be32_to_cpup(iprop);
+	if (!num_laws) {
+		ret = -ENODEV;
+		goto error;
+	}
+
+	lac = of_iomap(np, 0);
+	if (!lac) {
+		ret = -ENODEV;
+		goto error;
+	}
+
+	/* LAW registers are at offset 0xC00 */
+	law = lac + 0xC00;
+
+	of_node_put(np);
+
+	np = of_find_compatible_node(NULL, NULL, "fsl,corenet-cf");
+	if (!np) {
+		ret = -ENODEV;
+		goto error;
+	}
+
+	iprop = of_get_property(np, "fsl,ccf-num-csdids", NULL);
+	if (!iprop) {
+		ret = -ENODEV;
+		goto error;
+	}
+
+	num_csds = be32_to_cpup(iprop);
+	if (!num_csds) {
+		ret = -ENODEV;
+		goto error;
+	}
+
+	ccm = of_iomap(np, 0);
+	if (!ccm) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	/* The undocumented CSDID registers are at offset 0x600 */
+	csdids = ccm + 0x600;
+
+	of_node_put(np);
+	np = NULL;
+
+	/* Find an unused coherence subdomain ID */
+	for (csd_id = 0; csd_id < num_csds; csd_id++) {
+		if (!csdids[csd_id])
+			break;
+	}
+
+	/* Store the Port ID in the (undocumented) proper CIDMRxx register */
+	csdids[csd_id] = csd_port_id;
+
+	/* Find the DDR LAW that maps to our buffer. */
+	for (i = 0; i < num_laws; i++) {
+		if (law[i].lawar & LAWAR_EN) {
+			phys_addr_t law_start, law_end;
+
+			law_start = make64(law[i].lawbarh, law[i].lawbarl);
+			law_end = law_start +
+				(2ULL << (law[i].lawar & LAWAR_SIZE_MASK));
+
+			if (law_start <= phys && phys < law_end) {
+				law_target = law[i].lawar & LAWAR_TARGET_MASK;
+				break;
+			}
+		}
+	}
+
+	if (i == 0 || i == num_laws) {
+		/* This should never happen*/
+		ret = -ENOENT;
+		goto error;
+	}
+
+	/* Find a free LAW entry */
+	while (law[--i].lawar & LAWAR_EN) {
+		if (i == 0) {
+			/* No higher priority LAW slots available */
+			ret = -ENOENT;
+			goto error;
+		}
+	}
+
+	law[i].lawbarh = upper_32_bits(phys);
+	law[i].lawbarl = lower_32_bits(phys);
+	wmb();
+	law[i].lawar = LAWAR_EN | law_target | (csd_id << LAWAR_CSDID_SHIFT) |
+		(LAW_SIZE_4K + get_order(size));
+	wmb();
+
+error:
+	if (ccm)
+		iounmap(ccm);
+
+	if (lac)
+		iounmap(lac);
+
+	if (np)
+		of_node_put(np);
+
+	return ret;
+}
+
+/*
+ * Table of SVRs and the corresponding PORT_ID values. Port ID corresponds to a 
+ * bit map of snoopers for a given range of memory mapped by a LAW.
+ *
+ * All future CoreNet-enabled SOCs will have this erratum(A-004510) fixed, so this
+ * table should never need to be updated.  SVRs are guaranteed to be unique, so
+ * there is no worry that a future SOC will inadvertently have one of these
+ * values.
+ */
+static const struct {
+	u32 svr;
+	u32 port_id;
+} port_id_map[] = {
+	{0x82100010, 0xFF000000},	/* P2040 1.0 */
+	{0x82100011, 0xFF000000},	/* P2040 1.1 */
+	{0x82100110, 0xFF000000},	/* P2041 1.0 */
+	{0x82100111, 0xFF000000},	/* P2041 1.1 */
+	{0x82110310, 0xFF000000},	/* P3041 1.0 */
+	{0x82110311, 0xFF000000},	/* P3041 1.1 */
+	{0x82010020, 0xFFF80000},	/* P4040 2.0 */
+	{0x82000020, 0xFFF80000},	/* P4080 2.0 */
+	{0x82210010, 0xFC000000},       /* P5010 1.0 */
+	{0x82210020, 0xFC000000},       /* P5010 2.0 */
+	{0x82200010, 0xFC000000},	/* P5020 1.0 */
+	{0x82050010, 0xFF800000},	/* P5021 1.0 */
+	{0x82040010, 0xFF800000},	/* P5040 1.0 */
+};
+
+#define SVR_SECURITY	0x80000	/* The Security (E) bit */
+
+static int __init fsl_pamu_probe(struct platform_device *pdev)
+{
+	void __iomem *pamu_regs = NULL;
+	struct ccsr_guts __iomem *guts_regs = NULL;
+	u32 pamubypenr, pamu_counter;
+	unsigned long pamu_reg_off;
+	unsigned long pamu_reg_base;
+	struct pamu_isr_data *data = NULL;
+	struct device_node *guts_node;
+	u64 size;
+	struct page *p;
+	int ret = 0;
+	int irq;
+	phys_addr_t ppaact_phys;
+	phys_addr_t spaact_phys;
+	phys_addr_t omt_phys;
+	size_t mem_size = 0;
+	unsigned int order = 0;
+	u32 csd_port_id = 0;
+	unsigned i;
+	/*
+	 * enumerate all PAMUs and allocate and setup PAMU tables
+	 * for each of them,
+	 * NOTE : All PAMUs share the same LIODN tables.
+	 */
+
+	pamu_regs = of_iomap(pdev->dev.of_node, 0);
+	if (!pamu_regs) {
+		dev_err(&pdev->dev, "ioremap of PAMU node failed\n");
+		return -ENOMEM;
+	}
+	of_get_address(pdev->dev.of_node, 0, &size, NULL);
+
+	irq = irq_of_parse_and_map(pdev->dev.of_node, 0);
+	if (irq == NO_IRQ) {
+		dev_warn(&pdev->dev, "no interrupts listed in PAMU node\n");
+		goto error;
+	}
+
+	data = kzalloc(sizeof(struct pamu_isr_data), GFP_KERNEL);
+	if (!data) {
+		dev_err(&pdev->dev, "PAMU isr data memory allocation failed\n");
+		ret = -ENOMEM;
+		goto error;
+	}
+	data->pamu_reg_base = pamu_regs;
+	data->count = size / PAMU_OFFSET;
+
+	/* The ISR needs access to the regs, so we won't iounmap them */
+	ret = request_irq(irq, pamu_av_isr, 0, "pamu", data);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "error %i installing ISR for irq %i\n",
+			ret, irq);
+		goto error;
+	}
+
+	guts_node = of_find_matching_node(NULL, guts_device_ids);
+	if (!guts_node) {
+		dev_err(&pdev->dev, "could not find GUTS node %s\n",
+			pdev->dev.of_node->full_name);
+		ret = -ENODEV;
+		goto error;
+	}
+
+	guts_regs = of_iomap(guts_node, 0);
+	of_node_put(guts_node);
+	if (!guts_regs) {
+		dev_err(&pdev->dev, "ioremap of GUTS node failed\n");
+		ret = -ENODEV;
+		goto error;
+	}
+
+	/* read in the PAMU capability registers */
+	get_pamu_cap_values((unsigned long)pamu_regs);
+	/*
+	 * To simplify the allocation of a coherency domain, we allocate the
+	 * PAACT and the OMT in the same memory buffer.  Unfortunately, this
+	 * wastes more memory compared to allocating the buffers separately.
+	 */
+	/* Determine how much memory we need */
+	mem_size = (PAGE_SIZE << get_order(PAACT_SIZE)) +
+		(PAGE_SIZE << get_order(SPAACT_SIZE)) +
+		(PAGE_SIZE << get_order(OMT_SIZE));
+	order = get_order(mem_size);
+
+	p = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
+	if (!p) {
+		dev_err(&pdev->dev, "unable to allocate PAACT/SPAACT/OMT block\n");
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	ppaact = page_address(p);
+	ppaact_phys = page_to_phys(p);
+
+	/* Make sure the memory is naturally aligned */
+	if (ppaact_phys & ((PAGE_SIZE << order) - 1)) {
+		dev_err(&pdev->dev, "PAACT/OMT block is unaligned\n");
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	spaact = (void *)ppaact + (PAGE_SIZE << get_order(PAACT_SIZE));
+	omt = (void *)spaact + (PAGE_SIZE << get_order(SPAACT_SIZE));
+
+	dev_dbg(&pdev->dev, "ppaact virt=%p phys=0x%llx\n", ppaact,
+		(unsigned long long) ppaact_phys);
+
+	/* Check to see if we need to implement the work-around on this SOC */
+
+	/* Determine the Port ID for our coherence subdomain */
+	for (i = 0; i < ARRAY_SIZE(port_id_map); i++) {
+		if (port_id_map[i].svr == (mfspr(SPRN_SVR) & ~SVR_SECURITY)) {
+			csd_port_id = port_id_map[i].port_id;
+			dev_dbg(&pdev->dev, "found matching SVR %08x\n",
+				port_id_map[i].svr);
+			break;
+		}
+	}
+
+	if (csd_port_id) {
+		dev_dbg(&pdev->dev, "creating coherency subdomain at address "
+			"0x%llx, size %zu, port id 0x%08x", ppaact_phys,
+			mem_size, csd_port_id);
+
+		ret = create_csd(ppaact_phys, mem_size, csd_port_id);
+		if (ret) {
+			dev_err(&pdev->dev, "could not create coherence "
+				"subdomain\n");
+			return ret;
+		}
+	}
+
+	spaact_phys = virt_to_phys(spaact);
+	omt_phys = virt_to_phys(omt);
+
+	spaace_pool = gen_pool_create(ilog2(sizeof(struct paace)), -1);
+	if (!spaace_pool) {
+		ret = -ENOMEM;
+		dev_err(&pdev->dev, "PAMU : failed to allocate spaace gen pool\n");
+		goto error;
+	}
+
+	ret = gen_pool_add(spaace_pool, (unsigned long)spaact, SPAACT_SIZE, -1);
+	if (ret)
+		goto error_genpool;
+
+	pamubypenr = in_be32(&guts_regs->pamubypenr);
+
+	for (pamu_reg_off = 0, pamu_counter = 0x80000000; pamu_reg_off < size;
+	     pamu_reg_off += PAMU_OFFSET, pamu_counter >>= 1) {
+
+		pamu_reg_base = (unsigned long) pamu_regs + pamu_reg_off;
+		setup_one_pamu(pamu_reg_base, pamu_reg_off, ppaact_phys,
+				 spaact_phys, omt_phys);
+		/* Disable PAMU bypass for this PAMU */
+		pamubypenr &= ~pamu_counter;
+	}
+
+	setup_omt(omt);
+
+	/* Enable all relevant PAMU(s) */
+	out_be32(&guts_regs->pamubypenr, pamubypenr);
+
+	iounmap(guts_regs);
+
+	/* Enable DMA for the LIODNs in the device tree*/
+
+	setup_liodns();
+
+	return 0;
+
+error_genpool:
+	gen_pool_destroy(spaace_pool);
+
+error:
+	if (irq != NO_IRQ)
+		free_irq(irq, data);
+
+	if (data) {
+		memset(data, 0, sizeof(struct pamu_isr_data));
+		kfree(data);
+	}
+
+	if (pamu_regs)
+		iounmap(pamu_regs);
+
+	if (guts_regs)
+		iounmap(guts_regs);
+
+	if (ppaact)
+		free_pages((unsigned long)ppaact, order);
+
+	ppaact = NULL;
+
+	return ret;
+}
+
+static const struct of_device_id fsl_of_pamu_ids[] = {
+	{
+		.compatible = "fsl,p4080-pamu",
+	},
+	{
+		.compatible = "fsl,pamu",
+	},
+	{},
+};
+
+static struct platform_driver fsl_of_pamu_driver = {
+	.driver = {
+		.name = "fsl-of-pamu",
+		.owner = THIS_MODULE,
+	},
+	.probe = fsl_pamu_probe,
+};
+
+static __init int fsl_pamu_init(void)
+{
+	struct platform_device *pdev = NULL;
+	struct device_node *np;
+	int ret;
+
+	/*
+	 * The normal OF process calls the probe function at some
+	 * indeterminate later time, after most drivers have loaded.  This is
+	 * too late for us, because PAMU clients (like the Qman driver)
+	 * depend on PAMU being initialized early.
+	 *
+	 * So instead, we "manually" call our probe function by creating the
+	 * platform devices ourselves.
+	 */
+
+	/*
+	 * We assume that there is only one PAMU node in the device tree.  A
+	 * single PAMU node represents all of the PAMU devices in the SOC
+	 * already.   Everything else already makes that assumption, and the
+	 * binding for the PAMU nodes doesn't allow for any parent-child
+	 * relationships anyway.  In other words, support for more than one
+	 * PAMU node would require significant changes to a lot of code.
+	 */
+
+	np = of_find_compatible_node(NULL, NULL, "fsl,pamu");
+	if (!np) {
+		pr_err("fsl-pamu: could not find a PAMU node\n");
+		return -ENODEV;
+	}
+
+	ret = platform_driver_register(&fsl_of_pamu_driver);
+	if (ret) {
+		pr_err("fsl-pamu: could not register driver (err=%i)\n", ret);
+		goto error_driver_register;
+	}
+
+	pdev = platform_device_alloc("fsl-of-pamu", 0);
+	if (!pdev) {
+		pr_err("fsl-pamu: could not allocate device %s\n",
+		       np->full_name);
+		ret = -ENOMEM;
+		goto error_device_alloc;
+	}
+	pdev->dev.of_node = of_node_get(np);
+
+	ret = pamu_domain_init();
+	if (ret)
+		goto error_device_add;
+
+	ret = platform_device_add(pdev);
+	if (ret) {
+		pr_err("fsl-pamu: could not add device %s (err=%i)\n",
+		       np->full_name, ret);
+		goto error_device_add;
+	}
+
+	return 0;
+
+error_device_add:
+	of_node_put(pdev->dev.of_node);
+	pdev->dev.of_node = NULL;
+
+	platform_device_put(pdev);
+
+error_device_alloc:
+	platform_driver_unregister(&fsl_of_pamu_driver);
+
+error_driver_register:
+	of_node_put(np);
+
+	return ret;
+}
+arch_initcall(fsl_pamu_init);
diff --git a/drivers/iommu/fsl_pamu.h b/drivers/iommu/fsl_pamu.h
new file mode 100644
index 0000000..b919e83
--- /dev/null
+++ b/drivers/iommu/fsl_pamu.h
@@ -0,0 +1,426 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ *
+ */
+
+#ifndef __FSL_PAMU_H
+#define __FSL_PAMU_H
+
+/* Bit Field macros
+ *	v = bit field variable; m = mask, m##_SHIFT = shift, x = value to load
+ */
+#define set_bf(v, m, x)		(v = ((v) & ~(m)) | (((x) << (m##_SHIFT)) & (m)))
+#define get_bf(v, m)		(((v) & (m)) >> (m##_SHIFT))
+
+/* PAMU CCSR space */
+#define PAMU_PGC 0x00000000     /* Allows all peripheral accesses */
+#define PAMU_PE 0x40000000      /* enable PAMU                    */
+
+/* PAMU_OFFSET to the next pamu space in ccsr */
+#define PAMU_OFFSET 0x1000
+
+#define PAMU_MMAP_REGS_BASE 0
+
+struct pamu_mmap_regs {
+	u32 ppbah;
+	u32 ppbal;
+	u32 pplah;
+	u32 pplal;
+	u32 spbah;
+	u32 spbal;
+	u32 splah;
+	u32 splal;
+	u32 obah;
+	u32 obal;
+	u32 olah;
+	u32 olal;
+};
+
+/* PAMU Error Registers */
+#define PAMU_POES1 0x0040
+#define PAMU_POES2 0x0044
+#define PAMU_POEAH 0x0048
+#define PAMU_POEAL 0x004C
+#define PAMU_AVS1  0x0050
+#define PAMU_AVS1_AV    0x1
+#define PAMU_AVS1_OTV   0x6
+#define PAMU_AVS1_APV   0x78
+#define PAMU_AVS1_WAV   0x380
+#define PAMU_AVS1_LAV   0x1c00
+#define PAMU_AVS1_GCV   0x2000
+#define PAMU_AVS1_PDV   0x4000
+#define PAMU_AV_MASK    (PAMU_AVS1_AV | PAMU_AVS1_OTV | PAMU_AVS1_APV | PAMU_AVS1_WAV \
+			| PAMU_AVS1_LAV | PAMU_AVS1_GCV | PAMU_AVS1_PDV)
+#define PAMU_AVS1_LIODN_SHIFT 16
+#define PAMU_LAV_LIODN_NOT_IN_PPAACT 0x400
+
+#define PAMU_AVS2  0x0054
+#define PAMU_AVAH  0x0058
+#define PAMU_AVAL  0x005C
+#define PAMU_EECTL 0x0060
+#define PAMU_EEDIS 0x0064
+#define PAMU_EEINTEN 0x0068
+#define PAMU_EEDET 0x006C
+#define PAMU_EEATTR 0x0070
+#define PAMU_EEAHI 0x0074
+#define PAMU_EEALO 0x0078
+#define PAMU_EEDHI 0X007C
+#define PAMU_EEDLO 0x0080
+#define PAMU_EECC  0x0084
+#define PAMU_UDAD  0x0090
+
+/* PAMU Revision Registers */
+#define PAMU_PR1 0x0BF8
+#define PAMU_PR2 0x0BFC
+
+/* PAMU version mask */
+#define PAMU_PR1_MASK 0xffff
+
+/* PAMU Capabilities Registers */
+#define PAMU_PC1 0x0C00
+#define PAMU_PC2 0x0C04
+#define PAMU_PC3 0x0C08
+#define PAMU_PC4 0x0C0C
+
+/* PAMU Control Register */
+#define PAMU_PC 0x0C10
+
+/* PAMU control defs */
+#define PAMU_CONTROL 0x0C10
+#define PAMU_PC_PGC 0x80000000  /* PAMU gate closed bit */
+#define PAMU_PC_PE   0x40000000 /* PAMU enable bit */
+#define PAMU_PC_SPCC 0x00000010 /* sPAACE cache enable */
+#define PAMU_PC_PPCC 0x00000001 /* pPAACE cache enable */
+#define PAMU_PC_OCE  0x00001000 /* OMT cache enable */
+
+#define PAMU_PFA1 0x0C14
+#define PAMU_PFA2 0x0C18
+
+#define PAMU_PC2_MLIODN(X) ((X) >> 16)
+#define PAMU_PC3_MWCE(X) (((X) >> 21) & 0xf)
+
+/* PAMU Interrupt control and Status Register */
+#define PAMU_PICS 0x0C1C
+#define PAMU_ACCESS_VIOLATION_STAT   0x8
+#define PAMU_ACCESS_VIOLATION_ENABLE 0x4
+
+/* PAMU Debug Registers */
+#define PAMU_PD1 0x0F00
+#define PAMU_PD2 0x0F04
+#define PAMU_PD3 0x0F08
+#define PAMU_PD4 0x0F0C
+
+#define PAACE_AP_PERMS_DENIED  0x0
+#define PAACE_AP_PERMS_QUERY   0x1
+#define PAACE_AP_PERMS_UPDATE  0x2
+#define PAACE_AP_PERMS_ALL     0x3
+
+#define PAACE_DD_TO_HOST       0x0
+#define PAACE_DD_TO_IO         0x1
+#define PAACE_PT_PRIMARY       0x0
+#define PAACE_PT_SECONDARY     0x1
+#define PAACE_V_INVALID        0x0
+#define PAACE_V_VALID          0x1
+#define PAACE_MW_SUBWINDOWS    0x1
+
+#define PAACE_WSE_4K           0xB
+#define PAACE_WSE_8K           0xC
+#define PAACE_WSE_16K          0xD
+#define PAACE_WSE_32K          0xE
+#define PAACE_WSE_64K          0xF
+#define PAACE_WSE_128K         0x10
+#define PAACE_WSE_256K         0x11
+#define PAACE_WSE_512K         0x12
+#define PAACE_WSE_1M           0x13
+#define PAACE_WSE_2M           0x14
+#define PAACE_WSE_4M           0x15
+#define PAACE_WSE_8M           0x16
+#define PAACE_WSE_16M          0x17
+#define PAACE_WSE_32M          0x18
+#define PAACE_WSE_64M          0x19
+#define PAACE_WSE_128M         0x1A
+#define PAACE_WSE_256M         0x1B
+#define PAACE_WSE_512M         0x1C
+#define PAACE_WSE_1G           0x1D
+#define PAACE_WSE_2G           0x1E
+#define PAACE_WSE_4G           0x1F
+
+#define PAACE_DID_PCI_EXPRESS_1 0x00
+#define PAACE_DID_PCI_EXPRESS_2 0x01
+#define PAACE_DID_PCI_EXPRESS_3 0x02
+#define PAACE_DID_PCI_EXPRESS_4 0x03
+#define PAACE_DID_LOCAL_BUS     0x04
+#define PAACE_DID_SRIO          0x0C
+#define PAACE_DID_MEM_1         0x10
+#define PAACE_DID_MEM_2         0x11
+#define PAACE_DID_MEM_3         0x12
+#define PAACE_DID_MEM_4         0x13
+#define PAACE_DID_MEM_1_2       0x14
+#define PAACE_DID_MEM_3_4       0x15
+#define PAACE_DID_MEM_1_4       0x16
+#define PAACE_DID_BM_SW_PORTAL  0x18
+#define PAACE_DID_PAMU          0x1C
+#define PAACE_DID_CAAM          0x21
+#define PAACE_DID_QM_SW_PORTAL  0x3C
+#define PAACE_DID_CORE0_INST    0x80
+#define PAACE_DID_CORE0_DATA    0x81
+#define PAACE_DID_CORE1_INST    0x82
+#define PAACE_DID_CORE1_DATA    0x83
+#define PAACE_DID_CORE2_INST    0x84
+#define PAACE_DID_CORE2_DATA    0x85
+#define PAACE_DID_CORE3_INST    0x86
+#define PAACE_DID_CORE3_DATA    0x87
+#define PAACE_DID_CORE4_INST    0x88
+#define PAACE_DID_CORE4_DATA    0x89
+#define PAACE_DID_CORE5_INST    0x8A
+#define PAACE_DID_CORE5_DATA    0x8B
+#define PAACE_DID_CORE6_INST    0x8C
+#define PAACE_DID_CORE6_DATA    0x8D
+#define PAACE_DID_CORE7_INST    0x8E
+#define PAACE_DID_CORE7_DATA    0x8F
+#define PAACE_DID_BROADCAST     0xFF
+
+#define PAACE_ATM_NO_XLATE      0x00
+#define PAACE_ATM_WINDOW_XLATE  0x01
+#define PAACE_ATM_PAGE_XLATE    0x02
+#define PAACE_ATM_WIN_PG_XLATE  \
+                (PAACE_ATM_WINDOW_XLATE | PAACE_ATM_PAGE_XLATE)
+#define PAACE_OTM_NO_XLATE      0x00
+#define PAACE_OTM_IMMEDIATE     0x01
+#define PAACE_OTM_INDEXED       0x02
+#define PAACE_OTM_RESERVED      0x03
+
+#define PAACE_M_COHERENCE_REQ   0x01
+
+#define PAACE_PID_0             0x0
+#define PAACE_PID_1             0x1
+#define PAACE_PID_2             0x2
+#define PAACE_PID_3             0x3
+#define PAACE_PID_4             0x4
+#define PAACE_PID_5             0x5
+#define PAACE_PID_6             0x6
+#define PAACE_PID_7             0x7
+
+#define PAACE_TCEF_FORMAT0_8B   0x00
+#define PAACE_TCEF_FORMAT1_RSVD 0x01
+/*
+ * Hard coded value for the PAACT size to accomodate
+ * maximum LIODN value generated by u-boot.
+ */
+#define PAACE_NUMBER_ENTRIES    0x500
+/* Hard coded value for the SPAACT size */
+#define SPAACE_NUMBER_ENTRIES	0x800
+
+#define	OME_NUMBER_ENTRIES      16
+
+/* PAACE Bit Field Defines */
+#define PPAACE_AF_WBAL			0xfffff000
+#define PPAACE_AF_WBAL_SHIFT		12
+#define PPAACE_AF_WSE			0x00000fc0
+#define PPAACE_AF_WSE_SHIFT		6
+#define PPAACE_AF_MW			0x00000020
+#define PPAACE_AF_MW_SHIFT		5
+
+#define SPAACE_AF_LIODN			0xffff0000
+#define SPAACE_AF_LIODN_SHIFT		16
+
+#define PAACE_AF_AP			0x00000018
+#define PAACE_AF_AP_SHIFT		3
+#define PAACE_AF_DD			0x00000004
+#define PAACE_AF_DD_SHIFT		2
+#define PAACE_AF_PT			0x00000002
+#define PAACE_AF_PT_SHIFT		1
+#define PAACE_AF_V			0x00000001
+#define PAACE_AF_V_SHIFT		0
+
+#define PAACE_DA_HOST_CR		0x80
+#define PAACE_DA_HOST_CR_SHIFT		7
+
+#define PAACE_IA_CID			0x00FF0000
+#define PAACE_IA_CID_SHIFT		16
+#define PAACE_IA_WCE			0x000000F0
+#define PAACE_IA_WCE_SHIFT		4
+#define PAACE_IA_ATM			0x0000000C
+#define PAACE_IA_ATM_SHIFT		2
+#define PAACE_IA_OTM			0x00000003
+#define PAACE_IA_OTM_SHIFT		0
+
+#define PAACE_WIN_TWBAL			0xfffff000
+#define PAACE_WIN_TWBAL_SHIFT		12
+#define PAACE_WIN_SWSE			0x00000fc0
+#define PAACE_WIN_SWSE_SHIFT		6
+
+/* PAMU Data Structures */
+/* primary / secondary paact structure */
+struct paace {
+	/* PAACE Offset 0x00 */
+	u32 wbah;				/* only valid for Primary PAACE */
+	u32 addr_bitfields;		/* See P/S PAACE_AF_* */
+
+	/* PAACE Offset 0x08 */
+	/* Interpretation of first 32 bits dependent on DD above */
+	union {
+		struct {
+			/* Destination ID, see PAACE_DID_* defines */
+			u8 did;
+			/* Partition ID */
+			u8 pid;
+			/* Snoop ID */
+			u8 snpid;
+			/* coherency_required : 1 reserved : 7 */
+			u8 coherency_required; /* See PAACE_DA_* */
+		} to_host;
+		struct {
+			/* Destination ID, see PAACE_DID_* defines */
+			u8  did;
+			u8  reserved1;
+			u16 reserved2;
+		} to_io;
+	} domain_attr;
+
+	/* Implementation attributes + window count + address & operation translation modes */
+	u32 impl_attr;			/* See PAACE_IA_* */
+
+	/* PAACE Offset 0x10 */
+	/* Translated window base address */
+	u32 twbah;
+	u32 win_bitfields;			/* See PAACE_WIN_* */
+
+	/* PAACE Offset 0x18 */
+	/* first secondary paace entry */
+	u32 fspi;				/* only valid for Primary PAACE */
+	union {
+		struct {
+			u8 ioea;
+			u8 moea;
+			u8 ioeb;
+			u8 moeb;
+		} immed_ot;
+		struct {
+			u16 reserved;
+			u16 omi;
+		} index_ot;
+	} op_encode;
+
+	/* PAACE Offsets 0x20-0x38 */
+	u32 reserved[8];			/* not currently implemented */
+};
+
+/* OME : Operation mapping entry
+ * MOE : Mapped Operation Encodings
+ * The operation mapping table is table containing operation mapping entries (OME).
+ * The index of a particular OME is programmed in the PAACE entry for translation
+ * in bound I/O operations corresponding to an LIODN. The OMT is used for translation
+ * specifically in case of the indexed translation mode. Each OME contains a 128
+ * byte mapped operation encoding (MOE), where each byte represents an MOE.
+ */
+#define NUM_MOE 128
+struct ome {
+	u8 moe[NUM_MOE];
+} __attribute__((packed));
+
+#define PAACT_SIZE              (sizeof(struct paace) * PAACE_NUMBER_ENTRIES)
+#define SPAACT_SIZE              (sizeof(struct paace) * SPAACE_NUMBER_ENTRIES)
+#define OMT_SIZE                (sizeof(struct ome) * OME_NUMBER_ENTRIES)
+
+#define PAMU_PAGE_SHIFT 12
+#define PAMU_PAGE_SIZE  4096ULL
+
+#define IOE_READ        0x00
+#define IOE_READ_IDX    0x00
+#define IOE_WRITE       0x81
+#define IOE_WRITE_IDX   0x01
+#define IOE_EREAD0      0x82    /* Enhanced read type 0 */
+#define IOE_EREAD0_IDX  0x02    /* Enhanced read type 0 */
+#define IOE_EWRITE0     0x83    /* Enhanced write type 0 */
+#define IOE_EWRITE0_IDX 0x03    /* Enhanced write type 0 */
+#define IOE_DIRECT0     0x84    /* Directive type 0 */
+#define IOE_DIRECT0_IDX 0x04    /* Directive type 0 */
+#define IOE_EREAD1      0x85    /* Enhanced read type 1 */
+#define IOE_EREAD1_IDX  0x05    /* Enhanced read type 1 */
+#define IOE_EWRITE1     0x86    /* Enhanced write type 1 */
+#define IOE_EWRITE1_IDX 0x06    /* Enhanced write type 1 */
+#define IOE_DIRECT1     0x87    /* Directive type 1 */
+#define IOE_DIRECT1_IDX 0x07    /* Directive type 1 */
+#define IOE_RAC         0x8c    /* Read with Atomic clear */
+#define IOE_RAC_IDX     0x0c    /* Read with Atomic clear */
+#define IOE_RAS         0x8d    /* Read with Atomic set */
+#define IOE_RAS_IDX     0x0d    /* Read with Atomic set */
+#define IOE_RAD         0x8e    /* Read with Atomic decrement */
+#define IOE_RAD_IDX     0x0e    /* Read with Atomic decrement */
+#define IOE_RAI         0x8f    /* Read with Atomic increment */
+#define IOE_RAI_IDX     0x0f    /* Read with Atomic increment */
+
+#define EOE_READ        0x00
+#define EOE_WRITE       0x01
+#define EOE_RAC         0x0c    /* Read with Atomic clear */
+#define EOE_RAS         0x0d    /* Read with Atomic set */
+#define EOE_RAD         0x0e    /* Read with Atomic decrement */
+#define EOE_RAI         0x0f    /* Read with Atomic increment */
+#define EOE_LDEC        0x10    /* Load external cache */
+#define EOE_LDECL       0x11    /* Load external cache with stash lock */
+#define EOE_LDECPE      0x12    /* Load external cache with preferred exclusive */
+#define EOE_LDECPEL     0x13    /* Load external cache with preferred exclusive and lock */
+#define EOE_LDECFE      0x14    /* Load external cache with forced exclusive */
+#define EOE_LDECFEL     0x15    /* Load external cache with forced exclusive and lock */
+#define EOE_RSA         0x16    /* Read with stash allocate */
+#define EOE_RSAU        0x17    /* Read with stash allocate and unlock */
+#define EOE_READI       0x18    /* Read with invalidate */
+#define EOE_RWNITC      0x19    /* Read with no intention to cache */
+#define EOE_WCI         0x1a    /* Write cache inhibited */
+#define EOE_WWSA        0x1b    /* Write with stash allocate */
+#define EOE_WWSAL       0x1c    /* Write with stash allocate and lock */
+#define EOE_WWSAO       0x1d    /* Write with stash allocate only */
+#define EOE_WWSAOL      0x1e    /* Write with stash allocate only and lock */
+#define EOE_VALID       0x80
+
+/* Function prototypes */
+int pamu_domain_init(void);
+int pamu_enable_liodn(int liodn);
+int pamu_disable_liodn(int liodn);
+void pamu_free_subwins(int liodn);
+int pamu_config_ppaace(int liodn, phys_addr_t win_addr, phys_addr_t win_size,
+		       u32 omi, unsigned long rpn, u32 snoopid, uint32_t stashid,
+		       u32 subwin_cnt, int prot);
+int pamu_config_spaace(int liodn, u32 subwin_cnt, u32 subwin_addr,
+		       phys_addr_t subwin_size, u32 omi, unsigned long rpn,
+		       uint32_t snoopid, u32 stashid, int enable, int prot);
+
+u32 get_stash_id(u32 stash_dest_hint, u32 vcpu);
+void get_ome_index(u32 *omi_index, struct device *dev);
+int  pamu_update_paace_stash(int liodn, u32 subwin, u32 value);
+int pamu_disable_spaace(int liodn, u32 subwin);
+u32 pamu_get_max_subwin_cnt(void);
+
+
+/* cache stash targets */
+enum pamu_stash_target {
+	PAMU_ATTR_CACHE_L1 = 1,
+	PAMU_ATTR_CACHE_L2,
+	PAMU_ATTR_CACHE_L3,
+};
+
+/*
+ * This attribute allows configuring stashig specific parameters
+ * in the PAMU hardware.
+ */
+
+struct pamu_stash_attribute {
+	u32 	cpu;	/* cpu number */
+	u32 	cache;	/* cache to stash to: L1,L2,L3 */
+};
+
+#endif  /* __FSL_PAMU_H */
diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
new file mode 100644
index 0000000..f72a98a
--- /dev/null
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -0,0 +1,1143 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ * Author: Varun Sethi <varun.sethi@freescale.com>
+ *
+ */
+
+#define pr_fmt(fmt)    "fsl-pamu-domain: %s: " fmt, __func__
+
+#include <linux/init.h>
+#include <linux/iommu.h>
+#include <linux/notifier.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/device.h>
+#include <linux/of_platform.h>
+#include <linux/bootmem.h>
+#include <linux/err.h>
+#include <asm/io.h>
+#include <asm/bitops.h>
+
+#include <asm/pci-bridge.h>
+#include <sysdev/fsl_pci.h>
+
+#include "fsl_pamu_domain.h"
+
+/*
+ * Global spinlock that needs to be held while
+ * configuring PAMU.
+ */
+static DEFINE_SPINLOCK(iommu_lock);
+
+static struct kmem_cache *fsl_pamu_domain_cache;
+static struct kmem_cache *iommu_devinfo_cache;
+static DEFINE_SPINLOCK(device_domain_lock);
+
+static int __init iommu_init_mempool(void)
+{
+
+	fsl_pamu_domain_cache = kmem_cache_create("fsl_pamu_domain",
+					 sizeof(struct fsl_dma_domain),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+
+					 NULL);
+	if (!fsl_pamu_domain_cache) {
+		pr_err("Couldn't create fsl iommu_domain cache\n");
+		return -ENOMEM;
+	}
+
+	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
+					 sizeof(struct device_domain_info),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+					 NULL);
+	if (!iommu_devinfo_cache) {
+		pr_err("Couldn't create devinfo cache\n");
+		kmem_cache_destroy(fsl_pamu_domain_cache);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static phys_addr_t get_phys_addr(struct fsl_dma_domain *dma_domain, dma_addr_t iova)
+{
+	u32 win_cnt = dma_domain->win_cnt;
+	struct dma_window *win_ptr =
+				&dma_domain->win_arr[0];
+	struct iommu_domain_geometry *geom;
+
+	geom = &dma_domain->iommu_domain->geometry;
+
+	if (!win_cnt || !dma_domain->geom_size) {
+		pr_err("Number of windows/geometry not configured for the domain\n");
+		return 0;
+	}
+
+	if (win_cnt > 1) {
+		u64 subwin_size;
+		dma_addr_t subwin_iova;
+		u32 wnd;
+
+		subwin_size = dma_domain->geom_size >> ilog2(win_cnt);
+		subwin_iova = iova & ~(subwin_size - 1);
+		wnd = (subwin_iova - geom->aperture_start) >> ilog2(subwin_size);
+		win_ptr = &dma_domain->win_arr[wnd];
+	}
+
+	if (win_ptr->valid)
+		return (win_ptr->paddr + (iova & (win_ptr->size - 1)));
+
+	return 0;
+}
+
+static int map_subwins(int liodn, struct fsl_dma_domain *dma_domain)
+{
+	struct dma_window *sub_win_ptr =
+				&dma_domain->win_arr[0];
+	int i, ret;
+	unsigned long rpn, flags;
+
+	for (i = 0; i < dma_domain->win_cnt; i++) {
+		if (sub_win_ptr[i].valid) {
+			rpn = sub_win_ptr[i].paddr >>
+				 PAMU_PAGE_SHIFT;
+			spin_lock_irqsave(&iommu_lock, flags);
+			ret = pamu_config_spaace(liodn, dma_domain->win_cnt, i,
+						 sub_win_ptr[i].size,
+						 ~(u32)0,
+						 rpn,
+						 dma_domain->snoop_id,
+						 dma_domain->stash_id,
+						 (i > 0) ? 1 : 0,
+						 sub_win_ptr[i].prot);
+			spin_unlock_irqrestore(&iommu_lock, flags);
+			if (ret) {
+				pr_err("PAMU SPAACE configuration failed for liodn %d\n",
+					 liodn);
+				return ret;
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int map_win(int liodn, struct fsl_dma_domain *dma_domain)
+{
+	int ret;
+	struct dma_window *wnd = &dma_domain->win_arr[0];
+	phys_addr_t wnd_addr = dma_domain->iommu_domain->geometry.aperture_start;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu_lock, flags);
+	ret = pamu_config_ppaace(liodn, wnd_addr,
+				 wnd->size,
+				 ~(u32)0,
+				 wnd->paddr >> PAMU_PAGE_SHIFT,
+				 dma_domain->snoop_id, dma_domain->stash_id,
+				 0, wnd->prot);
+	spin_unlock_irqrestore(&iommu_lock, flags);
+	if (ret)
+		pr_err("PAMU PAACE configuration failed for liodn %d\n",
+			liodn);
+
+	return ret;
+}
+
+/* Map the DMA window corresponding to the LIODN */
+static int map_liodn(int liodn, struct fsl_dma_domain *dma_domain)
+{
+	if (dma_domain->win_cnt > 1)
+		return map_subwins(liodn, dma_domain);
+	else
+		return map_win(liodn, dma_domain);
+
+}
+
+/* Update window/subwindow mapping for the LIODN */
+static int update_liodn(int liodn, struct fsl_dma_domain *dma_domain, u32 wnd_nr)
+{
+	int ret;
+	struct dma_window *wnd = &dma_domain->win_arr[wnd_nr];
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu_lock, flags);
+	if (dma_domain->win_cnt > 1) {
+		ret = pamu_config_spaace(liodn, dma_domain->win_cnt, wnd_nr,
+					 wnd->size,
+					 ~(u32)0,
+					 wnd->paddr >> PAMU_PAGE_SHIFT,
+					 dma_domain->snoop_id,
+					 dma_domain->stash_id,
+					 (wnd_nr > 0) ? 1 : 0,
+					 wnd->prot);
+		if (ret)
+			pr_err("Subwindow reconfiguration failed for liodn %d\n", liodn);
+	} else {
+		phys_addr_t wnd_addr;
+
+		wnd_addr = dma_domain->iommu_domain->geometry.aperture_start;
+
+		ret = pamu_config_ppaace(liodn, wnd_addr,
+					 wnd->size,
+					 ~(u32)0,
+					 wnd->paddr >> PAMU_PAGE_SHIFT,
+					dma_domain->snoop_id, dma_domain->stash_id,
+					0, wnd->prot);
+		if (ret)
+			pr_err("Window reconfiguration failed for liodn %d\n", liodn);
+	}
+
+	spin_unlock_irqrestore(&iommu_lock, flags);
+
+	return ret;
+}
+
+static int update_liodn_stash(int liodn, struct fsl_dma_domain *dma_domain,
+				 u32 val)
+{
+	int ret = 0, i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu_lock, flags);
+	if (!dma_domain->win_cnt) {
+		ret = pamu_update_paace_stash(liodn, 0, val);
+		if (ret) {
+			pr_err("Failed to update PAACE field for liodn %d\n ", liodn);
+			spin_unlock_irqrestore(&iommu_lock, flags);
+			return ret;
+		}
+	} else {
+		for (i = 0; i < dma_domain->win_cnt; i++) {
+			ret = pamu_update_paace_stash(liodn, i, val);
+			if (ret) {
+				pr_err("Failed to update SPAACE %d field for liodn %d\n ", i, liodn);
+				spin_unlock_irqrestore(&iommu_lock, flags);
+				return ret;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&iommu_lock, flags);
+
+	return ret;
+}
+
+/* Set the geometry parameters for a LIODN */
+static int pamu_set_liodn(int liodn, struct device *dev,
+			   struct fsl_dma_domain *dma_domain,
+			   struct iommu_domain_geometry *geom_attr,
+			   u32 win_cnt)
+{
+	phys_addr_t window_addr, window_size;
+	phys_addr_t subwin_size;
+	int ret = 0, i;
+	u32 omi_index = ~(u32)0;
+	unsigned long flags;
+
+	/*
+	 * Configure the omi_index at the geometry setup time.
+	 * This is a static value which depends on the type of
+	 * device and would not change thereafter.
+	 */
+	get_ome_index(&omi_index, dev);
+
+	window_addr = geom_attr->aperture_start;
+	window_size = dma_domain->geom_size;
+
+	spin_lock_irqsave(&iommu_lock, flags);
+	ret = pamu_disable_liodn(liodn);
+	if (!ret)
+		ret = pamu_config_ppaace(liodn, window_addr, window_size, omi_index,
+					 0, dma_domain->snoop_id,
+					 dma_domain->stash_id, win_cnt, 0);
+	spin_unlock_irqrestore(&iommu_lock, flags);
+	if (ret) {
+		pr_err("PAMU PAACE configuration failed for liodn %d, win_cnt =%d\n", liodn, win_cnt);
+		return ret;
+	}
+
+	if (win_cnt > 1) {
+		subwin_size = window_size >> ilog2(win_cnt);
+		for (i = 0; i < win_cnt; i++) {
+			spin_lock_irqsave(&iommu_lock, flags);
+			ret = pamu_disable_spaace(liodn, i);
+			if (!ret)
+				ret = pamu_config_spaace(liodn, win_cnt, i,
+							 subwin_size, omi_index,
+							 0, dma_domain->snoop_id,
+							 dma_domain->stash_id,
+							 0, 0);
+			spin_unlock_irqrestore(&iommu_lock, flags);
+			if (ret) {
+				pr_err("PAMU SPAACE configuration failed for liodn %d\n", liodn);
+				return ret;
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int check_size(u64 size, dma_addr_t iova)
+{
+	/*
+	 * Size must be a power of two and at least be equal
+	 * to PAMU page size.
+	 */
+	if (!is_power_of_2(size) || size < PAMU_PAGE_SIZE) {
+		pr_err("%s: size too small or not a power of two\n", __func__);
+		return -EINVAL;
+	}
+
+	/* iova must be page size aligned*/
+	if (iova & (size - 1)) {
+		pr_err("%s: address is not aligned with window size\n", __func__);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct fsl_dma_domain *iommu_alloc_dma_domain(void)
+{
+	struct fsl_dma_domain *domain;
+
+	domain = kmem_cache_zalloc(fsl_pamu_domain_cache, GFP_KERNEL);
+	if (!domain)
+		return NULL;
+
+	domain->stash_id = ~(u32)0;
+	domain->snoop_id = ~(u32)0;
+	domain->win_cnt = pamu_get_max_subwin_cnt();
+	domain->geom_size = 0;
+
+	INIT_LIST_HEAD(&domain->devices);
+
+	spin_lock_init(&domain->domain_lock);
+
+	return domain;
+}
+
+static inline struct device_domain_info *find_domain(struct device *dev)
+{
+	return dev->archdata.iommu_domain;
+}
+
+static void remove_device_ref(struct device_domain_info *info, u32 win_cnt)
+{
+	unsigned long flags;
+
+	list_del(&info->link);
+	spin_lock_irqsave(&iommu_lock, flags);
+	if (win_cnt > 1)
+		pamu_free_subwins(info->liodn);
+	pamu_disable_liodn(info->liodn);
+	spin_unlock_irqrestore(&iommu_lock, flags);
+	spin_lock_irqsave(&device_domain_lock, flags);
+	info->dev->archdata.iommu_domain = NULL;
+	kmem_cache_free(iommu_devinfo_cache, info);
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+static void detach_device(struct device *dev, struct fsl_dma_domain *dma_domain)
+{
+	struct device_domain_info *info, *tmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dma_domain->domain_lock, flags);
+	/* Remove the device from the domain device list */
+	list_for_each_entry_safe(info, tmp, &dma_domain->devices, link) {
+		if (!dev || (info->dev == dev))
+			remove_device_ref(info, dma_domain->win_cnt);
+	}
+	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+}
+
+static void attach_device(struct fsl_dma_domain *dma_domain, int liodn, struct device *dev)
+{
+	struct device_domain_info *info, *old_domain_info;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	/*
+	 * Check here if the device is already attached to domain or not.
+	 * If the device is already attached to a domain detach it.
+	 */
+	old_domain_info = find_domain(dev);
+	if (old_domain_info && old_domain_info->domain != dma_domain) {
+		spin_unlock_irqrestore(&device_domain_lock, flags);
+		detach_device(dev, old_domain_info->domain);
+		spin_lock_irqsave(&device_domain_lock, flags);
+	}
+
+	info = kmem_cache_zalloc(iommu_devinfo_cache, GFP_KERNEL);
+
+	info->dev = dev;
+	info->liodn = liodn;
+	info->domain = dma_domain;
+
+	list_add(&info->link, &dma_domain->devices);
+	/*
+	 * In case of devices with multiple LIODNs just store
+	 * the info for the first LIODN as all
+	 * LIODNs share the same domain
+	 */
+	if (!old_domain_info)
+		dev->archdata.iommu_domain = info;
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+}
+
+static phys_addr_t fsl_pamu_iova_to_phys(struct iommu_domain *domain,
+					    dma_addr_t iova)
+{
+	struct fsl_dma_domain *dma_domain = domain->priv;
+
+	if ((iova < domain->geometry.aperture_start) ||
+		iova > (domain->geometry.aperture_end))
+		return 0;
+
+	return get_phys_addr(dma_domain, iova);
+}
+
+static int fsl_pamu_domain_has_cap(struct iommu_domain *domain,
+				      unsigned long cap)
+{
+	return cap == IOMMU_CAP_CACHE_COHERENCY;
+}
+
+static void fsl_pamu_domain_destroy(struct iommu_domain *domain)
+{
+	struct fsl_dma_domain *dma_domain = domain->priv;
+
+	domain->priv = NULL;
+
+	/* remove all the devices from the device list */
+	detach_device(NULL, dma_domain);
+
+	dma_domain->enabled = 0;
+	dma_domain->mapped = 0;
+
+	kmem_cache_free(fsl_pamu_domain_cache, dma_domain);
+}
+
+static int fsl_pamu_domain_init(struct iommu_domain *domain)
+{
+	struct fsl_dma_domain *dma_domain;
+
+	dma_domain = iommu_alloc_dma_domain();
+	if (!dma_domain) {
+		pr_err("dma_domain allocation failed\n");
+		return -ENOMEM;
+	}
+	domain->priv = dma_domain;
+	dma_domain->iommu_domain = domain;
+	/* defaul geometry 64 GB i.e. maximum system address */
+	domain->geometry.aperture_start = 0;
+	domain->geometry.aperture_end = (1ULL << 36) - 1;
+	domain->geometry.force_aperture = true;
+
+	return 0;
+}
+
+/* Configure geometry settings for all LIODNs associated with domain */
+static int pamu_set_domain_geometry(struct fsl_dma_domain *dma_domain,
+				    struct iommu_domain_geometry *geom_attr,
+				    u32 win_cnt)
+{
+	struct device_domain_info *info;
+	int ret = 0;
+
+	list_for_each_entry(info, &dma_domain->devices, link) {
+		ret = pamu_set_liodn(info->liodn, info->dev, dma_domain,
+				      geom_attr, win_cnt);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/* Update stash destination for all LIODNs associated with the domain */
+static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val)
+{
+	struct device_domain_info *info;
+	int ret = 0;
+
+	list_for_each_entry(info, &dma_domain->devices, link) {
+		ret = update_liodn_stash(info->liodn, dma_domain, val);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/* Update domain mappings for all LIODNs associated with the domain */
+static int update_domain_mapping(struct fsl_dma_domain *dma_domain, u32 wnd_nr)
+{
+	struct device_domain_info *info;
+	int ret = 0;
+
+	list_for_each_entry(info, &dma_domain->devices, link) {
+		ret = update_liodn(info->liodn, dma_domain, wnd_nr);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+static int disable_domain_win(struct fsl_dma_domain *dma_domain, u32 wnd_nr)
+{
+	struct device_domain_info *info;
+	int ret = 0;
+
+	list_for_each_entry(info, &dma_domain->devices, link) {
+		if (dma_domain->win_cnt == 1 && dma_domain->enabled) {
+			ret = pamu_disable_liodn(info->liodn);
+			if (!ret)
+				dma_domain->enabled = 0;
+		} else {
+			ret = pamu_disable_spaace(info->liodn, wnd_nr);
+		}
+	}
+
+	return ret;
+}
+
+static void fsl_pamu_window_disable(struct iommu_domain *domain, u32 wnd_nr)
+{
+	struct fsl_dma_domain *dma_domain = domain->priv;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&dma_domain->domain_lock, flags);
+	if (!dma_domain->win_arr) {
+		pr_err("Number of windows not configured\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return;
+	}
+
+	if (wnd_nr >= dma_domain->win_cnt) {
+		pr_err("Invalid window index\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return;
+	}
+
+	if (dma_domain->win_arr[wnd_nr].valid) {
+		ret = disable_domain_win(dma_domain, wnd_nr);
+		if (!ret) {
+			dma_domain->win_arr[wnd_nr].valid = 0;
+			dma_domain->mapped--;
+		}
+	}
+
+	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+}
+
+static int fsl_pamu_window_enable(struct iommu_domain *domain, u32 wnd_nr,
+				  phys_addr_t paddr, u64 size, int prot)
+{
+	struct fsl_dma_domain *dma_domain = domain->priv;
+	struct dma_window *wnd;
+	int pamu_prot = 0;
+	int ret;
+	unsigned long flags;
+	u64 win_size;
+
+	if (prot & IOMMU_READ)
+		pamu_prot |= PAACE_AP_PERMS_QUERY;
+	if (prot & IOMMU_WRITE)
+		pamu_prot |= PAACE_AP_PERMS_UPDATE;
+
+	spin_lock_irqsave(&dma_domain->domain_lock, flags);
+	if (!dma_domain->win_arr) {
+		pr_err("Number of windows not configured\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return -ENODEV;
+	}
+
+	if (wnd_nr >= dma_domain->win_cnt) {
+		pr_err("Invalid window index\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return -EINVAL;
+	}
+
+	win_size = dma_domain->geom_size >> ilog2(dma_domain->win_cnt);
+	if (size > win_size) {
+		pr_err("Invalid window size \n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return -EINVAL;
+	}
+
+	if (dma_domain->win_cnt == 1) {
+		if (dma_domain->enabled) {
+			pr_err("Disable the window before updating the mapping\n");
+			spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+			return -EBUSY;
+		}
+
+		ret = check_size(size, domain->geometry.aperture_start);
+		if (ret) {
+			pr_err("Aperture start not aligned to the size\n");
+			spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+			return -EINVAL;
+		}
+	}
+
+	wnd = &dma_domain->win_arr[wnd_nr];
+	if (!wnd->valid) {
+		wnd->paddr = paddr;
+		wnd->size = size;
+		wnd->prot = pamu_prot;
+
+		ret = update_domain_mapping(dma_domain, wnd_nr);
+		if (!ret) {
+			wnd->valid = 1;
+			dma_domain->mapped++;
+		}
+	} else {
+		pr_err("Disable the window before updating the mapping\n");
+		ret = -EBUSY;
+	}
+
+	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+	return ret;
+}
+
+/*
+ * Attach the LIODN to the DMA domain and configure the geometry
+ * and window mappings.
+ */
+static int handle_attach_device(struct fsl_dma_domain *dma_domain,
+				 struct device *dev, const u32 *liodn,
+				 int num)
+{
+	unsigned long flags;
+	struct iommu_domain *domain = dma_domain->iommu_domain;
+	int ret = 0;
+	int i;
+
+	spin_lock_irqsave(&dma_domain->domain_lock, flags);
+	for (i = 0; i < num; i++) {
+
+		/* Ensure that LIODN value is valid */
+		if (liodn[i] >= PAACE_NUMBER_ENTRIES) {
+			pr_err("Invalid liodn %d, attach device failed for %s\n",
+				liodn[i], dev->of_node->full_name);
+			ret = -EINVAL;
+			break;
+		}
+
+		attach_device(dma_domain, liodn[i], dev);
+		/*
+		 * Check if geometry has already been configured
+		 * for the domain. If yes, set the geometry for
+		 * the LIODN.
+		 */
+		if (dma_domain->win_cnt) {
+			u32 win_cnt = dma_domain->win_cnt > 1 ? dma_domain->win_cnt : 0;
+			ret = pamu_set_liodn(liodn[i], dev, dma_domain,
+					      &domain->geometry,
+					      win_cnt);
+			if (ret)
+				break;
+			if (dma_domain->mapped) {
+				/*
+				 * Create window/subwindow mapping for
+				 * the LIODN.
+				 */
+				ret = map_liodn(liodn[i], dma_domain);
+				if (ret)
+					break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+	return ret;
+}
+
+static int fsl_pamu_attach_device(struct iommu_domain *domain,
+				  struct device *dev)
+{
+	struct fsl_dma_domain *dma_domain = domain->priv;
+	const u32 *liodn;
+	u32 liodn_cnt;
+	int len, ret = 0;
+	struct pci_dev *pdev = NULL;
+	struct pci_controller *pci_ctl;
+
+	/*
+	 * Use LIODN of the PCI controller while attaching a
+	 * PCI device.
+	 */
+	if (dev->bus == &pci_bus_type) {
+		pdev = to_pci_dev(dev);
+		pci_ctl = pci_bus_to_host(pdev->bus);
+		/*
+		 * make dev point to pci controller device
+		 * so we can get the LIODN programmed by
+		 * u-boot.
+		 */
+		dev = pci_ctl->parent;
+	}
+
+	liodn = of_get_property(dev->of_node, "fsl,liodn", &len);
+	if (liodn) {
+		liodn_cnt = len / sizeof(u32);
+		ret = handle_attach_device(dma_domain, dev,
+					 liodn, liodn_cnt);
+	} else {
+		pr_err("missing fsl,liodn property at %s\n",
+		          dev->of_node->full_name);
+			ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static void fsl_pamu_detach_device(struct iommu_domain *domain,
+				      struct device *dev)
+{
+	struct fsl_dma_domain *dma_domain = domain->priv;
+	const u32 *prop;
+	int len;
+	struct pci_dev *pdev = NULL;
+	struct pci_controller *pci_ctl;
+
+	/*
+	 * Use LIODN of the PCI controller while detaching a
+	 * PCI device.
+	 */
+	if (dev->bus == &pci_bus_type) {
+		pdev = to_pci_dev(dev);
+		pci_ctl = pci_bus_to_host(pdev->bus);
+		/*
+		 * make dev point to pci controller device
+		 * so we can get the LIODN programmed by
+		 * u-boot.
+		 */
+		dev = pci_ctl->parent;
+	}
+
+	prop = of_get_property(dev->of_node, "fsl,liodn", &len);
+	if (prop)
+		detach_device(dev, dma_domain);
+	else
+		pr_err("missing fsl,liodn property at %s\n",
+		          dev->of_node->full_name);
+}
+
+static  int configure_domain_geometry(struct iommu_domain *domain, void *data)
+{
+	struct iommu_domain_geometry *geom_attr = data;
+	struct fsl_dma_domain *dma_domain = domain->priv;
+	dma_addr_t geom_size;
+	unsigned long flags;
+
+	geom_size = geom_attr->aperture_end - geom_attr->aperture_start + 1;
+	/*
+	 * Sanity check the geometry size. Also, we do not support
+	 * DMA outside of the geometry.
+	 */
+	if (check_size(geom_size, geom_attr->aperture_start) ||
+		!geom_attr->force_aperture) {
+			pr_err("Invalid PAMU geometry attributes\n");
+			return -EINVAL;
+		}
+
+	spin_lock_irqsave(&dma_domain->domain_lock, flags);
+	if (dma_domain->enabled) {
+		pr_err("Can't set geometry attributes as domain is active\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return  -EBUSY;
+	}
+
+	/* Copy the domain geometry information */
+	memcpy(&domain->geometry, geom_attr,
+	       sizeof(struct iommu_domain_geometry));
+	dma_domain->geom_size = geom_size;
+
+	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+	return 0;
+}
+
+/* Set the domain stash attribute */
+static int configure_domain_stash(struct fsl_dma_domain *dma_domain, void *data)
+{
+	struct pamu_stash_attribute *stash_attr = data;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&dma_domain->domain_lock, flags);
+
+	memcpy(&dma_domain->dma_stash, stash_attr,
+		 sizeof(struct pamu_stash_attribute));
+
+	dma_domain->stash_id = get_stash_id(stash_attr->cache,
+					    stash_attr->cpu);
+	if (dma_domain->stash_id == ~(u32)0) {
+		pr_err("Invalid stash attributes\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return -EINVAL;
+	}
+
+	ret = update_domain_stash(dma_domain, dma_domain->stash_id);
+
+	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+	return ret;
+}
+
+/* Configure domain dma state i.e. enable/disable DMA*/
+static int configure_domain_dma_state(struct fsl_dma_domain *dma_domain, bool enable)
+{
+	struct device_domain_info *info;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&dma_domain->domain_lock, flags);
+
+	if (enable && !dma_domain->mapped) {
+		pr_err("Can't enable DMA domain without valid mapping\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return -ENODEV;
+	}
+
+	dma_domain->enabled = enable;
+	list_for_each_entry(info, &dma_domain->devices,
+				 link) {
+		ret = (enable) ? pamu_enable_liodn(info->liodn) :
+			pamu_disable_liodn(info->liodn);
+		if (ret)
+			pr_err("Unable to set dma state for liodn %d",
+				 info->liodn);
+	}
+	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+	return 0;
+}
+
+static int fsl_pamu_set_domain_attr(struct iommu_domain *domain,
+				 enum iommu_attr attr_type, void *data)
+{
+	struct fsl_dma_domain *dma_domain = domain->priv;
+	int ret = 0;
+
+
+	switch (attr_type) {
+	case DOMAIN_ATTR_GEOMETRY:
+		ret = configure_domain_geometry(domain, data);
+		break;
+	case DOMAIN_ATTR_PAMU_STASH:
+		ret = configure_domain_stash(dma_domain, data);
+		break;
+	case DOMAIN_ATTR_PAMU_ENABLE:
+		ret = configure_domain_dma_state(dma_domain, *(int *)data);
+		break;
+	default:
+		pr_err("Unsupported attribute type\n");
+		ret = -EINVAL;
+		break;
+	};
+
+	return ret;
+}
+
+static int fsl_pamu_get_domain_attr(struct iommu_domain *domain,
+				 enum iommu_attr attr_type, void *data)
+{
+	struct fsl_dma_domain *dma_domain = domain->priv;
+	int ret = 0;
+
+
+	switch (attr_type) {
+	case DOMAIN_ATTR_PAMU_STASH:
+		memcpy((struct pamu_stash_attribute *) data, &dma_domain->dma_stash,
+				 sizeof(struct pamu_stash_attribute));
+		break;
+	case DOMAIN_ATTR_PAMU_ENABLE:
+		*(int *)data = dma_domain->enabled;
+		break;
+	case DOMAIN_ATTR_FSL_PAMUV1:
+		*(int *)data = DOMAIN_ATTR_FSL_PAMUV1;
+		break;
+	default:
+		pr_err("Unsupported attribute type\n");
+		ret = -EINVAL;
+		break;
+	};
+
+	return ret;
+}
+
+#define REQ_ACS_FLAGS	(PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
+
+static struct iommu_group *get_device_iommu_group(struct device *dev)
+{
+	struct iommu_group *group;
+
+	group = iommu_group_get(dev);
+	if (!group)
+		group = iommu_group_alloc();
+
+	return group;
+}
+
+static  bool check_pci_ctl_endpt_part(struct pci_controller *pci_ctl)
+{
+	u32 version;
+
+	/* Check the PCI controller version number by readding BRR1 register */
+	version = in_be32(pci_ctl->cfg_addr + (PCI_FSL_BRR1 >> 2));
+	version &= PCI_FSL_BRR1_VER;
+	/* If PCI controller version is >= 0x204 we can partition endpoints*/
+	if (version >= 0x204)
+		return 1;
+
+	return 0;
+}
+
+static struct iommu_group *get_peer_pci_device_group(struct pci_dev *pdev)
+{
+	struct iommu_group *group = NULL;
+
+	/* check if this is the first device on the bus*/
+	if (pdev->bus_list.next == pdev->bus_list.prev) {
+		struct pci_bus *bus = pdev->bus->parent;
+		/* Traverese the parent bus list to get
+		 * pdev & dev for the sibling device.
+		 */
+		while (bus) {
+			if (!list_empty(&bus->devices)) {
+				pdev = container_of(bus->devices.next,
+					            struct pci_dev, bus_list);
+				group = iommu_group_get(&pdev->dev);
+				break;
+			} else
+				bus = bus->parent;
+		}
+	} else {
+		/*
+		 * Get the pdev & dev for the sibling device
+		 */
+		pdev = container_of(pdev->bus_list.prev,
+				    struct pci_dev, bus_list);
+		group = iommu_group_get(&pdev->dev);
+	}
+
+	return group;
+}
+
+static struct iommu_group *get_pci_device_group(struct pci_dev *pdev)
+{
+	struct iommu_group *group = NULL;
+	struct pci_dev *bridge, *dma_pdev = NULL;
+	struct pci_controller *pci_ctl;
+	bool pci_endpt_partioning;
+
+	pci_ctl = pci_bus_to_host(pdev->bus);
+	pci_endpt_partioning = check_pci_ctl_endpt_part(pci_ctl);
+	/* We can partition PCIe devices so assign device group to the device */
+	if (pci_endpt_partioning) {
+		bridge = pci_find_upstream_pcie_bridge(pdev);
+		if (bridge) {
+			if (pci_is_pcie(bridge))
+				dma_pdev = pci_get_domain_bus_and_slot(
+						pci_domain_nr(pdev->bus),
+						bridge->subordinate->number, 0);
+			if (!dma_pdev)
+				dma_pdev = pci_dev_get(bridge);
+		} else
+			dma_pdev = pci_dev_get(pdev);
+
+		/* Account for quirked devices */
+		swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
+
+		/*
+	 	 * If it's a multifunction device that does not support our
+	 	 * required ACS flags, add to the same group as function 0.
+	 	*/
+		if (dma_pdev->multifunction &&
+		    !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))
+			swap_pci_ref(&dma_pdev,
+				     pci_get_slot(dma_pdev->bus,
+						  PCI_DEVFN(PCI_SLOT(dma_pdev->devfn),
+						  0)));
+
+		group = get_device_iommu_group(&pdev->dev);
+		pci_dev_put(pdev);
+		/*
+		 * PCIe controller is not a paritionable entity
+		 * free the controller device iommu_group.
+		 */
+		if (pci_ctl->parent->iommu_group)
+			iommu_group_remove_device(pci_ctl->parent);
+	} else {
+		/*
+		 * All devices connected to the controller will share the
+		 * PCI controllers device group. If this is the first
+		 * device to be probed for the pci controller, copy the
+		 * device group information from the PCI controller device
+		 * node and remove the PCI controller iommu group.
+		 * For subsequent devices, the iommu group information can
+		 * be obtained from sibling devices (i.e. from the bus_devices
+		 * link list).
+		 */
+		if (pci_ctl->parent->iommu_group) {
+			group = get_device_iommu_group(pci_ctl->parent);
+			iommu_group_remove_device(pci_ctl->parent);
+		} else
+			group = get_peer_pci_device_group(pdev);
+	}
+
+	return group;
+}
+
+static int fsl_pamu_add_device(struct device *dev)
+{
+	struct iommu_group *group = NULL;
+	struct pci_dev *pdev;
+	const u32 *prop;
+	int ret, len;
+
+	/*
+	 * For platform devices we allocate a separate group for
+	 * each of the devices.
+	 */
+	if (dev->bus == &pci_bus_type) {
+		pdev = to_pci_dev(dev);
+		/* Don't create device groups for virtual PCI bridges */
+		if (pdev->subordinate)
+			return 0;
+
+		group = get_pci_device_group(pdev);
+
+	} else {
+		prop = of_get_property(dev->of_node, "fsl,liodn", &len);
+		if (prop)
+			group = get_device_iommu_group(dev);
+	}
+
+	if (!group || IS_ERR(group))
+		return PTR_ERR(group);
+
+	ret = iommu_group_add_device(group, dev);
+
+	iommu_group_put(group);
+	return ret;
+}
+
+static void fsl_pamu_remove_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int fsl_pamu_set_windows(struct iommu_domain *domain, u32 w_count)
+{
+	struct fsl_dma_domain *dma_domain = domain->priv;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&dma_domain->domain_lock, flags);
+	/* Ensure domain is inactive i.e. DMA should be disabled for the domain */
+	if (dma_domain->enabled) {
+		pr_err("Can't set geometry attributes as domain is active\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return  -EBUSY;
+	}
+
+	/* Ensure that the geometry has been set for the domain */
+	if (!dma_domain->geom_size) {
+		pr_err("Please configure geometry before setting the number of windows\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return -EINVAL;
+	}
+
+	/*
+	 * Ensure we have valid window count i.e. it should be less than
+	 * maximum permissible limit and should be a power of two.
+	 */
+	if (w_count > pamu_get_max_subwin_cnt() || !is_power_of_2(w_count)) {
+		pr_err("Invalid window count\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return -EINVAL;
+	}
+
+	ret = pamu_set_domain_geometry(dma_domain, &domain->geometry,
+				((w_count > 1) ? w_count : 0));
+	if (!ret) {
+		if (dma_domain->win_arr)
+			kfree(dma_domain->win_arr);
+		dma_domain->win_arr = kzalloc(sizeof(struct dma_window) *
+							  w_count, GFP_KERNEL);
+		if (!dma_domain->win_arr) {
+			spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+			return -ENOMEM;
+		}
+		dma_domain->win_cnt = w_count;
+	}
+	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+	return ret;
+}
+
+static u32 fsl_pamu_get_windows(struct iommu_domain *domain)
+{
+	struct fsl_dma_domain *dma_domain = domain->priv;
+
+	return dma_domain->win_cnt;
+}
+
+static struct iommu_ops fsl_pamu_ops = {
+	.domain_init	= fsl_pamu_domain_init,
+	.domain_destroy = fsl_pamu_domain_destroy,
+	.attach_dev	= fsl_pamu_attach_device,
+	.detach_dev	= fsl_pamu_detach_device,
+	.domain_window_enable = fsl_pamu_window_enable,
+	.domain_window_disable = fsl_pamu_window_disable,
+	.domain_get_windows = fsl_pamu_get_windows,
+	.domain_set_windows = fsl_pamu_set_windows,
+	.iova_to_phys	= fsl_pamu_iova_to_phys,
+	.domain_has_cap = fsl_pamu_domain_has_cap,
+	.domain_set_attr = fsl_pamu_set_domain_attr,
+	.domain_get_attr = fsl_pamu_get_domain_attr,
+	.add_device	= fsl_pamu_add_device,
+	.remove_device	= fsl_pamu_remove_device,
+};
+
+int pamu_domain_init()
+{
+	int ret = 0;
+
+	ret = iommu_init_mempool();
+	if (ret)
+		return ret;
+
+	bus_set_iommu(&platform_bus_type, &fsl_pamu_ops);
+	bus_set_iommu(&pci_bus_type, &fsl_pamu_ops);
+
+	return ret;
+}
diff --git a/drivers/iommu/fsl_pamu_domain.h b/drivers/iommu/fsl_pamu_domain.h
new file mode 100644
index 0000000..c90293f
--- /dev/null
+++ b/drivers/iommu/fsl_pamu_domain.h
@@ -0,0 +1,85 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ *
+ */
+
+#ifndef __FSL_PAMU_DOMAIN_H
+#define __FSL_PAMU_DOMAIN_H
+
+#include "fsl_pamu.h"
+
+struct dma_window {
+	phys_addr_t paddr;
+	u64 size;
+	int valid;
+	int prot;
+};
+
+struct fsl_dma_domain {
+	/*
+	 * Indicates the geometry size for the domain.
+	 * This would be set when the geometry is
+	 * configured for the domain.
+	 */
+	dma_addr_t			geom_size;
+	/*
+	 * Number of windows assocaited with this domain.
+	 * During domain initialization, it is set to the
+	 * the maximum number of subwindows allowed for a LIODN.
+	 * Minimum value for this is 1 indicating a single PAMU
+	 * window, without any sub windows. Value can be set/
+	 * queried by set_attr/get_attr API for DOMAIN_ATTR_WINDOWS.
+	 * Value can only be set once the geometry has been configured.
+	 */
+	u32				win_cnt;
+	/*
+	 * win_arr contains information of the configured
+	 * windows for a domain. This is allocated only
+	 * when the number of windows for the domain are
+	 * set.
+	 */
+	struct dma_window		*win_arr;
+	/* list of devices associated with the domain */
+	struct list_head		devices;
+	/* dma_domain states:
+	 * mapped - A particular mapping has been created
+	 * within the configured geometry.
+	 * enabled - DMA has been enabled for the given
+	 * domain. This translates to setting of the
+	 * valid bit for the primary PAACE in the PAMU
+	 * PAACT table. Domain geometry should be set and
+	 * it must have a valid mapping before DMA can be
+	 * enabled for it.
+	 *
+	 */
+	int				mapped;
+	int				enabled;
+	/* stash_id obtained from the stash attribute details */
+	u32				stash_id;
+	struct pamu_stash_attribute	dma_stash;
+	u32				snoop_id;
+	struct iommu_domain		*iommu_domain;
+	spinlock_t			domain_lock;
+};
+
+/* domain-device relationship */
+struct device_domain_info {
+	struct list_head link;	/* link to domain siblings */
+	struct device *dev;
+	u32 liodn;
+	struct fsl_dma_domain *domain; /* pointer to domain */
+};
+#endif  /* __FSL_PAMU_DOMAIN_H */
-- 
1.7.4.1

^ permalink raw reply related

* Re: [PATCH 0/3] freescale: Update logging style
From: David Miller @ 2013-04-14 19:43 UTC (permalink / raw)
  To: joe; +Cc: netdev, festevam, linuxppc-dev, linux-kernel
In-Reply-To: <cover.1365915548.git.joe@perches.com>

From: Joe Perches <joe@perches.com>
Date: Sat, 13 Apr 2013 22:03:16 -0700

> Convert various printk logging styles to current styles.
> 
> Uncompiled, untested.
> 
> Joe Perches (3):
>   fec: Convert printks to netdev_<level>
>   gianfar: Use netdev_<level> when possible
>   ucc_geth: Convert ugeth_<level> to pr_<level>

All applied.

^ permalink raw reply

* Re: [PATCH 03/17] powerpc/85xx: cache operations for Freescale SoCs based on BOOK3E
From: Zhao Chenhui @ 2013-04-15  8:38 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Scott Wood
In-Reply-To: <1364994565-16010-3-git-send-email-chenhui.zhao@freescale.com>

On Wed, Apr 03, 2013 at 09:09:11PM +0800, Zhao Chenhui wrote:
> These cache operations support Freescale SoCs based on BOOK3E.
> Move L1 cache operations to fsl_booke_cache.S in order to maintain
> easily. And, add cache operations for backside L2 cache and platform cache.
> 
> The backside L2 cache appears on e500mc and e5500 core. The platform cache
> supported by this patch is L2 Look-Aside Cache, which appears on SoCs
> with e500v1/e500v2 core, such as MPC8572, P1020, etc.
> 
> Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com>
> Signed-off-by: Li Yang <leoli@freescale.com>
> ---
>  arch/powerpc/include/asm/cacheflush.h |    8 ++
>  arch/powerpc/kernel/Makefile          |    1 +
>  arch/powerpc/kernel/fsl_booke_cache.S |  210 +++++++++++++++++++++++++++++++++
>  arch/powerpc/kernel/head_fsl_booke.S  |   74 ------------
>  4 files changed, 219 insertions(+), 74 deletions(-)
>  create mode 100644 arch/powerpc/kernel/fsl_booke_cache.S

Are there any comments about the set of patches?

-Chenhui

^ permalink raw reply

* Re: [PATCH -V5 24/25] powerpc: Optimize hugepage invalidate
From: David Gibson @ 2013-04-15  1:18 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev, paulus, linux-mm
In-Reply-To: <8761zpqvkr.fsf@linux.vnet.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 10497 bytes --]

On Sun, Apr 14, 2013 at 03:32:12PM +0530, Aneesh Kumar K.V wrote:
> David Gibson <dwg@au1.ibm.com> writes:
> 
> > On Thu, Apr 04, 2013 at 11:28:02AM +0530, Aneesh Kumar K.V wrote:
> >> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> >> 
> >> Hugepage invalidate involves invalidating multiple hpte entries.
> >> Optimize the operation using H_BULK_REMOVE on lpar platforms.
> >> On native, reduce the number of tlb flush.
> >> 
> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> >> ---
> >>  arch/powerpc/include/asm/machdep.h    |    3 +
> >>  arch/powerpc/mm/hash_native_64.c      |   78 ++++++++++++++++++++
> >>  arch/powerpc/mm/pgtable.c             |   13 +++-
> >>  arch/powerpc/platforms/pseries/lpar.c |  126 +++++++++++++++++++++++++++++++--
> >>  4 files changed, 210 insertions(+), 10 deletions(-)
> >> 
> >> diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
> >> index 6cee6e0..3bc7816 100644
> >> --- a/arch/powerpc/include/asm/machdep.h
> >> +++ b/arch/powerpc/include/asm/machdep.h
> >> @@ -56,6 +56,9 @@ struct machdep_calls {
> >>  	void            (*hpte_removebolted)(unsigned long ea,
> >>  					     int psize, int ssize);
> >>  	void		(*flush_hash_range)(unsigned long number, int local);
> >> +	void		(*hugepage_invalidate)(struct mm_struct *mm,
> >> +					       unsigned char *hpte_slot_array,
> >> +					       unsigned long addr, int psize);
> >>  
> >>  	/* special for kexec, to be called in real mode, linear mapping is
> >>  	 * destroyed as well */
> >> diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
> >> index ac84fa6..59f29bf 100644
> >> --- a/arch/powerpc/mm/hash_native_64.c
> >> +++ b/arch/powerpc/mm/hash_native_64.c
> >> @@ -450,6 +450,83 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
> >>  	local_irq_restore(flags);
> >>  }
> >>  
> >> +static void native_hugepage_invalidate(struct mm_struct *mm,
> >> +				       unsigned char *hpte_slot_array,
> >> +				       unsigned long addr, int psize)
> >> +{
> >> +	int ssize = 0, i;
> >> +	int lock_tlbie;
> >> +	struct hash_pte *hptep;
> >> +	int actual_psize = MMU_PAGE_16M;
> >> +	unsigned int max_hpte_count, valid;
> >> +	unsigned long flags, s_addr = addr;
> >> +	unsigned long hpte_v, want_v, shift;
> >> +	unsigned long hidx, vpn = 0, vsid, hash, slot;
> >> +
> >> +	shift = mmu_psize_defs[psize].shift;
> >> +	max_hpte_count = HUGE_PAGE_SIZE/(1ul << shift);
> >> +
> >> +	local_irq_save(flags);
> >> +	for (i = 0; i < max_hpte_count; i++) {
> >> +		/*
> >> +		 * 8 bits per each hpte entries
> >> +		 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
> >> +		 */
> >> +		valid = hpte_slot_array[i] & 0x1;
> >> +		if (!valid)
> >> +			continue;
> >> +		hidx =  hpte_slot_array[i]  >> 1;
> >> +
> >> +		/* get the vpn */
> >> +		addr = s_addr + (i * (1ul << shift));
> >> +		if (!is_kernel_addr(addr)) {
> >> +			ssize = user_segment_size(addr);
> >> +			vsid = get_vsid(mm->context.id, addr, ssize);
> >> +			WARN_ON(vsid == 0);
> >> +		} else {
> >> +			vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
> >> +			ssize = mmu_kernel_ssize;
> >> +		}
> >> +
> >> +		vpn = hpt_vpn(addr, vsid, ssize);
> >> +		hash = hpt_hash(vpn, shift, ssize);
> >> +		if (hidx & _PTEIDX_SECONDARY)
> >> +			hash = ~hash;
> >> +
> >> +		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
> >> +		slot += hidx & _PTEIDX_GROUP_IX;
> >> +
> >> +		hptep = htab_address + slot;
> >> +		want_v = hpte_encode_avpn(vpn, psize, ssize);
> >> +		native_lock_hpte(hptep);
> >> +		hpte_v = hptep->v;
> >> +
> >> +		/* Even if we miss, we need to invalidate the TLB */
> >> +		if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
> >> +			native_unlock_hpte(hptep);
> >> +		else
> >> +			/* Invalidate the hpte. NOTE: this also unlocks it */
> >> +			hptep->v = 0;
> >
> > Shouldn't you be clearing the entry from the slot_array once it is
> > invalidated in the hash table?
> 
> We don't need to do that. We should be fine even if hptes get
> invalidated under us. Also inorder to update slot_array i will have to
> mark the corresponding hpte busy, so that we can ensure nobody is
> looking at the slot array.

Hm, ok.

> >> +	}
> >> +	/*
> >> +	 * Since this is a hugepage, we just need a single tlbie.
> >> +	 * use the last vpn.
> >> +	 */
> >> +	lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
> >> +	if (lock_tlbie)
> >> +		raw_spin_lock(&native_tlbie_lock);
> >> +
> >> +	asm volatile("ptesync":::"memory");
> >> +	__tlbie(vpn, psize, actual_psize, ssize);
> >> +	asm volatile("eieio; tlbsync; ptesync":::"memory");
> >> +
> >> +	if (lock_tlbie)
> >> +		raw_spin_unlock(&native_tlbie_lock);
> >> +
> >> +	local_irq_restore(flags);
> >> +}
> >> +
> >> +
> >>  static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
> >>  			int *psize, int *apsize, int *ssize, unsigned long *vpn)
> >>  {
> >> @@ -678,4 +755,5 @@ void __init hpte_init_native(void)
> >>  	ppc_md.hpte_remove	= native_hpte_remove;
> >>  	ppc_md.hpte_clear_all	= native_hpte_clear;
> >>  	ppc_md.flush_hash_range = native_flush_hash_range;
> >> +	ppc_md.hugepage_invalidate   = native_hugepage_invalidate;
> >>  }
> >> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
> >> index fbff062..386cab8 100644
> >> --- a/arch/powerpc/mm/pgtable.c
> >> +++ b/arch/powerpc/mm/pgtable.c
> >> @@ -433,6 +433,7 @@ void hpte_need_hugepage_flush(struct mm_struct *mm, unsigned long addr,
> >>  {
> >>  	int ssize, i;
> >>  	unsigned long s_addr;
> >> +	int max_hpte_count;
> >>  	unsigned int psize, valid;
> >>  	unsigned char *hpte_slot_array;
> >>  	unsigned long hidx, vpn, vsid, hash, shift, slot;
> >> @@ -446,12 +447,18 @@ void hpte_need_hugepage_flush(struct mm_struct *mm, unsigned long addr,
> >>  	 * second half of the PMD
> >>  	 */
> >>  	hpte_slot_array = *(char **)(pmdp + PTRS_PER_PMD);
> >> -
> >>  	/* get the base page size */
> >>  	psize = get_slice_psize(mm, s_addr);
> >> -	shift = mmu_psize_defs[psize].shift;
> >>  
> >> -	for (i = 0; i < HUGE_PAGE_SIZE/(1ul << shift); i++) {
> >> +	if (ppc_md.hugepage_invalidate)
> >> +		return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
> >> +						  s_addr, psize);
> >> +	/*
> >> +	 * No bluk hpte removal support, invalidate each entry
> >> +	 */
> >> +	shift = mmu_psize_defs[psize].shift;
> >> +	max_hpte_count = HUGE_PAGE_SIZE/(1ul << shift);
> >> +	for (i = 0; i < max_hpte_count; i++) {
> >>  		/*
> >>  		 * 8 bits per each hpte entries
> >>  		 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
> >> diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
> >> index 3daced3..5fcc621 100644
> >> --- a/arch/powerpc/platforms/pseries/lpar.c
> >> +++ b/arch/powerpc/platforms/pseries/lpar.c
> >> @@ -45,6 +45,13 @@
> >>  #include "plpar_wrappers.h"
> >>  #include "pseries.h"
> >>  
> >> +/* Flag bits for H_BULK_REMOVE */
> >> +#define HBR_REQUEST	0x4000000000000000UL
> >> +#define HBR_RESPONSE	0x8000000000000000UL
> >> +#define HBR_END		0xc000000000000000UL
> >> +#define HBR_AVPN	0x0200000000000000UL
> >> +#define HBR_ANDCOND	0x0100000000000000UL
> >> +
> >>  
> >>  /* in hvCall.S */
> >>  EXPORT_SYMBOL(plpar_hcall);
> >> @@ -339,6 +346,117 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
> >>  	BUG_ON(lpar_rc != H_SUCCESS);
> >>  }
> >>  
> >> +/*
> >> + * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
> >> + * to make sure that we avoid bouncing the hypervisor tlbie lock.
> >> + */
> >> +#define PPC64_HUGE_HPTE_BATCH 12
> >> +
> >> +static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
> >> +					     unsigned long *vpn, int count,
> >> +					     int psize, int ssize)
> >> +{
> >> +	unsigned long param[9];
> >> +	int i = 0, pix = 0, rc;
> >> +	unsigned long flags = 0;
> >> +	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
> >> +
> >> +	if (lock_tlbie)
> >> +		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
> >> +
> >> +	for (i = 0; i < count; i++) {
> >> +
> >> +		if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
> >> +			pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize,
> >> +						     ssize, 0);
> >> +		} else {
> >> +			param[pix] = HBR_REQUEST | HBR_AVPN | slot[i];
> >> +			param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize);
> >> +			pix += 2;
> >> +			if (pix == 8) {
> >> +				rc = plpar_hcall9(H_BULK_REMOVE, param,
> >> +						  param[0], param[1], param[2],
> >> +						  param[3], param[4], param[5],
> >> +						  param[6], param[7]);
> >> +				BUG_ON(rc != H_SUCCESS);
> >> +				pix = 0;
> >> +			}
> >> +		}
> >> +	}
> >> +	if (pix) {
> >> +		param[pix] = HBR_END;
> >> +		rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
> >> +				  param[2], param[3], param[4], param[5],
> >> +				  param[6], param[7]);
> >> +		BUG_ON(rc != H_SUCCESS);
> >> +	}
> >> +
> >> +	if (lock_tlbie)
> >> +		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
> >> +}
> >> +
> >> +static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm,
> >> +				       unsigned char *hpte_slot_array,
> >> +				       unsigned long addr, int psize)
> >> +{
> >> +	int ssize = 0, i, index = 0;
> >> +	unsigned long s_addr = addr;
> >> +	unsigned int max_hpte_count, valid;
> >> +	unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
> >> +	unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
> >
> > These are really too big to be allocating on the stack.  You'd be
> > better off going direct from the char slot array to the data structure
> > for H_BULK_REMOVE, rather than introducing this intermediate
> > structure.
> 
> The reason i wanted to do that was to make sure i don't lock/unlock
> pSeries_lpar_tlbie_lock that frequently, ie, for ever H_BULK_REMOVE.
> The total size taken by both the array is only 192 bytes. Is that big
> enough to create trouble ?

Oh, sorry, I missed the batch invalidate.  I think 192 bytes is
borderline - Paul or Ben might know better.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* [PATCH] powerpc: Fix audit crash due to save/restore PPR changes
From: Alistair Popple @ 2013-04-15  1:44 UTC (permalink / raw)
  To: sfr, benh; +Cc: mikey, linuxppc-dev, haren

The current mainline crashes when hitting userspace with the following:

kernel BUG at /home/alistair/Source/linux-stable/kernel/auditsc.c:1769!
cpu 0x1: Vector: 700 (Program Check) at [c000000023883a60]
    pc: c0000000001047a8: .__audit_syscall_entry+0x38/0x130
    lr: c00000000000ed64: .do_syscall_trace_enter+0xc4/0x270
    sp: c000000023883ce0
   msr: 8000000000029032
  current = 0xc000000023800000
  paca    = 0xc00000000f080380   softe: 0        irq_happened: 0x01
    pid   = 1629, comm = start_udev
kernel BUG at /home/alistair/Source/linux-stable/kernel/auditsc.c:1769!
enter ? for help
[c000000023883d80] c00000000000ed64 .do_syscall_trace_enter+0xc4/0x270
[c000000023883e30] c000000000009b08 syscall_dotrace+0xc/0x38
--- Exception: c00 (System Call) at 0000008010ec50dc

Bisecting found the following patch caused it:

commit 44e9309f1f357794b7ae93d5f3e3e6f11d2b8a7f
Author: Haren Myneni <haren@linux.vnet.ibm.com>
powerpc: Implement PPR save/restore

It was found this patch corrupted r9 when calling
SET_DEFAULT_THREAD_PPR()

Using r10 as a scratch register instead of r9 solved the problem.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Acked-by: Michael Neuling <mikey@neuling.org>
---

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 256c5bf..3acb1a0 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -304,7 +304,7 @@ syscall_exit_work:
 	subi	r12,r12,TI_FLAGS

 4:	/* Anything else left to do? */
-	SET_DEFAULT_THREAD_PPR(r3, r9)		/* Set thread.ppr = 3 */
+	SET_DEFAULT_THREAD_PPR(r3, r10)		/* Set thread.ppr = 3 */
 	andi.	r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SINGLESTEP)
 	beq	.ret_from_except_lite

^ permalink raw reply related

* Re: [RFC PATCH v2 3/4] powerpc: Don't bolt the hpte in kernel_map_linear_page()
From: Paul Mackerras @ 2013-04-15  3:50 UTC (permalink / raw)
  To: Li Zhong; +Cc: linuxppc-dev
In-Reply-To: <1365733021-28912-4-git-send-email-zhong@linux.vnet.ibm.com>

On Fri, Apr 12, 2013 at 10:16:59AM +0800, Li Zhong wrote:
> It seems that in kernel_unmap_linear_page(), it only checks whether there
> is a map in the linear_map_hash_slots array, so seems we don't need bolt
> the hpte.

I don't exactly understand your rationale here, but I don't think it's
safe not to have linear mapping pages bolted.  Basically, if a page
will be used in the process of calling hash_page to demand-fault an
HPTE into the hash table, then that page needs to be bolted, otherwise
we can get an infinite recursion of HPT misses.  That includes all
kernel stack pages, among other things, so I think we need to leave
the HPTE_V_BOLTED in there.

Paul.

^ permalink raw reply

* [PATCH] powerpc/perf: Power8 PMU support
From: Michael Ellerman @ 2013-04-15  4:17 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: acme, sukadev, Paul Mackerras, Anton Blanchard, linux-kernel

This patch adds preliminary support for the power8 PMU to perf.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
---
 arch/powerpc/perf/Makefile     |    3 +-
 arch/powerpc/perf/power8-pmu.c |  454 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 456 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/perf/power8-pmu.c

diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile
index af3fac2..472db18 100644
--- a/arch/powerpc/perf/Makefile
+++ b/arch/powerpc/perf/Makefile
@@ -4,7 +4,8 @@ obj-$(CONFIG_PERF_EVENTS)	+= callchain.o
 
 obj-$(CONFIG_PPC_PERF_CTRS)	+= core-book3s.o
 obj64-$(CONFIG_PPC_PERF_CTRS)	+= power4-pmu.o ppc970-pmu.o power5-pmu.o \
-				   power5+-pmu.o power6-pmu.o power7-pmu.o
+				   power5+-pmu.o power6-pmu.o power7-pmu.o \
+				   power8-pmu.o
 obj32-$(CONFIG_PPC_PERF_CTRS)	+= mpc7450-pmu.o
 
 obj-$(CONFIG_FSL_EMB_PERF_EVENT) += core-fsl-emb.o
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
new file mode 100644
index 0000000..106ae0b
--- /dev/null
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -0,0 +1,454 @@
+/*
+ * Performance counter support for POWER8 processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ * Copyright 2013 Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <asm/firmware.h>
+
+
+/*
+ * Some power8 event codes.
+ */
+#define PM_CYC				0x0001e
+#define PM_GCT_NOSLOT_CYC		0x100f8
+#define PM_CMPLU_STALL			0x4000a	/* or 0x1e054 */
+#define PM_INST_CMPL			0x00002
+#define PM_BRU_FIN			0x10068
+#define PM_BR_MPRED_CMPL		0x400f6
+
+
+/*
+ * Raw event encoding for POWER8:
+ *
+ *        60        56        52        48        44        40        36        32
+ * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
+ *                                     [      thresh_cmp     ]   [  thresh_ctl   ]
+ *                                                                       |
+ *                                       thresh start/stop OR FAB match -*
+ *
+ *        28        24        20        16        12         8         4         0
+ * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
+ *   [   ] [  sample ]   [cache]   [ pmc ]   [unit ]   c     m   [    pmcxsel    ]
+ *     |        |           |                          |     |
+ *     |        |           |                          |     *- mark
+ *     |        |           *- L1/L2/L3 cache_sel      |
+ *     |        |                                      |
+ *     |        *- sampling mode for marked events     *- combine
+ *     |
+ *     *- thresh_sel
+ *
+ * Below uses IBM bit numbering.
+ *
+ * MMCR1[x:y] = unit    (PMCxUNIT)
+ * MMCR1[x]   = combine (PMCxCOMB)
+ *
+ * if pmc == 3 and unit == 0 and pmcxsel[0:6] == 0b0101011
+ *	# PM_MRK_FAB_RSP_MATCH
+ *	MMCR1[20:27] = thresh_ctl   (FAB_CRESP_MATCH / FAB_TYPE_MATCH)
+ * else if pmc == 4 and unit == 0xf and pmcxsel[0:6] == 0b0101001
+ *	# PM_MRK_FAB_RSP_MATCH_CYC
+ *	MMCR1[20:27] = thresh_ctl   (FAB_CRESP_MATCH / FAB_TYPE_MATCH)
+ * else
+ *	MMCRA[48:55] = thresh_ctl   (THRESH START/END)
+ *
+ * if thresh_sel:
+ *	MMCRA[45:47] = thresh_sel
+ *
+ * if thresh_cmp:
+ *	MMCRA[22:24] = thresh_cmp[0:2]
+ *	MMCRA[25:31] = thresh_cmp[3:9]
+ *
+ * if unit == 6 or unit == 7
+ *	MMCRC[53:55] = cache_sel[1:3]      (L2EVENT_SEL)
+ * else if unit == 8 or unit == 9:
+ *	if cache_sel[0] == 0: # L3 bank
+ *		MMCRC[47:49] = cache_sel[1:3]  (L3EVENT_SEL0)
+ *	else if cache_sel[0] == 1:
+ *		MMCRC[50:51] = cache_sel[2:3]  (L3EVENT_SEL1)
+ * else if cache_sel[1]: # L1 event
+ *	MMCR1[16] = cache_sel[2]
+ *	MMCR1[17] = cache_sel[3]
+ *
+ * if mark:
+ *	MMCRA[63]    = 1		(SAMPLE_ENABLE)
+ *	MMCRA[57:59] = sample[0:2]	(RAND_SAMP_ELIG)
+ *	MMCRA[61:62] = sample[3:4]	(RAND_SAMP_MODE)
+ *
+ */
+
+#define EVENT_THR_CMP_SHIFT	40	/* Threshold CMP value */
+#define EVENT_THR_CMP_MASK	0x3ff
+#define EVENT_THR_CTL_SHIFT	32	/* Threshold control value (start/stop) */
+#define EVENT_THR_CTL_MASK	0xffull
+#define EVENT_THR_SEL_SHIFT	29	/* Threshold select value */
+#define EVENT_THR_SEL_MASK	0x7
+#define EVENT_THRESH_SHIFT	29	/* All threshold bits */
+#define EVENT_THRESH_MASK	0x1fffffull
+#define EVENT_SAMPLE_SHIFT	24	/* Sampling mode & eligibility */
+#define EVENT_SAMPLE_MASK	0x1f
+#define EVENT_CACHE_SEL_SHIFT	20	/* L2/L3 cache select */
+#define EVENT_CACHE_SEL_MASK	0xf
+#define EVENT_IS_L1		(4 << EVENT_CACHE_SEL_SHIFT)
+#define EVENT_PMC_SHIFT		16	/* PMC number (1-based) */
+#define EVENT_PMC_MASK		0xf
+#define EVENT_UNIT_SHIFT	12	/* Unit */
+#define EVENT_UNIT_MASK		0xf
+#define EVENT_COMBINE_SHIFT	11	/* Combine bit */
+#define EVENT_COMBINE_MASK	0x1
+#define EVENT_MARKED_SHIFT	8	/* Marked bit */
+#define EVENT_MARKED_MASK	0x1
+#define EVENT_IS_MARKED		(EVENT_MARKED_MASK << EVENT_MARKED_SHIFT)
+#define EVENT_PSEL_MASK		0xff	/* PMCxSEL value */
+
+/*
+ * Layout of constraint bits:
+ *
+ *        60        56        52        48        44        40        36        32
+ * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
+ *   [   fab_match   ]         [       thresh_cmp      ] [   thresh_ctl    ] [   ]
+ *                                                                             |
+ *                                                                 thresh_sel -*
+ *
+ *        28        24        20        16        12         8         4         0
+ * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
+ *                       [ ]   [  sample ]   [     ]   [6] [5]   [4] [3]   [2] [1]
+ *                        |                     |
+ *      L1 I/D qualifier -*                     |      Count of events for each PMC.
+ *                                              |        p1, p2, p3, p4, p5, p6.
+ *                     nc - number of counters -*
+ *
+ * The PMC fields P1..P6, and NC, are adder fields. As we accumulate constraints
+ * we want the low bit of each field to be added to any existing value.
+ *
+ * Everything else is a value field.
+ */
+
+#define CNST_FAB_MATCH_VAL(v)	(((v) & EVENT_THR_CTL_MASK) << 56)
+#define CNST_FAB_MATCH_MASK	CNST_FAB_MATCH_VAL(EVENT_THR_CTL_MASK)
+
+/* We just throw all the threshold bits into the constraint */
+#define CNST_THRESH_VAL(v)	(((v) & EVENT_THRESH_MASK) << 32)
+#define CNST_THRESH_MASK	CNST_THRESH_VAL(EVENT_THRESH_MASK)
+
+#define CNST_L1_QUAL_VAL(v)	(((v) & 3) << 22)
+#define CNST_L1_QUAL_MASK	CNST_L1_QUAL_VAL(3)
+
+#define CNST_SAMPLE_VAL(v)	(((v) & EVENT_SAMPLE_MASK) << 16)
+#define CNST_SAMPLE_MASK	CNST_SAMPLE_VAL(EVENT_SAMPLE_MASK)
+
+/*
+ * For NC we are counting up to 4 events. This requires three bits, and we need
+ * the fifth event to overflow and set the 4th bit. To achieve that we bias the
+ * fields by 3 in test_adder.
+ */
+#define CNST_NC_SHIFT		12
+#define CNST_NC_VAL		(1 << CNST_NC_SHIFT)
+#define CNST_NC_MASK		(8 << CNST_NC_SHIFT)
+#define POWER8_TEST_ADDER	(3 << CNST_NC_SHIFT)
+
+/*
+ * For the per-PMC fields we have two bits. The low bit is added, so if two
+ * events ask for the same PMC the sum will overflow, setting the high bit,
+ * indicating an error. So our mask sets the high bit.
+ */
+#define CNST_PMC_SHIFT(pmc)	((pmc - 1) * 2)
+#define CNST_PMC_VAL(pmc)	(1 << CNST_PMC_SHIFT(pmc))
+#define CNST_PMC_MASK(pmc)	(2 << CNST_PMC_SHIFT(pmc))
+
+/* Our add_fields is defined as: */
+#define POWER8_ADD_FIELDS	\
+	CNST_PMC_VAL(1) | CNST_PMC_VAL(2) | CNST_PMC_VAL(3) | \
+	CNST_PMC_VAL(4) | CNST_PMC_VAL(5) | CNST_PMC_VAL(6) | CNST_NC_VAL
+
+
+/* Bits in MMCR1 for POWER8 */
+#define MMCR1_UNIT_SHIFT(pmc)		(60 - (4 * ((pmc) - 1)))
+#define MMCR1_COMBINE_SHIFT(pmc)	(35 - ((pmc) - 1))
+#define MMCR1_PMCSEL_SHIFT(pmc)		(24 - (((pmc) - 1)) * 8)
+#define MMCR1_DC_QUAL_SHIFT		47
+#define MMCR1_IC_QUAL_SHIFT		46
+
+/* Bits in MMCRA for POWER8 */
+#define MMCRA_SAMP_MODE_SHIFT		1
+#define MMCRA_SAMP_ELIG_SHIFT		4
+#define MMCRA_THR_CTL_SHIFT		8
+#define MMCRA_THR_SEL_SHIFT		16
+#define MMCRA_THR_CMP_SHIFT		32
+#define MMCRA_SDAR_MODE_TLB		(1ull << 42)
+
+
+static inline bool event_is_fab_match(u64 event)
+{
+	/* Only check pmc, unit and pmcxsel, ignore the edge bit (0) */
+	event &= 0xff0fe;
+
+	/* PM_MRK_FAB_RSP_MATCH & PM_MRK_FAB_RSP_MATCH_CYC */
+	return (event == 0x30056 || event == 0x4f052);
+}
+
+static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
+{
+	unsigned int unit, pmc, cache;
+	unsigned long mask, value;
+
+	mask = value = 0;
+
+	pmc   = (event >> EVENT_PMC_SHIFT)       & EVENT_PMC_MASK;
+	unit  = (event >> EVENT_UNIT_SHIFT)      & EVENT_UNIT_MASK;
+	cache = (event >> EVENT_CACHE_SEL_SHIFT) & EVENT_CACHE_SEL_MASK;
+
+	if (pmc) {
+		if (pmc > 6)
+			return -1;
+
+		mask  |= CNST_PMC_MASK(pmc);
+		value |= CNST_PMC_VAL(pmc);
+
+		if (pmc >= 5 && event != 0x500fa && event != 0x600f4)
+			return -1;
+	}
+
+	if (pmc <= 4) {
+		/*
+		 * Add to number of counters in use. Note this includes events with
+		 * a PMC of 0 - they still need a PMC, it's just assigned later.
+		 * Don't count events on PMC 5 & 6, there is only one valid event
+		 * on each of those counters, and they are handled above.
+		 */
+		mask  |= CNST_NC_MASK;
+		value |= CNST_NC_VAL;
+	}
+
+	if (unit >= 6 && unit <= 9) {
+		/*
+		 * L2/L3 events contain a cache selector field, which is
+		 * supposed to be programmed into MMCRC. However MMCRC is only
+		 * HV writable, and there is no API for guest kernels to modify
+		 * it. The solution is for the hypervisor to initialise the
+		 * field to zeroes, and for us to only ever allow events that
+		 * have a cache selector of zero.
+		 */
+		if (cache)
+			return -1;
+
+	} else if (event & EVENT_IS_L1) {
+		mask  |= CNST_L1_QUAL_MASK;
+		value |= CNST_L1_QUAL_VAL(cache);
+	}
+
+	if (event & EVENT_IS_MARKED) {
+		mask  |= CNST_SAMPLE_MASK;
+		value |= CNST_SAMPLE_VAL(event >> EVENT_SAMPLE_SHIFT);
+	}
+
+	/*
+	 * Special case for PM_MRK_FAB_RSP_MATCH and PM_MRK_FAB_RSP_MATCH_CYC,
+	 * the threshold control bits are used for the match value.
+	 */
+	if (event_is_fab_match(event)) {
+		mask  |= CNST_FAB_MATCH_MASK;
+		value |= CNST_FAB_MATCH_VAL(event >> EVENT_THR_CTL_SHIFT);
+	} else {
+		/*
+		 * Check the mantissa upper two bits are not zero, unless the
+		 * exponent is also zero. See the THRESH_CMP_MANTISSA doc.
+		 */
+		unsigned int cmp, exp;
+
+		cmp = (event >> EVENT_THR_CMP_SHIFT) & EVENT_THR_CMP_MASK;
+		exp = cmp >> 7;
+
+		if (exp && (cmp & 0x60) == 0)
+			return -1;
+
+		mask  |= CNST_THRESH_MASK;
+		value |= CNST_THRESH_VAL(event >> EVENT_THRESH_SHIFT);
+	}
+
+	*maskp = mask;
+	*valp = value;
+
+	return 0;
+}
+
+static int power8_compute_mmcr(u64 event[], int n_ev,
+			       unsigned int hwc[], unsigned long mmcr[])
+{
+	unsigned long mmcra, mmcr1, unit, combine, psel, cache, val;
+	unsigned int pmc, pmc_inuse;
+	int i;
+
+	pmc_inuse = 0;
+
+	/* First pass to count resource use */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK;
+		if (pmc)
+			pmc_inuse |= 1 << pmc;
+	}
+
+	/* In continous sampling mode, update SDAR on TLB miss */
+	mmcra = MMCRA_SDAR_MODE_TLB;
+	mmcr1 = 0;
+
+	/* Second pass: assign PMCs, set all MMCR1 fields */
+	for (i = 0; i < n_ev; ++i) {
+		pmc     = (event[i] >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK;
+		unit    = (event[i] >> EVENT_UNIT_SHIFT) & EVENT_UNIT_MASK;
+		combine = (event[i] >> EVENT_COMBINE_SHIFT) & EVENT_COMBINE_MASK;
+		psel    =  event[i] & EVENT_PSEL_MASK;
+
+		if (!pmc) {
+			for (pmc = 1; pmc <= 4; ++pmc) {
+				if (!(pmc_inuse & (1 << pmc)))
+					break;
+			}
+
+			pmc_inuse |= 1 << pmc;
+		}
+
+		if (pmc <= 4) {
+			mmcr1 |= unit << MMCR1_UNIT_SHIFT(pmc);
+			mmcr1 |= combine << MMCR1_COMBINE_SHIFT(pmc);
+			mmcr1 |= psel << MMCR1_PMCSEL_SHIFT(pmc);
+		}
+
+		if (event[i] & EVENT_IS_L1) {
+			cache = event[i] >> EVENT_CACHE_SEL_SHIFT;
+			mmcr1 |= (cache & 1) << MMCR1_IC_QUAL_SHIFT;
+			cache >>= 1;
+			mmcr1 |= (cache & 1) << MMCR1_DC_QUAL_SHIFT;
+		}
+
+		if (event[i] & EVENT_IS_MARKED) {
+			mmcra |= MMCRA_SAMPLE_ENABLE;
+
+			val = (event[i] >> EVENT_SAMPLE_SHIFT) & EVENT_SAMPLE_MASK;
+			if (val) {
+				mmcra |= (val &  3) << MMCRA_SAMP_MODE_SHIFT;
+				mmcra |= (val >> 2) << MMCRA_SAMP_ELIG_SHIFT;
+			}
+		}
+
+		/*
+		 * PM_MRK_FAB_RSP_MATCH and PM_MRK_FAB_RSP_MATCH_CYC,
+		 * the threshold bits are used for the match value.
+		 */
+		if (event_is_fab_match(event[i])) {
+			mmcr1 |= (event[i] >> EVENT_THR_CTL_SHIFT) &
+				  EVENT_THR_CTL_MASK;
+		} else {
+			val = (event[i] >> EVENT_THR_CTL_SHIFT) & EVENT_THR_CTL_MASK;
+			mmcra |= val << MMCRA_THR_CTL_SHIFT;
+			val = (event[i] >> EVENT_THR_SEL_SHIFT) & EVENT_THR_SEL_MASK;
+			mmcra |= val << MMCRA_THR_SEL_SHIFT;
+			val = (event[i] >> EVENT_THR_CMP_SHIFT) & EVENT_THR_CMP_MASK;
+			mmcra |= val << MMCRA_THR_CMP_SHIFT;
+		}
+
+		hwc[i] = pmc - 1;
+	}
+
+	/* Return MMCRx values */
+	mmcr[0] = 0;
+
+	/* pmc_inuse is 1-based */
+	if (pmc_inuse & 2)
+		mmcr[0] = MMCR0_PMC1CE;
+
+	if (pmc_inuse & 0x7c)
+		mmcr[0] |= MMCR0_PMCjCE;
+
+	mmcr[1] = mmcr1;
+	mmcr[2] = mmcra;
+
+	return 0;
+}
+
+static void power8_disable_pmc(unsigned int pmc, unsigned long mmcr[])
+{
+	if (pmc <= 3)
+		mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SHIFT(pmc + 1));
+}
+
+PMU_FORMAT_ATTR(event,		"config:0-49");
+PMU_FORMAT_ATTR(pmcxsel,	"config:0-7");
+PMU_FORMAT_ATTR(mark,		"config:8");
+PMU_FORMAT_ATTR(combine,	"config:11");
+PMU_FORMAT_ATTR(unit,		"config:12-15");
+PMU_FORMAT_ATTR(pmc,		"config:16-19");
+PMU_FORMAT_ATTR(cache_sel,	"config:20-23");
+PMU_FORMAT_ATTR(sample_mode,	"config:24-28");
+PMU_FORMAT_ATTR(thresh_sel,	"config:29-31");
+PMU_FORMAT_ATTR(thresh_stop,	"config:32-35");
+PMU_FORMAT_ATTR(thresh_start,	"config:36-39");
+PMU_FORMAT_ATTR(thresh_cmp,	"config:40-49");
+
+static struct attribute *power8_pmu_format_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_pmcxsel.attr,
+	&format_attr_mark.attr,
+	&format_attr_combine.attr,
+	&format_attr_unit.attr,
+	&format_attr_pmc.attr,
+	&format_attr_cache_sel.attr,
+	&format_attr_sample_mode.attr,
+	&format_attr_thresh_sel.attr,
+	&format_attr_thresh_stop.attr,
+	&format_attr_thresh_start.attr,
+	&format_attr_thresh_cmp.attr,
+	NULL,
+};
+
+struct attribute_group power8_pmu_format_group = {
+	.name = "format",
+	.attrs = power8_pmu_format_attr,
+};
+
+static const struct attribute_group *power8_pmu_attr_groups[] = {
+	&power8_pmu_format_group,
+	NULL,
+};
+
+static int power8_generic_events[] = {
+	[PERF_COUNT_HW_CPU_CYCLES] =			PM_CYC,
+	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =	PM_GCT_NOSLOT_CYC,
+	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =	PM_CMPLU_STALL,
+	[PERF_COUNT_HW_INSTRUCTIONS] =			PM_INST_CMPL,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =		PM_BRU_FIN,
+	[PERF_COUNT_HW_BRANCH_MISSES] =			PM_BR_MPRED_CMPL,
+};
+
+static struct power_pmu power8_pmu = {
+	.name			= "POWER8",
+	.n_counter		= 6,
+	.max_alternatives	= 0,
+	.add_fields		= POWER8_ADD_FIELDS,
+	.test_adder		= POWER8_TEST_ADDER,
+	.compute_mmcr		= power8_compute_mmcr,
+	.get_constraint		= power8_get_constraint,
+	.disable_pmc		= power8_disable_pmc,
+	.flags			= PPMU_HAS_SSLOT | PPMU_HAS_SIER,
+	.n_generic		= ARRAY_SIZE(power8_generic_events),
+	.generic_events		= power8_generic_events,
+	.attr_groups		= power8_pmu_attr_groups,
+};
+
+static int __init init_power8_pmu(void)
+{
+	if (!cur_cpu_spec->oprofile_cpu_type ||
+	    strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8"))
+		return -ENODEV;
+
+	return register_power_pmu(&power8_pmu);
+}
+early_initcall(init_power8_pmu);
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCHv3 1/2] ppc64: perform proper max_bus_speed detection
From: Michael Ellerman @ 2013-04-15  5:00 UTC (permalink / raw)
  To: Lucas Kannebley Tavares
  Cc: David Airlie, Brian King, dri-devel, Kleber Sacilotto de Souza,
	Alex Deucher, Jerome Glisse, Thadeu Lima de Souza Cascardo,
	Bjorn Helgaas, linuxppc-dev
In-Reply-To: <1365685994-32603-2-git-send-email-lucaskt@linux.vnet.ibm.com>

On Thu, Apr 11, 2013 at 10:13:13AM -0300, Lucas Kannebley Tavares wrote:
> On pseries machines the detection for max_bus_speed should be done
> through an OpenFirmware property. This patch adds a function to perform this
> detection and a hook to perform dynamic adding of the function only for
> pseries.

The crucial detail you didn't mention is that pcibios_root_bridge_prepare()
already exists as a weak function in the PCI code and is called from
pci_create_root_bus().

> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
> index 8bcc9ca..15796b5 100644
> --- a/arch/powerpc/platforms/pseries/setup.c
> +++ b/arch/powerpc/platforms/pseries/setup.c
> @@ -430,6 +430,8 @@ static void pSeries_machine_kexec(struct kimage *image)
>  }
>  #endif
>  
> +int pseries_root_bridge_prepare(struct pci_host_bridge *bridge);
> +

Don't do that, put it in a header where it belongs.

cheers

^ permalink raw reply

* Re: [PATCH] Enable CONFIG_DEVTMPFS_MOUNT to ensure /dev can be mounted correctly
From: Michael Ellerman @ 2013-04-15  5:01 UTC (permalink / raw)
  To: Zhenhua Luo; +Cc: linuxppc-dev
In-Reply-To: <1365688591-5323-1-git-send-email-zhenhua.luo@freescale.com>

On Thu, Apr 11, 2013 at 09:56:30PM +0800, Zhenhua Luo wrote:
> When using recent udev, the /dev node mount requires CONFIG_DEVTMPFS_MOUNT
> is enabled in Kernel.

Really?

I know it makes life easier when you don't have an initramfs, but is it
actually required now?

cheers

^ permalink raw reply

* [PATCH] powerpc: fix usage of setup_pci_atmu()
From: Michael Neuling @ 2013-04-15  5:42 UTC (permalink / raw)
  To: Kumar Gala, sfr; +Cc: linuxppc-dev, linux-next
In-Reply-To: <1363201636-7318-1-git-send-email-galak@kernel.crashing.org>

Linux next is currently failing to compile mpc85xx_defconfig with:
  arch/powerpc/sysdev/fsl_pci.c:944:2: error: too many arguments to function 'setup_pci_atmu'

This is caused by (from Kumar's next branch):
  commit 34642bbb3d12121333efcf4ea7dfe66685e403a1
  Author: Kumar Gala <galak@kernel.crashing.org>
  powerpc/fsl-pci: Keep PCI SoC controller registers in pci_controller

Which changed definition of setup_pci_atmu() but didn't update one of
the callers.  Below fixes this.

Signed-off-by: Michael Neuling <mikey@neuling.org>
---
Kumar: this is for your next tree

diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index 83918c3..a10a036 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -941,7 +941,7 @@ static int fsl_pci_resume(struct device *dev)
 		return -ENODEV;
 	}
 
-	setup_pci_atmu(hose, &pci_rsrc);
+	setup_pci_atmu(hose);
 
 	return 0;
 }

^ permalink raw reply related

* Re: [PATCH] perf: Power7: Make CPI stack events available in sysfs
From: Michael Ellerman @ 2013-04-15  6:15 UTC (permalink / raw)
  To: Sukadev Bhattiprolu
  Cc: Paul Mackerras, linux-kernel, Arnaldo Carvalho de Melo,
	linuxppc-dev
In-Reply-To: <20130406164803.GA408@us.ibm.com>

On Sat, Apr 06, 2013 at 09:48:03AM -0700, Sukadev Bhattiprolu wrote:
> From bdeacf7175241f6c79b5b2be0fa6b20b0d0b7d1c Mon Sep 17 00:00:00 2001
> From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
> Date: Sat, 6 Apr 2013 08:48:26 -0700
> Subject: [PATCH] perf: Power7: Make CPI stack events available in sysfs
> 
> A set of Power7 events are often used for Cycles Per Instruction (CPI) stack
> analysis. Make these events available in sysfs (/sys/devices/cpu/events/) so
> they can be identified using their symbolic names:
> 
> 	perf stat -e 'cpu/PM_CMPLU_STALL_DCACHE_MISS/' /bin/ls

Should we take these two via the powerpc tree? Or do you want to take
them Arnaldo?

cheers

^ permalink raw reply

* Re: [RFC PATCH v2 1/4] powerpc: Move the setting of rflags out of loop in __hash_page_huge
From: Michael Ellerman @ 2013-04-15  6:32 UTC (permalink / raw)
  To: Li Zhong; +Cc: linuxppc-dev, paulus
In-Reply-To: <1365733021-28912-2-git-send-email-zhong@linux.vnet.ibm.com>

On Fri, Apr 12, 2013 at 10:16:57AM +0800, Li Zhong wrote:
> It seems that rflags don't get changed in  the repeating loop, so move
> it out of the loop.

You've also changed the way new_pte is handled on repeat, but I think
that's OK too.

cheers

> diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
> index cecad34..edb4129 100644
> --- a/arch/powerpc/mm/hugetlbpage-hash64.c
> +++ b/arch/powerpc/mm/hugetlbpage-hash64.c
> @@ -87,10 +87,6 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
>  
>  		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
>  
> -repeat:
> -		hpte_group = ((hash & htab_hash_mask) *
> -			      HPTES_PER_GROUP) & ~0x7UL;
> -
>  		/* clear HPTE slot informations in new PTE */
>  #ifdef CONFIG_PPC_64K_PAGES
>  		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;

ie. here new_pte was updated on repeat, but now it's not.

> @@ -101,6 +97,10 @@ repeat:
>  		rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
>  				      _PAGE_COHERENT | _PAGE_GUARDED));
>  
> +repeat:
> +		hpte_group = ((hash & htab_hash_mask) *
> +			      HPTES_PER_GROUP) & ~0x7UL;
> +
>  		/* Insert into the hash table, primary slot */
>  		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
>  					  mmu_psize, ssize);

^ permalink raw reply

* Re: [RFC PATCH v2 4/4] powerpc: Try to insert the hptes repeatedly in kernel_map_linear_page()
From: Michael Ellerman @ 2013-04-15  6:36 UTC (permalink / raw)
  To: Li Zhong; +Cc: linuxppc-dev, paulus
In-Reply-To: <1365733021-28912-5-git-send-email-zhong@linux.vnet.ibm.com>

On Fri, Apr 12, 2013 at 10:17:00AM +0800, Li Zhong wrote:
> This patch tries to fix following issue when CONFIG_DEBUG_PAGEALLOC
> is enabled:

Please include a better changelog.

This patch does fix (I hope?) the following oops, caused by xxx. Reproducible
by doing yyy.

cheers

> 
> [  543.075675] ------------[ cut here ]------------
> [  543.075701] kernel BUG at arch/powerpc/mm/hash_utils_64.c:1239!
> [  543.075714] Oops: Exception in kernel mode, sig: 5 [#1]
> [  543.075722] PREEMPT SMP NR_CPUS=16 DEBUG_PAGEALLOC NUMA pSeries
> [  543.075741] Modules linked in: binfmt_misc ehea
> [  543.075759] NIP: c000000000036eb0 LR: c000000000036ea4 CTR: c00000000005a594
> [  543.075771] REGS: c0000000a90832c0 TRAP: 0700   Not tainted  (3.8.0-next-20130222)
> [  543.075781] MSR: 8000000000029032 <SF,EE,ME,IR,DR,RI>  CR: 22224482  XER: 00000000
> [  543.075816] SOFTE: 0
> [  543.075823] CFAR: c00000000004c200
> [  543.075830] TASK = c0000000e506b750[23934] 'cc1' THREAD: c0000000a9080000 CPU: 1
> GPR00: 0000000000000001 c0000000a9083540 c000000000c600a8 ffffffffffffffff
> GPR04: 0000000000000050 fffffffffffffffa c0000000a90834e0 00000000004ff594
> GPR08: 0000000000000001 0000000000000000 000000009592d4d8 c000000000c86854
> GPR12: 0000000000000002 c000000006ead300 0000000000a51000 0000000000000001
> GPR16: f000000003354380 ffffffffffffffff ffffffffffffff80 0000000000000000
> GPR20: 0000000000000001 c000000000c600a8 0000000000000001 0000000000000001
> GPR24: 0000000003354380 c000000000000000 0000000000000000 c000000000b65950
> GPR28: 0000002000000000 00000000000cd50e 0000000000bf50d9 c000000000c7c230
> [  543.076005] NIP [c000000000036eb0] .kernel_map_pages+0x1e0/0x3f8
> [  543.076016] LR [c000000000036ea4] .kernel_map_pages+0x1d4/0x3f8
> [  543.076025] Call Trace:
> [  543.076033] [c0000000a9083540] [c000000000036ea4] .kernel_map_pages+0x1d4/0x3f8 (unreliable)
> [  543.076053] [c0000000a9083640] [c000000000167638] .get_page_from_freelist+0x6cc/0x8dc
> [  543.076067] [c0000000a9083800] [c000000000167a48] .__alloc_pages_nodemask+0x200/0x96c
> [  543.076082] [c0000000a90839c0] [c0000000001ade44] .alloc_pages_vma+0x160/0x1e4
> [  543.076098] [c0000000a9083a80] [c00000000018ce04] .handle_pte_fault+0x1b0/0x7e8
> [  543.076113] [c0000000a9083b50] [c00000000018d5a8] .handle_mm_fault+0x16c/0x1a0
> [  543.076129] [c0000000a9083c00] [c0000000007bf1dc] .do_page_fault+0x4d0/0x7a4
> [  543.076144] [c0000000a9083e30] [c0000000000090e8] handle_page_fault+0x10/0x30
> [  543.076155] Instruction dump:
> [  543.076163] 7c630038 78631d88 e80a0000 f8410028 7c0903a6 e91f01de e96a0010 e84a0008
> [  543.076192] 4e800421 e8410028 7c7107b4 7a200fe0 <0b000000> 7f63db78 48785781 60000000
> [  543.076224] ---[ end trace bd5807e8d6ae186b ]---
> 
> Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
> ---
>  arch/powerpc/mm/hash_utils_64.c |   10 ++++++----
>  1 file changed, 6 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
> index a7f54f0..4b449a0 100644
> --- a/arch/powerpc/mm/hash_utils_64.c
> +++ b/arch/powerpc/mm/hash_utils_64.c
> @@ -1272,7 +1272,7 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
>  	unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
>  	unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
>  	unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL);
> -	int ret;
> +	long ret;
>  
>  	hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
>  	hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
> @@ -1280,9 +1280,11 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
>  	/* Don't create HPTE entries for bad address */
>  	if (!vsid)
>  		return;
> -	ret = ppc_md.hpte_insert(hpteg, vpn, __pa(vaddr),
> -				 mode, 0,
> -				 mmu_linear_psize, mmu_kernel_ssize);
> +
> +	ret = hpte_insert_repeating(hash, vpn, __pa(vaddr),
> +				    mode, mmu_linear_psize,
> +				    mmu_kernel_ssize);
> +
>  	BUG_ON (ret < 0);
>  	spin_lock(&linear_map_hash_lock);
>  	BUG_ON(linear_map_hash_slots[lmi] & 0x80);
> -- 
> 1.7.9.5
> 

^ permalink raw reply

* Re: [RFC PATCH v2 3/4] powerpc: Don't bolt the hpte in kernel_map_linear_page()
From: Benjamin Herrenschmidt @ 2013-04-15  6:56 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, Li Zhong
In-Reply-To: <20130415035031.GA7494@iris.ozlabs.ibm.com>

On Mon, 2013-04-15 at 13:50 +1000, Paul Mackerras wrote:
> On Fri, Apr 12, 2013 at 10:16:59AM +0800, Li Zhong wrote:
> > It seems that in kernel_unmap_linear_page(), it only checks whether there
> > is a map in the linear_map_hash_slots array, so seems we don't need bolt
> > the hpte.
> 
> I don't exactly understand your rationale here, but I don't think it's
> safe not to have linear mapping pages bolted.  Basically, if a page
> will be used in the process of calling hash_page to demand-fault an
> HPTE into the hash table, then that page needs to be bolted, otherwise
> we can get an infinite recursion of HPT misses.  That includes all
> kernel stack pages, among other things, so I think we need to leave
> the HPTE_V_BOLTED in there.

I suspect Li's confusion comes from the fact that he doesn't realizes
that we might evict random hash slots. If the linear mapping hash
entries could only be thrown out via kernel_unmap_linear_page() then his
comment would make sense. However this isn't the case.

Li: When faulting something in, if both the primary and secondary
buckets are full, we "somewhat randomly" evict the content of a slot and
replace it. However we only do that on non-bolted slots.

This is why the linear mapping (and the vmemmap) must be bolted.

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH 1/8] Remove syslog prefix in uncompressed oops text
From: Michael Ellerman @ 2013-04-15  7:20 UTC (permalink / raw)
  To: Aruna Balakrishnaiah
  Cc: jkenisto, mahesh, linux-kernel, linuxppc-dev, paulus, anton
In-Reply-To: <20130410072100.20150.74661.stgit@aruna-ThinkPad-T420>

On Wed, Apr 10, 2013 at 12:51:00PM +0530, Aruna Balakrishnaiah wrote:
> Removal of syslog prefix in the uncompressed oops text will
> help in capturing more oops data.

Why does it help? Does this effect any existing tools?

cheers

^ permalink raw reply

* Re: [PATCH] powerpc/perf: Power8 PMU support
From: Benjamin Herrenschmidt @ 2013-04-15  7:31 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: linux-kernel, acme, linuxppc-dev, Paul Mackerras, Anton Blanchard,
	sukadev
In-Reply-To: <1365999438-20578-1-git-send-email-michael@ellerman.id.au>

On Mon, 2013-04-15 at 14:17 +1000, Michael Ellerman wrote:
> This patch adds preliminary support for the power8 PMU to perf.

Might be worthwhile to have a small blurb explaining roughly what you
mean by "preliminary" :-)

Cheers,
Ben.

> Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
> ---
>  arch/powerpc/perf/Makefile     |    3 +-
>  arch/powerpc/perf/power8-pmu.c |  454 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 456 insertions(+), 1 deletion(-)
>  create mode 100644 arch/powerpc/perf/power8-pmu.c
> 
> diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile
> index af3fac2..472db18 100644
> --- a/arch/powerpc/perf/Makefile
> +++ b/arch/powerpc/perf/Makefile
> @@ -4,7 +4,8 @@ obj-$(CONFIG_PERF_EVENTS)	+= callchain.o
>  
>  obj-$(CONFIG_PPC_PERF_CTRS)	+= core-book3s.o
>  obj64-$(CONFIG_PPC_PERF_CTRS)	+= power4-pmu.o ppc970-pmu.o power5-pmu.o \
> -				   power5+-pmu.o power6-pmu.o power7-pmu.o
> +				   power5+-pmu.o power6-pmu.o power7-pmu.o \
> +				   power8-pmu.o
>  obj32-$(CONFIG_PPC_PERF_CTRS)	+= mpc7450-pmu.o
>  
>  obj-$(CONFIG_FSL_EMB_PERF_EVENT) += core-fsl-emb.o
> diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
> new file mode 100644
> index 0000000..106ae0b
> --- /dev/null
> +++ b/arch/powerpc/perf/power8-pmu.c
> @@ -0,0 +1,454 @@
> +/*
> + * Performance counter support for POWER8 processors.
> + *
> + * Copyright 2009 Paul Mackerras, IBM Corporation.
> + * Copyright 2013 Michael Ellerman, IBM Corporation.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version
> + * 2 of the License, or (at your option) any later version.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/perf_event.h>
> +#include <asm/firmware.h>
> +
> +
> +/*
> + * Some power8 event codes.
> + */
> +#define PM_CYC				0x0001e
> +#define PM_GCT_NOSLOT_CYC		0x100f8
> +#define PM_CMPLU_STALL			0x4000a	/* or 0x1e054 */
> +#define PM_INST_CMPL			0x00002
> +#define PM_BRU_FIN			0x10068
> +#define PM_BR_MPRED_CMPL		0x400f6
> +
> +
> +/*
> + * Raw event encoding for POWER8:
> + *
> + *        60        56        52        48        44        40        36        32
> + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
> + *                                     [      thresh_cmp     ]   [  thresh_ctl   ]
> + *                                                                       |
> + *                                       thresh start/stop OR FAB match -*
> + *
> + *        28        24        20        16        12         8         4         0
> + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
> + *   [   ] [  sample ]   [cache]   [ pmc ]   [unit ]   c     m   [    pmcxsel    ]
> + *     |        |           |                          |     |
> + *     |        |           |                          |     *- mark
> + *     |        |           *- L1/L2/L3 cache_sel      |
> + *     |        |                                      |
> + *     |        *- sampling mode for marked events     *- combine
> + *     |
> + *     *- thresh_sel
> + *
> + * Below uses IBM bit numbering.
> + *
> + * MMCR1[x:y] = unit    (PMCxUNIT)
> + * MMCR1[x]   = combine (PMCxCOMB)
> + *
> + * if pmc == 3 and unit == 0 and pmcxsel[0:6] == 0b0101011
> + *	# PM_MRK_FAB_RSP_MATCH
> + *	MMCR1[20:27] = thresh_ctl   (FAB_CRESP_MATCH / FAB_TYPE_MATCH)
> + * else if pmc == 4 and unit == 0xf and pmcxsel[0:6] == 0b0101001
> + *	# PM_MRK_FAB_RSP_MATCH_CYC
> + *	MMCR1[20:27] = thresh_ctl   (FAB_CRESP_MATCH / FAB_TYPE_MATCH)
> + * else
> + *	MMCRA[48:55] = thresh_ctl   (THRESH START/END)
> + *
> + * if thresh_sel:
> + *	MMCRA[45:47] = thresh_sel
> + *
> + * if thresh_cmp:
> + *	MMCRA[22:24] = thresh_cmp[0:2]
> + *	MMCRA[25:31] = thresh_cmp[3:9]
> + *
> + * if unit == 6 or unit == 7
> + *	MMCRC[53:55] = cache_sel[1:3]      (L2EVENT_SEL)
> + * else if unit == 8 or unit == 9:
> + *	if cache_sel[0] == 0: # L3 bank
> + *		MMCRC[47:49] = cache_sel[1:3]  (L3EVENT_SEL0)
> + *	else if cache_sel[0] == 1:
> + *		MMCRC[50:51] = cache_sel[2:3]  (L3EVENT_SEL1)
> + * else if cache_sel[1]: # L1 event
> + *	MMCR1[16] = cache_sel[2]
> + *	MMCR1[17] = cache_sel[3]
> + *
> + * if mark:
> + *	MMCRA[63]    = 1		(SAMPLE_ENABLE)
> + *	MMCRA[57:59] = sample[0:2]	(RAND_SAMP_ELIG)
> + *	MMCRA[61:62] = sample[3:4]	(RAND_SAMP_MODE)
> + *
> + */
> +
> +#define EVENT_THR_CMP_SHIFT	40	/* Threshold CMP value */
> +#define EVENT_THR_CMP_MASK	0x3ff
> +#define EVENT_THR_CTL_SHIFT	32	/* Threshold control value (start/stop) */
> +#define EVENT_THR_CTL_MASK	0xffull
> +#define EVENT_THR_SEL_SHIFT	29	/* Threshold select value */
> +#define EVENT_THR_SEL_MASK	0x7
> +#define EVENT_THRESH_SHIFT	29	/* All threshold bits */
> +#define EVENT_THRESH_MASK	0x1fffffull
> +#define EVENT_SAMPLE_SHIFT	24	/* Sampling mode & eligibility */
> +#define EVENT_SAMPLE_MASK	0x1f
> +#define EVENT_CACHE_SEL_SHIFT	20	/* L2/L3 cache select */
> +#define EVENT_CACHE_SEL_MASK	0xf
> +#define EVENT_IS_L1		(4 << EVENT_CACHE_SEL_SHIFT)
> +#define EVENT_PMC_SHIFT		16	/* PMC number (1-based) */
> +#define EVENT_PMC_MASK		0xf
> +#define EVENT_UNIT_SHIFT	12	/* Unit */
> +#define EVENT_UNIT_MASK		0xf
> +#define EVENT_COMBINE_SHIFT	11	/* Combine bit */
> +#define EVENT_COMBINE_MASK	0x1
> +#define EVENT_MARKED_SHIFT	8	/* Marked bit */
> +#define EVENT_MARKED_MASK	0x1
> +#define EVENT_IS_MARKED		(EVENT_MARKED_MASK << EVENT_MARKED_SHIFT)
> +#define EVENT_PSEL_MASK		0xff	/* PMCxSEL value */
> +
> +/*
> + * Layout of constraint bits:
> + *
> + *        60        56        52        48        44        40        36        32
> + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
> + *   [   fab_match   ]         [       thresh_cmp      ] [   thresh_ctl    ] [   ]
> + *                                                                             |
> + *                                                                 thresh_sel -*
> + *
> + *        28        24        20        16        12         8         4         0
> + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
> + *                       [ ]   [  sample ]   [     ]   [6] [5]   [4] [3]   [2] [1]
> + *                        |                     |
> + *      L1 I/D qualifier -*                     |      Count of events for each PMC.
> + *                                              |        p1, p2, p3, p4, p5, p6.
> + *                     nc - number of counters -*
> + *
> + * The PMC fields P1..P6, and NC, are adder fields. As we accumulate constraints
> + * we want the low bit of each field to be added to any existing value.
> + *
> + * Everything else is a value field.
> + */
> +
> +#define CNST_FAB_MATCH_VAL(v)	(((v) & EVENT_THR_CTL_MASK) << 56)
> +#define CNST_FAB_MATCH_MASK	CNST_FAB_MATCH_VAL(EVENT_THR_CTL_MASK)
> +
> +/* We just throw all the threshold bits into the constraint */
> +#define CNST_THRESH_VAL(v)	(((v) & EVENT_THRESH_MASK) << 32)
> +#define CNST_THRESH_MASK	CNST_THRESH_VAL(EVENT_THRESH_MASK)
> +
> +#define CNST_L1_QUAL_VAL(v)	(((v) & 3) << 22)
> +#define CNST_L1_QUAL_MASK	CNST_L1_QUAL_VAL(3)
> +
> +#define CNST_SAMPLE_VAL(v)	(((v) & EVENT_SAMPLE_MASK) << 16)
> +#define CNST_SAMPLE_MASK	CNST_SAMPLE_VAL(EVENT_SAMPLE_MASK)
> +
> +/*
> + * For NC we are counting up to 4 events. This requires three bits, and we need
> + * the fifth event to overflow and set the 4th bit. To achieve that we bias the
> + * fields by 3 in test_adder.
> + */
> +#define CNST_NC_SHIFT		12
> +#define CNST_NC_VAL		(1 << CNST_NC_SHIFT)
> +#define CNST_NC_MASK		(8 << CNST_NC_SHIFT)
> +#define POWER8_TEST_ADDER	(3 << CNST_NC_SHIFT)
> +
> +/*
> + * For the per-PMC fields we have two bits. The low bit is added, so if two
> + * events ask for the same PMC the sum will overflow, setting the high bit,
> + * indicating an error. So our mask sets the high bit.
> + */
> +#define CNST_PMC_SHIFT(pmc)	((pmc - 1) * 2)
> +#define CNST_PMC_VAL(pmc)	(1 << CNST_PMC_SHIFT(pmc))
> +#define CNST_PMC_MASK(pmc)	(2 << CNST_PMC_SHIFT(pmc))
> +
> +/* Our add_fields is defined as: */
> +#define POWER8_ADD_FIELDS	\
> +	CNST_PMC_VAL(1) | CNST_PMC_VAL(2) | CNST_PMC_VAL(3) | \
> +	CNST_PMC_VAL(4) | CNST_PMC_VAL(5) | CNST_PMC_VAL(6) | CNST_NC_VAL
> +
> +
> +/* Bits in MMCR1 for POWER8 */
> +#define MMCR1_UNIT_SHIFT(pmc)		(60 - (4 * ((pmc) - 1)))
> +#define MMCR1_COMBINE_SHIFT(pmc)	(35 - ((pmc) - 1))
> +#define MMCR1_PMCSEL_SHIFT(pmc)		(24 - (((pmc) - 1)) * 8)
> +#define MMCR1_DC_QUAL_SHIFT		47
> +#define MMCR1_IC_QUAL_SHIFT		46
> +
> +/* Bits in MMCRA for POWER8 */
> +#define MMCRA_SAMP_MODE_SHIFT		1
> +#define MMCRA_SAMP_ELIG_SHIFT		4
> +#define MMCRA_THR_CTL_SHIFT		8
> +#define MMCRA_THR_SEL_SHIFT		16
> +#define MMCRA_THR_CMP_SHIFT		32
> +#define MMCRA_SDAR_MODE_TLB		(1ull << 42)
> +
> +
> +static inline bool event_is_fab_match(u64 event)
> +{
> +	/* Only check pmc, unit and pmcxsel, ignore the edge bit (0) */
> +	event &= 0xff0fe;
> +
> +	/* PM_MRK_FAB_RSP_MATCH & PM_MRK_FAB_RSP_MATCH_CYC */
> +	return (event == 0x30056 || event == 0x4f052);
> +}
> +
> +static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
> +{
> +	unsigned int unit, pmc, cache;
> +	unsigned long mask, value;
> +
> +	mask = value = 0;
> +
> +	pmc   = (event >> EVENT_PMC_SHIFT)       & EVENT_PMC_MASK;
> +	unit  = (event >> EVENT_UNIT_SHIFT)      & EVENT_UNIT_MASK;
> +	cache = (event >> EVENT_CACHE_SEL_SHIFT) & EVENT_CACHE_SEL_MASK;
> +
> +	if (pmc) {
> +		if (pmc > 6)
> +			return -1;
> +
> +		mask  |= CNST_PMC_MASK(pmc);
> +		value |= CNST_PMC_VAL(pmc);
> +
> +		if (pmc >= 5 && event != 0x500fa && event != 0x600f4)
> +			return -1;
> +	}
> +
> +	if (pmc <= 4) {
> +		/*
> +		 * Add to number of counters in use. Note this includes events with
> +		 * a PMC of 0 - they still need a PMC, it's just assigned later.
> +		 * Don't count events on PMC 5 & 6, there is only one valid event
> +		 * on each of those counters, and they are handled above.
> +		 */
> +		mask  |= CNST_NC_MASK;
> +		value |= CNST_NC_VAL;
> +	}
> +
> +	if (unit >= 6 && unit <= 9) {
> +		/*
> +		 * L2/L3 events contain a cache selector field, which is
> +		 * supposed to be programmed into MMCRC. However MMCRC is only
> +		 * HV writable, and there is no API for guest kernels to modify
> +		 * it. The solution is for the hypervisor to initialise the
> +		 * field to zeroes, and for us to only ever allow events that
> +		 * have a cache selector of zero.
> +		 */
> +		if (cache)
> +			return -1;
> +
> +	} else if (event & EVENT_IS_L1) {
> +		mask  |= CNST_L1_QUAL_MASK;
> +		value |= CNST_L1_QUAL_VAL(cache);
> +	}
> +
> +	if (event & EVENT_IS_MARKED) {
> +		mask  |= CNST_SAMPLE_MASK;
> +		value |= CNST_SAMPLE_VAL(event >> EVENT_SAMPLE_SHIFT);
> +	}
> +
> +	/*
> +	 * Special case for PM_MRK_FAB_RSP_MATCH and PM_MRK_FAB_RSP_MATCH_CYC,
> +	 * the threshold control bits are used for the match value.
> +	 */
> +	if (event_is_fab_match(event)) {
> +		mask  |= CNST_FAB_MATCH_MASK;
> +		value |= CNST_FAB_MATCH_VAL(event >> EVENT_THR_CTL_SHIFT);
> +	} else {
> +		/*
> +		 * Check the mantissa upper two bits are not zero, unless the
> +		 * exponent is also zero. See the THRESH_CMP_MANTISSA doc.
> +		 */
> +		unsigned int cmp, exp;
> +
> +		cmp = (event >> EVENT_THR_CMP_SHIFT) & EVENT_THR_CMP_MASK;
> +		exp = cmp >> 7;
> +
> +		if (exp && (cmp & 0x60) == 0)
> +			return -1;
> +
> +		mask  |= CNST_THRESH_MASK;
> +		value |= CNST_THRESH_VAL(event >> EVENT_THRESH_SHIFT);
> +	}
> +
> +	*maskp = mask;
> +	*valp = value;
> +
> +	return 0;
> +}
> +
> +static int power8_compute_mmcr(u64 event[], int n_ev,
> +			       unsigned int hwc[], unsigned long mmcr[])
> +{
> +	unsigned long mmcra, mmcr1, unit, combine, psel, cache, val;
> +	unsigned int pmc, pmc_inuse;
> +	int i;
> +
> +	pmc_inuse = 0;
> +
> +	/* First pass to count resource use */
> +	for (i = 0; i < n_ev; ++i) {
> +		pmc = (event[i] >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK;
> +		if (pmc)
> +			pmc_inuse |= 1 << pmc;
> +	}
> +
> +	/* In continous sampling mode, update SDAR on TLB miss */
> +	mmcra = MMCRA_SDAR_MODE_TLB;
> +	mmcr1 = 0;
> +
> +	/* Second pass: assign PMCs, set all MMCR1 fields */
> +	for (i = 0; i < n_ev; ++i) {
> +		pmc     = (event[i] >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK;
> +		unit    = (event[i] >> EVENT_UNIT_SHIFT) & EVENT_UNIT_MASK;
> +		combine = (event[i] >> EVENT_COMBINE_SHIFT) & EVENT_COMBINE_MASK;
> +		psel    =  event[i] & EVENT_PSEL_MASK;
> +
> +		if (!pmc) {
> +			for (pmc = 1; pmc <= 4; ++pmc) {
> +				if (!(pmc_inuse & (1 << pmc)))
> +					break;
> +			}
> +
> +			pmc_inuse |= 1 << pmc;
> +		}
> +
> +		if (pmc <= 4) {
> +			mmcr1 |= unit << MMCR1_UNIT_SHIFT(pmc);
> +			mmcr1 |= combine << MMCR1_COMBINE_SHIFT(pmc);
> +			mmcr1 |= psel << MMCR1_PMCSEL_SHIFT(pmc);
> +		}
> +
> +		if (event[i] & EVENT_IS_L1) {
> +			cache = event[i] >> EVENT_CACHE_SEL_SHIFT;
> +			mmcr1 |= (cache & 1) << MMCR1_IC_QUAL_SHIFT;
> +			cache >>= 1;
> +			mmcr1 |= (cache & 1) << MMCR1_DC_QUAL_SHIFT;
> +		}
> +
> +		if (event[i] & EVENT_IS_MARKED) {
> +			mmcra |= MMCRA_SAMPLE_ENABLE;
> +
> +			val = (event[i] >> EVENT_SAMPLE_SHIFT) & EVENT_SAMPLE_MASK;
> +			if (val) {
> +				mmcra |= (val &  3) << MMCRA_SAMP_MODE_SHIFT;
> +				mmcra |= (val >> 2) << MMCRA_SAMP_ELIG_SHIFT;
> +			}
> +		}
> +
> +		/*
> +		 * PM_MRK_FAB_RSP_MATCH and PM_MRK_FAB_RSP_MATCH_CYC,
> +		 * the threshold bits are used for the match value.
> +		 */
> +		if (event_is_fab_match(event[i])) {
> +			mmcr1 |= (event[i] >> EVENT_THR_CTL_SHIFT) &
> +				  EVENT_THR_CTL_MASK;
> +		} else {
> +			val = (event[i] >> EVENT_THR_CTL_SHIFT) & EVENT_THR_CTL_MASK;
> +			mmcra |= val << MMCRA_THR_CTL_SHIFT;
> +			val = (event[i] >> EVENT_THR_SEL_SHIFT) & EVENT_THR_SEL_MASK;
> +			mmcra |= val << MMCRA_THR_SEL_SHIFT;
> +			val = (event[i] >> EVENT_THR_CMP_SHIFT) & EVENT_THR_CMP_MASK;
> +			mmcra |= val << MMCRA_THR_CMP_SHIFT;
> +		}
> +
> +		hwc[i] = pmc - 1;
> +	}
> +
> +	/* Return MMCRx values */
> +	mmcr[0] = 0;
> +
> +	/* pmc_inuse is 1-based */
> +	if (pmc_inuse & 2)
> +		mmcr[0] = MMCR0_PMC1CE;
> +
> +	if (pmc_inuse & 0x7c)
> +		mmcr[0] |= MMCR0_PMCjCE;
> +
> +	mmcr[1] = mmcr1;
> +	mmcr[2] = mmcra;
> +
> +	return 0;
> +}
> +
> +static void power8_disable_pmc(unsigned int pmc, unsigned long mmcr[])
> +{
> +	if (pmc <= 3)
> +		mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SHIFT(pmc + 1));
> +}
> +
> +PMU_FORMAT_ATTR(event,		"config:0-49");
> +PMU_FORMAT_ATTR(pmcxsel,	"config:0-7");
> +PMU_FORMAT_ATTR(mark,		"config:8");
> +PMU_FORMAT_ATTR(combine,	"config:11");
> +PMU_FORMAT_ATTR(unit,		"config:12-15");
> +PMU_FORMAT_ATTR(pmc,		"config:16-19");
> +PMU_FORMAT_ATTR(cache_sel,	"config:20-23");
> +PMU_FORMAT_ATTR(sample_mode,	"config:24-28");
> +PMU_FORMAT_ATTR(thresh_sel,	"config:29-31");
> +PMU_FORMAT_ATTR(thresh_stop,	"config:32-35");
> +PMU_FORMAT_ATTR(thresh_start,	"config:36-39");
> +PMU_FORMAT_ATTR(thresh_cmp,	"config:40-49");
> +
> +static struct attribute *power8_pmu_format_attr[] = {
> +	&format_attr_event.attr,
> +	&format_attr_pmcxsel.attr,
> +	&format_attr_mark.attr,
> +	&format_attr_combine.attr,
> +	&format_attr_unit.attr,
> +	&format_attr_pmc.attr,
> +	&format_attr_cache_sel.attr,
> +	&format_attr_sample_mode.attr,
> +	&format_attr_thresh_sel.attr,
> +	&format_attr_thresh_stop.attr,
> +	&format_attr_thresh_start.attr,
> +	&format_attr_thresh_cmp.attr,
> +	NULL,
> +};
> +
> +struct attribute_group power8_pmu_format_group = {
> +	.name = "format",
> +	.attrs = power8_pmu_format_attr,
> +};
> +
> +static const struct attribute_group *power8_pmu_attr_groups[] = {
> +	&power8_pmu_format_group,
> +	NULL,
> +};
> +
> +static int power8_generic_events[] = {
> +	[PERF_COUNT_HW_CPU_CYCLES] =			PM_CYC,
> +	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =	PM_GCT_NOSLOT_CYC,
> +	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =	PM_CMPLU_STALL,
> +	[PERF_COUNT_HW_INSTRUCTIONS] =			PM_INST_CMPL,
> +	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =		PM_BRU_FIN,
> +	[PERF_COUNT_HW_BRANCH_MISSES] =			PM_BR_MPRED_CMPL,
> +};
> +
> +static struct power_pmu power8_pmu = {
> +	.name			= "POWER8",
> +	.n_counter		= 6,
> +	.max_alternatives	= 0,
> +	.add_fields		= POWER8_ADD_FIELDS,
> +	.test_adder		= POWER8_TEST_ADDER,
> +	.compute_mmcr		= power8_compute_mmcr,
> +	.get_constraint		= power8_get_constraint,
> +	.disable_pmc		= power8_disable_pmc,
> +	.flags			= PPMU_HAS_SSLOT | PPMU_HAS_SIER,
> +	.n_generic		= ARRAY_SIZE(power8_generic_events),
> +	.generic_events		= power8_generic_events,
> +	.attr_groups		= power8_pmu_attr_groups,
> +};
> +
> +static int __init init_power8_pmu(void)
> +{
> +	if (!cur_cpu_spec->oprofile_cpu_type ||
> +	    strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8"))
> +		return -ENODEV;
> +
> +	return register_power_pmu(&power8_pmu);
> +}
> +early_initcall(init_power8_pmu);

^ permalink raw reply

* Re: [PATCH 2/8] Add version and timestamp to oops header
From: Michael Ellerman @ 2013-04-15  7:31 UTC (permalink / raw)
  To: Aruna Balakrishnaiah
  Cc: jkenisto, mahesh, linux-kernel, linuxppc-dev, paulus, anton
In-Reply-To: <20130410072112.20150.10281.stgit@aruna-ThinkPad-T420>

On Wed, Apr 10, 2013 at 12:51:12PM +0530, Aruna Balakrishnaiah wrote:
> Introduce version and timestamp information in the oops header.
> oops_log_info (oops header) holds version (to distinguish between old
> and new format oops header), length of the oops text
> (compressed or uncompressed) and timestamp.

This needs a much more detailed explanation.

I think what you're doing is you're overlaying the new information so
that the version field in oops_log_info sits in the same location as the
length field in the old format. And then you're defining the version to
be a value that is an illegal length.

So existing tools will refuse to dump new style partitions,
because they'll think the length is too large. You've tested that?

Updated tools will know about both formats, so will be able to handle
either old or new style partitions.

Is that correct?

And we're adding the timestamp just because we can and it'd be nice to
have?

cheers

^ permalink raw reply

* Re: [PATCH 3/8] Introduce generic read function to read nvram-partitions
From: Michael Ellerman @ 2013-04-15  7:35 UTC (permalink / raw)
  To: Aruna Balakrishnaiah
  Cc: jkenisto, mahesh, linux-kernel, linuxppc-dev, paulus, anton
In-Reply-To: <20130410072125.20150.97063.stgit@aruna-ThinkPad-T420>

On Wed, Apr 10, 2013 at 12:51:25PM +0530, Aruna Balakrishnaiah wrote:
> Introduce generic read function to read nvram partitions other than rtas.
> nvram_read_error_log will be retained which is used to read rtas partition
> from rtasd. nvram_read_partition is the generic read function to read from
> any nvram partition.
> 
> Signed-off-by: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
> Reviewed-by: Jim Keniston <jkenisto@us.ibm.com>
> ---
>  arch/powerpc/platforms/pseries/nvram.c |   34 +++++++++++++++++++++++---------
>  1 file changed, 24 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
> index 742735a..6701b71 100644
> --- a/arch/powerpc/platforms/pseries/nvram.c
> +++ b/arch/powerpc/platforms/pseries/nvram.c
> @@ -293,34 +293,37 @@ int nvram_write_error_log(char * buff, int length,
>  	return rc;
>  }
>  
> -/* nvram_read_error_log
> +/* nvram_read_partition
>   *
> - * Reads nvram for error log for at most 'length'
> + * Reads nvram partition for at most 'length'
>   */
> -int nvram_read_error_log(char * buff, int length,
> -                         unsigned int * err_type, unsigned int * error_log_cnt)
> +int nvram_read_partition(struct nvram_os_partition *part, char *buff,
> +			int length, unsigned int *err_type,
> +			unsigned int *error_log_cnt)
>  {
>  	int rc;
>  	loff_t tmp_index;
>  	struct err_log_info info;
>  	
> -	if (rtas_log_partition.index == -1)
> +	if (part->index == -1)
>  		return -1;
>  
> -	if (length > rtas_log_partition.size)
> -		length = rtas_log_partition.size;
> +	if (length > part->size)
> +		length = part->size;
>  
> -	tmp_index = rtas_log_partition.index;
> +	tmp_index = part->index;
>  
>  	rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index);
>  	if (rc <= 0) {
> -		printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc);
> +		printk(KERN_ERR "nvram_read_partition: "
> +				"Failed nvram_read (%d)\n", rc);

Should be:
	pr_err("%s: Failed ..\n", __FUNCTION__, ..)

cheers

^ permalink raw reply

* Re: [PATCH 0/8] Nvram-to-pstore
From: Michael Ellerman @ 2013-04-15  7:36 UTC (permalink / raw)
  To: Aruna Balakrishnaiah
  Cc: jkenisto, mahesh, linux-kernel, linuxppc-dev, paulus, anton
In-Reply-To: <20130410071835.20150.56489.stgit@aruna-ThinkPad-T420>

On Wed, Apr 10, 2013 at 12:50:47PM +0530, Aruna Balakrishnaiah wrote:
> Currently the kernel provides the contents of p-series NVRAM only as a
> simple stream of bytes via /dev/nvram, which must be interpreted in user
> space by the nvram command in the powerpc-utils package.  This patch set
> exploits the pstore subsystem to expose each partition in NVRAM as a
> separate file in /dev/pstore. For instance Oops messages will stored in a
> file named [dmesg-nvram-2].

Please try to fold some of that info into the commit messages for actual
patches. The 0th patch is lost when we commit the series into git.

Also all your patches should have a subject starting with
"powerpc/pseries:".

cheers

^ permalink raw reply

* Re: [PATCH 1/8] Remove syslog prefix in uncompressed oops text
From: aruna @ 2013-04-15  7:39 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: jkenisto, mahesh, linux-kernel, linuxppc-dev, paulus, anton
In-Reply-To: <20130415072027.GA30156@concordia>

On Monday 15 April 2013 12:50 PM, Michael Ellerman wrote:
> On Wed, Apr 10, 2013 at 12:51:00PM +0530, Aruna Balakrishnaiah wrote:
>> Removal of syslog prefix in the uncompressed oops text will
>> help in capturing more oops data.
> Why does it help? Does this effect any existing tools?
>
> cheers
>

By setting the (2nd) syslog argument of kmsg_dump_get_buffer() to false,
we omit <n> line prefixes and thereby capture more of the printk buffer.

No this should not effect any existing tools.


Regards,
Aruna

^ permalink raw reply

* Re: [PATCH 4/8] Read/Write oops nvram partition via pstore
From: Michael Ellerman @ 2013-04-15  7:55 UTC (permalink / raw)
  To: Aruna Balakrishnaiah
  Cc: jkenisto, mahesh, linux-kernel, linuxppc-dev, paulus, anton
In-Reply-To: <20130410072303.20150.61382.stgit@aruna-ThinkPad-T420>

On Wed, Apr 10, 2013 at 12:53:03PM +0530, Aruna Balakrishnaiah wrote:
> This patch exploits pstore infrastructure in power systems.
> IBM's system p machines provide persistent storage for LPARs

In the kernel we use "pseries" instead of "system p".

> through NVRAM. NVRAM's lnx,oops-log partition is used to log
> oops messages. In case pstore registration fails it will
> fall back to kmsg_dump mechanism.

What are the implications of falling back to kmsg_dump()?


Is there any reason we would not want to enable CONFIG_PSTORE ? ie.
should the pseries platform just select it?

> diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
> index 6701b71..82d32a2 100644
> --- a/arch/powerpc/platforms/pseries/nvram.c
> +++ b/arch/powerpc/platforms/pseries/nvram.c
> @@ -18,6 +18,7 @@
>  #include <linux/spinlock.h>
>  #include <linux/slab.h>
>  #include <linux/kmsg_dump.h>
> +#include <linux/pstore.h>
>  #include <linux/ctype.h>
>  #include <linux/zlib.h>
>  #include <asm/uaccess.h>
> @@ -87,6 +88,25 @@ static struct kmsg_dumper nvram_kmsg_dumper = {
>  	.dump = oops_to_nvram
>  };
>  
> +static int nvram_pstore_open(struct pstore_info *psi);
> +
> +static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
> +				int *count, struct timespec *time, char **buf,
> +				struct pstore_info *psi);
> +
> +static int nvram_pstore_write(enum pstore_type_id type,
> +				enum kmsg_dump_reason reason, u64 *id,
> +				unsigned int part, int count, size_t size,
> +				struct pstore_info *psi);

I think you should be able to rearrange this so that you don't need the
forward declarations.

> +
> +static struct pstore_info nvram_pstore_info = {
> +	.owner = THIS_MODULE,
> +	.name = "nvram",
> +	.open = nvram_pstore_open,
> +	.read = nvram_pstore_read,
> +	.write = nvram_pstore_write,
> +};
> +
>  /* See clobbering_unread_rtas_event() */
>  #define NVRAM_RTAS_READ_TIMEOUT 5		/* seconds */
>  static unsigned long last_unread_rtas_event;	/* timestamp */
> @@ -121,6 +141,13 @@ static char *big_oops_buf, *oops_buf;
>  static char *oops_data;
>  static size_t oops_data_sz;
>  
> +#ifdef CONFIG_PSTORE

If we are going to have CONFIG_PSTORE #ifdefs in this file, I don't see
why there can't be just a single block of code that is #ifdef'ed, rather
than several like you have.

> +static enum pstore_type_id nvram_type_ids[] = {
> +	PSTORE_TYPE_DMESG,
> +	-1
> +};
> +static int read_type;

I don't understand what you're doing with read_type. It looks fishy.

> +#endif
>  /* Compression parameters */
>  #define COMPR_LEVEL 6
>  #define WINDOW_BITS 12
> @@ -455,6 +482,23 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists)
>  	oops_data = oops_buf + sizeof(struct oops_log_info);
>  	oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info);
>  
> +	nvram_pstore_info.buf = oops_data;
> +	nvram_pstore_info.bufsize = oops_data_sz;
> +
> +	rc = pstore_register(&nvram_pstore_info);
> +
> +	if (rc != 0) {
> +		pr_err("nvram: pstore_register() failed, defaults to "
> +				"kmsg_dump; returned %d\n", rc);
> +		goto kmsg_dump;

You don't need the goto.

> +	} else {
> +		/*TODO: Support compression when pstore is configured */

What is the issue here?

> +		pr_info("nvram: Compression of oops text supported only when "
> +				"pstore is not configured");
> +		return;
> +	}
> +
> +kmsg_dump:
>  	/*
>  	 * Figure compression (preceded by elimination of each line's <n>
>  	 * severity prefix) will reduce the oops/panic report to at most
> @@ -663,3 +707,104 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
>  
>  	spin_unlock_irqrestore(&lock, flags);
>  }
> +
> +#ifdef CONFIG_PSTORE

Same comment about too many ifdefs.

> +static int nvram_pstore_open(struct pstore_info *psi)
> +{
> +	read_type = -1;

Locking?

> +	return 0;
> +}
> +
> +/*

Make it a kernel-doc style comment.

> + * Called by pstore_dump() when an oops or panic report is logged to the printk
> + * buffer. @size bytes have been written to oops_buf, starting after the
> + * oops_log_info header.

"@size bytes have", or "@size bytes should be written"?

> + */
> +static int nvram_pstore_write(enum pstore_type_id type,
> +				enum kmsg_dump_reason reason,
> +				u64 *id, unsigned int part, int count,
> +				size_t size, struct pstore_info *psi)
> +{
> +	struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf;
> +
> +	/* part 1 has the recent messages from printk buffer */
> +	if (part > 1 || clobbering_unread_rtas_event())
> +		return -1;
> +
> +	BUG_ON(type != PSTORE_TYPE_DMESG);
> +	BUG_ON(sizeof(*oops_hdr) + size > oops_log_partition.size);

Why would we be called with the wrong type? Would it be better to just
return an error, rather than causing another oops while we're trying to
write the oops?

And couldn't we just clamp the size, rather than BUG'ing.

> +	oops_hdr->version = OOPS_HDR_VERSION;
> +	oops_hdr->report_length = (u16) size;
> +	oops_hdr->timestamp = get_seconds();
> +	(void) nvram_write_os_partition(&oops_log_partition, oops_buf,
> +		(int) (sizeof(*oops_hdr) + size), ERR_TYPE_KERNEL_PANIC,
> +		count);

You definitely don't need the (void). But more to the point why aren't
you checking the return value?

> +	*id = part;

What is this? Part of the API?

> +	return 0;
> +}
> +
> +/*
> + * Reads the oops/panic report.
> + * Returns the length of the data we read from each partition.
> + * Returns 0 if we've been called before.
> + */
> +static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
> +				int *count, struct timespec *time, char **buf,
> +				struct pstore_info *psi)
> +{
> +	struct oops_log_info *oops_hdr;
> +	unsigned int err_type, id_no;
> +	struct nvram_os_partition *part = NULL;
> +	char *buff = NULL;
> +
> +	read_type++;
> +
> +	switch (nvram_type_ids[read_type]) {
> +	case PSTORE_TYPE_DMESG:
> +		part = &oops_log_partition;
> +		*type = PSTORE_TYPE_DMESG;
> +		break;
> +	default:
> +		return 0;
> +	}
> +
> +	buff = kmalloc(part->size, GFP_KERNEL);
> +
> +	if (!buff)
> +		return -ENOMEM;
> +
> +	if (nvram_read_partition(part, buff, part->size, &err_type, &id_no)) {
> +		kfree(buff);
> +		return 0;
> +	}
> +
> +	*count = 0;
> +	*id = id_no;

Can't you just cast in the call to nvram_read_partition() ?

> +	oops_hdr = (struct oops_log_info *)buff;
> +	*buf = buff + sizeof(*oops_hdr);
> +	time->tv_sec = oops_hdr->timestamp;
> +	time->tv_nsec = 0;
> +	return oops_hdr->report_length;
> +}
> +#else
> +static int nvram_pstore_open(struct pstore_info *psi)
> +{
> +	return 0;
> +}
> +
> +static int nvram_pstore_write(enum pstore_type_id type,
> +				enum kmsg_dump_reason reason, u64 *id,
> +				unsigned int part, int count, size_t size,
> +				struct pstore_info *psi)
> +{
> +	return 0;
> +}
> +
> +static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
> +				int *count, struct timespec *time, char **buf,
> +				struct pstore_info *psi)
> +{
> +	return 0;
> +}
> +#endif

I don't understand why we have empty versions of these. If CONFIG_PSTORE
is disabled we should just not register with pstore at all.

cheers

^ permalink raw reply

* Re: [PATCH 2/8] Add version and timestamp to oops header
From: aruna @ 2013-04-15  7:51 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: jkenisto, mahesh, linux-kernel, linuxppc-dev, paulus, anton
In-Reply-To: <20130415073138.GB30156@concordia>

On Monday 15 April 2013 01:01 PM, Michael Ellerman wrote:
> On Wed, Apr 10, 2013 at 12:51:12PM +0530, Aruna Balakrishnaiah wrote:
>> Introduce version and timestamp information in the oops header.
>> oops_log_info (oops header) holds version (to distinguish between old
>> and new format oops header), length of the oops text
>> (compressed or uncompressed) and timestamp.
> This needs a much more detailed explanation.
>
> I think what you're doing is you're overlaying the new information so
> that the version field in oops_log_info sits in the same location as the
> length field in the old format. And then you're defining the version to
> be a value that is an illegal length.

Thats right.

> So existing tools will refuse to dump new style partitions,
> because they'll think the length is too large. You've tested that?

Yeah, I have tested that.

>
> Updated tools will know about both formats, so will be able to handle
> either old or new style partitions.
>
> Is that correct?

Yeah, thats correct.

>
> And we're adding the timestamp just because we can and it'd be nice to
> have?

Thats right. And also, the main reason behind adding timestamp is
it will be used when we create a pstore file for oops messages.
The pstore file's timestamp will reflect the timestamp in the oops-header
added during the crash.

> cheers
>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox