LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH kernel v2 REPOST] powerpc/powernv/ioda: Allocate indirect TCE levels on demand
From: Alexey Kardashevskiy @ 2018-06-21  9:36 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Alexey Kardashevskiy, David Gibson, kvm-ppc, Alex Williamson,
	Benjamin Herrenschmidt, Russell Currey
In-Reply-To: <20180621020321.GE32328@umbus.fritz.box>

At the moment we allocate the entire TCE table, twice (hardware part and
userspace translation cache). This normally works as we normally have
contigous memory and the guest will map entire RAM for 64bit DMA.

However if we have sparse RAM (one example is a memory device), then
we will allocate TCEs which will never be used as the guest only maps
actual memory for DMA. If it is a single level TCE table, there is nothing
we can really do but if it a multilevel table, we can skip allocating
TCEs we know we won't need.

This adds ability to allocate only first level, saving memory.

This changes iommu_table::free() to avoid allocating of an extra level;
iommu_table::set() will do this when needed.

This adds @alloc parameter to iommu_table::exchange() to tell the callback
if it can allocate an extra level; the flag is set to "false" for
the realmode KVM handlers of H_PUT_TCE hcalls and the callback returns
H_TOO_HARD.

This still requires the entire table to be counted in mm::locked_vm.

To be conservative, this only does on-demand allocation when
the usespace cache table is requested which is the case of VFIO.

The example math for a system replicating a powernv setup with NVLink2
in a guest:
16GB RAM mapped at 0x0
128GB GPU RAM window (16GB of actual RAM) mapped at 0x244000000000

the table to cover that all with 64K pages takes:
(((0x244000000000 + 0x2000000000) >> 16)*8)>>20 = 4556MB

If we allocate only necessary TCE levels, we will only need:
(((0x400000000 + 0x400000000) >> 16)*8)>>20 = 4MB (plus some for indirect
levels).

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---


This is what I meant to post few days ago, sorry for the noise.


---
Changes:
v2:
* fixed bug in cleanup path which forced the entire table to be
allocated right before destroying
* added memory allocation error handling pnv_tce()
---
 arch/powerpc/include/asm/iommu.h              |  7 ++-
 arch/powerpc/platforms/powernv/pci.h          |  6 ++-
 arch/powerpc/kvm/book3s_64_vio_hv.c           |  4 +-
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 73 +++++++++++++++++++++------
 arch/powerpc/platforms/powernv/pci-ioda.c     |  8 +--
 drivers/vfio/vfio_iommu_spapr_tce.c           |  2 +-
 6 files changed, 73 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 4bdcf22..daa3ee5 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -70,7 +70,7 @@ struct iommu_table_ops {
 			unsigned long *hpa,
 			enum dma_data_direction *direction);
 
-	__be64 *(*useraddrptr)(struct iommu_table *tbl, long index);
+	__be64 *(*useraddrptr)(struct iommu_table *tbl, long index, bool alloc);
 #endif
 	void (*clear)(struct iommu_table *tbl,
 			long index, long npages);
@@ -122,10 +122,13 @@ struct iommu_table {
 	__be64 *it_userspace; /* userspace view of the table */
 	struct iommu_table_ops *it_ops;
 	struct kref    it_kref;
+	int it_nid;
 };
 
+#define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \
+		((tbl)->it_ops->useraddrptr((tbl), (entry), false))
 #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
-		((tbl)->it_ops->useraddrptr((tbl), (entry)))
+		((tbl)->it_ops->useraddrptr((tbl), (entry), true))
 
 /* Pure 2^n version of get_order */
 static inline __attribute_const__
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 5e02408..1fa5590 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -267,8 +267,10 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 		unsigned long attrs);
 extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
 extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
-		unsigned long *hpa, enum dma_data_direction *direction);
-extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index);
+		unsigned long *hpa, enum dma_data_direction *direction,
+		bool alloc);
+extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index,
+		bool alloc);
 extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
 
 extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 8cc1caf..efb90d8 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -200,7 +200,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
 {
 	struct mm_iommu_table_group_mem_t *mem = NULL;
 	const unsigned long pgsize = 1ULL << tbl->it_page_shift;
-	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
 
 	if (!pua)
 		/* it_userspace allocation might be delayed */
@@ -264,7 +264,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
 {
 	long ret;
 	unsigned long hpa = 0;
-	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
 	struct mm_iommu_table_group_mem_t *mem;
 
 	if (!pua)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index 36c2eb0..fe96910 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -48,7 +48,7 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
 	return addr;
 }
 
-static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx)
+static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
 {
 	__be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base;
 	int  level = tbl->it_indirect_levels;
@@ -57,7 +57,23 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx)
 
 	while (level) {
 		int n = (idx & mask) >> (level * shift);
-		unsigned long tce = be64_to_cpu(tmp[n]);
+		unsigned long tce;
+
+		if (tmp[n] == 0) {
+			__be64 *tmp2;
+
+			if (!alloc)
+				return NULL;
+
+			tmp2 = pnv_alloc_tce_level(tbl->it_nid,
+					ilog2(tbl->it_level_size) + 3);
+			if (!tmp2)
+				return NULL;
+
+			tmp[n] = cpu_to_be64(__pa(tmp2) |
+					TCE_PCI_READ | TCE_PCI_WRITE);
+		}
+		tce = be64_to_cpu(tmp[n]);
 
 		tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
 		idx &= ~mask;
@@ -84,7 +100,7 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 			((rpn + i) << tbl->it_page_shift);
 		unsigned long idx = index - tbl->it_offset + i;
 
-		*(pnv_tce(tbl, false, idx)) = cpu_to_be64(newtce);
+		*(pnv_tce(tbl, false, idx, true)) = cpu_to_be64(newtce);
 	}
 
 	return 0;
@@ -92,31 +108,46 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 
 #ifdef CONFIG_IOMMU_API
 int pnv_tce_xchg(struct iommu_table *tbl, long index,
-		unsigned long *hpa, enum dma_data_direction *direction)
+		unsigned long *hpa, enum dma_data_direction *direction,
+		bool alloc)
 {
 	u64 proto_tce = iommu_direction_to_tce_perm(*direction);
 	unsigned long newtce = *hpa | proto_tce, oldtce;
 	unsigned long idx = index - tbl->it_offset;
+	__be64 *ptce = NULL;
 
 	BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl));
 
+	if (*direction == DMA_NONE) {
+		ptce = pnv_tce(tbl, false, idx, false);
+		if (!ptce) {
+			*hpa = 0;
+			return 0;
+		}
+	}
+
+	if (!ptce) {
+		ptce = pnv_tce(tbl, false, idx, alloc);
+		if (!ptce)
+			return alloc ? H_HARDWARE : H_TOO_HARD;
+	}
+
 	if (newtce & TCE_PCI_WRITE)
 		newtce |= TCE_PCI_READ;
 
-	oldtce = be64_to_cpu(xchg(pnv_tce(tbl, false, idx),
-				  cpu_to_be64(newtce)));
+	oldtce = be64_to_cpu(xchg(ptce, cpu_to_be64(newtce)));
 	*hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
 	*direction = iommu_tce_direction(oldtce);
 
 	return 0;
 }
 
-__be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index)
+__be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index, bool alloc)
 {
 	if (WARN_ON_ONCE(!tbl->it_userspace))
 		return NULL;
 
-	return pnv_tce(tbl, true, index - tbl->it_offset);
+	return pnv_tce(tbl, true, index - tbl->it_offset, alloc);
 }
 #endif
 
@@ -126,14 +157,19 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 
 	for (i = 0; i < npages; i++) {
 		unsigned long idx = index - tbl->it_offset + i;
+		__be64 *ptce = pnv_tce(tbl, false, idx,	false);
 
-		*(pnv_tce(tbl, false, idx)) = cpu_to_be64(0);
+		if (ptce)
+			*ptce = cpu_to_be64(0);
 	}
 }
 
 unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
 {
-	__be64 *ptce = pnv_tce(tbl, false, index - tbl->it_offset);
+	__be64 *ptce = pnv_tce(tbl, false, index - tbl->it_offset, false);
+
+	if (!ptce)
+		return 0;
 
 	return be64_to_cpu(*ptce);
 }
@@ -224,6 +260,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	unsigned int table_shift = max_t(unsigned int, entries_shift + 3,
 			PAGE_SHIFT);
 	const unsigned long tce_table_size = 1UL << table_shift;
+	unsigned int tmplevels = levels;
 
 	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
 		return -EINVAL;
@@ -231,6 +268,9 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	if (!is_power_of_2(window_size))
 		return -EINVAL;
 
+	if (alloc_userspace_copy && (window_size > (1ULL << 32)))
+		tmplevels = 1;
+
 	/* Adjust direct table size from window_size and levels */
 	entries_shift = (entries_shift + levels - 1) / levels;
 	level_shift = entries_shift + 3;
@@ -241,7 +281,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 
 	/* Allocate TCE table */
 	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
-			levels, tce_table_size, &offset, &total_allocated);
+			tmplevels, tce_table_size, &offset, &total_allocated);
 
 	/* addr==NULL means that the first level allocation failed */
 	if (!addr)
@@ -252,7 +292,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	 * we did not allocate as much as we wanted,
 	 * release partially allocated table.
 	 */
-	if (offset < tce_table_size)
+	if (tmplevels == levels && offset < tce_table_size)
 		goto free_tces_exit;
 
 	/* Allocate userspace view of the TCE table */
@@ -263,8 +303,8 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 				&total_allocated_uas);
 		if (!uas)
 			goto free_tces_exit;
-		if (offset < tce_table_size ||
-				total_allocated_uas != total_allocated)
+		if (tmplevels == levels && (offset < tce_table_size ||
+				total_allocated_uas != total_allocated))
 			goto free_uas_exit;
 	}
 
@@ -275,10 +315,11 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	tbl->it_indirect_levels = levels - 1;
 	tbl->it_allocated_size = total_allocated;
 	tbl->it_userspace = uas;
+	tbl->it_nid = nid;
 
-	pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d\n",
+	pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n",
 			window_size, tce_table_size, bus_offset, tbl->it_base,
-			tbl->it_userspace, levels);
+			tbl->it_userspace, tmplevels, levels);
 
 	return 0;
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index f3a7829..81489ae 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2011,7 +2011,7 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
 static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
 		unsigned long *hpa, enum dma_data_direction *direction)
 {
-	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
 
 	if (!ret)
 		pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false);
@@ -2022,7 +2022,7 @@ static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
 static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
 		unsigned long *hpa, enum dma_data_direction *direction)
 {
-	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
 
 	if (!ret)
 		pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
@@ -2176,7 +2176,7 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
 static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
 		unsigned long *hpa, enum dma_data_direction *direction)
 {
-	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
 
 	if (!ret)
 		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
@@ -2187,7 +2187,7 @@ static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
 static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
 		unsigned long *hpa, enum dma_data_direction *direction)
 {
-	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
 
 	if (!ret)
 		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 5a2e8e4..6e174ef 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -635,7 +635,7 @@ static long tce_iommu_create_table(struct tce_container *container,
 			page_shift, window_size, levels, ptbl);
 
 	WARN_ON(!ret && !(*ptbl)->it_ops->free);
-	WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
+	WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
 
 	return ret;
 }
-- 
2.11.0

^ permalink raw reply related

* [RESEND PATCH 3/3] powerpc: dts: use a correct at24 compatible fallback in ac14xx
From: Bartosz Golaszewski @ 2018-06-21  8:33 UTC (permalink / raw)
  To: Rob Herring, Mark Rutland, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman
  Cc: devicetree, linuxppc-dev, linux-kernel, Bartosz Golaszewski
In-Reply-To: <20180621083305.5322-1-brgl@bgdev.pl>

Using 'at24' as fallback is now deprecated - use the full
'atmel,<model>' string.

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 arch/powerpc/boot/dts/ac14xx.dts | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/boot/dts/ac14xx.dts b/arch/powerpc/boot/dts/ac14xx.dts
index 83bcfd865167..0be5c4f3265d 100644
--- a/arch/powerpc/boot/dts/ac14xx.dts
+++ b/arch/powerpc/boot/dts/ac14xx.dts
@@ -176,12 +176,12 @@
 			clock-frequency = <400000>;
 
 			at24@30 {
-				compatible = "at24,24c01";
+				compatible = "atmel,24c01";
 				reg = <0x30>;
 			};
 
 			at24@31 {
-				compatible = "at24,24c01";
+				compatible = "atmel,24c01";
 				reg = <0x31>;
 			};
 
@@ -191,42 +191,42 @@
 			};
 
 			at24@50 {
-				compatible = "at24,24c01";
+				compatible = "atmel,24c01";
 				reg = <0x50>;
 			};
 
 			at24@51 {
-				compatible = "at24,24c01";
+				compatible = "atmel,24c01";
 				reg = <0x51>;
 			};
 
 			at24@52 {
-				compatible = "at24,24c01";
+				compatible = "atmel,24c01";
 				reg = <0x52>;
 			};
 
 			at24@53 {
-				compatible = "at24,24c01";
+				compatible = "atmel,24c01";
 				reg = <0x53>;
 			};
 
 			at24@54 {
-				compatible = "at24,24c01";
+				compatible = "atmel,24c01";
 				reg = <0x54>;
 			};
 
 			at24@55 {
-				compatible = "at24,24c01";
+				compatible = "atmel,24c01";
 				reg = <0x55>;
 			};
 
 			at24@56 {
-				compatible = "at24,24c01";
+				compatible = "atmel,24c01";
 				reg = <0x56>;
 			};
 
 			at24@57 {
-				compatible = "at24,24c01";
+				compatible = "atmel,24c01";
 				reg = <0x57>;
 			};
 
-- 
2.17.1

^ permalink raw reply related

* [RESEND PATCH 2/3] powerpc: dts: use 'atmel' as at24 manufacturer for kmcent2
From: Bartosz Golaszewski @ 2018-06-21  8:33 UTC (permalink / raw)
  To: Rob Herring, Mark Rutland, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman
  Cc: devicetree, linuxppc-dev, linux-kernel, Bartosz Golaszewski
In-Reply-To: <20180621083305.5322-1-brgl@bgdev.pl>

Using compatible strings without the <manufacturer> part for at24 is
now deprecated. Use a correct 'atmel,<model>' value.

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 arch/powerpc/boot/dts/fsl/kmcent2.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/boot/dts/fsl/kmcent2.dts b/arch/powerpc/boot/dts/fsl/kmcent2.dts
index 5922c1ea0e96..3094df05f5ea 100644
--- a/arch/powerpc/boot/dts/fsl/kmcent2.dts
+++ b/arch/powerpc/boot/dts/fsl/kmcent2.dts
@@ -130,7 +130,7 @@
 					#size-cells = <0>;
 
 					eeprom@54 {
-						compatible = "24c02";
+						compatible = "atmel,24c02";
 						reg = <0x54>;
 						pagesize = <2>;
 						read-only;
-- 
2.17.1

^ permalink raw reply related

* [RESEND PATCH 1/3] powerpc: dts: use 'atmel' as at24 anufacturer for pdm360ng
From: Bartosz Golaszewski @ 2018-06-21  8:33 UTC (permalink / raw)
  To: Rob Herring, Mark Rutland, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman
  Cc: devicetree, linuxppc-dev, linux-kernel, Bartosz Golaszewski

Using 'at' as the <manufacturer> part of the compatible string is now
deprecated. Use a correct string: 'atmel,<model>'.

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 arch/powerpc/boot/dts/pdm360ng.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/boot/dts/pdm360ng.dts b/arch/powerpc/boot/dts/pdm360ng.dts
index 445b88114009..df1283b63d9b 100644
--- a/arch/powerpc/boot/dts/pdm360ng.dts
+++ b/arch/powerpc/boot/dts/pdm360ng.dts
@@ -98,7 +98,7 @@
 			fsl,preserve-clocking;
 
 			eeprom@50 {
-				compatible = "at,24c01";
+				compatible = "atmel,24c01";
 				reg = <0x50>;
 			};
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH 2/2] powerpc/mm: Increase MAX_PHYSMEM_BITS to 128TB with SPARSEMEM_VMEMMAP config
From: Aneesh Kumar K.V @ 2018-06-21  8:31 UTC (permalink / raw)
  To: npiggin, benh, paulus, mpe; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <20180621083158.30849-1-aneesh.kumar@linux.ibm.com>

We do this only with VMEMMAP config so that our page_to_[nid/section] etc are not
impacted.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/sparsemem.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
index bc66712bdc3c..28f5dae25db6 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -6,13 +6,20 @@
 #ifdef CONFIG_SPARSEMEM
 /*
  * SECTION_SIZE_BITS		2^N: how big each section will be
- * MAX_PHYSADDR_BITS		2^N: how much physical address space we have
  * MAX_PHYSMEM_BITS		2^N: how much memory we can have in that space
  */
 #define SECTION_SIZE_BITS       24
-
-#define MAX_PHYSADDR_BITS       46
+/*
+ * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS
+ * if we increase SECTIONS_WIDTH we will not store node details in page->flags and
+ * page_to_nid does a page->section->node lookup
+ * Hence only increase for VMEMMAP.
+ */
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+#define MAX_PHYSMEM_BITS        47
+#else
 #define MAX_PHYSMEM_BITS        46
+#endif
 
 #endif /* CONFIG_SPARSEMEM */
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH 1/2] powerpc/mm: Check memblock_add against MAX_PHYSMEM_BITS range
From: Aneesh Kumar K.V @ 2018-06-21  8:31 UTC (permalink / raw)
  To: npiggin, benh, paulus, mpe; +Cc: linuxppc-dev, Aneesh Kumar K.V

With SPARSEMEM config enabled, we make sure that we don't add sections beyond
MAX_PHYSMEM_BITS range. This results in not building vmemmap mapping for
range beyond max range. But our memblock layer looks the device tree and create
mapping for the full memory range. Prevent this by checking against
MAX_PHSYSMEM_BITS when doing memblock_add.

We don't do similar check for memeblock_reserve_range. If reserve range is beyond
MAX_PHYSMEM_BITS we expect that to be configured with 'nomap'. Any other
reserved range should come from existing memblock ranges which we already
filtered while adding.

This avoids crash as below when running on a system with system ram config above
MAX_PHSYSMEM_BITS

 Unable to handle kernel paging request for data at address 0xc00a001000000440
 Faulting instruction address: 0xc000000001034118
 cpu 0x0: Vector: 300 (Data Access) at [c00000000124fb30]
     pc: c000000001034118: __free_pages_bootmem+0xc0/0x1c0
     lr: c00000000103b258: free_all_bootmem+0x19c/0x22c
     sp: c00000000124fdb0
    msr: 9000000002001033
    dar: c00a001000000440
  dsisr: 40000000
   current = 0xc00000000120dd00
   paca    = 0xc000000001f60000^I irqmask: 0x03^I irq_happened: 0x01
     pid   = 0, comm = swapper
 [c00000000124fe20] c00000000103b258 free_all_bootmem+0x19c/0x22c
 [c00000000124fee0] c000000001010a68 mem_init+0x3c/0x5c
 [c00000000124ff00] c00000000100401c start_kernel+0x298/0x5e4
 [c00000000124ff90] c00000000000b57c start_here_common+0x1c/0x520

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/kernel/prom.c | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 05e7fb47a7a4..8f32f14ba508 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -440,6 +440,29 @@ static int __init early_init_dt_scan_chosen_ppc(unsigned long node,
 	return 1;
 }
 
+/*
+ * Compare the range against max mem limit and update
+ * size if it cross the limit.
+ */
+
+#ifdef CONFIG_SPARSEMEM
+static bool validate_mem_limit(u64 base, u64 *size)
+{
+	u64 max_mem = 1UL << (MAX_PHYSMEM_BITS);
+
+	if (base >= max_mem)
+		return false;
+	if ((base + *size) > max_mem)
+		*size = max_mem - base;
+	return true;
+}
+#else
+static bool validate_mem_limit(u64 base, u64 *size)
+{
+	return true;
+}
+#endif
+
 #ifdef CONFIG_PPC_PSERIES
 /*
  * Interpret the ibm dynamic reconfiguration memory LMBs.
@@ -494,7 +517,8 @@ static void __init early_init_drmem_lmb(struct drmem_lmb *lmb,
 		}
 
 		DBG("Adding: %llx -> %llx\n", base, size);
-		memblock_add(base, size);
+		if (validate_mem_limit(base, &size))
+			memblock_add(base, size);
 	} while (--rngs);
 }
 #endif /* CONFIG_PPC_PSERIES */
@@ -548,8 +572,10 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
 	}
 
 	/* Add the chunk to the MEMBLOCK list */
-	if (add_mem_to_memblock)
-		memblock_add(base, size);
+	if (add_mem_to_memblock) {
+		if (validate_mem_limit(base, &size))
+			memblock_add(base, size);
+	}
 }
 
 static void __init early_reserve_mem_dt(void)
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH kernel v2 6/6] powerpc/powernv/ioda: Allocate indirect TCE levels on demand
From: Alexey Kardashevskiy @ 2018-06-21  7:16 UTC (permalink / raw)
  To: David Gibson
  Cc: linuxppc-dev, kvm-ppc, Alex Williamson, Benjamin Herrenschmidt,
	Russell Currey
In-Reply-To: <20180621020321.GE32328@umbus.fritz.box>

[-- Attachment #1: Type: text/plain, Size: 7098 bytes --]

On Thu, 21 Jun 2018 12:03:21 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Sun, Jun 17, 2018 at 09:14:28PM +1000, Alexey Kardashevskiy wrote:
> > At the moment we allocate the entire TCE table, twice (hardware part and
> > userspace translation cache). This normally works as we normally have
> > contigous memory and the guest will map entire RAM for 64bit DMA.
> > 
> > However if we have sparse RAM (one example is a memory device), then
> > we will allocate TCEs which will never be used as the guest only maps
> > actual memory for DMA. If it is a single level TCE table, there is nothing
> > we can really do but if it a multilevel table, we can skip allocating
> > TCEs we know we won't need.
> > 
> > This adds ability to allocate only first level, saving memory.
> > 
> > This changes iommu_table::free() to avoid allocating of an extra level;
> > iommu_table::set() will do this when needed.
> > 
> > This adds @alloc parameter to iommu_table::exchange() to tell the callback
> > if it can allocate an extra level; the flag is set to "false" for
> > the realmode KVM handlers of H_PUT_TCE hcalls and the callback returns
> > H_TOO_HARD.
> > 
> > This still requires the entire table to be counted in mm::locked_vm.
> > 
> > To be conservative, this only does on-demand allocation when
> > the usespace cache table is requested which is the case of VFIO.
> > 
> > The example math for a system replicating a powernv setup with NVLink2
> > in a guest:
> > 16GB RAM mapped at 0x0
> > 128GB GPU RAM window (16GB of actual RAM) mapped at 0x244000000000
> > 
> > the table to cover that all with 64K pages takes:
> > (((0x244000000000 + 0x2000000000) >> 16)*8)>>20 = 4556MB
> > 
> > If we allocate only necessary TCE levels, we will only need:
> > (((0x400000000 + 0x400000000) >> 16)*8)>>20 = 4MB (plus some for indirect
> > levels).
> > 
> > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > ---
> > Changes:
> > v2:
> > * fixed bug in cleanup path which forced the entire table to be
> > allocated right before destroying
> > * added memory allocation error handling pnv_tce()
> > ---
> >  arch/powerpc/include/asm/iommu.h              |  7 ++-
> >  arch/powerpc/platforms/powernv/pci.h          |  6 ++-
> >  arch/powerpc/kvm/book3s_64_vio_hv.c           |  4 +-
> >  arch/powerpc/platforms/powernv/pci-ioda-tce.c | 69 ++++++++++++++++++++-------
> >  arch/powerpc/platforms/powernv/pci-ioda.c     |  8 ++--
> >  drivers/vfio/vfio_iommu_spapr_tce.c           |  2 +-
> >  6 files changed, 69 insertions(+), 27 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> > index 4bdcf22..daa3ee5 100644
> > --- a/arch/powerpc/include/asm/iommu.h
> > +++ b/arch/powerpc/include/asm/iommu.h
> > @@ -70,7 +70,7 @@ struct iommu_table_ops {
> >  			unsigned long *hpa,
> >  			enum dma_data_direction *direction);
> >  
> > -	__be64 *(*useraddrptr)(struct iommu_table *tbl, long index);
> > +	__be64 *(*useraddrptr)(struct iommu_table *tbl, long index, bool alloc);
> >  #endif
> >  	void (*clear)(struct iommu_table *tbl,
> >  			long index, long npages);
> > @@ -122,10 +122,13 @@ struct iommu_table {
> >  	__be64 *it_userspace; /* userspace view of the table */
> >  	struct iommu_table_ops *it_ops;
> >  	struct kref    it_kref;
> > +	int it_nid;
> >  };
> >  
> > +#define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \
> > +		((tbl)->it_ops->useraddrptr((tbl), (entry), false))
> >  #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
> > -		((tbl)->it_ops->useraddrptr((tbl), (entry)))
> > +		((tbl)->it_ops->useraddrptr((tbl), (entry), true))
> >  
> >  /* Pure 2^n version of get_order */
> >  static inline __attribute_const__
> > diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> > index 5e02408..1fa5590 100644
> > --- a/arch/powerpc/platforms/powernv/pci.h
> > +++ b/arch/powerpc/platforms/powernv/pci.h
> > @@ -267,8 +267,10 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
> >  		unsigned long attrs);
> >  extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
> >  extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
> > -		unsigned long *hpa, enum dma_data_direction *direction);
> > -extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index);
> > +		unsigned long *hpa, enum dma_data_direction *direction,
> > +		bool alloc);
> > +extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index,
> > +		bool alloc);
> >  extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
> >  
> >  extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
> > diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
> > index db0490c..05b4865 100644
> > --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> > +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> > @@ -200,7 +200,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
> >  {
> >  	struct mm_iommu_table_group_mem_t *mem = NULL;
> >  	const unsigned long pgsize = 1ULL << tbl->it_page_shift;
> > -	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
> > +	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
> >  
> >  	if (!pua)
> >  		/* it_userspace allocation might be delayed */
> > @@ -264,7 +264,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
> >  {
> >  	long ret;
> >  	unsigned long hpa = 0;
> > -	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
> > +	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
> >  	struct mm_iommu_table_group_mem_t *mem;
> >  
> >  	if (!pua)
> > diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> > index 36c2eb0..a7debfb 100644
> > --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> > +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> > @@ -48,7 +48,7 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
> >  	return addr;
> >  }
> >  
> > -static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx)
> > +static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
> >  {
> >  	__be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base;
> >  	int  level = tbl->it_indirect_levels;
> > @@ -57,7 +57,20 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx)
> >  
> >  	while (level) {
> >  		int n = (idx & mask) >> (level * shift);
> > -		unsigned long tce = be64_to_cpu(tmp[n]);
> > +		unsigned long tce;
> > +
> > +		if (tmp[n] == 0) {
> > +			__be64 *tmp2;
> > +
> > +			if (!alloc)
> > +				return NULL;
> > +
> > +			tmp2 = pnv_alloc_tce_level(tbl->it_nid,
> > +					ilog2(tbl->it_level_size) + 3);  
> 
> Can this allocation fail?  If it does you'll crash as you dereference
> NULL just below.


Oh. I fixed both comments and I lost the fix in rebase, I'll repost :(



--
Alexey

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH] powerpc: Wire up io_pgetevents
From: Christoph Hellwig @ 2018-06-21  6:11 UTC (permalink / raw)
  To: Breno Leitao; +Cc: linuxppc-dev, Christoph Hellwig
In-Reply-To: <1529523316-25027-1-git-send-email-leitao@debian.org>

On Wed, Jun 20, 2018 at 04:35:16PM -0300, Breno Leitao wrote:
> Wire up io_pgetevents system call on powerpc.
> 
> io_pgetevents is a new syscall to read asynchronous I/O events from the
> completion queue.
> 
> Tested with libaio branch aio-poll[1] and the io_pgetevents test (#22) passed
> on both ppc64 LE and BE modes.

Looks good to me:

Acked-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply

* Re: [PATCH v2 1/6] powerpc/pkeys: Enable all user-allocatable pkeys at init.
From: Michael Ellerman @ 2018-06-21  4:14 UTC (permalink / raw)
  To: Ram Pai
  Cc: linuxppc-dev, dave.hansen, aneesh.kumar, bsingharora, hbabu,
	mhocko, bauerman, Ulrich.Weigand, fweimer, luto, msuchanek
In-Reply-To: <20180619142537.GC5294@ram.oc3035372033.ibm.com>

Ram Pai <linuxram@us.ibm.com> writes:
> On Tue, Jun 19, 2018 at 10:39:52PM +1000, Michael Ellerman wrote:
>> Ram Pai <linuxram@us.ibm.com> writes:
>> 
>> > In a multithreaded application, a key allocated by one thread must
>> > be activate and usable on all threads.
>> >
>> > Currently this is not the case, because the UAMOR bits for all keys are
>> > disabled by default. When a new key is allocated in one thread, though
>> > the corresponding UAMOR bits for that thread get enabled, the UAMOR bits
>> > for all other existing threads continue to have their bits disabled.
>> > Other threads have no way to set permissions on the key, effectively
>> > making the key useless.
>> 
>> This all seems a bit strongly worded to me. It's arguable whether a key
>> should be usable by the thread that allocated it or all threads.
>> 
>> You could conceivably have a design where threads are blocked from using
>> a key until they're given permission to do so by the thread that
>> allocated the key.
>> 
>> But we're changing the behaviour to match x86 and because we don't have
>> an API to grant another thread access to a key. Right?
>
> correct. The other threads have no way to access or change the
> permissions on the key.

OK.

Though prior to patch 6 all threads have read/write permissions for all
keys, so they don't necessarily need to change permissions on a key
allocated by another thread.

>> > Enable the UAMOR bits for all keys, at process creation. Since the
>> > contents of UAMOR are inherited at fork, all threads are capable of
>> > modifying the permissions on any key.
>> >
>> > BTW: changing the permission on unallocated keys has no effect, till
>> > those keys are not associated with any PTEs. The kernel will anyway
>> > disallow to association of unallocated keys with PTEs.
>> 
>> This is an ABI change, which is bad, but I guess we call it a bug fix
>> because things didn't really work previously?
>
> Yes its a behaviorial change for the better. There is no downside
> to the change because no applications should break. Single threaded
> apps will continue to just work fine. Multithreaded applications,
> which were unable to consume the API/ABI, will now be able to do so.

Multi-threaded applications were able to use the API, as long as they
were satisfied with the semantics it provided, ie. that restrictions on
a key were only possible on the thread that allocated the key.

I'm not trying to argue for the sake of it, it's important that we
understand the subtleties of what we're changing and how it affects
existing software - even if we think there is essentially no existing
software.

I'll try and massage the change log to capture it.

I ended up with what's below.

cheers

  powerpc/pkeys: Give all threads control of their key permissions

  Currently in a multithreaded application, a key allocated by one
  thread is not usable by other threads. By "not usable" we mean that
  other threads are unable to change the access permissions for that
  key for themselves.

  When a new key is allocated in one thread, the corresponding UAMOR
  bits for that thread get enabled, however the UAMOR bits for that key
  for all other threads remain disabled.

  Other threads have no way to set permissions on the key, and the
  current default permissions are that read/write is enabled for all
  keys, which means the key has no effect for other threads. Although
  that may be the desired behaviour in some circumstances, having all
  threads able to control their permissions for the key is more
  flexible.

  The current behaviour also differs from the x86 behaviour, which is
  problematic for users.

  To fix this, enable the UAMOR bits for all keys, at process
  creation (in start_thread(), ie exec time). Since the contents of
  UAMOR are inherited at fork, all threads are capable of modifying the
  permissions on any key.

  This is technically an ABI break on powerpc, but pkey support is
  fairly new on powerpc and not widely used, and this brings us into
  line with x86.

  Fixes: cf43d3b26452 ("powerpc: Enable pkey subsystem")
  Cc: stable@vger.kernel.org # v4.16+
  Tested-by: Florian Weimer <fweimer@redhat.com>
  Signed-off-by: Ram Pai <linuxram@us.ibm.com>
  [mpe: Reword some of the changelog]
  Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

^ permalink raw reply

* Re: [PATCH v2 2/6] powerpc/pkeys: Save the pkey registers before fork
From: Michael Ellerman @ 2018-06-21  4:13 UTC (permalink / raw)
  To: Ram Pai
  Cc: linuxppc-dev, dave.hansen, aneesh.kumar, bsingharora, hbabu,
	mhocko, bauerman, Ulrich.Weigand, fweimer, luto, msuchanek
In-Reply-To: <20180619142811.GD5294@ram.oc3035372033.ibm.com>

Ram Pai <linuxram@us.ibm.com> writes:

> On Tue, Jun 19, 2018 at 10:39:56PM +1000, Michael Ellerman wrote:
>> Ram Pai <linuxram@us.ibm.com> writes:
>> 
>> > When a thread forks the contents of AMR, IAMR, UAMOR registers in the
>> > newly forked thread are not inherited.
>> >
>> > Save the registers before forking, for content of those
>> > registers to be automatically copied into the new thread.
>> >
>> > CC: Michael Ellerman <mpe@ellerman.id.au>
>> > CC: Florian Weimer <fweimer@redhat.com>
>> > CC: Andy Lutomirski <luto@kernel.org>
>> > CC: Thiago Jung Bauermann <bauerman@linux.ibm.com>
>> > Signed-off-by: Ram Pai <linuxram@us.ibm.com>
>> 
>> Again this is an ABI change but we'll call it a bug fix I guess.
>
> yes. the same defense here too. its a behaviorial change for the better.
> Single threaded applications will not see any behaviorial change.
> Multithreaded apps, which were unable to consume, the behavior will now be
> able to do so.

Well threads is one thing, but this also affects processes.

And actually without this fix it's possible that a child process could
fault on a region protected in the parent, if the value in the AMR in
the thread struct happens to block access at the time of fork(). The
value in the thread struct would be whatever was in the AMR the last
time the parent was scheduled in. I think?

cheers

^ permalink raw reply

* RE: [PATCH 0/7 v5] Support for fsl-mc bus and its devices in SMMU
From: Nipun Gupta @ 2018-06-21  3:59 UTC (permalink / raw)
  To: robin.murphy@arm.com, will.deacon@arm.com,
	gregkh@linuxfoundation.org
  Cc: hch@lst.de, joro@8bytes.org, m.szyprowski@samsung.com,
	shawnguo@kernel.org, frowand.list@gmail.com,
	iommu@lists.linux-foundation.org, linux-kernel@vger.kernel.org,
	devicetree@vger.kernel.org, linux-arm-kernel@lists.infradead.org,
	linuxppc-dev@lists.ozlabs.org, linux-pci@vger.kernel.org,
	Bharat Bhushan, stuyoder@gmail.com, Leo Li
In-Reply-To: <1526824191-7000-1-git-send-email-nipun.gupta@nxp.com>

Hi Robin/Greg k-h,

Will this patch-set be taken for the next kernel release (and via which tre=
e)?

Thanks,
Nipun

> -----Original Message-----
> From: Nipun Gupta
> Sent: Sunday, May 20, 2018 7:20 PM
> To: robin.murphy@arm.com; will.deacon@arm.com; robh+dt@kernel.org;
> robh@kernel.org; mark.rutland@arm.com; catalin.marinas@arm.com;
> gregkh@linuxfoundation.org; Laurentiu Tudor <laurentiu.tudor@nxp.com>;
> bhelgaas@google.com
> Cc: hch@lst.de; joro@8bytes.org; m.szyprowski@samsung.com;
> shawnguo@kernel.org; frowand.list@gmail.com; iommu@lists.linux-
> foundation.org; linux-kernel@vger.kernel.org; devicetree@vger.kernel.org;
> linux-arm-kernel@lists.infradead.org; linuxppc-dev@lists.ozlabs.org; linu=
x-
> pci@vger.kernel.org; Bharat Bhushan <bharat.bhushan@nxp.com>;
> stuyoder@gmail.com; Leo Li <leoyang.li@nxp.com>; Nipun Gupta
> <nipun.gupta@nxp.com>
> Subject: [PATCH 0/7 v5] Support for fsl-mc bus and its devices in SMMU
>=20
> This patchset defines IOMMU DT binding for fsl-mc bus and adds
> support in SMMU for fsl-mc bus.
>=20
> The patch series is based on top of dma-mapping tree (for-next branch):
> http://git.infradead.org/users/hch/dma-mapping.git
>=20
> These patches
>   - Define property 'iommu-map' for fsl-mc bus (patch 1)
>   - Integrates the fsl-mc bus with the SMMU using this
>     IOMMU binding (patch 2,3,4)
>   - Adds the dma configuration support for fsl-mc bus (patch 5, 6)
>   - Updates the fsl-mc device node with iommu/dma related changes (patch =
7)
>=20
> Changes in v2:
>   - use iommu-map property for fsl-mc bus
>   - rebase over patchset https://patchwork.kernel.org/patch/10317337/
>     and make corresponding changes for dma configuration of devices on
>     fsl-mc bus
>=20
> Changes in v3:
>   - move of_map_rid in drivers/of/address.c
>=20
> Changes in v4:
>   - move of_map_rid in drivers/of/base.c
>=20
> Changes in v5:
>   - break patch 5 in two separate patches (now patch 5/7 and patch 6/7)
>   - add changelog text in patch 3/7 and patch 5/7
>   - typo fix
>=20
> Nipun Gupta (7):
>   Docs: dt: add fsl-mc iommu-map device-tree binding
>   iommu: of: make of_pci_map_rid() available for other devices too
>   iommu: support iommu configuration for fsl-mc devices
>   iommu: arm-smmu: Add support for the fsl-mc bus
>   bus: fsl-mc: support dma configure for devices on fsl-mc bus
>   bus: fsl-mc: set coherent dma mask for devices on fsl-mc bus
>   arm64: dts: ls208xa: comply with the iommu map binding for fsl_mc
>=20
>  .../devicetree/bindings/misc/fsl,qoriq-mc.txt      |  39 ++++++++
>  arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi     |   6 +-
>  drivers/bus/fsl-mc/fsl-mc-bus.c                    |  16 +++-
>  drivers/iommu/arm-smmu.c                           |   7 ++
>  drivers/iommu/iommu.c                              |  21 +++++
>  drivers/iommu/of_iommu.c                           |  25 ++++-
>  drivers/of/base.c                                  | 102 +++++++++++++++=
++++++
>  drivers/of/irq.c                                   |   5 +-
>  drivers/pci/of.c                                   | 101 ---------------=
-----
>  include/linux/fsl/mc.h                             |   8 ++
>  include/linux/iommu.h                              |   2 +
>  include/linux/of.h                                 |  11 +++
>  include/linux/of_pci.h                             |  10 --
>  13 files changed, 231 insertions(+), 122 deletions(-)
>=20
> --
> 1.9.1

^ permalink raw reply

* RE: [PATCH] QE GPIO: Add qe_gpio_set_multiple
From: Qiang Zhao @ 2018-06-21  2:38 UTC (permalink / raw)
  To: jocke@infinera.com, York Sun, linuxppc-dev
In-Reply-To: <20180619162216.20030-1-joakim.tjernlund@infinera.com>

T24gMDYvMTkvMjAxOCAwOToyMiBBTSwgSm9ha2ltIFRqZXJubHVuZCB3cm90ZToNCi0tLS0tT3Jp
Z2luYWwgTWVzc2FnZS0tLS0tDQpGcm9tOiBMaW51eHBwYy1kZXYgW21haWx0bzpsaW51eHBwYy1k
ZXYtYm91bmNlcytxaWFuZy56aGFvPW54cC5jb21AbGlzdHMub3psYWJzLm9yZ10gT24gQmVoYWxm
IE9mIEpvYWtpbSBUamVybmx1bmQNClNlbnQ6IDIwMTjE6jbUwjIwyNUgMDoyMg0KVG86IFlvcmsg
U3VuIDx5b3JrLnN1bkBueHAuY29tPjsgbGludXhwcGMtZGV2IDxsaW51eHBwYy1kZXZAbGlzdHMu
b3psYWJzLm9yZz4NClN1YmplY3Q6IFtQQVRDSF0gUUUgR1BJTzogQWRkIHFlX2dwaW9fc2V0X211
bHRpcGxlDQoNClRoaXMgY291c2luIHRvIGdwaW8tbXBjOHh4eCB3YXMgbGFja2luZyBhIG11bHRp
cGxlIHBpbnMgbWV0aG9kLCBhZGQgb25lLg0KDQpTaWduZWQtb2ZmLWJ5OiBKb2FraW0gVGplcm5s
dW5kIDxqb2FraW0udGplcm5sdW5kQGluZmluZXJhLmNvbT4NCi0tLQ0KIGRyaXZlcnMvc29jL2Zz
bC9xZS9ncGlvLmMgfCAyOCArKysrKysrKysrKysrKysrKysrKysrKysrKysrDQogMSBmaWxlIGNo
YW5nZWQsIDI4IGluc2VydGlvbnMoKykNCg0KZGlmZiAtLWdpdCBhL2RyaXZlcnMvc29jL2ZzbC9x
ZS9ncGlvLmMgYi9kcml2ZXJzL3NvYy9mc2wvcWUvZ3Bpby5jIGluZGV4IDNiMjcwNzVjMjFhNy4u
ODE5YmVkMGY1NjY3IDEwMDY0NA0KLS0tIGEvZHJpdmVycy9zb2MvZnNsL3FlL2dwaW8uYw0KKysr
IGIvZHJpdmVycy9zb2MvZnNsL3FlL2dwaW8uYw0KQEAgLTgzLDYgKzgzLDMzIEBAIHN0YXRpYyB2
b2lkIHFlX2dwaW9fc2V0KHN0cnVjdCBncGlvX2NoaXAgKmdjLCB1bnNpZ25lZCBpbnQgZ3Bpbywg
aW50IHZhbCkNCiAJc3Bpbl91bmxvY2tfaXJxcmVzdG9yZSgmcWVfZ2MtPmxvY2ssIGZsYWdzKTsg
IH0NCiANCitzdGF0aWMgdm9pZCBxZV9ncGlvX3NldF9tdWx0aXBsZShzdHJ1Y3QgZ3Bpb19jaGlw
ICpnYywNCisJCQkJIHVuc2lnbmVkIGxvbmcgKm1hc2ssIHVuc2lnbmVkIGxvbmcgKmJpdHMpIHsN
CisJc3RydWN0IG9mX21tX2dwaW9fY2hpcCAqbW1fZ2MgPSB0b19vZl9tbV9ncGlvX2NoaXAoZ2Mp
Ow0KKwlzdHJ1Y3QgcWVfZ3Bpb19jaGlwICpxZV9nYyA9IGdwaW9jaGlwX2dldF9kYXRhKGdjKTsN
CisJc3RydWN0IHFlX3Bpb19yZWdzIF9faW9tZW0gKnJlZ3MgPSBtbV9nYy0+cmVnczsNCisJdW5z
aWduZWQgbG9uZyBmbGFnczsNCisJaW50IGk7DQorDQorCXNwaW5fbG9ja19pcnFzYXZlKCZxZV9n
Yy0+bG9jaywgZmxhZ3MpOw0KKw0KKwlmb3IgKGkgPSAwOyBpIDwgZ2MtPm5ncGlvOyBpKyspIHsN
CisJCWlmICgqbWFzayA9PSAwKQ0KKwkJCWJyZWFrOw0KKwkJaWYgKF9fdGVzdF9hbmRfY2xlYXJf
Yml0KGksIG1hc2spKSB7DQorCQkJaWYgKHRlc3RfYml0KGksIGJpdHMpKQ0KKwkJCQlxZV9nYy0+
Y3BkYXRhIHw9ICgxVSA8PCAoUUVfUElPX1BJTlMgLSAxIC0gaSkpOw0KKwkJCWVsc2UNCisJCQkJ
cWVfZ2MtPmNwZGF0YSAmPSB+KDFVIDw8IChRRV9QSU9fUElOUyAtIDEgLSBpKSk7DQorCQl9DQor
CX0NCisNCisJb3V0X2JlMzIoJnJlZ3MtPmNwZGF0YSwgcWVfZ2MtPmNwZGF0YSk7DQorDQorCXNw
aW5fdW5sb2NrX2lycXJlc3RvcmUoJnFlX2djLT5sb2NrLCBmbGFncyk7IH0NCisNCiBzdGF0aWMg
aW50IHFlX2dwaW9fZGlyX2luKHN0cnVjdCBncGlvX2NoaXAgKmdjLCB1bnNpZ25lZCBpbnQgZ3Bp
bykgIHsNCiAJc3RydWN0IG9mX21tX2dwaW9fY2hpcCAqbW1fZ2MgPSB0b19vZl9tbV9ncGlvX2No
aXAoZ2MpOyBAQCAtMjk4LDYgKzMyNSw3IEBAIHN0YXRpYyBpbnQgX19pbml0IHFlX2FkZF9ncGlv
Y2hpcHModm9pZCkNCiAJCWdjLT5kaXJlY3Rpb25fb3V0cHV0ID0gcWVfZ3Bpb19kaXJfb3V0Ow0K
IAkJZ2MtPmdldCA9IHFlX2dwaW9fZ2V0Ow0KIAkJZ2MtPnNldCA9IHFlX2dwaW9fc2V0Ow0KKwkJ
Z2MtPnNldF9tdWx0aXBsZSA9IHFlX2dwaW9fc2V0X211bHRpcGxlOw0KIA0KIAkJcmV0ID0gb2Zf
bW1fZ3Bpb2NoaXBfYWRkX2RhdGEobnAsIG1tX2djLCBxZV9nYyk7DQogCQlpZiAocmV0KQ0KDQpS
ZXZpZXdlZC1ieTogUWlhbmcgWmhhbyA8cWlhbmcuemhhb0BueHAuY29tPg0KDQo=

^ permalink raw reply

* Re: [PATCH v04 9/9] hotplug/pmt: Update topology after PMT
From: kbuild test robot @ 2018-06-21  2:13 UTC (permalink / raw)
  To: Michael Bringmann
  Cc: kbuild-all, linuxppc-dev, Nathan Fontenot, Michael Bringmann,
	Thomas Falcon, Tyrel Datwyler, John Allen
In-Reply-To: <71195eb3-e11c-f586-d359-7026bdfd9a8a@linux.vnet.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 1957 bytes --]

Hi Michael,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on v4.18-rc1 next-20180620]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Michael-Bringmann/powerpc-hotplug-Update-affinity-for-migrated-CPUs/20180621-085543
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-defconfig (attached as .config)
compiler: powerpc64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.2.0 make.cross ARCH=powerpc 

All errors (new ones prefixed by >>):

   arch/powerpc/platforms/pseries/dlpar.c: In function 'dlpar_pmt':
>> arch/powerpc/platforms/pseries/dlpar.c:453:2: error: implicit declaration of function 'rebuild_sched_domains' [-Werror=implicit-function-declaration]
     rebuild_sched_domains();
     ^~~~~~~~~~~~~~~~~~~~~
   cc1: all warnings being treated as errors

vim +/rebuild_sched_domains +453 arch/powerpc/platforms/pseries/dlpar.c

   435	
   436	static int dlpar_pmt(struct pseries_hp_errorlog *work)
   437	{
   438		struct list_head *pos, *q;
   439	
   440		ssleep(15);
   441	
   442		list_for_each_safe(pos, q, &dlpar_delayed_list) {
   443			struct pseries_hp_errorlog *tmp;
   444	
   445			tmp = list_entry(pos, struct pseries_hp_errorlog, list);
   446			handle_dlpar_errorlog(tmp);
   447	
   448			list_del(pos);
   449			kfree(tmp);
   450		}
   451	
   452		ssleep(5);
 > 453		rebuild_sched_domains();
   454	
   455		return 0;
   456	}
   457	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 23378 bytes --]

^ permalink raw reply

* Re: [PATCH kernel v2 6/6] powerpc/powernv/ioda: Allocate indirect TCE levels on demand
From: David Gibson @ 2018-06-21  2:03 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: linuxppc-dev, kvm-ppc, Alex Williamson, Benjamin Herrenschmidt,
	Russell Currey
In-Reply-To: <20180617111428.24349-7-aik@ozlabs.ru>

[-- Attachment #1: Type: text/plain, Size: 14894 bytes --]

On Sun, Jun 17, 2018 at 09:14:28PM +1000, Alexey Kardashevskiy wrote:
> At the moment we allocate the entire TCE table, twice (hardware part and
> userspace translation cache). This normally works as we normally have
> contigous memory and the guest will map entire RAM for 64bit DMA.
> 
> However if we have sparse RAM (one example is a memory device), then
> we will allocate TCEs which will never be used as the guest only maps
> actual memory for DMA. If it is a single level TCE table, there is nothing
> we can really do but if it a multilevel table, we can skip allocating
> TCEs we know we won't need.
> 
> This adds ability to allocate only first level, saving memory.
> 
> This changes iommu_table::free() to avoid allocating of an extra level;
> iommu_table::set() will do this when needed.
> 
> This adds @alloc parameter to iommu_table::exchange() to tell the callback
> if it can allocate an extra level; the flag is set to "false" for
> the realmode KVM handlers of H_PUT_TCE hcalls and the callback returns
> H_TOO_HARD.
> 
> This still requires the entire table to be counted in mm::locked_vm.
> 
> To be conservative, this only does on-demand allocation when
> the usespace cache table is requested which is the case of VFIO.
> 
> The example math for a system replicating a powernv setup with NVLink2
> in a guest:
> 16GB RAM mapped at 0x0
> 128GB GPU RAM window (16GB of actual RAM) mapped at 0x244000000000
> 
> the table to cover that all with 64K pages takes:
> (((0x244000000000 + 0x2000000000) >> 16)*8)>>20 = 4556MB
> 
> If we allocate only necessary TCE levels, we will only need:
> (((0x400000000 + 0x400000000) >> 16)*8)>>20 = 4MB (plus some for indirect
> levels).
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> Changes:
> v2:
> * fixed bug in cleanup path which forced the entire table to be
> allocated right before destroying
> * added memory allocation error handling pnv_tce()
> ---
>  arch/powerpc/include/asm/iommu.h              |  7 ++-
>  arch/powerpc/platforms/powernv/pci.h          |  6 ++-
>  arch/powerpc/kvm/book3s_64_vio_hv.c           |  4 +-
>  arch/powerpc/platforms/powernv/pci-ioda-tce.c | 69 ++++++++++++++++++++-------
>  arch/powerpc/platforms/powernv/pci-ioda.c     |  8 ++--
>  drivers/vfio/vfio_iommu_spapr_tce.c           |  2 +-
>  6 files changed, 69 insertions(+), 27 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index 4bdcf22..daa3ee5 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -70,7 +70,7 @@ struct iommu_table_ops {
>  			unsigned long *hpa,
>  			enum dma_data_direction *direction);
>  
> -	__be64 *(*useraddrptr)(struct iommu_table *tbl, long index);
> +	__be64 *(*useraddrptr)(struct iommu_table *tbl, long index, bool alloc);
>  #endif
>  	void (*clear)(struct iommu_table *tbl,
>  			long index, long npages);
> @@ -122,10 +122,13 @@ struct iommu_table {
>  	__be64 *it_userspace; /* userspace view of the table */
>  	struct iommu_table_ops *it_ops;
>  	struct kref    it_kref;
> +	int it_nid;
>  };
>  
> +#define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \
> +		((tbl)->it_ops->useraddrptr((tbl), (entry), false))
>  #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
> -		((tbl)->it_ops->useraddrptr((tbl), (entry)))
> +		((tbl)->it_ops->useraddrptr((tbl), (entry), true))
>  
>  /* Pure 2^n version of get_order */
>  static inline __attribute_const__
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index 5e02408..1fa5590 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -267,8 +267,10 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
>  		unsigned long attrs);
>  extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
>  extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
> -		unsigned long *hpa, enum dma_data_direction *direction);
> -extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index);
> +		unsigned long *hpa, enum dma_data_direction *direction,
> +		bool alloc);
> +extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index,
> +		bool alloc);
>  extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
>  
>  extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
> index db0490c..05b4865 100644
> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> @@ -200,7 +200,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
>  {
>  	struct mm_iommu_table_group_mem_t *mem = NULL;
>  	const unsigned long pgsize = 1ULL << tbl->it_page_shift;
> -	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
> +	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
>  
>  	if (!pua)
>  		/* it_userspace allocation might be delayed */
> @@ -264,7 +264,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
>  {
>  	long ret;
>  	unsigned long hpa = 0;
> -	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
> +	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
>  	struct mm_iommu_table_group_mem_t *mem;
>  
>  	if (!pua)
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> index 36c2eb0..a7debfb 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> @@ -48,7 +48,7 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
>  	return addr;
>  }
>  
> -static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx)
> +static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
>  {
>  	__be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base;
>  	int  level = tbl->it_indirect_levels;
> @@ -57,7 +57,20 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx)
>  
>  	while (level) {
>  		int n = (idx & mask) >> (level * shift);
> -		unsigned long tce = be64_to_cpu(tmp[n]);
> +		unsigned long tce;
> +
> +		if (tmp[n] == 0) {
> +			__be64 *tmp2;
> +
> +			if (!alloc)
> +				return NULL;
> +
> +			tmp2 = pnv_alloc_tce_level(tbl->it_nid,
> +					ilog2(tbl->it_level_size) + 3);

Can this allocation fail?  If it does you'll crash as you dereference
NULL just below.

> +			tmp[n] = cpu_to_be64(__pa(tmp2) |
> +					TCE_PCI_READ | TCE_PCI_WRITE);
> +		}
> +		tce = be64_to_cpu(tmp[n]);
>  
>  		tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
>  		idx &= ~mask;
> @@ -84,7 +97,7 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
>  			((rpn + i) << tbl->it_page_shift);
>  		unsigned long idx = index - tbl->it_offset + i;
>  
> -		*(pnv_tce(tbl, false, idx)) = cpu_to_be64(newtce);
> +		*(pnv_tce(tbl, false, idx, true)) = cpu_to_be64(newtce);
>  	}
>  
>  	return 0;
> @@ -92,31 +105,45 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
>  
>  #ifdef CONFIG_IOMMU_API
>  int pnv_tce_xchg(struct iommu_table *tbl, long index,
> -		unsigned long *hpa, enum dma_data_direction *direction)
> +		unsigned long *hpa, enum dma_data_direction *direction,
> +		bool alloc)
>  {
>  	u64 proto_tce = iommu_direction_to_tce_perm(*direction);
>  	unsigned long newtce = *hpa | proto_tce, oldtce;
>  	unsigned long idx = index - tbl->it_offset;
> +	__be64 *ptce;
>  
>  	BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl));
>  
>  	if (newtce & TCE_PCI_WRITE)
>  		newtce |= TCE_PCI_READ;
>  
> -	oldtce = be64_to_cpu(xchg(pnv_tce(tbl, false, idx),
> -				  cpu_to_be64(newtce)));
> +	ptce = pnv_tce(tbl, false, idx, alloc);
> +	if (!ptce) {
> +		if (*direction == DMA_NONE) {
> +			*hpa = 0;
> +			return 0;
> +		}
> +		/* It is likely to be realmode */

"likely" makes me nervous.  Is there a case were it might not be real
mode?  What will happen in that case?

> +		if (!alloc)
> +			return H_TOO_HARD;
> +
> +		return H_HARDWARE;

Has idx already been bounds checked at this point?  If not, couldn't
you get here due to an out of bounds index, in which case it wouldn't
be H_HARDWARE, but H_PARAMETER or something.

> +	}
> +
> +	oldtce = be64_to_cpu(xchg(ptce, cpu_to_be64(newtce)));
>  	*hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
>  	*direction = iommu_tce_direction(oldtce);
>  
>  	return 0;
>  }
>  
> -__be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index)
> +__be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index, bool alloc)
>  {
>  	if (WARN_ON_ONCE(!tbl->it_userspace))
>  		return NULL;
>  
> -	return pnv_tce(tbl, true, index - tbl->it_offset);
> +	return pnv_tce(tbl, true, index - tbl->it_offset, alloc);
>  }
>  #endif
>  
> @@ -126,14 +153,19 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
>  
>  	for (i = 0; i < npages; i++) {
>  		unsigned long idx = index - tbl->it_offset + i;
> +		__be64 *ptce = pnv_tce(tbl, false, idx,	false);
>  
> -		*(pnv_tce(tbl, false, idx)) = cpu_to_be64(0);
> +		if (ptce)
> +			*ptce = cpu_to_be64(0);
>  	}
>  }
>  
>  unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
>  {
> -	__be64 *ptce = pnv_tce(tbl, false, index - tbl->it_offset);
> +	__be64 *ptce = pnv_tce(tbl, false, index - tbl->it_offset, false);
> +
> +	if (!ptce)
> +		return 0;
>  
>  	return be64_to_cpu(*ptce);
>  }
> @@ -224,6 +256,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
>  	unsigned int table_shift = max_t(unsigned int, entries_shift + 3,
>  			PAGE_SHIFT);
>  	const unsigned long tce_table_size = 1UL << table_shift;
> +	unsigned int tmplevels = levels;
>  
>  	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
>  		return -EINVAL;
> @@ -231,6 +264,9 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
>  	if (!is_power_of_2(window_size))
>  		return -EINVAL;
>  
> +	if (alloc_userspace_copy && (window_size > (1ULL << 32)))
> +		tmplevels = 1;
> +
>  	/* Adjust direct table size from window_size and levels */
>  	entries_shift = (entries_shift + levels - 1) / levels;
>  	level_shift = entries_shift + 3;
> @@ -241,7 +277,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
>  
>  	/* Allocate TCE table */
>  	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
> -			levels, tce_table_size, &offset, &total_allocated);
> +			tmplevels, tce_table_size, &offset, &total_allocated);
>  
>  	/* addr==NULL means that the first level allocation failed */
>  	if (!addr)
> @@ -252,7 +288,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
>  	 * we did not allocate as much as we wanted,
>  	 * release partially allocated table.
>  	 */
> -	if (offset < tce_table_size)
> +	if (tmplevels == levels && offset < tce_table_size)
>  		goto free_tces_exit;
>  
>  	/* Allocate userspace view of the TCE table */
> @@ -263,8 +299,8 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
>  				&total_allocated_uas);
>  		if (!uas)
>  			goto free_tces_exit;
> -		if (offset < tce_table_size ||
> -				total_allocated_uas != total_allocated)
> +		if (tmplevels == levels && (offset < tce_table_size ||
> +				total_allocated_uas != total_allocated))
>  			goto free_uas_exit;
>  	}
>  
> @@ -275,10 +311,11 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
>  	tbl->it_indirect_levels = levels - 1;
>  	tbl->it_allocated_size = total_allocated;
>  	tbl->it_userspace = uas;
> +	tbl->it_nid = nid;
>  
> -	pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d\n",
> +	pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n",
>  			window_size, tce_table_size, bus_offset, tbl->it_base,
> -			tbl->it_userspace, levels);
> +			tbl->it_userspace, tmplevels, levels);
>  
>  	return 0;
>  
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index c61c04d..d9df620 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -2010,7 +2010,7 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
>  static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
>  		unsigned long *hpa, enum dma_data_direction *direction)
>  {
> -	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
> +	long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
>  
>  	if (!ret)
>  		pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false);
> @@ -2021,7 +2021,7 @@ static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
>  static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
>  		unsigned long *hpa, enum dma_data_direction *direction)
>  {
> -	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
> +	long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
>  
>  	if (!ret)
>  		pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
> @@ -2175,7 +2175,7 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
>  static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
>  		unsigned long *hpa, enum dma_data_direction *direction)
>  {
> -	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
> +	long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
>  
>  	if (!ret)
>  		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
> @@ -2186,7 +2186,7 @@ static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
>  static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
>  		unsigned long *hpa, enum dma_data_direction *direction)
>  {
> -	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
> +	long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
>  
>  	if (!ret)
>  		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> index 833f926..1e58fb9 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -635,7 +635,7 @@ static long tce_iommu_create_table(struct tce_container *container,
>  			page_shift, window_size, levels, ptbl);
>  
>  	WARN_ON(!ret && !(*ptbl)->it_ops->free);
> -	WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
> +	WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
>  
>  	return ret;
>  }

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH] powerpc/pkeys: preallocate execute_only key only if the key is available.
From: Ram Pai @ 2018-06-21  1:14 UTC (permalink / raw)
  To: mpe
  Cc: linuxppc-dev, hbabu, mhocko, bauerman, Ulrich.Weigand, fweimer,
	msuchanek
In-Reply-To: <1529472862-26506-1-git-send-email-linuxram@us.ibm.com>

On Tue, Jun 19, 2018 at 10:34:22PM -0700, Ram Pai wrote:
> Key 2 is preallocated and reserved for execute-only key. In rare
> cases if key-2 is unavailable, mprotect(PROT_EXEC) will behave
> incorrectly. NOTE: mprotect(PROT_EXEC) uses execute-only key.
> 
> Ensure key 2 is available for preallocation before reserving it for
> execute_only purpose.
> 
> CC: Michael Ellerman <mpe@ellerman.id.au>
> CC: Thiago Jung Bauermann <bauerman@linux.ibm.com>
> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> ---
>  arch/powerpc/mm/pkeys.c |   14 +++++++++-----
>  1 files changed, 9 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
> index cec990c..2013ef0 100644
> --- a/arch/powerpc/mm/pkeys.c
> +++ b/arch/powerpc/mm/pkeys.c
> @@ -19,6 +19,7 @@
>  u64  pkey_amr_mask;		/* Bits in AMR not to be touched */
>  u64  pkey_iamr_mask;		/* Bits in AMR not to be touched */
>  u64  pkey_uamor_mask;		/* Bits in UMOR not to be touched */
> +u32  execute_only_key = 2;

this cannot be unsigned, since it can get set to -1 if key 2 is not available.  :(
Please ignore this patch.  Will resend a fixed version.

> 
>  #define AMR_BITS_PER_PKEY 2
>  #define AMR_RD_BIT 0x1UL
> @@ -26,7 +27,6 @@
>  #define IAMR_EX_BIT 0x1UL
>  #define PKEY_REG_BITS (sizeof(u64)*8)
>  #define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
> -#define EXECUTE_ONLY_KEY 2
> 
>  static void scan_pkey_feature(void)
>  {
> @@ -122,8 +122,12 @@ int pkey_initialize(void)
>  #else
>  	os_reserved = 0;
>  #endif
> +
> +	if ((pkeys_total - os_reserved) <= execute_only_key)
> +		execute_only_key = -1;
> +


.snip..

RP

^ permalink raw reply

* Re: Build regressions/improvements in v4.18-rc1
From: Michael Ellerman @ 2018-06-21  0:50 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Linux Kernel Mailing List, Linux MIPS Mailing List, linux-ia64,
	Linux-sh list, sparclinux, Andrew Morton, linuxppc-dev
In-Reply-To: <CAMuHMdUfHJS-ykNr-vPPUDfGLsGr62c4R=EThw33-DFNj9ZQNg@mail.gmail.com>

Geert Uytterhoeven <geert@linux-m68k.org> writes:
> On Tue, Jun 19, 2018 at 8:35 AM Michael Ellerman <mpe@ellerman.id.au> wrote:
>> Geert Uytterhoeven <geert@linux-m68k.org> writes:
>> > On Mon, Jun 18, 2018 at 11:18 AM Geert Uytterhoeven
>> > <geert@linux-m68k.org> wrote:
>> >> Below is the list of build error/warning regressions/improvements in
>> >> v4.18-rc1[1] compared to v4.17[2].
...
>
>> Relatedly I might move all the randconfig targets from Linus' tree into
>> a separate "linus-rand" tree, so that they don't pollute the results, as
>> I've done for linux-next.
>
> Sounds look a good thing.

OK done.

See eg:
  http://kisskb.ellerman.id.au/kisskb/head/14180/


At the moment the "head" page shows the progress/successful for that
head across all branches, so the linus-rand results get counted together
with the linus results which is a bit annoying.

I'm working on a change so that you can view a head *on a branch* so
that you only see the progress/successful for that branch. I'll try and
polish that up and get it pushed RSN.

cheers

^ permalink raw reply

* [PATCH v04 9/9] hotplug/pmt: Update topology after PMT
From: Michael Bringmann @ 2018-06-21  0:49 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Michael Bringmann, Nathan Fontenot, John Allen, Tyrel Datwyler,
	Thomas Falcon
In-Reply-To: <71195eb3-e11c-f586-d359-7026bdfd9a8a@linux.vnet.ibm.com>

[Sorry.  File error.  Previous copy was older version of file.]

hotplug/pmt: Call rebuild_sched_domains after applying changes
to update CPU associativity i.e. 'readd' CPUs.  This is to
ensure that the deferred calls to arch_update_cpu_topology are
now reflected in the system data structures.

Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/dlpar.c |    4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 4b43fec..74b6287 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -16,6 +16,7 @@
 #include <linux/notifier.h>
 #include <linux/spinlock.h>
 #include <linux/cpu.h>
+#include <linux/cpuset.h>
 #include <linux/slab.h>
 #include <linux/of.h>
 
@@ -449,6 +450,9 @@ static int dlpar_pmt(struct pseries_hp_errorlog *work)
 		kfree(tmp);
 	}
 
+	ssleep(5);
+	rebuild_sched_domains();
+
 	return 0;
 }
 

^ permalink raw reply related

* Re: [PATCH] arch: powerpc: pci-common: fix wrong return value check on phd_id
From: Benjamin Herrenschmidt @ 2018-06-21  0:38 UTC (permalink / raw)
  To: Michael Ellerman, Daniel Walker, Guilherme G. Piccoli
  Cc: Andrew Morton, xe-kernel, linux-kernel, Paul Mackerras,
	linuxppc-dev, Mauro Rodrigues, linux-pci
In-Reply-To: <87in6dvv89.fsf@concordia.ellerman.id.au>

On Thu, 2018-06-21 at 10:28 +1000, Michael Ellerman wrote:
> 
> That's true, though I think yours is the first report we've had of
> problems.
> 
> The old behaviour relied on device tree ordering in nearly all cases, so
> you basically get whatever order your firmware happened to flatten the
> device tree in.
> 
> That tends to be consistent on a single system or with a single firmware
> version, but it's not stable in general. If your firmware changes, or
> you kexec then the ordering can change.
> 
> So I'd definitely prefer we didn't go back to that behaviour, because
> it's basically "random order".
> 
> If there's anything you can do on your end to cope with the ne

I think the numbering change has to be coped with. However:

The main issue I see is that it somewhat hard wires that "reg"
is a 64-bit property with the "interesting" bits in the bottom,
and that "interesting" part somewhat happens to fit in 16-bits.

It would have been better to get the full address out of reg (using the
appropriate size specified in the parent #address-cells) and hash it.

Cheers,
Ben.

^ permalink raw reply

* [PATCH 3/3] migration/memory: Support 'ibm,dynamic-memory-v2'
From: Michael Bringmann @ 2018-06-21  0:35 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Michael Bringmann, Nathan Fontenot, John Allen, Tyrel Datwyler,
	Thomas Falcon
In-Reply-To: <c7b0c1e5-7adf-299b-646f-6988b1fa0f8b@linux.vnet.ibm.com>

migration/memory: This patch adds recognition for changes to the
associativity of memory blocks described by 'ibm,dynamic-memory-v2'
in order to update local and general kernel data structures to
reflect those changes.  This patch builds upon previous enhancements
that scan the device-tree properties to build an LMB array in memory
to reuse and evaluate that structure for the new property.

Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/hotplug-memory.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 2573b87..d390dd4 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -1144,7 +1144,8 @@ static int pseries_memory_notifier(struct notifier_block *nb,
 		err = pseries_remove_mem_node(rd->dn);
 		break;
 	case OF_RECONFIG_UPDATE_PROPERTY:
-		if (!strcmp(rd->prop->name, "ibm,dynamic-memory")) {
+		if (!strcmp(rd->prop->name, "ibm,dynamic-memory") ||
+		    !strcmp(rd->prop->name, "ibm,dynamic-memory-v2")) {
 			struct drmem_lmb_info *dinfo =
 				drmem_lmbs_init(rd->prop);
 			if (!dinfo)

^ permalink raw reply related

* [PATCH 2/3] migration/memory: Evaluate LMB assoc changes
From: Michael Bringmann @ 2018-06-21  0:34 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Michael Bringmann, Nathan Fontenot, John Allen, Tyrel Datwyler,
	Thomas Falcon
In-Reply-To: <c7b0c1e5-7adf-299b-646f-6988b1fa0f8b@linux.vnet.ibm.com>

migration/memory: This patch adds code that recognizes changes to
the associativity of memory blocks described by the device-tree
properties in order to drive equivalent 'hotplug' operations to
update local and general kernel data structures to reflect those
changes.  These differences may include:

* Evaluating 'ibm,dynamic-memory' properties when processing the
  topology of LPARS in Post Migration events.  Previous efforts
  only recognized whether a memory block's assignment had changed
  in the property.  Changes here include checking the aa_index
  values for each drc_index of the old/new LMBs and to 'readd'
  any block for which the setting has changed.

* In an LPAR migration scenario, the "ibm,associativity-lookup-arrays"
  property may change.  In the event that a row of the array differs,
  locate all assigned memory blocks with that 'aa_index' and 're-add'
  them to the system memory block data structures.  In the process of
  the 're-add', the system routines will update the corresponding entry
  for the memory in the LMB structures and any other relevant kernel
  data structures.

A number of previous extensions made to the DRMEM code for scanning
device-tree properties and creating LMB arrays were used here to
ensure that the resulting code was simpler and more usable:

* Use new paired list iterator for the DRMEM LMB info arrays to find
  differences in old and new versions of properties.
* Use new iterator for copies of the DRMEM info arrays to evaluate
  completely new structures.
* Combine common code for parsing and evaluating memory description
  properties based on the DRMEM LMB array model to greatly simplify
  extension from the older property 'ibm,dynamic-memory' to the new
  property model of 'ibm,dynamic-memory-v2'.

Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/hotplug-memory.c |  161 ++++++++++++++++++-----
 1 file changed, 127 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index c1578f5..2573b87 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -561,8 +561,11 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
 		}
 	}
 
-	if (!lmb_found)
-		rc = -EINVAL;
+	if (!lmb_found) {
+		pr_info("Failed to update memory for drc index %lx\n",
+			(unsigned long) drc_index);
+		return -EINVAL;
+	}
 
 	if (rc)
 		pr_info("Failed to update memory at %llx\n",
@@ -994,13 +997,11 @@ static int pseries_add_mem_node(struct device_node *np)
 	return (ret < 0) ? -EINVAL : 0;
 }
 
-static int pseries_update_drconf_memory(struct of_reconfig_data *pr)
+static int pseries_update_drconf_memory(struct drmem_lmb_info *new_dinfo)
 {
-	struct of_drconf_cell_v1 *new_drmem, *old_drmem;
+	struct drmem_lmb *old_lmb, *new_lmb;
 	unsigned long memblock_size;
-	u32 entries;
-	__be32 *p;
-	int i, rc = -EINVAL;
+	int rc = 0;
 
 	if (rtas_hp_event)
 		return 0;
@@ -1009,42 +1010,126 @@ static int pseries_update_drconf_memory(struct of_reconfig_data *pr)
 	if (!memblock_size)
 		return -EINVAL;
 
-	p = (__be32 *) pr->old_prop->value;
-	if (!p)
-		return -EINVAL;
-
-	/* The first int of the property is the number of lmb's described
-	 * by the property. This is followed by an array of of_drconf_cell
-	 * entries. Get the number of entries and skip to the array of
-	 * of_drconf_cell's.
-	 */
-	entries = be32_to_cpu(*p++);
-	old_drmem = (struct of_drconf_cell_v1 *)p;
+	/* Arrays should have the same size and DRC indexes */
+	for_each_pair_dinfo_lmb(drmem_info, old_lmb, new_dinfo, new_lmb) {
 
-	p = (__be32 *)pr->prop->value;
-	p++;
-	new_drmem = (struct of_drconf_cell_v1 *)p;
+		if (new_lmb->drc_index != old_lmb->drc_index)
+			continue;
 
-	for (i = 0; i < entries; i++) {
-		if ((be32_to_cpu(old_drmem[i].flags) & DRCONF_MEM_ASSIGNED) &&
-		    (!(be32_to_cpu(new_drmem[i].flags) & DRCONF_MEM_ASSIGNED))) {
+		if ((old_lmb->flags & DRCONF_MEM_ASSIGNED) &&
+		    (!(new_lmb->flags & DRCONF_MEM_ASSIGNED))) {
 			rc = pseries_remove_memblock(
-				be64_to_cpu(old_drmem[i].base_addr),
-						     memblock_size);
+				old_lmb->base_addr, memblock_size);
 			break;
-		} else if ((!(be32_to_cpu(old_drmem[i].flags) &
-			    DRCONF_MEM_ASSIGNED)) &&
-			    (be32_to_cpu(new_drmem[i].flags) &
-			    DRCONF_MEM_ASSIGNED)) {
-			rc = memblock_add(be64_to_cpu(old_drmem[i].base_addr),
-					  memblock_size);
+		} else if ((!(old_lmb->flags & DRCONF_MEM_ASSIGNED)) &&
+			   (new_lmb->flags & DRCONF_MEM_ASSIGNED)) {
+			rc = memblock_add(old_lmb->base_addr,
+					memblock_size);
 			rc = (rc < 0) ? -EINVAL : 0;
 			break;
+		} else if ((old_lmb->aa_index != new_lmb->aa_index) &&
+			   (new_lmb->flags & DRCONF_MEM_ASSIGNED)) {
+			dlpar_queue_action(
+					PSERIES_HP_ELOG_RESOURCE_MEM,
+					PSERIES_HP_ELOG_ACTION_READD,
+					new_lmb->drc_index);
 		}
 	}
 	return rc;
 }
 
+static void pseries_update_ala_memory_aai(int aa_index)
+{
+	struct drmem_lmb *lmb;
+
+	/* Readd all LMBs which were previously using the
+	 * specified aa_index value.
+	 */
+	for_each_drmem_lmb(lmb) {
+		if ((lmb->aa_index == aa_index) &&
+			(lmb->flags & DRCONF_MEM_ASSIGNED)) {
+			dlpar_queue_action(
+					PSERIES_HP_ELOG_RESOURCE_MEM,
+					PSERIES_HP_ELOG_ACTION_READD,
+					lmb->drc_index);
+		}
+	}
+}
+
+struct assoc_arrays {
+	u32 n_arrays;
+	u32 array_sz;
+	const __be32 *arrays;
+};
+
+static int pseries_update_ala_memory(struct of_reconfig_data *pr)
+{
+	struct assoc_arrays new_ala, old_ala;
+	__be32 *p;
+	int i, lim;
+
+	if (rtas_hp_event)
+		return 0;
+
+	/*
+	 * The layout of the ibm,associativity-lookup-arrays
+	 * property is a number N indicating the number of
+	 * associativity arrays, followed by a number M
+	 * indicating the size of each associativity array,
+	 * followed by a list of N associativity arrays.
+	 */
+
+	p = (__be32 *) pr->old_prop->value;
+	if (!p)
+		return -EINVAL;
+	old_ala.n_arrays = of_read_number(p++, 1);
+	old_ala.array_sz = of_read_number(p++, 1);
+	old_ala.arrays = p;
+
+	p = (__be32 *) pr->prop->value;
+	if (!p)
+		return -EINVAL;
+	new_ala.n_arrays = of_read_number(p++, 1);
+	new_ala.array_sz = of_read_number(p++, 1);
+	new_ala.arrays = p;
+
+	lim = (new_ala.n_arrays > old_ala.n_arrays) ? old_ala.n_arrays :
+			new_ala.n_arrays;
+
+	if (old_ala.array_sz == new_ala.array_sz) {
+
+		/* Reset any entries where the old and new rows
+		 * the array have changed.
+		 */
+		for (i = 0; i < lim; i++) {
+			int index = (i * new_ala.array_sz);
+
+			if (!memcmp(&old_ala.arrays[index],
+				&new_ala.arrays[index],
+				new_ala.array_sz))
+				continue;
+
+			pseries_update_ala_memory_aai(i);
+		}
+
+		/* Reset any entries representing the extra rows.
+		 * There shouldn't be any, but just in case ...
+		 */
+		for (i = lim; i < new_ala.n_arrays; i++)
+			pseries_update_ala_memory_aai(i);
+
+	} else {
+		/* Update all entries representing these rows;
+		 * as all rows have different sizes, none can
+		 * have equivalent values.
+		 */
+		for (i = 0; i < lim; i++)
+			pseries_update_ala_memory_aai(i);
+	}
+
+	return 0;
+}
+
 static int pseries_memory_notifier(struct notifier_block *nb,
 				   unsigned long action, void *data)
 {
@@ -1059,8 +1144,16 @@ static int pseries_memory_notifier(struct notifier_block *nb,
 		err = pseries_remove_mem_node(rd->dn);
 		break;
 	case OF_RECONFIG_UPDATE_PROPERTY:
-		if (!strcmp(rd->prop->name, "ibm,dynamic-memory"))
-			err = pseries_update_drconf_memory(rd);
+		if (!strcmp(rd->prop->name, "ibm,dynamic-memory")) {
+			struct drmem_lmb_info *dinfo =
+				drmem_lmbs_init(rd->prop);
+			if (!dinfo)
+				return -EINVAL;
+			err = pseries_update_drconf_memory(dinfo);
+			drmem_lmbs_free(dinfo);
+		} else if (!strcmp(rd->prop->name,
+				"ibm,associativity-lookup-arrays"))
+			err = pseries_update_ala_memory(rd);
 		break;
 	}
 	return notifier_from_errno(err);

^ permalink raw reply related

* [PATCH 1/3] powerpc/drmem: Export 'dynamic-memory' loader
From: Michael Bringmann @ 2018-06-21  0:34 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Michael Bringmann, Nathan Fontenot, John Allen, Tyrel Datwyler,
	Thomas Falcon
In-Reply-To: <c7b0c1e5-7adf-299b-646f-6988b1fa0f8b@linux.vnet.ibm.com>

powerpc/drmem: Export many of the functions of DRMEM to parse
"ibm,dynamic-memory" and "ibm,dynamic-memory-v2" during hotplug
operations and for Post Migration events.

Also modify the DRMEM initialization code to allow it to,

* Be called after system initialization
* Provide a separate user copy of the LMB array that is produces
* Free the user copy upon request

In addition, a couple of changes were made to make the creation
of additional copies of the LMB array more useful including,

* Add new iterator to work through a pair of drmem_info arrays.
* Modify DRMEM code to replace usages of dt_root_addr_cells, and
  dt_mem_next_cell, as these are only available at first boot.

Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/drmem.h |   15 ++++++++
 arch/powerpc/mm/drmem.c          |   75 ++++++++++++++++++++++++++++----------
 2 files changed, 70 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index ce242b9..b0e70fd 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -35,6 +35,18 @@ struct drmem_lmb_info {
 		&drmem_info->lmbs[0],				\
 		&drmem_info->lmbs[drmem_info->n_lmbs - 1])
 
+#define for_each_dinfo_lmb(dinfo, lmb)				\
+	for_each_drmem_lmb_in_range((lmb),			\
+		&dinfo->lmbs[0],				\
+		&dinfo->lmbs[dinfo->n_lmbs - 1])
+
+#define for_each_pair_dinfo_lmb(dinfo1, lmb1, dinfo2, lmb2)	\
+	for ((lmb1) = (&dinfo1->lmbs[0]),			\
+	     (lmb2) = (&dinfo2->lmbs[0]);			\
+	     ((lmb1) <= (&dinfo1->lmbs[dinfo1->n_lmbs - 1])) &&	\
+	     ((lmb2) <= (&dinfo2->lmbs[dinfo2->n_lmbs - 1]));	\
+	     (lmb1)++, (lmb2)++)
+
 /*
  * The of_drconf_cell_v1 struct defines the layout of the LMB data
  * specified in the ibm,dynamic-memory device tree property.
@@ -94,6 +106,9 @@ void __init walk_drmem_lmbs(struct device_node *dn,
 			void (*func)(struct drmem_lmb *, const __be32 **));
 int drmem_update_dt(void);
 
+struct drmem_lmb_info *drmem_lmbs_init(struct property *prop);
+void drmem_lmbs_free(struct drmem_lmb_info *dinfo);
+
 #ifdef CONFIG_PPC_PSERIES
 void __init walk_drmem_lmbs_early(unsigned long node,
 			void (*func)(struct drmem_lmb *, const __be32 **));
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index 3f18036..13d2abb 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -20,6 +20,7 @@
 
 static struct drmem_lmb_info __drmem_info;
 struct drmem_lmb_info *drmem_info = &__drmem_info;
+static int n_root_addr_cells;
 
 u64 drmem_lmb_memory_max(void)
 {
@@ -193,12 +194,13 @@ int drmem_update_dt(void)
 	return rc;
 }
 
-static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
+static void read_drconf_v1_cell(struct drmem_lmb *lmb,
 				       const __be32 **prop)
 {
 	const __be32 *p = *prop;
 
-	lmb->base_addr = dt_mem_next_cell(dt_root_addr_cells, &p);
+	lmb->base_addr = of_read_number(p, n_root_addr_cells);
+	p += n_root_addr_cells;
 	lmb->drc_index = of_read_number(p++, 1);
 
 	p++; /* skip reserved field */
@@ -209,7 +211,7 @@ static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
 	*prop = p;
 }
 
-static void __init __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
+static void __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
 			void (*func)(struct drmem_lmb *, const __be32 **))
 {
 	struct drmem_lmb lmb;
@@ -225,13 +227,14 @@ static void __init __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
 	}
 }
 
-static void __init read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
+static void read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
 				       const __be32 **prop)
 {
 	const __be32 *p = *prop;
 
 	dr_cell->seq_lmbs = of_read_number(p++, 1);
-	dr_cell->base_addr = dt_mem_next_cell(dt_root_addr_cells, &p);
+	dr_cell->base_addr = of_read_number(p, n_root_addr_cells);
+	p += n_root_addr_cells;
 	dr_cell->drc_index = of_read_number(p++, 1);
 	dr_cell->aa_index = of_read_number(p++, 1);
 	dr_cell->flags = of_read_number(p++, 1);
@@ -239,7 +242,7 @@ static void __init read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
 	*prop = p;
 }
 
-static void __init __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
+static void __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
 			void (*func)(struct drmem_lmb *, const __be32 **))
 {
 	struct of_drconf_cell_v2 dr_cell;
@@ -275,6 +278,9 @@ void __init walk_drmem_lmbs_early(unsigned long node,
 	const __be32 *prop, *usm;
 	int len;
 
+	if (n_root_addr_cells == 0)
+		n_root_addr_cells = dt_root_addr_cells;
+
 	prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
 	if (!prop || len < dt_root_size_cells * sizeof(__be32))
 		return;
@@ -353,24 +359,26 @@ void __init walk_drmem_lmbs(struct device_node *dn,
 	}
 }
 
-static void __init init_drmem_v1_lmbs(const __be32 *prop)
+static void init_drmem_v1_lmbs(const __be32 *prop,
+				struct drmem_lmb_info *dinfo)
 {
 	struct drmem_lmb *lmb;
 
-	drmem_info->n_lmbs = of_read_number(prop++, 1);
-	if (drmem_info->n_lmbs == 0)
+	dinfo->n_lmbs = of_read_number(prop++, 1);
+	if (dinfo->n_lmbs == 0)
 		return;
 
-	drmem_info->lmbs = kcalloc(drmem_info->n_lmbs, sizeof(*lmb),
+	dinfo->lmbs = kcalloc(dinfo->n_lmbs, sizeof(*lmb),
 				   GFP_KERNEL);
-	if (!drmem_info->lmbs)
+	if (!dinfo->lmbs)
 		return;
 
-	for_each_drmem_lmb(lmb)
+	for_each_dinfo_lmb(dinfo, lmb)
 		read_drconf_v1_cell(lmb, &prop);
 }
 
-static void __init init_drmem_v2_lmbs(const __be32 *prop)
+static void init_drmem_v2_lmbs(const __be32 *prop,
+				struct drmem_lmb_info *dinfo)
 {
 	struct drmem_lmb *lmb;
 	struct of_drconf_cell_v2 dr_cell;
@@ -386,12 +394,12 @@ static void __init init_drmem_v2_lmbs(const __be32 *prop)
 	p = prop;
 	for (i = 0; i < lmb_sets; i++) {
 		read_drconf_v2_cell(&dr_cell, &p);
-		drmem_info->n_lmbs += dr_cell.seq_lmbs;
+		dinfo->n_lmbs += dr_cell.seq_lmbs;
 	}
 
-	drmem_info->lmbs = kcalloc(drmem_info->n_lmbs, sizeof(*lmb),
+	dinfo->lmbs = kcalloc(dinfo->n_lmbs, sizeof(*lmb),
 				   GFP_KERNEL);
-	if (!drmem_info->lmbs)
+	if (!dinfo->lmbs)
 		return;
 
 	/* second pass, read in the LMB information */
@@ -402,10 +410,10 @@ static void __init init_drmem_v2_lmbs(const __be32 *prop)
 		read_drconf_v2_cell(&dr_cell, &p);
 
 		for (j = 0; j < dr_cell.seq_lmbs; j++) {
-			lmb = &drmem_info->lmbs[lmb_index++];
+			lmb = &dinfo->lmbs[lmb_index++];
 
 			lmb->base_addr = dr_cell.base_addr;
-			dr_cell.base_addr += drmem_info->lmb_size;
+			dr_cell.base_addr += dinfo->lmb_size;
 
 			lmb->drc_index = dr_cell.drc_index;
 			dr_cell.drc_index++;
@@ -416,11 +424,38 @@ static void __init init_drmem_v2_lmbs(const __be32 *prop)
 	}
 }
 
+void drmem_lmbs_free(struct drmem_lmb_info *dinfo)
+{
+	if (dinfo) {
+		kfree(dinfo->lmbs);
+		kfree(dinfo);
+	}
+}
+
+struct drmem_lmb_info *drmem_lmbs_init(struct property *prop)
+{
+	struct drmem_lmb_info *dinfo;
+
+	dinfo = kzalloc(sizeof(*dinfo), GFP_KERNEL);
+	if (!dinfo)
+		return NULL;
+
+	if (!strcmp("ibm,dynamic-memory", prop->name))
+		init_drmem_v1_lmbs(prop->value, dinfo);
+	else if (!strcmp("ibm,dynamic-memory-v2", prop->name))
+		init_drmem_v2_lmbs(prop->value, dinfo);
+
+	return dinfo;
+}
+
 static int __init drmem_init(void)
 {
 	struct device_node *dn;
 	const __be32 *prop;
 
+	if (n_root_addr_cells == 0)
+		n_root_addr_cells = dt_root_addr_cells;
+
 	dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
 	if (!dn) {
 		pr_info("No dynamic reconfiguration memory found\n");
@@ -434,11 +469,11 @@ static int __init drmem_init(void)
 
 	prop = of_get_property(dn, "ibm,dynamic-memory", NULL);
 	if (prop) {
-		init_drmem_v1_lmbs(prop);
+		init_drmem_v1_lmbs(prop, drmem_info);
 	} else {
 		prop = of_get_property(dn, "ibm,dynamic-memory-v2", NULL);
 		if (prop)
-			init_drmem_v2_lmbs(prop);
+			init_drmem_v2_lmbs(prop, drmem_info);
 	}
 
 	of_node_put(dn);

^ permalink raw reply related

* [PATCH 0/3] powerpc/migration: Affinity fix for memory
From: Michael Bringmann @ 2018-06-21  0:34 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Michael Bringmann, Nathan Fontenot, John Allen, Tyrel Datwyler,
	Thomas Falcon

The migration of LPARs across Power systems affects many attributes
including that of the associativity of memory blocks.  The patches
in this set execute when a system is coming up fresh upon a migration
target.  They are intended to,

* Recognize changes to the associativity of memory recorded in
  internal data structures when compared to the latest copies in
  the device tree (e.g. ibm,dynamic-memory, ibm,dynamic-memory-v2).
* Recognize changes to the associativity mapping (e.g. ibm,
  associativity-lookup-arrays), locate all assigned memory blocks
  corresponding to each changed row, and readd all such blocks.
* Generate calls to other code layers to reset the data structures
  related to associativity of memory.
* Re-register the 'changed' entities into the target system.
  Re-registration of memory blocks mostly entails acting as if they
  have been newly hot-added into the target system.

This code builds upon features introduced in a previous patch set
that updates CPUs for affinity changes that may occur during LPM.

Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>

Michael Bringmann (3):
  powerpc/drmem: Export 'dynamic-memory' loader
  migration/memory: Evaluate LMB assoc changes
  migration/memory: Support 'ibm,dynamic-memory-v2'

^ permalink raw reply

* [PATCH v04 9/9] hotplug/pmt: Update topology after PMT
From: Michael Bringmann @ 2018-06-21  0:30 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Michael Bringmann, Nathan Fontenot, John Allen, Tyrel Datwyler,
	Thomas Falcon
In-Reply-To: <0425b353-54b0-6ccd-fbb6-3d26d9448bb5@linux.vnet.ibm.com>

hotplug/pmt: Call rebuild_sched_domains after applying changes
to update CPU associativity i.e. 'readd' CPUs.  This is to
ensure that the deferred calls to arch_update_cpu_topology are
now reflected in the system data structures.

Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/dlpar.c |    3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 4b43fec..9c68032 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -449,6 +449,9 @@ static int dlpar_pmt(struct pseries_hp_errorlog *work)
 		kfree(tmp);
 	}
 
+	ssleep(5);
+	rebuild_sched_domains();
+
 	return 0;
 }
 

^ permalink raw reply related

* [PATCH v04 8/9] hotplug/rtas: No rtas_event_scan during PMT update
From: Michael Bringmann @ 2018-06-21  0:30 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Michael Bringmann, Nathan Fontenot, John Allen, Tyrel Datwyler,
	Thomas Falcon
In-Reply-To: <0425b353-54b0-6ccd-fbb6-3d26d9448bb5@linux.vnet.ibm.com>

hotplug/rtas: Disable rtas_event_scan during device-tree property
updates after migration to reduce conflicts with changes propagated
to other parts of the kernel configuration, such as CPUs or memory.

Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c |    4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 6267b53..f5c9e8f 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -686,14 +686,18 @@ static int dlpar_cpu_readd_by_index(u32 drc_index)
 
 	pr_info("Attempting to re-add CPU, drc index %x\n", drc_index);
 
+	rtas_event_scan_disable();
 	arch_update_cpu_topology_suspend();
 	rc = dlpar_cpu_remove_by_index(drc_index, false);
 	arch_update_cpu_topology_resume();
+	rtas_event_scan_enable();
 
 	if (!rc) {
+		rtas_event_scan_disable();
 		arch_update_cpu_topology_suspend();
 		rc = dlpar_cpu_add(drc_index, false);
 		arch_update_cpu_topology_resume();
+		rtas_event_scan_enable();
 	}
 
 	if (rc)

^ permalink raw reply related

* [PATCH v04 7/9] powerpc/rtas: Allow disabling rtas_event_scan
From: Michael Bringmann @ 2018-06-21  0:30 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Michael Bringmann, Nathan Fontenot, John Allen, Tyrel Datwyler,
	Thomas Falcon
In-Reply-To: <0425b353-54b0-6ccd-fbb6-3d26d9448bb5@linux.vnet.ibm.com>

powerpc/rtas: Provide mechanism by which the rtas_event_scan can
be disabled/re-enabled by other portions of the powerpc code.
Among other things, this simplifies the usage of locking mechanisms
for shared kernel resources.

Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/rtas.h |    4 ++++
 arch/powerpc/kernel/rtasd.c     |   14 ++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 4f45152..a94e3ff 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -386,8 +386,12 @@ extern int early_init_dt_scan_rtas(unsigned long node,
 
 #ifdef CONFIG_PPC_RTAS_DAEMON
 extern void rtas_cancel_event_scan(void);
+extern void rtas_event_scan_disable(void);
+extern void rtas_event_scan_enable(void);
 #else
 static inline void rtas_cancel_event_scan(void) { }
+static inline void rtas_event_scan_disable(void) { }
+static inline void rtas_event_scan_enable(void) { }
 #endif
 
 /* Error types logged.  */
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index f915db9..72f3696 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -455,11 +455,25 @@ static void do_event_scan(void)
  */
 static unsigned long event_scan_delay = 1*HZ;
 static int first_pass = 1;
+static int res_enable = 1;
+
+void rtas_event_scan_disable(void)
+{
+	res_enable = 0;
+}
+
+void rtas_event_scan_enable(void)
+{
+	res_enable = 1;
+}
 
 static void rtas_event_scan(struct work_struct *w)
 {
 	unsigned int cpu;
 
+	if (!res_enable)
+		return;
+
 	do_event_scan();
 
 	get_online_cpus();

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox