LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 05/14] powerpc/kernel/iommu: Add new iommu_table_in_use() helper
From: Leonardo Bras @ 2020-09-11 17:07 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Leonardo Bras, Joel Stanley, Christophe Leroy,
	Alexey Kardashevskiy, Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev
In-Reply-To: <20200911170738.82818-1-leobras.c@gmail.com>

Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org, 

Having a function to check if the iommu table has any allocation helps
deciding if a tbl can be reset for using a new DMA window.

It should be enough to replace all instances of !bitmap_empty(tbl...).

iommu_table_in_use() skips reserved memory, so we don't need to worry about
releasing it before testing. This causes iommu_table_release_pages() to
become unnecessary, given it is only used to remove reserved memory for
testing.

Also, only allow storing reserved memory values in tbl if they are valid
in the table, so there is no need to check it in the new helper.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/include/asm/iommu.h |  1 +
 arch/powerpc/kernel/iommu.c      | 61 +++++++++++++++-----------------
 2 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 5032f1593299..2913e5c8b1f8 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -154,6 +154,7 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table *tbl,
 		int nid, unsigned long res_start, unsigned long res_end);
+bool iommu_table_in_use(struct iommu_table *tbl);
 
 #define IOMMU_TABLE_GROUP_MAX_TABLES	2
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ffb2637dc82b..c838da3d8f32 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -655,34 +655,21 @@ static void iommu_table_reserve_pages(struct iommu_table *tbl,
 	if (tbl->it_offset == 0)
 		set_bit(0, tbl->it_map);
 
+	/* Check if res_start..res_end is a valid range in the table */
+	if (res_start >= res_end || res_start < tbl->it_offset ||
+	    res_end > (tbl->it_offset + tbl->it_size)) {
+		tbl->it_reserved_start = tbl->it_offset;
+		tbl->it_reserved_end = tbl->it_offset;
+		return;
+	}
+
 	tbl->it_reserved_start = res_start;
 	tbl->it_reserved_end = res_end;
 
-	/* Check if res_start..res_end isn't empty and overlaps the table */
-	if (res_start && res_end &&
-			(tbl->it_offset + tbl->it_size < res_start ||
-			 res_end < tbl->it_offset))
-		return;
-
 	for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
 		set_bit(i - tbl->it_offset, tbl->it_map);
 }
 
-static void iommu_table_release_pages(struct iommu_table *tbl)
-{
-	int i;
-
-	/*
-	 * In case we have reserved the first bit, we should not emit
-	 * the warning below.
-	 */
-	if (tbl->it_offset == 0)
-		clear_bit(0, tbl->it_map);
-
-	for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
-		clear_bit(i - tbl->it_offset, tbl->it_map);
-}
-
 /*
  * Build a iommu_table structure.  This contains a bit map which
  * is used to manage allocation of the tce space.
@@ -743,6 +730,23 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
 	return tbl;
 }
 
+bool iommu_table_in_use(struct iommu_table *tbl)
+{
+	unsigned long start = 0, end;
+
+	/* ignore reserved bit0 */
+	if (tbl->it_offset == 0)
+		start = 1;
+	end = tbl->it_reserved_start - tbl->it_offset;
+	if (find_next_bit(tbl->it_map, end, start) != end)
+		return true;
+
+	start = tbl->it_reserved_end - tbl->it_offset;
+	end = tbl->it_size;
+	return find_next_bit(tbl->it_map, end, start) != end;
+
+}
+
 static void iommu_table_free(struct kref *kref)
 {
 	unsigned long bitmap_sz;
@@ -759,10 +763,8 @@ static void iommu_table_free(struct kref *kref)
 		return;
 	}
 
-	iommu_table_release_pages(tbl);
-
 	/* verify that table contains no entries */
-	if (!bitmap_empty(tbl->it_map, tbl->it_size))
+	if (iommu_table_in_use(tbl))
 		pr_warn("%s: Unexpected TCEs\n", __func__);
 
 	/* calculate bitmap size in bytes */
@@ -1068,18 +1070,13 @@ int iommu_take_ownership(struct iommu_table *tbl)
 	for (i = 0; i < tbl->nr_pools; i++)
 		spin_lock(&tbl->pools[i].lock);
 
-	iommu_table_release_pages(tbl);
-
-	if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
+	if (iommu_table_in_use(tbl)) {
 		pr_err("iommu_tce: it_map is not empty");
 		ret = -EBUSY;
-		/* Undo iommu_table_release_pages, i.e. restore bit#0, etc */
-		iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
-				tbl->it_reserved_end);
-	} else {
-		memset(tbl->it_map, 0xff, sz);
 	}
 
+	memset(tbl->it_map, 0xff, sz);
+
 	for (i = 0; i < tbl->nr_pools; i++)
 		spin_unlock(&tbl->pools[i].lock);
 	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
-- 
2.25.4


^ permalink raw reply related

* [PATCH v2 09/14] powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
From: Leonardo Bras @ 2020-09-11 17:07 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Leonardo Bras, Joel Stanley, Christophe Leroy,
	Alexey Kardashevskiy, Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev
In-Reply-To: <20200911170738.82818-1-leobras.c@gmail.com>

Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org, 

Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.

This created an opportunity to reorganize the second part of enable_ddw():

Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().

With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().

This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 106 +++++++++++++++----------
 1 file changed, 63 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index bba8584a8f9d..510ccb0521af 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -177,7 +177,7 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
 		if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
 			ret = (int)rc;
 			tce_free_pSeriesLP(liobn, tcenum_start, tceshift,
-			                   (npages_start - (npages + 1)));
+					   (npages_start - (npages + 1)));
 			break;
 		}
 
@@ -215,7 +215,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 	if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
 		return tce_build_pSeriesLP(tbl->it_index, tcenum,
 					   tceshift, npages, uaddr,
-		                           direction, attrs);
+					   direction, attrs);
 	}
 
 	local_irq_save(flags);	/* to protect tcep and the page behind it */
@@ -269,7 +269,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 	if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
 		ret = (int)rc;
 		tce_freemulti_pSeriesLP(tbl, tcenum_start,
-		                        (npages_start - (npages + limit)));
+					(npages_start - (npages + limit)));
 		return ret;
 	}
 
@@ -1121,6 +1121,35 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
 			 ret);
 }
 
+static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr,
+					    u32 page_shift, u32 window_shift)
+{
+	struct dynamic_dma_window_prop *ddwprop;
+	struct property *win64;
+
+	win64 = kzalloc(sizeof(*win64), GFP_KERNEL);
+	if (!win64)
+		return NULL;
+
+	win64->name = kstrdup(propname, GFP_KERNEL);
+	ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
+	win64->value = ddwprop;
+	win64->length = sizeof(*ddwprop);
+	if (!win64->name || !win64->value) {
+		kfree(win64);
+		kfree(win64->name);
+		kfree(win64->value);
+		return NULL;
+	}
+
+	ddwprop->liobn = cpu_to_be32(liobn);
+	ddwprop->dma_base = cpu_to_be64(dma_addr);
+	ddwprop->tce_shift = cpu_to_be32(page_shift);
+	ddwprop->window_shift = cpu_to_be32(window_shift);
+
+	return win64;
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1138,12 +1167,11 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	struct ddw_query_response query;
 	struct ddw_create_response create;
 	int page_shift;
-	u64 max_addr;
+	u64 max_addr, win_addr;
 	struct device_node *dn;
 	u32 ddw_avail[DDW_APPLICABLE_SIZE];
 	struct direct_window *window;
-	struct property *win64;
-	struct dynamic_dma_window_prop *ddwprop;
+	struct property *win64 = NULL;
 	struct failed_ddw_pdn *fpdn;
 	bool default_win_removed = false;
 
@@ -1245,72 +1273,64 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 		goto out_failed;
 	}
 	len = order_base_2(max_addr);
-	win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
-	if (!win64) {
-		dev_info(&dev->dev,
-			"couldn't allocate property for 64bit dma window\n");
-		goto out_failed;
-	}
-	win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
-	win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
-	win64->length = sizeof(*ddwprop);
-	if (!win64->name || !win64->value) {
-		dev_info(&dev->dev,
-			"couldn't allocate property name and value\n");
-		goto out_free_prop;
-	}
 
 	ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
 	if (ret != 0)
-		goto out_free_prop;
-
-	ddwprop->liobn = cpu_to_be32(create.liobn);
-	ddwprop->dma_base = cpu_to_be64(((u64)create.addr_hi << 32) |
-			create.addr_lo);
-	ddwprop->tce_shift = cpu_to_be32(page_shift);
-	ddwprop->window_shift = cpu_to_be32(len);
+		goto out_failed;
 
 	dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n",
-		  create.liobn, dn);
+		create.liobn, dn);
 
-	window = ddw_list_new_entry(pdn, ddwprop);
+	win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
+	win64 = ddw_property_create(DIRECT64_PROPNAME, create.liobn, win_addr,
+				    page_shift, len);
+	if (!win64) {
+		dev_info(&dev->dev,
+			 "couldn't allocate property, property name, or value\n");
+		goto out_win_del;
+	}
+
+	ret = of_add_property(pdn, win64);
+	if (ret) {
+		dev_err(&dev->dev, "unable to add dma window property for %pOF: %d",
+			pdn, ret);
+		goto out_prop_free;
+	}
+
+	window = ddw_list_new_entry(pdn, win64->value);
 	if (!window)
-		goto out_clear_window;
+		goto out_prop_del;
 
 	ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
 			win64->value, tce_setrange_multi_pSeriesLP_walk);
 	if (ret) {
 		dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n",
 			 dn, ret);
-		goto out_free_window;
-	}
-
-	ret = of_add_property(pdn, win64);
-	if (ret) {
-		dev_err(&dev->dev, "unable to add dma window property for %pOF: %d",
-			 pdn, ret);
-		goto out_free_window;
+		goto out_list_del;
 	}
 
 	spin_lock(&direct_window_list_lock);
 	list_add(&window->list, &direct_window_list);
 	spin_unlock(&direct_window_list_lock);
 
-	dev->dev.archdata.dma_offset = be64_to_cpu(ddwprop->dma_base);
+	dev->dev.archdata.dma_offset = win_addr;
 	goto out_unlock;
 
-out_free_window:
+out_list_del:
 	kfree(window);
 
-out_clear_window:
-	remove_ddw(pdn, true);
+out_prop_del:
+	of_remove_property(pdn, win64);
 
-out_free_prop:
+out_prop_free:
 	kfree(win64->name);
 	kfree(win64->value);
 	kfree(win64);
 	win64 = NULL;
 
+out_win_del:
+	remove_ddw(pdn, true);
+
 out_failed:
 	if (default_win_removed)
 		reset_dma_window(dev, pdn);
-- 
2.25.4


^ permalink raw reply related

* [PATCH v2 10/14] powerpc/pseries/iommu: Reorganize iommu_table_setparms*() with new helper
From: Leonardo Bras @ 2020-09-11 17:07 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Leonardo Bras, Joel Stanley, Christophe Leroy,
	Alexey Kardashevskiy, Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev
In-Reply-To: <20200911170738.82818-1-leobras.c@gmail.com>

Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org, 

Add a new helper _iommu_table_setparms(), and use it in
iommu_table_setparms() and iommu_table_setparms_lpar() to avoid duplicated
code.

Also, setting tbl->it_ops was happening outsite iommu_table_setparms*(),
so move it to the new helper. Since we need the iommu_table_ops to be
declared before used, move iommu_table_lpar_multi_ops and
iommu_table_pseries_ops to before their respective iommu_table_setparms*().

The tce_exchange_pseries() also had to be moved up, since it's used in
iommu_table_lpar_multi_ops.xchg_no_kill.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 149 ++++++++++++-------------
 1 file changed, 72 insertions(+), 77 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 510ccb0521af..abd36b257725 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -495,12 +495,62 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
 	return rc;
 }
 
+#ifdef CONFIG_IOMMU_API
+static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
+				long *tce, enum dma_data_direction *direction,
+				bool realmode)
+{
+	long rc;
+	unsigned long ioba = (unsigned long)index << tbl->it_page_shift;
+	unsigned long flags, oldtce = 0;
+	u64 proto_tce = iommu_direction_to_tce_perm(*direction);
+	unsigned long newtce = *tce | proto_tce;
+
+	spin_lock_irqsave(&tbl->large_pool.lock, flags);
+
+	rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce);
+	if (!rc)
+		rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce);
+
+	if (!rc) {
+		*direction = iommu_tce_direction(oldtce);
+		*tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+	}
+
+	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
+
+	return rc;
+}
+#endif
+
 static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
 		unsigned long num_pfn, void *arg)
 {
 	return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
 }
 
+static inline void _iommu_table_setparms(struct iommu_table *tbl, unsigned long busno,
+					 unsigned long liobn, unsigned long win_addr,
+					 unsigned long window_size, unsigned long page_shift,
+					 unsigned long base, struct iommu_table_ops *table_ops)
+{
+	tbl->it_busno = busno;
+	tbl->it_index = liobn;
+	tbl->it_offset = win_addr >> page_shift;
+	tbl->it_size = window_size >> page_shift;
+	tbl->it_page_shift = page_shift;
+	tbl->it_base = base;
+	tbl->it_blocksize = 16;
+	tbl->it_type = TCE_PCI;
+	tbl->it_ops = table_ops;
+}
+
+struct iommu_table_ops iommu_table_pseries_ops = {
+	.set = tce_build_pSeries,
+	.clear = tce_free_pSeries,
+	.get = tce_get_pseries
+};
+
 static void iommu_table_setparms(struct pci_controller *phb,
 				 struct device_node *dn,
 				 struct iommu_table *tbl)
@@ -509,8 +559,13 @@ static void iommu_table_setparms(struct pci_controller *phb,
 	const unsigned long *basep;
 	const u32 *sizep;
 
-	node = phb->dn;
+	/* Test if we are going over 2GB of DMA space */
+	if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
+		udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+		panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+	}
 
+	node = phb->dn;
 	basep = of_get_property(node, "linux,tce-base", NULL);
 	sizep = of_get_property(node, "linux,tce-size", NULL);
 	if (basep == NULL || sizep == NULL) {
@@ -519,33 +574,25 @@ static void iommu_table_setparms(struct pci_controller *phb,
 		return;
 	}
 
-	tbl->it_base = (unsigned long)__va(*basep);
+	_iommu_table_setparms(tbl, phb->bus->number, 0, phb->dma_window_base_cur,
+			      phb->dma_window_size, IOMMU_PAGE_SHIFT_4K,
+			      (unsigned long)__va(*basep), &iommu_table_pseries_ops);
 
 	if (!is_kdump_kernel())
 		memset((void *)tbl->it_base, 0, *sizep);
 
-	tbl->it_busno = phb->bus->number;
-	tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
-
-	/* Units of tce entries */
-	tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift;
-
-	/* Test if we are going over 2GB of DMA space */
-	if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
-		udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
-		panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
-	}
-
 	phb->dma_window_base_cur += phb->dma_window_size;
-
-	/* Set the tce table size - measured in entries */
-	tbl->it_size = phb->dma_window_size >> tbl->it_page_shift;
-
-	tbl->it_index = 0;
-	tbl->it_blocksize = 16;
-	tbl->it_type = TCE_PCI;
 }
 
+struct iommu_table_ops iommu_table_lpar_multi_ops = {
+	.set = tce_buildmulti_pSeriesLP,
+#ifdef CONFIG_IOMMU_API
+	.xchg_no_kill = tce_exchange_pseries,
+#endif
+	.clear = tce_freemulti_pSeriesLP,
+	.get = tce_get_pSeriesLP
+};
+
 /*
  * iommu_table_setparms_lpar
  *
@@ -557,28 +604,17 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb,
 				      struct iommu_table_group *table_group,
 				      const __be32 *dma_window)
 {
-	unsigned long offset, size;
+	unsigned long offset, size, liobn;
 
-	of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size);
+	of_parse_dma_window(dn, dma_window, &liobn, &offset, &size);
 
-	tbl->it_busno = phb->bus->number;
-	tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
-	tbl->it_base   = 0;
-	tbl->it_blocksize  = 16;
-	tbl->it_type = TCE_PCI;
-	tbl->it_offset = offset >> tbl->it_page_shift;
-	tbl->it_size = size >> tbl->it_page_shift;
+	_iommu_table_setparms(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, 0,
+			      &iommu_table_lpar_multi_ops);
 
 	table_group->tce32_start = offset;
 	table_group->tce32_size = size;
 }
 
-struct iommu_table_ops iommu_table_pseries_ops = {
-	.set = tce_build_pSeries,
-	.clear = tce_free_pSeries,
-	.get = tce_get_pseries
-};
-
 static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 {
 	struct device_node *dn;
@@ -647,7 +683,6 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 	tbl = pci->table_group->tables[0];
 
 	iommu_table_setparms(pci->phb, dn, tbl);
-	tbl->it_ops = &iommu_table_pseries_ops;
 	iommu_init_table(tbl, pci->phb->node, 0, 0);
 
 	/* Divide the rest (1.75GB) among the children */
@@ -658,43 +693,6 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 	pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
 }
 
-#ifdef CONFIG_IOMMU_API
-static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
-				long *tce, enum dma_data_direction *direction,
-				bool realmode)
-{
-	long rc;
-	unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
-	unsigned long flags, oldtce = 0;
-	u64 proto_tce = iommu_direction_to_tce_perm(*direction);
-	unsigned long newtce = *tce | proto_tce;
-
-	spin_lock_irqsave(&tbl->large_pool.lock, flags);
-
-	rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce);
-	if (!rc)
-		rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce);
-
-	if (!rc) {
-		*direction = iommu_tce_direction(oldtce);
-		*tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
-	}
-
-	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
-
-	return rc;
-}
-#endif
-
-struct iommu_table_ops iommu_table_lpar_multi_ops = {
-	.set = tce_buildmulti_pSeriesLP,
-#ifdef CONFIG_IOMMU_API
-	.xchg_no_kill = tce_exchange_pseries,
-#endif
-	.clear = tce_freemulti_pSeriesLP,
-	.get = tce_get_pSeriesLP
-};
-
 static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 {
 	struct iommu_table *tbl;
@@ -729,7 +727,6 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 		tbl = ppci->table_group->tables[0];
 		iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
 				ppci->table_group, dma_window);
-		tbl->it_ops = &iommu_table_lpar_multi_ops;
 		iommu_init_table(tbl, ppci->phb->node, 0, 0);
 		iommu_register_group(ppci->table_group,
 				pci_domain_nr(bus), 0);
@@ -758,7 +755,6 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);
 		tbl = PCI_DN(dn)->table_group->tables[0];
 		iommu_table_setparms(phb, dn, tbl);
-		tbl->it_ops = &iommu_table_pseries_ops;
 		iommu_init_table(tbl, phb->node, 0, 0);
 		set_iommu_table_base(&dev->dev, tbl);
 		return;
@@ -1385,7 +1381,6 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 		tbl = pci->table_group->tables[0];
 		iommu_table_setparms_lpar(pci->phb, pdn, tbl,
 				pci->table_group, dma_window);
-		tbl->it_ops = &iommu_table_lpar_multi_ops;
 		iommu_init_table(tbl, pci->phb->node, 0, 0);
 		iommu_register_group(pci->table_group,
 				pci_domain_nr(pci->phb->bus), 0);
-- 
2.25.4


^ permalink raw reply related

* [PATCH v2 11/14] powerpc/pseries/iommu: Update remove_dma_window() to accept property name
From: Leonardo Bras @ 2020-09-11 17:07 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Leonardo Bras, Joel Stanley, Christophe Leroy,
	Alexey Kardashevskiy, Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev
In-Reply-To: <20200911170738.82818-1-leobras.c@gmail.com>

Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org, 

Update remove_dma_window() so it can be used to remove DDW with a given
property name.

This enables the creation of new property names for DDW, so we can
have different usage for it, like indirect mapping.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index abd36b257725..f6a65ecd1db5 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -818,31 +818,32 @@ static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
 			np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 }
 
-static void remove_ddw(struct device_node *np, bool remove_prop)
+static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_name)
 {
 	struct property *win;
 	u32 ddw_avail[DDW_APPLICABLE_SIZE];
 	int ret = 0;
 
+	win = of_find_property(np, win_name, NULL);
+	if (!win)
+		return -EINVAL;
+
 	ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
 					 &ddw_avail[0], DDW_APPLICABLE_SIZE);
 	if (ret)
-		return;
-
-	win = of_find_property(np, DIRECT64_PROPNAME, NULL);
-	if (!win)
-		return;
+		return 0;
 
 	if (win->length >= sizeof(struct dynamic_dma_window_prop))
 		remove_dma_window(np, ddw_avail, win);
 
 	if (!remove_prop)
-		return;
+		return 0;
 
 	ret = of_remove_property(np, win);
 	if (ret)
 		pr_warn("%pOF: failed to remove direct window property: %d\n",
 			np, ret);
+	return 0;
 }
 
 static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr)
@@ -894,7 +895,7 @@ static int find_existing_ddw_windows(void)
 	for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
 		direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);
 		if (!direct64 || len < sizeof(*direct64)) {
-			remove_ddw(pdn, true);
+			remove_ddw(pdn, true, DIRECT64_PROPNAME);
 			continue;
 		}
 
@@ -1325,7 +1326,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	win64 = NULL;
 
 out_win_del:
-	remove_ddw(pdn, true);
+	remove_ddw(pdn, true, DIRECT64_PROPNAME);
 
 out_failed:
 	if (default_win_removed)
@@ -1480,7 +1481,7 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
 		 * we have to remove the property when releasing
 		 * the device node.
 		 */
-		remove_ddw(np, false);
+		remove_ddw(np, false, DIRECT64_PROPNAME);
 		if (pci && pci->table_group)
 			iommu_pseries_free_group(pci->table_group,
 					np->full_name);
-- 
2.25.4


^ permalink raw reply related

* [PATCH v2 12/14] powerpc/pseries/iommu: Find existing DDW with given property name
From: Leonardo Bras @ 2020-09-11 17:07 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Leonardo Bras, Joel Stanley, Christophe Leroy,
	Alexey Kardashevskiy, Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev
In-Reply-To: <20200911170738.82818-1-leobras.c@gmail.com>

Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org, 

Extract find_existing_ddw_windows() into find_existing_ddw_windows_named()
and calls it with current property name.

This will allow more property names to be checked in
find_existing_ddw_windows(), enabling the creation of new property names,
like the one that will be used for indirect mapping.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index f6a65ecd1db5..9b7c03652e72 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -882,24 +882,21 @@ static struct direct_window *ddw_list_new_entry(struct device_node *pdn,
 	return window;
 }
 
-static int find_existing_ddw_windows(void)
+static void find_existing_ddw_windows_named(const char *name)
 {
 	int len;
 	struct device_node *pdn;
 	struct direct_window *window;
-	const struct dynamic_dma_window_prop *direct64;
-
-	if (!firmware_has_feature(FW_FEATURE_LPAR))
-		return 0;
+	const struct dynamic_dma_window_prop *dma64;
 
-	for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
-		direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);
-		if (!direct64 || len < sizeof(*direct64)) {
-			remove_ddw(pdn, true, DIRECT64_PROPNAME);
+	for_each_node_with_property(pdn, name) {
+		dma64 = of_get_property(pdn, name, &len);
+		if (!dma64 || len < sizeof(*dma64)) {
+			remove_ddw(pdn, true, name);
 			continue;
 		}
 
-		window = ddw_list_new_entry(pdn, direct64);
+		window = ddw_list_new_entry(pdn, dma64);
 		if (!window)
 			break;
 
@@ -907,6 +904,14 @@ static int find_existing_ddw_windows(void)
 		list_add(&window->list, &direct_window_list);
 		spin_unlock(&direct_window_list_lock);
 	}
+}
+
+static int find_existing_ddw_windows(void)
+{
+	if (!firmware_has_feature(FW_FEATURE_LPAR))
+		return 0;
+
+	find_existing_ddw_windows_named(DIRECT64_PROPNAME);
 
 	return 0;
 }
-- 
2.25.4


^ permalink raw reply related

* [PATCH v2 13/14] powerpc/pseries/iommu: Make use of DDW for indirect mapping
From: Leonardo Bras @ 2020-09-11 17:07 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Leonardo Bras, Joel Stanley, Christophe Leroy,
	Alexey Kardashevskiy, Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev
In-Reply-To: <20200911170738.82818-1-leobras.c@gmail.com>

Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org, 

So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.

As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.

The default DMA window uses 4k pages instead of 64k pages, and since
the amount of pages (TCEs) may stay the same (on pHyp case), making
use of DDW instead of the default DMA window for indirect mapping will
expand in up to 16x the amount of memory that can be mapped on DMA.

Indirect mapping will only be used if direct mapping is not a
possibility.

For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.

Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.

Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.

Note: ddw_memory_hotplug_max() was moved up so it can be used in
find_existing_ddw().

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 160 ++++++++++++++++---------
 1 file changed, 103 insertions(+), 57 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 9b7c03652e72..c4de23080d1b 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -375,6 +375,7 @@ static DEFINE_SPINLOCK(direct_window_list_lock);
 /* protects initializing window twice for same device */
 static DEFINE_MUTEX(direct_window_init_mutex);
 #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
 
 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
 					unsigned long num_pfn, const void *arg)
@@ -846,10 +847,48 @@ static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_
 	return 0;
 }
 
-static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr)
+static phys_addr_t ddw_memory_hotplug_max(void)
+{
+	phys_addr_t max_addr = memory_hotplug_max();
+	struct device_node *memory;
+
+	/*
+	 * The "ibm,pmemory" can appear anywhere in the address space.
+	 * Assuming it is still backed by page structs, set the upper limit
+	 * for the huge DMA window as MAX_PHYSMEM_BITS.
+	 */
+	if (of_find_node_by_type(NULL, "ibm,pmemory"))
+		return (sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ?
+			(phys_addr_t)-1 : (1ULL << MAX_PHYSMEM_BITS);
+
+	for_each_node_by_type(memory, "memory") {
+		unsigned long start, size;
+		int n_mem_addr_cells, n_mem_size_cells, len;
+		const __be32 *memcell_buf;
+
+		memcell_buf = of_get_property(memory, "reg", &len);
+		if (!memcell_buf || len <= 0)
+			continue;
+
+		n_mem_addr_cells = of_n_addr_cells(memory);
+		n_mem_size_cells = of_n_size_cells(memory);
+
+		start = of_read_number(memcell_buf, n_mem_addr_cells);
+		memcell_buf += n_mem_addr_cells;
+		size = of_read_number(memcell_buf, n_mem_size_cells);
+		memcell_buf += n_mem_size_cells;
+
+		max_addr = max_t(phys_addr_t, max_addr, start + size);
+	}
+
+	return max_addr;
+}
+
+static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, bool *direct_mapping)
 {
 	struct direct_window *window;
 	const struct dynamic_dma_window_prop *direct64;
+	unsigned long window_size;
 	bool found = false;
 
 	spin_lock(&direct_window_list_lock);
@@ -858,6 +897,10 @@ static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr)
 		if (window->device == pdn) {
 			direct64 = window->prop;
 			*dma_addr = be64_to_cpu(direct64->dma_base);
+
+			window_size = (1UL << be32_to_cpu(direct64->window_shift));
+			*direct_mapping = (window_size >= ddw_memory_hotplug_max());
+
 			found = true;
 			break;
 		}
@@ -912,6 +955,7 @@ static int find_existing_ddw_windows(void)
 		return 0;
 
 	find_existing_ddw_windows_named(DIRECT64_PROPNAME);
+	find_existing_ddw_windows_named(DMA64_PROPNAME);
 
 	return 0;
 }
@@ -1054,43 +1098,6 @@ struct failed_ddw_pdn {
 
 static LIST_HEAD(failed_ddw_pdn_list);
 
-static phys_addr_t ddw_memory_hotplug_max(void)
-{
-	phys_addr_t max_addr = memory_hotplug_max();
-	struct device_node *memory;
-
-	/*
-	 * The "ibm,pmemory" can appear anywhere in the address space.
-	 * Assuming it is still backed by page structs, set the upper limit
-	 * for the huge DMA window as MAX_PHYSMEM_BITS.
-	 */
-	if (of_find_node_by_type(NULL, "ibm,pmemory"))
-		return (sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ?
-			(phys_addr_t) -1 : (1ULL << MAX_PHYSMEM_BITS);
-
-	for_each_node_by_type(memory, "memory") {
-		unsigned long start, size;
-		int n_mem_addr_cells, n_mem_size_cells, len;
-		const __be32 *memcell_buf;
-
-		memcell_buf = of_get_property(memory, "reg", &len);
-		if (!memcell_buf || len <= 0)
-			continue;
-
-		n_mem_addr_cells = of_n_addr_cells(memory);
-		n_mem_size_cells = of_n_size_cells(memory);
-
-		start = of_read_number(memcell_buf, n_mem_addr_cells);
-		memcell_buf += n_mem_addr_cells;
-		size = of_read_number(memcell_buf, n_mem_size_cells);
-		memcell_buf += n_mem_size_cells;
-
-		max_addr = max_t(phys_addr_t, max_addr, start + size);
-	}
-
-	return max_addr;
-}
-
 /*
  * Platforms supporting the DDW option starting with LoPAR level 2.7 implement
  * ibm,ddw-extensions, which carries the rtas token for
@@ -1173,14 +1180,19 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	struct device_node *dn;
 	u32 ddw_avail[DDW_APPLICABLE_SIZE];
 	struct direct_window *window;
+	const char *win_name;
 	struct property *win64 = NULL;
 	struct failed_ddw_pdn *fpdn;
-	bool default_win_removed = false;
+	bool default_win_removed = false, direct_mapping = false;
+	struct pci_dn *pci = PCI_DN(pdn);
+	struct iommu_table *tbl = pci->table_group->tables[0];
 
 	mutex_lock(&direct_window_init_mutex);
 
-	if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset))
-		goto out_unlock;
+	if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &direct_mapping)) {
+		mutex_unlock(&direct_window_init_mutex);
+		return direct_mapping;
+	}
 
 	/*
 	 * If we already went through this for a previous function of
@@ -1266,15 +1278,25 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	}
 
 	/* verify the window * number of ptes will map the partition */
-	/* check largest block * page size > max memory hotplug addr */
 	max_addr = ddw_memory_hotplug_max();
 	if (query.largest_available_block < (max_addr >> page_shift)) {
-		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu "
-			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
-			  1ULL << page_shift);
+		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu %llu-sized pages\n",
+			max_addr, query.largest_available_block,
+			1ULL << page_shift);
+
+		len = order_base_2(query.largest_available_block << page_shift);
+		win_name = DMA64_PROPNAME;
+	} else {
+		direct_mapping = true;
+		len = order_base_2(max_addr);
+		win_name = DIRECT64_PROPNAME;
+	}
+
+	/* DDW + IOMMU on single window may fail if there is any allocation */
+	if (default_win_removed && !direct_mapping && iommu_table_in_use(tbl)) {
+		dev_dbg(&dev->dev, "current IOMMU table in use, can't be replaced.\n");
 		goto out_failed;
 	}
-	len = order_base_2(max_addr);
 
 	ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
 	if (ret != 0)
@@ -1284,8 +1306,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 		create.liobn, dn);
 
 	win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
-	win64 = ddw_property_create(DIRECT64_PROPNAME, create.liobn, win_addr,
-				    page_shift, len);
+	win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len);
 	if (!win64) {
 		dev_info(&dev->dev,
 			 "couldn't allocate property, property name, or value\n");
@@ -1300,15 +1321,37 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	}
 
 	window = ddw_list_new_entry(pdn, win64->value);
-	if (!window)
+	if (!window) {
+		dev_dbg(&dev->dev, "couldn't create new list entry\n");
 		goto out_prop_del;
+	}
 
-	ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
-			win64->value, tce_setrange_multi_pSeriesLP_walk);
-	if (ret) {
-		dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n",
-			 dn, ret);
-		goto out_list_del;
+	if (direct_mapping) {
+		/* DDW maps the whole partition, so enable direct DMA mapping */
+		ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
+					    win64->value, tce_setrange_multi_pSeriesLP_walk);
+		if (ret) {
+			dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n",
+				 dn, ret);
+			goto out_list_del;
+		}
+	} else {
+		/* New table for using DDW instead of the default DMA window */
+		tbl = iommu_pseries_alloc_table(pci->phb->node);
+		if (!tbl) {
+			dev_dbg(&dev->dev, "couldn't create new IOMMU table\n");
+			goto out_list_del;
+		}
+
+		_iommu_table_setparms(tbl, pci->phb->bus->number, create.liobn, win_addr,
+				      1UL << len, page_shift, 0, &iommu_table_lpar_multi_ops);
+		iommu_init_table(tbl, pci->phb->node, 0, 0);
+
+		/* Free old table and replace by the newer */
+		iommu_tce_table_put(pci->table_group->tables[0]);
+		pci->table_group->tables[0] = tbl;
+
+		set_iommu_table_base(&dev->dev, tbl);
 	}
 
 	spin_lock(&direct_window_list_lock);
@@ -1345,7 +1388,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 
 out_unlock:
 	mutex_unlock(&direct_window_init_mutex);
-	return win64;
+	return win64 && direct_mapping;
 }
 
 static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
@@ -1486,7 +1529,10 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
 		 * we have to remove the property when releasing
 		 * the device node.
 		 */
-		remove_ddw(np, false, DIRECT64_PROPNAME);
+
+		if (remove_ddw(np, false, DIRECT64_PROPNAME))
+			remove_ddw(np, false, DMA64_PROPNAME);
+
 		if (pci && pci->table_group)
 			iommu_pseries_free_group(pci->table_group,
 					np->full_name);
-- 
2.25.4


^ permalink raw reply related

* [PATCH v2 14/14] powerpc/pseries/iommu: Rename "direct window" to "dma window"
From: Leonardo Bras @ 2020-09-11 17:07 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Leonardo Bras, Joel Stanley, Christophe Leroy,
	Alexey Kardashevskiy, Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev
In-Reply-To: <20200911170738.82818-1-leobras.c@gmail.com>

Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org, 

A previous change introduced the usage of DDW as a bigger indirect DMA
mapping when the DDW available size does not map the whole partition.

As most of the code that manipulates direct mappings was reused for
indirect mappings, it's necessary to rename all names and debug/info
messages to reflect that it can be used for both kinds of mapping.

Also, defines DEFAULT_DMA_WIN as "ibm,dma-window" to document that
it's the name of the default DMA window.

Those changes are not supposed to change how the code works in any
way, just adjust naming.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 102 +++++++++++++------------
 1 file changed, 53 insertions(+), 49 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index c4de23080d1b..56638b7f07fc 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -349,7 +349,7 @@ struct dynamic_dma_window_prop {
 	__be32	window_shift;	/* ilog2(tce_window_size) */
 };
 
-struct direct_window {
+struct dma_win {
 	struct device_node *device;
 	const struct dynamic_dma_window_prop *prop;
 	struct list_head list;
@@ -369,13 +369,14 @@ struct ddw_create_response {
 	u32 addr_lo;
 };
 
-static LIST_HEAD(direct_window_list);
+static LIST_HEAD(dma_win_list);
 /* prevents races between memory on/offline and window creation */
-static DEFINE_SPINLOCK(direct_window_list_lock);
+static DEFINE_SPINLOCK(dma_win_list_lock);
 /* protects initializing window twice for same device */
-static DEFINE_MUTEX(direct_window_init_mutex);
+static DEFINE_MUTEX(dma_win_init_mutex);
 #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
 #define DMA64_PROPNAME "linux,dma64-ddr-window-info"
+#define DEFAULT_DMA_WIN "ibm,dma-window"
 
 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
 					unsigned long num_pfn, const void *arg)
@@ -706,15 +707,18 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 	pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
 		 dn);
 
-	/* Find nearest ibm,dma-window, walking up the device tree */
+	/*
+	 * Find nearest ibm,dma-window (default DMA window), walking up the
+	 * device tree
+	 */
 	for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
-		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+		dma_window = of_get_property(pdn, DEFAULT_DMA_WIN, NULL);
 		if (dma_window != NULL)
 			break;
 	}
 
 	if (dma_window == NULL) {
-		pr_debug("  no ibm,dma-window property !\n");
+		pr_debug("  no %s property !\n", DEFAULT_DMA_WIN);
 		return;
 	}
 
@@ -810,11 +814,11 @@ static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
 
 	ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
 	if (ret)
-		pr_warn("%pOF: failed to remove direct window: rtas returned "
+		pr_warn("%pOF: failed to remove dma window: rtas returned "
 			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
 			np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 	else
-		pr_debug("%pOF: successfully removed direct window: rtas returned "
+		pr_debug("%pOF: successfully removed dma window: rtas returned "
 			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
 			np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 }
@@ -842,7 +846,7 @@ static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_
 
 	ret = of_remove_property(np, win);
 	if (ret)
-		pr_warn("%pOF: failed to remove direct window property: %d\n",
+		pr_warn("%pOF: failed to remove dma window property: %d\n",
 			np, ret);
 	return 0;
 }
@@ -886,34 +890,34 @@ static phys_addr_t ddw_memory_hotplug_max(void)
 
 static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, bool *direct_mapping)
 {
-	struct direct_window *window;
-	const struct dynamic_dma_window_prop *direct64;
+	struct dma_win *window;
+	const struct dynamic_dma_window_prop *dma64;
 	unsigned long window_size;
 	bool found = false;
 
-	spin_lock(&direct_window_list_lock);
+	spin_lock(&dma_win_list_lock);
 	/* check if we already created a window and dupe that config if so */
-	list_for_each_entry(window, &direct_window_list, list) {
+	list_for_each_entry(window, &dma_win_list, list) {
 		if (window->device == pdn) {
-			direct64 = window->prop;
-			*dma_addr = be64_to_cpu(direct64->dma_base);
+			dma64 = window->prop;
+			*dma_addr = be64_to_cpu(dma64->dma_base);
 
-			window_size = (1UL << be32_to_cpu(direct64->window_shift));
+			window_size = (1UL << be32_to_cpu(dma64->window_shift));
 			*direct_mapping = (window_size >= ddw_memory_hotplug_max());
 
 			found = true;
 			break;
 		}
 	}
-	spin_unlock(&direct_window_list_lock);
+	spin_unlock(&dma_win_list_lock);
 
 	return found;
 }
 
-static struct direct_window *ddw_list_new_entry(struct device_node *pdn,
-						const struct dynamic_dma_window_prop *dma64)
+static struct dma_win *ddw_list_new_entry(struct device_node *pdn,
+					  const struct dynamic_dma_window_prop *dma64)
 {
-	struct direct_window *window;
+	struct dma_win *window;
 
 	window = kzalloc(sizeof(*window), GFP_KERNEL);
 	if (!window)
@@ -929,7 +933,7 @@ static void find_existing_ddw_windows_named(const char *name)
 {
 	int len;
 	struct device_node *pdn;
-	struct direct_window *window;
+	struct dma_win *window;
 	const struct dynamic_dma_window_prop *dma64;
 
 	for_each_node_with_property(pdn, name) {
@@ -943,9 +947,9 @@ static void find_existing_ddw_windows_named(const char *name)
 		if (!window)
 			break;
 
-		spin_lock(&direct_window_list_lock);
-		list_add(&window->list, &direct_window_list);
-		spin_unlock(&direct_window_list_lock);
+		spin_lock(&dma_win_list_lock);
+		list_add(&window->list, &dma_win_list);
+		spin_unlock(&dma_win_list_lock);
 	}
 }
 
@@ -1179,7 +1183,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	u64 max_addr, win_addr;
 	struct device_node *dn;
 	u32 ddw_avail[DDW_APPLICABLE_SIZE];
-	struct direct_window *window;
+	struct dma_win *window;
 	const char *win_name;
 	struct property *win64 = NULL;
 	struct failed_ddw_pdn *fpdn;
@@ -1187,10 +1191,10 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	struct pci_dn *pci = PCI_DN(pdn);
 	struct iommu_table *tbl = pci->table_group->tables[0];
 
-	mutex_lock(&direct_window_init_mutex);
+	mutex_lock(&dma_win_init_mutex);
 
 	if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &direct_mapping)) {
-		mutex_unlock(&direct_window_init_mutex);
+		mutex_unlock(&dma_win_init_mutex);
 		return direct_mapping;
 	}
 
@@ -1241,7 +1245,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 		struct property *default_win;
 		int reset_win_ext;
 
-		default_win = of_find_property(pdn, "ibm,dma-window", NULL);
+		default_win = of_find_property(pdn, DEFAULT_DMA_WIN, NULL);
 		if (!default_win)
 			goto out_failed;
 
@@ -1272,8 +1276,8 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	} else if ((query.page_size & 1) && PAGE_SHIFT >= 12) {
 		page_shift = 12; /* 4kB */
 	} else {
-		dev_dbg(&dev->dev, "no supported direct page size in mask %x",
-			  query.page_size);
+		dev_dbg(&dev->dev, "no supported page size in mask %x",
+			query.page_size);
 		goto out_failed;
 	}
 
@@ -1331,7 +1335,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 		ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
 					    win64->value, tce_setrange_multi_pSeriesLP_walk);
 		if (ret) {
-			dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n",
+			dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n",
 				 dn, ret);
 			goto out_list_del;
 		}
@@ -1354,9 +1358,9 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 		set_iommu_table_base(&dev->dev, tbl);
 	}
 
-	spin_lock(&direct_window_list_lock);
-	list_add(&window->list, &direct_window_list);
-	spin_unlock(&direct_window_list_lock);
+	spin_lock(&dma_win_list_lock);
+	list_add(&window->list, &dma_win_list);
+	spin_unlock(&dma_win_list_lock);
 
 	dev->dev.archdata.dma_offset = win_addr;
 	goto out_unlock;
@@ -1387,7 +1391,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	list_add(&fpdn->list, &failed_ddw_pdn_list);
 
 out_unlock:
-	mutex_unlock(&direct_window_init_mutex);
+	mutex_unlock(&dma_win_init_mutex);
 	return win64 && direct_mapping;
 }
 
@@ -1411,7 +1415,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 
 	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
 	     pdn = pdn->parent) {
-		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+		dma_window = of_get_property(pdn, DEFAULT_DMA_WIN, NULL);
 		if (dma_window)
 			break;
 	}
@@ -1461,7 +1465,7 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
 	 */
 	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
 			pdn = pdn->parent) {
-		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+		dma_window = of_get_property(pdn, DEFAULT_DMA_WIN, NULL);
 		if (dma_window)
 			break;
 	}
@@ -1475,29 +1479,29 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
 static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
 		void *data)
 {
-	struct direct_window *window;
+	struct dma_win *window;
 	struct memory_notify *arg = data;
 	int ret = 0;
 
 	switch (action) {
 	case MEM_GOING_ONLINE:
-		spin_lock(&direct_window_list_lock);
-		list_for_each_entry(window, &direct_window_list, list) {
+		spin_lock(&dma_win_list_lock);
+		list_for_each_entry(window, &dma_win_list, list) {
 			ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
 					arg->nr_pages, window->prop);
 			/* XXX log error */
 		}
-		spin_unlock(&direct_window_list_lock);
+		spin_unlock(&dma_win_list_lock);
 		break;
 	case MEM_CANCEL_ONLINE:
 	case MEM_OFFLINE:
-		spin_lock(&direct_window_list_lock);
-		list_for_each_entry(window, &direct_window_list, list) {
+		spin_lock(&dma_win_list_lock);
+		list_for_each_entry(window, &dma_win_list, list) {
 			ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
 					arg->nr_pages, window->prop);
 			/* XXX log error */
 		}
-		spin_unlock(&direct_window_list_lock);
+		spin_unlock(&dma_win_list_lock);
 		break;
 	default:
 		break;
@@ -1518,7 +1522,7 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
 	struct of_reconfig_data *rd = data;
 	struct device_node *np = rd->dn;
 	struct pci_dn *pci = PCI_DN(np);
-	struct direct_window *window;
+	struct dma_win *window;
 
 	switch (action) {
 	case OF_RECONFIG_DETACH_NODE:
@@ -1537,15 +1541,15 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
 			iommu_pseries_free_group(pci->table_group,
 					np->full_name);
 
-		spin_lock(&direct_window_list_lock);
-		list_for_each_entry(window, &direct_window_list, list) {
+		spin_lock(&dma_win_list_lock);
+		list_for_each_entry(window, &dma_win_list, list) {
 			if (window->device == np) {
 				list_del(&window->list);
 				kfree(window);
 				break;
 			}
 		}
-		spin_unlock(&direct_window_list_lock);
+		spin_unlock(&dma_win_list_lock);
 		break;
 	default:
 		err = NOTIFY_DONE;
-- 
2.25.4


^ permalink raw reply related

* Re: [PATCH] selftests/seccomp: fix ptrace tests on powerpc
From: Thadeu Lima de Souza Cascardo @ 2020-09-11 18:06 UTC (permalink / raw)
  To: Kees Cook; +Cc: linuxppc-dev, Shuah Khan, Oleg Nesterov, linux-kselftest
In-Reply-To: <202009081505.D9FE52510B@keescook>

On Tue, Sep 08, 2020 at 04:18:17PM -0700, Kees Cook wrote:
> On Tue, Jun 30, 2020 at 01:47:39PM -0300, Thadeu Lima de Souza Cascardo wrote:
> > As pointed out by Michael Ellerman, the ptrace ABI on powerpc does not
> > allow or require the return code to be set on syscall entry when
> > skipping the syscall. It will always return ENOSYS and the return code
> > must be set on syscall exit.
> > 
> > This code does that, behaving more similarly to strace. It still sets
> > the return code on entry, which is overridden on powerpc, and it will
> > always repeat the same on exit. Also, on powerpc, the errno is not
> > inverted, and depends on ccr.so being set.
> > 
> > This has been tested on powerpc and amd64.
> > 
> > Cc: Michael Ellerman <mpe@ellerman.id.au>
> > Cc: Kees Cook <keescook@google.com>
> > Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
> 
> Yikes, I missed this from a while ago. I apologize for responding so
> late!
> 
> This appears still unfixed; is that correct?
> 

Yes. I will send a v2 on top of recent changes to the test.

> > ---
> >  tools/testing/selftests/seccomp/seccomp_bpf.c | 24 +++++++++++++++----
> >  1 file changed, 20 insertions(+), 4 deletions(-)
> > 
> > diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> > index 252140a52553..b90a9190ba88 100644
> > --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> > +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> > @@ -1738,6 +1738,14 @@ void change_syscall(struct __test_metadata *_metadata,
> >  		TH_LOG("Can't modify syscall return on this architecture");
> >  #else
> >  		regs.SYSCALL_RET = result;
> > +# if defined(__powerpc__)
> > +		if (result < 0) {
> > +			regs.SYSCALL_RET = -result;
> > +			regs.ccr |= 0x10000000;
> > +		} else {
> > +			regs.ccr &= ~0x10000000;
> > +		}
> > +# endif
> >  #endif
> 
> Just so I understand correctly: for ppc to "see" this result, it needs
> to be both negative AND have this specific register set?
> 

Yes. According to Documentation/powerpc/syscall64-abi.rst:

"
- For the sc instruction, both a value and an error condition are returned.
  cr0.SO is the error condition, and r3 is the return value. When cr0.SO is
  clear, the syscall succeeded and r3 is the return value. When cr0.SO is set,
  the syscall failed and r3 is the error value (that normally corresponds to
  errno).
"

So, while some other arches will return -EINVAL, ppc returns EINVAL. And that
is distinguished from, say, read(2) returning 22 bytes read, by using CR.SO.

> >  
> >  #ifdef HAVE_GETREGS
> > @@ -1796,6 +1804,7 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
> >  	int ret, nr;
> >  	unsigned long msg;
> >  	static bool entry;
> > +	int *syscall_nr = args;
> >  
> >  	/*
> >  	 * The traditional way to tell PTRACE_SYSCALL entry/exit
> > @@ -1809,10 +1818,15 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
> >  	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
> >  			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
> >  
> > -	if (!entry)
> > +	if (!entry && !syscall_nr)
> >  		return;
> >  
> > -	nr = get_syscall(_metadata, tracee);
> > +	if (entry)
> > +		nr = get_syscall(_metadata, tracee);
> > +	else
> > +		nr = *syscall_nr;
> 
> This is weird? Shouldn't get_syscall() be modified to do the right thing
> here instead of depending on the extra arg?
> 

R0 might be clobered. Same documentation mentions it as volatile. So, during
syscall exit, we can't tell for sure that R0 will have the original syscall
number. So, we need to grab it during syscall enter, save it somewhere and
reuse it. I used the test context/args for that. That's the main change I had
to deal with after recent changes to the test. I used the variant struct now.

I only saw the need to do this under tracer_ptrace, as that was the only one
changing syscall return values using ptrace. And that can only be done during
syscall exit on ppc (ptrace ABI we can't break). So, changing get_syscall did
not seem necessary.

Thanks.
Cascardo.

> > +	if (syscall_nr)
> > +		*syscall_nr = nr;
> >  
> >  	if (nr == __NR_getpid)
> >  		change_syscall(_metadata, tracee, __NR_getppid, 0);
> > @@ -1889,9 +1903,10 @@ TEST_F(TRACE_syscall, ptrace_syscall_redirected)
> >  
> >  TEST_F(TRACE_syscall, ptrace_syscall_errno)
> >  {
> > +	int syscall_nr = -1;
> >  	/* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */
> >  	teardown_trace_fixture(_metadata, self->tracer);
> > -	self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL,
> > +	self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, &syscall_nr,
> >  					   true);
> >  
> >  	/* Tracer should skip the open syscall, resulting in ESRCH. */
> > @@ -1900,9 +1915,10 @@ TEST_F(TRACE_syscall, ptrace_syscall_errno)
> >  
> >  TEST_F(TRACE_syscall, ptrace_syscall_faked)
> >  {
> > +	int syscall_nr = -1;
> >  	/* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */
> >  	teardown_trace_fixture(_metadata, self->tracer);
> > -	self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL,
> > +	self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, &syscall_nr,
> >  					   true);
> >  
> >  	/* Tracer should skip the gettid syscall, resulting fake pid. */
> > -- 
> > 2.25.1
> > 
> 
> -- 
> Kees Cook

^ permalink raw reply

* [PATCH v2] selftests/seccomp: fix ptrace tests on powerpc
From: Thadeu Lima de Souza Cascardo @ 2020-09-11 18:10 UTC (permalink / raw)
  To: linux-kselftest
  Cc: cascardo, Shuah Khan, Oleg Nesterov, Kees Cook, linuxppc-dev

As pointed out by Michael Ellerman, the ptrace ABI on powerpc does not
allow or require the return code to be set on syscall entry when
skipping the syscall. It will always return ENOSYS and the return code
must be set on syscall exit.

This code does that, behaving more similarly to strace. It still sets
the return code on entry, which is overridden on powerpc, and it will
always repeat the same on exit. Also, on powerpc, the errno is not
inverted, and depends on ccr.so being set.

This has been tested on powerpc and amd64.

Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Kees Cook <keescook@google.com>
Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 81 ++++++++++++-------
 1 file changed, 53 insertions(+), 28 deletions(-)

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 7a6d40286a42..0ddc0846e9c0 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -1837,15 +1837,24 @@ void change_syscall(struct __test_metadata *_metadata,
 #endif
 
 	/* If syscall is skipped, change return value. */
-	if (syscall == -1)
+	if (syscall == -1) {
 #ifdef SYSCALL_NUM_RET_SHARE_REG
 		TH_LOG("Can't modify syscall return on this architecture");
-
 #elif defined(__xtensa__)
 		regs.SYSCALL_RET(regs) = result;
+#elif defined(__powerpc__)
+		/* Error is signaled by CR0 SO bit and error code is positive. */
+		if (result < 0) {
+			regs.SYSCALL_RET = -result;
+			regs.ccr |= 0x10000000;
+		} else {
+			regs.SYSCALL_RET = result;
+			regs.ccr &= ~0x10000000;
+		}
 #else
 		regs.SYSCALL_RET = result;
 #endif
+	}
 
 #ifdef HAVE_GETREGS
 	ret = ptrace(PTRACE_SETREGS, tracee, 0, &regs);
@@ -1897,12 +1906,44 @@ void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
 
 }
 
+FIXTURE(TRACE_syscall) {
+	struct sock_fprog prog;
+	pid_t tracer, mytid, mypid, parent;
+};
+
+FIXTURE_VARIANT(TRACE_syscall) {
+	/*
+	 * All of the SECCOMP_RET_TRACE behaviors can be tested with either
+	 * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
+	 * This indicates if we should use SECCOMP_RET_TRACE (false), or
+	 * ptrace (true).
+	 */
+	bool use_ptrace;
+
+	/*
+	 * Some archs (like ppc) only support changing the return code during
+	 * syscall exit when ptrace is used.  As the syscall number might not
+	 * be available anymore during syscall exit, it needs to be saved
+	 * during syscall enter.
+	 */
+	int syscall_nr;
+};
+
+FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
+	.use_ptrace = true,
+};
+
+FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
+	.use_ptrace = false,
+};
+
 void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
 		   int status, void *args)
 {
 	int ret, nr;
 	unsigned long msg;
 	static bool entry;
+	FIXTURE_VARIANT(TRACE_syscall) * variant = args;
 
 	/*
 	 * The traditional way to tell PTRACE_SYSCALL entry/exit
@@ -1916,10 +1957,15 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
 	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
 			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
 
-	if (!entry)
+	if (!entry && !variant)
 		return;
 
-	nr = get_syscall(_metadata, tracee);
+	if (entry)
+		nr = get_syscall(_metadata, tracee);
+	else if (variant)
+		nr = variant->syscall_nr;
+	if (variant)
+		variant->syscall_nr = nr;
 
 	if (nr == __NR_getpid)
 		change_syscall(_metadata, tracee, __NR_getppid, 0);
@@ -1929,29 +1975,6 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
 		change_syscall(_metadata, tracee, -1, -ESRCH);
 }
 
-FIXTURE(TRACE_syscall) {
-	struct sock_fprog prog;
-	pid_t tracer, mytid, mypid, parent;
-};
-
-FIXTURE_VARIANT(TRACE_syscall) {
-	/*
-	 * All of the SECCOMP_RET_TRACE behaviors can be tested with either
-	 * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
-	 * This indicates if we should use SECCOMP_RET_TRACE (false), or
-	 * ptrace (true).
-	 */
-	bool use_ptrace;
-};
-
-FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
-	.use_ptrace = true,
-};
-
-FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
-	.use_ptrace = false,
-};
-
 FIXTURE_SETUP(TRACE_syscall)
 {
 	struct sock_filter filter[] = {
@@ -1992,7 +2015,9 @@ FIXTURE_SETUP(TRACE_syscall)
 	self->tracer = setup_trace_fixture(_metadata,
 					   variant->use_ptrace ? tracer_ptrace
 							       : tracer_seccomp,
-					   NULL, variant->use_ptrace);
+					   variant->use_ptrace ? (void *) variant
+							       : NULL,
+					   variant->use_ptrace);
 
 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 	ASSERT_EQ(0, ret);
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH v2] selftests/seccomp: fix ptrace tests on powerpc
From: Shuah Khan @ 2020-09-11 19:06 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo, linux-kselftest
  Cc: Shuah Khan, Kees Cook, Oleg Nesterov, Shuah Khan, linuxppc-dev
In-Reply-To: <20200911181012.171027-1-cascardo@canonical.com>

On 9/11/20 12:10 PM, Thadeu Lima de Souza Cascardo wrote:
> As pointed out by Michael Ellerman, the ptrace ABI on powerpc does not
> allow or require the return code to be set on syscall entry when
> skipping the syscall. It will always return ENOSYS and the return code
> must be set on syscall exit.
> 
> This code does that, behaving more similarly to strace. It still sets
> the return code on entry, which is overridden on powerpc, and it will
> always repeat the same on exit. Also, on powerpc, the errno is not
> inverted, and depends on ccr.so being set.
> 
> This has been tested on powerpc and amd64.
> 
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Kees Cook <keescook@google.com>
> Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
> ---
>   tools/testing/selftests/seccomp/seccomp_bpf.c | 81 ++++++++++++-------
>   1 file changed, 53 insertions(+), 28 deletions(-)
> 
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 7a6d40286a42..0ddc0846e9c0 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -1837,15 +1837,24 @@ void change_syscall(struct __test_metadata *_metadata,
>   #endif
>   
>   	/* If syscall is skipped, change return value. */
> -	if (syscall == -1)
> +	if (syscall == -1) {
>   #ifdef SYSCALL_NUM_RET_SHARE_REG
>   		TH_LOG("Can't modify syscall return on this architecture");
> -
>   #elif defined(__xtensa__)
>   		regs.SYSCALL_RET(regs) = result;
> +#elif defined(__powerpc__)
> +		/* Error is signaled by CR0 SO bit and error code is positive. */
> +		if (result < 0) {
> +			regs.SYSCALL_RET = -result;
> +			regs.ccr |= 0x10000000;
> +		} else {
> +			regs.SYSCALL_RET = result;
> +			regs.ccr &= ~0x10000000;
> +		}
>   #else
>   		regs.SYSCALL_RET = result;
>   #endif
> +	}
>   
>   #ifdef HAVE_GETREGS
>   	ret = ptrace(PTRACE_SETREGS, tracee, 0, &regs);
> @@ -1897,12 +1906,44 @@ void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
>   
>   }
>   
> +FIXTURE(TRACE_syscall) {
> +	struct sock_fprog prog;
> +	pid_t tracer, mytid, mypid, parent;
> +};
> +
> +FIXTURE_VARIANT(TRACE_syscall) {
> +	/*
> +	 * All of the SECCOMP_RET_TRACE behaviors can be tested with either
> +	 * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
> +	 * This indicates if we should use SECCOMP_RET_TRACE (false), or
> +	 * ptrace (true).
> +	 */
> +	bool use_ptrace;
> +
> +	/*
> +	 * Some archs (like ppc) only support changing the return code during
> +	 * syscall exit when ptrace is used.  As the syscall number might not
> +	 * be available anymore during syscall exit, it needs to be saved
> +	 * during syscall enter.
> +	 */
> +	int syscall_nr;
> +};
> +
> +FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
> +	.use_ptrace = true,
> +};
> +
> +FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
> +	.use_ptrace = false,
> +};
> +
>   void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
>   		   int status, void *args)
>   {
>   	int ret, nr;
>   	unsigned long msg;
>   	static bool entry;
> +	FIXTURE_VARIANT(TRACE_syscall) * variant = args;
>   
>   	/*
>   	 * The traditional way to tell PTRACE_SYSCALL entry/exit
> @@ -1916,10 +1957,15 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
>   	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
>   			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
>   
> -	if (!entry)
> +	if (!entry && !variant)
>   		return;
>   
> -	nr = get_syscall(_metadata, tracee);
> +	if (entry)
> +		nr = get_syscall(_metadata, tracee);
> +	else if (variant)
> +		nr = variant->syscall_nr;
> +	if (variant)
> +		variant->syscall_nr = nr;
>   
>   	if (nr == __NR_getpid)
>   		change_syscall(_metadata, tracee, __NR_getppid, 0);
> @@ -1929,29 +1975,6 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
>   		change_syscall(_metadata, tracee, -1, -ESRCH);
>   }
>   
> -FIXTURE(TRACE_syscall) {
> -	struct sock_fprog prog;
> -	pid_t tracer, mytid, mypid, parent;
> -};
> -
> -FIXTURE_VARIANT(TRACE_syscall) {
> -	/*
> -	 * All of the SECCOMP_RET_TRACE behaviors can be tested with either
> -	 * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
> -	 * This indicates if we should use SECCOMP_RET_TRACE (false), or
> -	 * ptrace (true).
> -	 */
> -	bool use_ptrace;
> -};
> -
> -FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
> -	.use_ptrace = true,
> -};
> -
> -FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
> -	.use_ptrace = false,
> -};
> -
>   FIXTURE_SETUP(TRACE_syscall)
>   {
>   	struct sock_filter filter[] = {
> @@ -1992,7 +2015,9 @@ FIXTURE_SETUP(TRACE_syscall)
>   	self->tracer = setup_trace_fixture(_metadata,
>   					   variant->use_ptrace ? tracer_ptrace
>   							       : tracer_seccomp,
> -					   NULL, variant->use_ptrace);
> +					   variant->use_ptrace ? (void *) variant
> +							       : NULL,
> +					   variant->use_ptrace);
>   
>   	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
>   	ASSERT_EQ(0, ret);
> 

Hi Kees,

If you want to take this through your tree:

Acked-by: Shuah Khan <skhan@linuxfoundation.org>

thanks,
-- Shuah

^ permalink raw reply

* Re: [PATCH] mm/gup: fix gup_fast with dynamic page table folding
From: Linus Torvalds @ 2020-09-11 19:09 UTC (permalink / raw)
  To: Vasily Gorbik
  Cc: Peter Zijlstra, Dave Hansen, Dave Hansen, Paul Mackerras,
	linux-sparc, Alexander Gordeev, Claudio Imbrenda, Will Deacon,
	linux-arch, linux-s390, Christian Borntraeger, Richard Weinberger,
	linux-x86, Russell King, Jason Gunthorpe, Ingo Molnar,
	Catalin Marinas, Andrey Ryabinin, Gerald Schaefer, Heiko Carstens,
	Arnd Bergmann, John Hubbard, Jeff Dike, linux-um, Borislav Petkov,
	Andy Lutomirski, Thomas Gleixner, linux-arm, linux-mm, LKML,
	Andrew Morton, linux-power, Mike Rapoport
In-Reply-To: <patch.git-2c4880212370.your-ad-here.call-01599849957-ext-4686@work.hours>

On Fri, Sep 11, 2020 at 12:04 PM Vasily Gorbik <gor@linux.ibm.com> wrote:
>
> Currently to make sure that every page table entry is read just once
> gup_fast walks perform READ_ONCE and pass pXd value down to the next
> gup_pXd_range function by value e.g.:
[ ... ]

Ack, this looks sane to me.

I was going to ask how horrible it would be to convert all the other
users, but a quick grep convinced me that yeah, it's only GUP that is
this special, and we don't want to make this interface be the real one
for everything else too..

                Linus

^ permalink raw reply

* Re: [PATCH] mm/gup: fix gup_fast with dynamic page table folding
From: Jason Gunthorpe @ 2020-09-11 19:40 UTC (permalink / raw)
  To: Vasily Gorbik
  Cc: Peter Zijlstra, Dave Hansen, linux-mm, Paul Mackerras,
	linux-sparc, Alexander Gordeev, Claudio Imbrenda, Will Deacon,
	linux-arch, linux-s390, Richard Weinberger, linux-x86,
	Russell King, Christian Borntraeger, Ingo Molnar, Catalin Marinas,
	Andrey Ryabinin, Gerald Schaefer, Heiko Carstens, Arnd Bergmann,
	John Hubbard, Jeff Dike, linux-um, Borislav Petkov,
	Andy Lutomirski, Thomas Gleixner, linux-arm, Dave Hansen,
	linux-power, LKML, Andrew Morton, Linus Torvalds, Mike Rapoport
In-Reply-To: <patch.git-2c4880212370.your-ad-here.call-01599849957-ext-4686@work.hours>

On Fri, Sep 11, 2020 at 09:03:06PM +0200, Vasily Gorbik wrote:
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index e8cbc2e795d5..e899d3506671 100644
> +++ b/include/linux/pgtable.h
> @@ -1427,6 +1427,16 @@ typedef unsigned int pgtbl_mod_mask;
>  #define mm_pmd_folded(mm)	__is_defined(__PAGETABLE_PMD_FOLDED)
>  #endif
>  
> +#ifndef p4d_offset_lockless
> +#define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&pgd, address)
> +#endif
> +#ifndef pud_offset_lockless
> +#define pud_offset_lockless(p4dp, p4d, address) pud_offset(&p4d, address)
> +#endif
> +#ifndef pmd_offset_lockless
> +#define pmd_offset_lockless(pudp, pud, address) pmd_offset(&pud, address)

Needs brackets: &(pgd) 

These would probably be better as static inlines though, as only s390
compiles will type check pudp like this.

Jason

^ permalink raw reply

* Re: [PATCH] mm/gup: fix gup_fast with dynamic page table folding
From: Jason Gunthorpe @ 2020-09-11 20:05 UTC (permalink / raw)
  To: Vasily Gorbik
  Cc: Peter Zijlstra, Dave Hansen, linux-mm, Paul Mackerras,
	linux-sparc, Alexander Gordeev, Claudio Imbrenda, Will Deacon,
	linux-arch, linux-s390, Richard Weinberger, linux-x86,
	Russell King, Christian Borntraeger, Ingo Molnar, Catalin Marinas,
	Andrey Ryabinin, Gerald Schaefer, Heiko Carstens, Arnd Bergmann,
	John Hubbard, Jeff Dike, linux-um, Borislav Petkov,
	Andy Lutomirski, Thomas Gleixner, linux-arm, Dave Hansen,
	linux-power, LKML, Andrew Morton, Linus Torvalds, Mike Rapoport
In-Reply-To: <20200911194000.GB1221970@ziepe.ca>

On Fri, Sep 11, 2020 at 04:40:00PM -0300, Jason Gunthorpe wrote:
> These would probably be better as static inlines though, as only s390
> compiles will type check pudp like this.

Never mind, it must be a macro - still need brackets though

Jason

^ permalink raw reply

* Re: [PATCH] powerpc/papr_scm: Fix warning triggered by perf_stats_show()
From: Vaibhav Jain @ 2020-09-11 20:06 UTC (permalink / raw)
  To: Ira Weiny
  Cc: Santosh Sivaraj, linux-nvdimm, Aneesh Kumar K . V,
	Oliver O'Halloran, Dan Williams, linuxppc-dev
In-Reply-To: <20200910155552.GN1930795@iweiny-DESK2.sc.intel.com>

Thanks for reviewing this patch Ira,


Ira Weiny <ira.weiny@intel.com> writes:

> On Thu, Sep 10, 2020 at 02:52:12PM +0530, Vaibhav Jain wrote:
>> A warning is reported by the kernel in case perf_stats_show() returns
>> an error code. The warning is of the form below:
>> 
>>  papr_scm ibm,persistent-memory:ibm,pmemory@44100001:
>>  	  Failed to query performance stats, Err:-10
>>  dev_attr_show: perf_stats_show+0x0/0x1c0 [papr_scm] returned bad count
>>  fill_read_buffer: dev_attr_show+0x0/0xb0 returned bad count
>> 
>> On investigation it looks like that the compiler is silently truncating the
>> return value of drc_pmem_query_stats() from 'long' to 'int', since the
>> variable used to store the return code 'rc' is an 'int'. This
>> truncated value is then returned back as a 'ssize_t' back from
>> perf_stats_show() to 'dev_attr_show()' which thinks of it as a large
>> unsigned number and triggers this warning..
>> 
>> To fix this we update the type of variable 'rc' from 'int' to
>> 'ssize_t' that prevents the compiler from truncating the return value
>> of drc_pmem_query_stats() and returning correct signed value back from
>> perf_stats_show().
>> 
>> Fixes: 2d02bf835e573 ('powerpc/papr_scm: Fetch nvdimm performance
>>        stats from PHYP')
>> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
>> ---
>>  arch/powerpc/platforms/pseries/papr_scm.c | 3 ++-
>>  1 file changed, 2 insertions(+), 1 deletion(-)
>> 
>> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
>> index a88a707a608aa..9f00b61676ab9 100644
>> --- a/arch/powerpc/platforms/pseries/papr_scm.c
>> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
>> @@ -785,7 +785,8 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc,
>>  static ssize_t perf_stats_show(struct device *dev,
>>  			       struct device_attribute *attr, char *buf)
>>  {
>> -	int index, rc;
>> +	int index;
>> +	ssize_t rc;
>
> I'm not sure this is really fixing everything here.

The issue is with the statement in perf_stats_show():

'return rc ? rc : seq_buf_used(&s);'

The function seq_buf_used() returns an 'unsigned int' and with 'rc'
typed as 'int', forces a promotion of the expression to 'unsigned int'
which causes a loss of signedness of 'rc' and compiler silently
assigns this unsigned value to the function return typed as 'signed
long'.

Making 'rc', a 'signed long' forces a promotion of the expresion to
'signed long' which preserves the signedness of 'rc' and will also be
compatible with the function return type.

>
> drc_pmem_query_stats() can return negative errno's.  Why are those not checked
> somewhere in perf_stats_show()?
>
For the specific invocation 'drc_pmem_query_stats(p, stats, 0)' we only
expect return value 'rc <=0' with '0' indicating a successful fetch of
nvdimm performance stats from hypervisor. Hence there are no explicit
checks for negative error codes in the functions as all return values
!=0 indicate an error.


> It seems like all this fix is handling is a > 0 return value: 'ret[0]' from
> line 289 in papr_scm.c...  Or something?
No, in case the arg 'num_stats' is '0' and 'buff_stats != NULL' the
variable 'size' is assigned a non-zero value hence that specific branch
you mentioned  is never taken. Instead in case of success
drc_pmem_query_stats() return '0' and in case of an error a negative
error code is returned.

>
> Worse yet drc_pmem_query_stats() is returning ssize_t which is a signed value.
> Therefore, it should not be returning -errno.  I'm surprised the static
> checkers did not catch that.
Didnt quite get the assertion here. The function is marked to return
'ssize_t' because we can return both +ve for success and -ve values to
indicate errors.

>
> I believe I caught similar errors with a patch series before which did not pay
> attention to variable types.
>
> Please audit this code for these types of errors and ensure you are really
> doing the correct thing when using the sysfs interface.  I'm pretty sure bad
> things will eventually happen (if they are not already) if you return some
> really big number to the sysfs core from *_show().
I think this problem is different compared to what you had previously pointed
to. The values returned from drc_pmem_query_stats() can be stored in an
'int' variable too, however it was the silent promotion of a signed type
to unsigned type was what caused this specific issue.

-- 
Cheers
~ Vaibhav

^ permalink raw reply

* Re: [PATCH] powerpc/boot/dts: Fix dtc "pciex" warnings
From: Christian Lamparter @ 2020-09-11 21:09 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev, Chris Blake, sfr
In-Reply-To: <CAAd0S9Cc2M+JsqC+3DhLtaQEecweVX-toC2SxNAzNdogFeeTOw@mail.gmail.com>

Hello,

On 2020-09-08 20:52, Christian Lamparter wrote:
> On Tue, Sep 8, 2020 at 9:12 AM Michael Ellerman <mpe@ellerman.id.au> wrote:
>> Christian Lamparter <chunkeey@gmail.com> writes:
>>> On 2020-06-23 15:03, Michael Ellerman wrote:
>>>> With CONFIG_OF_ALL_DTBS=y, as set by eg. allmodconfig, we see lots of
>>>> warnings about our dts files, such as:
>>>>
>>>>     arch/powerpc/boot/dts/glacier.dts:492.26-532.5:
>>>>     Warning (pci_bridge): /plb/pciex@d00000000: node name is not "pci"
>>>>     or "pcie"
>>>>
>>>
>>>
>>> Unfortunately yes. This patch will break uboot code in Meraki MX60(W) / MX60.
>>>
>>>   > https://github.com/riptidewave93/meraki-uboot/blob/mx60w-20180413/board/amcc/bluestone/bluestone.c#L1178
>>>
>>> | if (!pci_available()) {
>>> |     fdt_find_and_setprop(blob, "/plb/pciex@d00000000", "status",
>>> |                   "disabled", sizeof("disabled"), 1);
>>> | }
>>>
>>>
>>> Backstory: There are two version of the Meraki MX60. The MX60
>>> and the MX60W. The difference is that the MX60W has a populated
>>> mini-pcie slot on the PCB for a >W<ireless card.
>>>
>>> That said, this is not earth shattering.
>>
>> I'm happy to revert that hunk if you think any one is actually booting
>> mainline on those.
> 
> The MX60(W) or APM82181 in general?
> 
> The APM82181 devices run really well on the kernel 5.8. The APM82181
> had some bitrot and missing pieces. But I started at around 4.4 with
> upstreaming various bits and stuff. For proof, I can post a bootlog from
> my MyBook Live dev station running my mbl-debian on this weekend:
> <https://github.com/chunkeey/mbl-debian>.

Here is the MyBook Live's Single Bootlog for 5.9-rc4.

root@mbl-debian:~# dmesg
[    0.000000] printk: bootconsole [udbg0] enabled
[    0.000000] Linux version 5.9.0-rc4+ (root@debian64) (powerpc-linux-gnu-gcc (Debian 10.2.0-3) 10.2.0, GNU ld (GNU Binutils for Debian) 2.35) #1 Fri Sep 11 22:13:07 CEST 2020
[    0.000000] Using PowerPC 44x Platform machine description
[    0.000000] -----------------------------------------------------
[    0.000000] phys_mem_size     = 0x10000000
[    0.000000] dcache_bsize      = 0x20
[    0.000000] icache_bsize      = 0x20
[    0.000000] cpu_features      = 0x0000000000000120
[    0.000000]   possible        = 0x0000000040000120
[    0.000000]   always          = 0x0000000000000020
[    0.000000] cpu_user_features = 0x8c008000 0x00000000
[    0.000000] mmu_features      = 0x00000008
[    0.000000] -----------------------------------------------------
[    0.000000] Top of RAM: 0x10000000, Total RAM: 0x10000000
[    0.000000] Memory hole size: 0MB
[    0.000000] Zone ranges:
[    0.000000]   Normal   [mem 0x0000000000000000-0x000000000fffffff]
[    0.000000] Movable zone start for each node
[    0.000000] Early memory node ranges
[    0.000000]   node   0: [mem 0x0000000000000000-0x000000000fffffff]
[    0.000000] Initmem setup node 0 [mem 0x0000000000000000-0x000000000fffffff]
[    0.000000] On node 0 totalpages: 16384
[    0.000000]   Normal zone: 40 pages used for memmap
[    0.000000]   Normal zone: 0 pages reserved
[    0.000000]   Normal zone: 16384 pages, LIFO batch:3
[    0.000000] MMU: Allocated 1088 bytes of context maps for 255 contexts
[    0.000000] pcpu-alloc: s0 r0 d32768 u32768 alloc=1*32768
[    0.000000] pcpu-alloc: [0] 0
[    0.000000] Built 1 zonelists, mobility grouping on.  Total pages: 16344
[    0.000000] Kernel command line: root=PARTUUID=25a9cfcc-6b0d-4ae4-abbf-e5e2e3889a40 rw console=ttyS0,115200
[    0.000000] Dentry cache hash table entries: 32768 (order: 3, 131072 bytes, linear)
[    0.000000] Inode-cache hash table entries: 16384 (order: 2, 65536 bytes, linear)
[    0.000000] mem auto-init: stack:off, heap alloc:off, heap free:off
[    0.000000] Memory: 252528K/262144K available (5840K kernel code, 448K rwdata, 1824K rodata, 224K init, 312K bss, 9616K reserved, 0K cma-reserved)
[    0.000000] Kernel virtual memory layout:
[    0.000000]   * 0xffbdc000..0xffffc000  : fixmap
[    0.000000]   * 0xd1000000..0xffbdc000  : vmalloc & ioremap
[    0.000000] random: get_random_u32 called from cache_random_seq_create+0x68/0x128 with crng_init=0
[    0.000000] SLUB: HWalign=32, Order=0-3, MinObjects=0, CPUs=1, Nodes=1
[    0.000000] NR_IRQS: 512, nr_irqs: 512, preallocated irqs: 16
[    0.000000] UIC0 (32 IRQ sources) at DCR 0xc0
[    0.000000] UIC1 (32 IRQ sources) at DCR 0xd0
[    0.000000] UIC2 (32 IRQ sources) at DCR 0xe0
[    0.000000] UIC3 (32 IRQ sources) at DCR 0xf0
[    0.000000] time_init: decrementer frequency = 800.000008 MHz
[    0.000000] time_init: processor frequency   = 800.000008 MHz
[    0.000016] clocksource: timebase: mask: 0xffffffffffffffff max_cycles: 0xb881274fa3, max_idle_ns: 440795210636 ns
[    0.008990] clocksource: timebase mult[1400000] shift[24] registered
[    0.014011] clockevent: decrementer mult[ccccccef] shift[32] cpu[0]
[    0.019133] Console: colour dummy device 80x25
[    0.022213] pid_max: default: 32768 minimum: 301
[    0.025920] Mount-cache hash table entries: 4096 (order: 0, 16384 bytes, linear)
[    0.031945] Mountpoint-cache hash table entries: 4096 (order: 0, 16384 bytes, linear)
[    0.040909] devtmpfs: initialized
[    0.045808] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns
[    0.054182] futex hash table entries: 256 (order: -3, 3072 bytes, linear)
[    0.060719] NET: Registered protocol family 16
[    0.067099] DMA: preallocated 128 KiB GFP_KERNEL pool for atomic allocations
[    0.072996] audit: initializing netlink subsys (disabled)
[    0.078210] thermal_sys: Registered thermal governor 'step_wise'
[    0.078219] thermal_sys: Registered thermal governor 'user_space'
[    0.082970] cpuidle: using governor ladder
[    0.090429] cpuidle: using governor menu
[    0.094588] 256k L2-cache enabled
[    0.096586] PCIE0: Port disabled via device-tree
[    0.100007] gpiochip_find_base: found new base at 480
[    0.104395] audit: type=2000 audit(0.040:1): state=initialized audit_enabled=0 res=1
[    0.113209] gpio gpiochip0: (/plb/opb/gpio@ef600b00): added GPIO chardev (254:0)
[    0.119316] gpio gpiochip0: registered GPIOs 480 to 511 on /plb/opb/gpio@ef600b00
[    0.125949] PCI: Probing PCI hardware
[    5.927343] vgaarb: loaded
[    5.929112] SCSI subsystem initialized
[    5.933393] libata version 3.00 loaded.
[    5.936123] usbcore: registered new interface driver usbfs
[    5.940272] usbcore: registered new interface driver hub
[    5.944233] usbcore: registered new device driver usb
[    5.952187] clocksource: Switched to clocksource timebase
[    5.956449] VFS: Disk quotas dquot_6.6.0
[    5.959041] VFS: Dquot-cache hash table entries: 4096 (order 0, 16384 bytes)
[    5.991157] NET: Registered protocol family 2
[    5.994695] tcp_listen_portaddr_hash hash table entries: 2048 (order: 0, 16384 bytes, linear)
[    6.001871] TCP established hash table entries: 4096 (order: 0, 16384 bytes, linear)
[    6.008246] TCP bind hash table entries: 4096 (order: 0, 16384 bytes, linear)
[    6.014029] TCP: Hash tables configured (established 4096 bind 4096)
[    6.019190] MPTCP token hash table entries: 2048 (order: 0, 24576 bytes, linear)
[    6.025300] UDP hash table entries: 1024 (order: 0, 16384 bytes, linear)
[    6.030653] UDP-Lite hash table entries: 1024 (order: 0, 16384 bytes, linear)
[    6.036615] NET: Registered protocol family 1
[    6.041688] RPC: Registered named UNIX socket transport module.
[    6.046236] RPC: Registered udp transport module.
[    6.049577] RPC: Registered tcp transport module.
[    6.052947] RPC: Registered tcp NFSv4.1 backchannel transport module.
[    6.058052] NET: Registered protocol family 44
[    6.061166] PCI: CLS 0 bytes, default 32
[    6.085333] reg-fixed-voltage plb:opb:usb-regulator: GPIO lookup for consumer (null)
[    6.091686] reg-fixed-voltage plb:opb:usb-regulator: using device tree for GPIO lookup
[    6.098262] reg-fixed-voltage plb:opb:usb-regulator: No GPIO consumer (null) found
[    6.105841] reg-fixed-voltage plb:opb:sata1-regulator: GPIO lookup for consumer (null)
[    6.112364] reg-fixed-voltage plb:opb:sata1-regulator: using device tree for GPIO lookup
[    6.119106] reg-fixed-voltage plb:opb:sata1-regulator: No GPIO consumer (null) found
[    6.126852] reg-fixed-voltage plb:opb:sata0-regulator: GPIO lookup for consumer (null)
[    6.133376] reg-fixed-voltage plb:opb:sata0-regulator: using device tree for GPIO lookup
[    6.140118] reg-fixed-voltage plb:opb:sata0-regulator: No GPIO consumer (null) found
[    6.149872] dw_dmac 4bffd0800.dma: DesignWare DMA Controller, 2 channels
[    6.165651] Initialise system trusted keyrings
[    6.168752] Key type blacklist registered
[    6.172713] workingset: timestamp_bits=30 max_order=14 bucket_order=0
[    6.185449] squashfs: version 4.0 (2009/01/31) Phillip Lougher
[    6.350439] Key type asymmetric registered
[    6.353161] Asymmetric key parser 'x509' registered
[    6.356754] Block layer SCSI generic (bsg) driver version 0.4 loaded (major 249)
[    6.362744] io scheduler mq-deadline registered
[    6.365932] io scheduler kyber registered
[    6.369230] gpiochip_find_base: found new base at 472
[    6.372932] gpio-473 (Enable Reset Button, disable NOR): hogged as output/low
[    6.379779] gpio gpiochip1: (4e0000000.gpio): added GPIO chardev (254:1)
[    6.385199] gpio gpiochip1: registered GPIOs 472 to 479 on 4e0000000.gpio
[    6.390720] gpiochip_find_base: found new base at 464
[    6.395145] gpio gpiochip2: (4e0100000.gpio): added GPIO chardev (254:2)
[    6.400555] gpio gpiochip2: registered GPIOs 464 to 471 on 4e0100000.gpio
[    6.408444] Serial: 8250/16550 driver, 2 ports, IRQ sharing enabled
[    6.415420] printk: console [ttyS0] disabled
[    6.418351] of_serial 4ef600300.serial: GPIO lookup for consumer rs485-term
[    6.423929] of_serial 4ef600300.serial: using device tree for GPIO lookup
[    6.429390] of_get_named_gpiod_flags: can't parse 'rs485-term-gpios' property of node '/plb/opb/serial@ef600300[0]'
[    6.438453] of_get_named_gpiod_flags: can't parse 'rs485-term-gpio' property of node '/plb/opb/serial@ef600300[0]'
[    6.447427] of_serial 4ef600300.serial: using lookup tables for GPIO lookup
[    6.453036] of_serial 4ef600300.serial: No GPIO consumer rs485-term found
[    6.458525] 4ef600300.serial: ttyS0 at MMIO 0x4ef600300 (irq = 32, base_baud = 462962) is a TI16750
[    6.466201] printk: console [ttyS0] enabled
[    6.471883] printk: bootconsole [udbg0] disabled
[    6.500228] brd: module loaded
[    6.514232] loop: module loaded
[    6.516963] sata-dwc 4bffd1000.sata: id 0, controller version 1.91
[    6.523635] scsi host0: sata-dwc
[    6.525992] ata1: SATA max UDMA/133 irq 39
[    6.529027] sata-dwc 4bffd1800.sata: id 0, controller version 1.91
[    6.536207] scsi host1: sata-dwc
[    6.538343] ata2: SATA max UDMA/133 irq 40
[    6.541630] platform physmap-flash.0: failed to claim resource 0: [mem 0x08000000-0x07ffffff]
[    6.549935] mdio_bus fixed-0: GPIO lookup for consumer reset
[    6.554301] mdio_bus fixed-0: using lookup tables for GPIO lookup
[    6.559081] mdio_bus fixed-0: No GPIO consumer reset found
[    6.563278] libphy: Fixed MDIO Bus: probed
[    6.566066] PPC 4xx OCP EMAC driver, version 3.54
[    6.570154] MAL v2 /plb/mcmal, 1 TX channels, 1 RX channels
[    6.574722] RGMII /plb/opb/emac-rgmii@ef601500 initialized with MDIO support
[    6.580630] TAH /plb/opb/emac-tah@ef601350 initialized
[    6.584789] /plb/opb/emac-rgmii@ef601500: input 0 in rgmii mode
[    6.730396] mdio_bus 4ef600c00.ethernet: GPIO lookup for consumer reset
[    6.735695] mdio_bus 4ef600c00.ethernet: using device tree for GPIO lookup
[    6.741293] of_get_named_gpiod_flags: parsed 'reset-gpios' property of node '/plb/opb/ethernet@ef600c00/mdio[0]' - status (0)
[    6.751283] libphy: emac_mdio: probed
[    6.761226] mdio_bus 4ef600c00.ethernet:01: GPIO lookup for consumer reset
[    6.766807] mdio_bus 4ef600c00.ethernet:01: using device tree for GPIO lookup
[    6.772660] of_get_named_gpiod_flags: can't parse 'reset-gpios' property of node '/plb/opb/ethernet@ef600c00/mdio/phy@1[0]'
[    6.782469] of_get_named_gpiod_flags: can't parse 'reset-gpio' property of node '/plb/opb/ethernet@ef600c00/mdio/phy@1[0]'
[    6.792186] mdio_bus 4ef600c00.ethernet:01: using lookup tables for GPIO lookup
[    6.798177] mdio_bus 4ef600c00.ethernet:01: No GPIO consumer reset found
[    6.806518] eth0: EMAC-0 /plb/opb/ethernet@ef600c00, MAC 00:90:a9:35:31:25
[    6.812091] eth0: found Broadcom BCM50610 PHY (0x01)
[    6.816240] dwc2 4bff80000.usbotg: mapped PA bff80000 to VA (ptrval)
[    6.821304] dwc2 4bff80000.usbotg: supply vusb_d not found, using dummy regulator
[    6.827610] dwc2 4bff80000.usbotg: supply vusb_a not found, using dummy regulator
[    6.833848] dwc2 4bff80000.usbotg: registering common handler for irq35
[    6.841266] usbcore: registered new interface driver usb-storage
[    6.846140] of_get_named_gpiod_flags: parsed 'gpios' property of node '/plb/opb/keys/reset-button[0]' - status (0)
[    6.855351] input: plb:opb:keys as /devices/platform/plb/plb:opb/plb:opb:keys/input/input0
[    6.862521] ata1: SATA link down (SStatus 0 SControl 300)
[    6.867654] i2c /dev entries driver
[    6.870052] IR NEC protocol handler initialized
[    6.873290] IR RC5(x/sz) protocol handler initialized
[    6.877028] IR RC6 protocol handler initialized
[    6.880242] IR JVC protocol handler initialized
[    6.883451] IR Sony protocol handler initialized
[    6.886756] IR SANYO protocol handler initialized
[    6.890143] IR Sharp protocol handler initialized
[    6.893530] IR MCE Keyboard/mouse protocol handler initialized
[    6.898040] IR XMP protocol handler initialized
[    6.903391] device-mapper: ioctl: 4.42.0-ioctl (2020-02-27) initialised: dm-devel@redhat.com
[    6.910678] of_get_named_gpiod_flags: parsed 'gpios' property of node '/plb/opb/leds/power-red[0]' - status (0)
[    6.919442] gpio-476 (?): no flags found for gpios
[    6.923065] of_get_named_gpiod_flags: parsed 'gpios' property of node '/plb/opb/leds/power-green[0]' - status (0)
[    6.932001] gpio-477 (?): no flags found for gpios
[    6.935587] of_get_named_gpiod_flags: parsed 'gpios' property of node '/plb/opb/leds/power-blue[0]' - status (0)
[    6.944433] gpio-478 (?): no flags found for gpios
[    6.948135] ledtrig-cpu: registered to indicate activity on CPUs
[    7.069005] ata2: SATA link up 3.0 Gbps (SStatus 123 SControl 300)
[    7.126707] ata2.00: ATA-7: ST3808110AS, 3.AAE, max UDMA/133
[    7.131071] ata2.00: 156301488 sectors, multi 0: LBA48 NCQ (depth 1/32)
[    7.193344] ata2.00: configured for UDMA/133
[    7.196475] blk_queue_segment_boundary: set to minimum 3fff
[    7.200991] scsi 1:0:0:0: Direct-Access     ATA      ST3808110AS      E    PQ: 0 ANSI: 5
[    7.250704] sd 1:0:0:0: [sda] 156301488 512-byte logical blocks: (80.0 GB/74.5 GiB)
[    7.257636] sd 1:0:0:0: [sda] Write Protect is off
[    7.261147] sd 1:0:0:0: [sda] Mode Sense: 00 3a 00 00
[    7.265069] sd 1:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
[    7.300288] GPT:Primary header thinks Alt. header is not at the end of the disk.
[    7.306386] GPT:8380415 != 156301487
[    7.308658] GPT:Alternate GPT header not at the end of the disk.
[    7.313349] GPT:8380415 != 156301487
[    7.315615] GPT: Use GNU Parted to correct GPT errors.
[    7.319511]  sda: sda1 sda2 sda3
[    7.323342] sd 1:0:0:0: [sda] Attached SCSI disk
[    8.291046] random: fast init done
[   10.926506] alg: No test for stdrng (crypto4xx_rng)
[   10.930969] usbcore: registered new interface driver usbhid
[   10.935247] usbhid: USB HID core driver
[   10.937991] Initializing XFRM netlink socket
[   10.941514] NET: Registered protocol family 10
[   10.948675] Segment Routing with IPv6
[   10.951112] NET: Registered protocol family 17
[   10.954288] NET: Registered protocol family 15
[   10.957606] 8021q: 802.1Q VLAN Support v1.8
[   10.960602] drmem: No dynamic reconfiguration memory found
[   10.965094] registered taskstats version 1
[   10.967885] Loading compiled-in X.509 certificates
[   10.971451] Key type ._fscrypt registered
[   10.974160] Key type .fscrypt registered
[   10.976768] Key type fscrypt-provisioning registered
[   10.980859] reg-fixed-voltage plb:opb:usb-regulator: GPIO lookup for consumer (null)
[   10.987298] reg-fixed-voltage plb:opb:usb-regulator: using device tree for GPIO lookup
[   10.993930] of_get_named_gpiod_flags: parsed 'gpios' property of node '/plb/opb/usb-regulator[0]' - status (0)
[   11.002906] reg-fixed-voltage plb:opb:sata1-regulator: GPIO lookup for consumer (null)
[   11.009516] reg-fixed-voltage plb:opb:sata1-regulator: using device tree for GPIO lookup
[   11.016311] of_get_named_gpiod_flags: parsed 'gpios' property of node '/plb/opb/sata1-regulator[0]' - status (0)
[   11.025450] reg-fixed-voltage plb:opb:sata0-regulator: GPIO lookup for consumer (null)
[   11.032058] reg-fixed-voltage plb:opb:sata0-regulator: using device tree for GPIO lookup
[   11.038852] of_get_named_gpiod_flags: parsed 'gpios' property of node '/plb/opb/sata0-regulator[0]' - status (0)
[   11.048060] dwc2 4bff80000.usbotg: mapped PA bff80000 to VA (ptrval)
[   11.053139] dwc2 4bff80000.usbotg: supply vusb_d not found, using dummy regulator
[   11.059428] dwc2 4bff80000.usbotg: supply vusb_a not found, using dummy regulator
[   11.065671] dwc2 4bff80000.usbotg: registering common handler for irq35
[   11.072892] dwc2 4bff80000.usbotg: Bad value for GSNPSID: 0x00000000
[   11.078824] md: Waiting for all devices to be available before autodetect
[   11.084315] md: If you don't use raid, use raid=noautodetect
[   11.088657] md: Autodetecting RAID arrays.
[   11.091432] md: autorun ...
[   11.092924] md: ... autorun DONE.
[   11.140266] EXT4-fs (sda3): mounted filesystem with ordered data mode. Opts: (null)
[   11.146665] VFS: Mounted root (ext4 filesystem) on device 8:3.
[   11.160049] devtmpfs: mounted
[   11.162700] Freeing unused kernel memory: 224K
[   11.165920] Run /sbin/init as init process
[   11.168719]   with arguments:
[   11.170377]     /sbin/init
[   11.171775]   with environment:
[   11.173613]     HOME=/
[   11.174662]     TERM=linux
[   12.215915] systemd[1]: System time before build time, advancing clock.
[   12.479770] systemd[1]: systemd 246.4-1 running in system mode. (+PAM +AUDIT +SELINUX +IMA +APPARMOR +SMACK +SYSVINIT +UTMP +LIBCRYPTSETUP +GCRYPT +GNUTLS +ACL +XZ +LZ4 +ZSTD +SECCOMP +BLKID +ELFUTILS +KMOD +IDN2 -IDN +PCRE2 default-hierarchy=hybrid)
[   12.501058] systemd[1]: Detected architecture ppc.
[   12.557673] systemd[1]: Set hostname to <mbl-debian>.

---

root@mbl-debian:~# cat /proc/cpuinfo
processor       : 0
cpu             : APM821XX
clock           : 800.000008MHz
revision        : 28.131 (pvr 12c4 1c83)
bogomips        : 1600.00

timebase        : 800000008
platform        : PowerPC 44x Platform
model           : MyBook Live
Memory          : 256 MB

root@mbl-debian:~# cat /proc/iomem
00000000-0fffffff : System RAM
4bffd0800-4bffd0bff : 4bffd0800.dma dma@bffd0800
4bffd1000-4bffd17ff : 4bffd1000.sata sata@bffd1000
4bffd1800-4bffd1fff : 4bffd1800.sata sata@bffd1800
4e0000000-4e0000000 : 4e0000000.gpio dat
4e0100000-4e0100000 : 4e0100000.gpio dat
4ef600300-4ef600307 : serial

root@mbl-debian:~# cat /proc/interrupts
            CPU0
  18:          0       UIC  11 Edge      L2C
  20:       2597       UIC   6 Level     MAL TX EOB
  21:      23062       UIC   7 Level     MAL RX EOB
  22:          0       UIC   3 Level     MAL SERR
  23:          0       UIC   4 Level     MAL TX DE
  24:          0       UIC   5 Level     MAL RX DE
  29:        735       UIC  29 Level     crypto4xx
  32:        538       UIC   1 Level     ttyS0
  33:          0       UIC  16 Level     EMAC
  38:       2765       UIC  25 Level     dw:dmac-1
  39:          0       UIC  26 Level     sata-dwc[4bffd1000.sata]
  40:       5680       UIC  27 Level     sata-dwc[4bffd1800.sata]
LOC:      25755   Local timer interrupts for timer event device
BCT:          0   Broadcast timer interrupts for timer event device
LOC:          4   Local timer interrupts for others
SPU:          0   Spurious interrupts
PMI:          0   Performance monitoring interrupts
MCE:          0   Machine check exceptions
NMI:          0   System Reset interrupts

root@mbl-debian:~# cat /etc/debian_version
bullseye/sid

^ permalink raw reply

* [PATCH] ibmvfc: Avoid link down on FS9100 canister reboot
From: Brian King @ 2020-09-11 21:28 UTC (permalink / raw)
  To: linux-scsi; +Cc: Brian King, tyreld, linuxppc-dev, martin.petersen

When a canister on a FS9100, or similar storage, running in NPIV mode,
is rebooted, its WWPNs will fail over to another canister. When this
occurs, we see a WWPN going away from the fabric at one N-Port ID,
and, a short time later, the same WWPN appears at a different N-Port ID.
When the canister is fully operational again, the WWPNs fail back to
the original canister. If there is any I/O outstanding to the target
when this occurs, it will result in the implicit logout the ibmvfc driver
issues before removing the rport to fail. When the WWPN then shows up at a
different N-Port ID, and we issue a PLOGI to it, the VIOS will
see that it still has a login for this WWPN at the old N-Port ID,
which results in the VIOS simulating a link down / link up sequence
to the client, in order to get the VIOS and client LPAR in sync.

The patch below improves the way we handle this scenario so as to
avoid the link bounce, which affects all targets under the virtual
host adapter. The change is to utilize the Move Login MAD, which
will work even when I/O is outstanding to the target. The change
only alters the target state machine for the case where the implicit
logout fails prior to deleting the rport. If this implicit logout fails,
we defer deleting the ibmvfc_target object after calling
fc_remote_port_delete. This enables us to later retry the implicit logout
after terminate_rport_io occurs, or to issue the Move Login request if
a WWPN shows up at a new N-Port ID prior to this occurring.

This has been tested by IBM's storage interoperability team
on a FS9100, forcing the failover to occur. With debug tracing enabled
in the ibmvfc driver, we confirmed the move login was sent
in this scenario and confirmed the link bounce no longer occurred.

Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 211 ++++++++++++++++++++++++++++++++++++++---
 drivers/scsi/ibmvscsi/ibmvfc.h |  38 +++++++-
 2 files changed, 232 insertions(+), 17 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 175a165..322bb30 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -134,6 +134,7 @@
 static void ibmvfc_tgt_query_target(struct ibmvfc_target *);
 static void ibmvfc_npiv_logout(struct ibmvfc_host *);
 static void ibmvfc_tgt_implicit_logout_and_del(struct ibmvfc_target *);
+static void ibmvfc_tgt_move_login(struct ibmvfc_target *);
 
 static const char *unknown_error = "unknown error";
 
@@ -431,7 +432,20 @@ static int ibmvfc_set_tgt_action(struct ibmvfc_target *tgt,
 		}
 		break;
 	case IBMVFC_TGT_ACTION_LOGOUT_RPORT_WAIT:
-		if (action == IBMVFC_TGT_ACTION_DEL_RPORT) {
+		if (action == IBMVFC_TGT_ACTION_DEL_RPORT ||
+		    action == IBMVFC_TGT_ACTION_DEL_AND_LOGOUT_RPORT) {
+			tgt->action = action;
+			rc = 0;
+		}
+		break;
+	case IBMVFC_TGT_ACTION_LOGOUT_DELETED_RPORT:
+		if (action == IBMVFC_TGT_ACTION_LOGOUT_RPORT) {
+			tgt->action = action;
+			rc = 0;
+		}
+		break;
+	case IBMVFC_TGT_ACTION_DEL_AND_LOGOUT_RPORT:
+		if (action == IBMVFC_TGT_ACTION_LOGOUT_DELETED_RPORT) {
 			tgt->action = action;
 			rc = 0;
 		}
@@ -441,16 +455,18 @@ static int ibmvfc_set_tgt_action(struct ibmvfc_target *tgt,
 			tgt->action = action;
 			rc = 0;
 		}
+		break;
 	case IBMVFC_TGT_ACTION_DELETED_RPORT:
 		break;
 	default:
-		if (action >= IBMVFC_TGT_ACTION_LOGOUT_RPORT)
-			tgt->add_rport = 0;
 		tgt->action = action;
 		rc = 0;
 		break;
 	}
 
+	if (action >= IBMVFC_TGT_ACTION_LOGOUT_RPORT)
+		tgt->add_rport = 0;
+
 	return rc;
 }
 
@@ -548,7 +564,8 @@ static void ibmvfc_set_host_action(struct ibmvfc_host *vhost,
  **/
 static void ibmvfc_reinit_host(struct ibmvfc_host *vhost)
 {
-	if (vhost->action == IBMVFC_HOST_ACTION_NONE) {
+	if (vhost->action == IBMVFC_HOST_ACTION_NONE &&
+	    vhost->state == IBMVFC_ACTIVE) {
 		if (!ibmvfc_set_host_state(vhost, IBMVFC_INITIALIZING)) {
 			scsi_block_requests(vhost->host);
 			ibmvfc_set_host_action(vhost, IBMVFC_HOST_ACTION_QUERY);
@@ -2574,7 +2591,9 @@ static void ibmvfc_terminate_rport_io(struct fc_rport *rport)
 	struct ibmvfc_host *vhost = shost_priv(shost);
 	struct fc_rport *dev_rport;
 	struct scsi_device *sdev;
-	unsigned long rc;
+	struct ibmvfc_target *tgt;
+	unsigned long rc, flags;
+	unsigned int found;
 
 	ENTER;
 	shost_for_each_device(sdev, shost) {
@@ -2588,6 +2607,25 @@ static void ibmvfc_terminate_rport_io(struct fc_rport *rport)
 
 	if (rc == FAILED)
 		ibmvfc_issue_fc_host_lip(shost);
+
+	spin_lock_irqsave(shost->host_lock, flags);
+	found = 0;
+	list_for_each_entry(tgt, &vhost->targets, queue) {
+		if (tgt->scsi_id == rport->port_id) {
+			found++;
+			break;
+		}
+	}
+
+	if (found && tgt->action == IBMVFC_TGT_ACTION_LOGOUT_DELETED_RPORT) {
+		/* If we get here, that means we previously attempted to send
+		 an implicit logout to the target but it failed, most likely
+		 due to I/O being pending, so we need to send it again */
+		ibmvfc_del_tgt(tgt);
+		ibmvfc_reinit_host(vhost);
+	}
+
+	spin_unlock_irqrestore(shost->host_lock, flags);
 	LEAVE;
 }
 
@@ -3623,7 +3661,15 @@ static void ibmvfc_tgt_implicit_logout_and_del_done(struct ibmvfc_event *evt)
 
 	vhost->discovery_threads--;
 	ibmvfc_free_event(evt);
-	ibmvfc_set_tgt_action(tgt, IBMVFC_TGT_ACTION_DEL_RPORT);
+
+	/* If our state is IBMVFC_HOST_OFFLINE, we could be unloading the driver
+	 in which case we need to free up all the targets. If we are not unloading,
+	 we will still go through a hard reset to get out of offline state, so there
+	 is no need to track the old targets in that case */
+	if (status == IBMVFC_MAD_SUCCESS || vhost->state == IBMVFC_HOST_OFFLINE)
+		ibmvfc_set_tgt_action(tgt, IBMVFC_TGT_ACTION_DEL_RPORT);
+	else
+		ibmvfc_set_tgt_action(tgt, IBMVFC_TGT_ACTION_DEL_AND_LOGOUT_RPORT);
 
 	tgt_dbg(tgt, "Implicit Logout %s\n", (status == IBMVFC_MAD_SUCCESS) ? "succeeded" : "failed");
 	kref_put(&tgt->kref, ibmvfc_release_tgt);
@@ -3662,6 +3708,92 @@ static void ibmvfc_tgt_implicit_logout_and_del(struct ibmvfc_target *tgt)
 }
 
 /**
+ * ibmvfc_tgt_move_login_done - Completion handler for Move Login
+ * @evt:	ibmvfc event struct
+ *
+ **/
+static void ibmvfc_tgt_move_login_done(struct ibmvfc_event *evt)
+{
+	struct ibmvfc_target *tgt = evt->tgt;
+	struct ibmvfc_host *vhost = evt->vhost;
+	struct ibmvfc_move_login *rsp = &evt->xfer_iu->move_login;
+	u32 status = be16_to_cpu(rsp->common.status);
+	int level = IBMVFC_DEFAULT_LOG_LEVEL;
+
+	vhost->discovery_threads--;
+	ibmvfc_set_tgt_action(tgt, IBMVFC_TGT_ACTION_NONE);
+	switch (status) {
+	case IBMVFC_MAD_SUCCESS:
+		tgt_dbg(tgt, "Move Login succeeded for old scsi_id: %llX\n", tgt->old_scsi_id);
+		tgt->ids.node_name = wwn_to_u64(rsp->service_parms.node_name);
+		tgt->ids.port_name = wwn_to_u64(rsp->service_parms.port_name);
+		tgt->ids.port_id = tgt->scsi_id;
+		memcpy(&tgt->service_parms, &rsp->service_parms,
+		       sizeof(tgt->service_parms));
+		memcpy(&tgt->service_parms_change, &rsp->service_parms_change,
+		       sizeof(tgt->service_parms_change));
+		ibmvfc_init_tgt(tgt, ibmvfc_tgt_send_prli);
+		break;
+	case IBMVFC_MAD_DRIVER_FAILED:
+		break;
+	case IBMVFC_MAD_CRQ_ERROR:
+		ibmvfc_retry_tgt_init(tgt, ibmvfc_tgt_move_login);
+		break;
+	case IBMVFC_MAD_FAILED:
+	default:
+		level += ibmvfc_retry_tgt_init(tgt, ibmvfc_tgt_move_login);
+
+		tgt_log(tgt, level, "Move Login failed: old scsi_id: %llX, flags:%x, vios_flags:%x, rc=0x%02X\n",
+			tgt->old_scsi_id, be32_to_cpu(rsp->flags), be16_to_cpu(rsp->vios_flags), status);
+		break;
+	}
+
+	kref_put(&tgt->kref, ibmvfc_release_tgt);
+	ibmvfc_free_event(evt);
+	wake_up(&vhost->work_wait_q);
+}
+
+
+/**
+ * ibmvfc_tgt_move_login - Initiate a move login for specified target
+ * @tgt:		ibmvfc target struct
+ *
+ **/
+static void ibmvfc_tgt_move_login(struct ibmvfc_target *tgt)
+{
+	struct ibmvfc_host *vhost = tgt->vhost;
+	struct ibmvfc_move_login *move;
+	struct ibmvfc_event *evt;
+
+	if (vhost->discovery_threads >= disc_threads)
+		return;
+
+	kref_get(&tgt->kref);
+	evt = ibmvfc_get_event(vhost);
+	vhost->discovery_threads++;
+	ibmvfc_set_tgt_action(tgt, IBMVFC_TGT_ACTION_INIT_WAIT);
+	ibmvfc_init_event(evt, ibmvfc_tgt_move_login_done, IBMVFC_MAD_FORMAT);
+	evt->tgt = tgt;
+	move = &evt->iu.move_login;
+	memset(move, 0, sizeof(*move));
+	move->common.version = cpu_to_be32(1);
+	move->common.opcode = cpu_to_be32(IBMVFC_MOVE_LOGIN);
+	move->common.length = cpu_to_be16(sizeof(*move));
+
+	move->old_scsi_id = cpu_to_be64(tgt->old_scsi_id);
+	move->new_scsi_id = cpu_to_be64(tgt->scsi_id);
+	move->wwpn = cpu_to_be64(tgt->wwpn);
+	move->node_name = cpu_to_be64(tgt->ids.node_name);
+
+	if (ibmvfc_send_event(evt, vhost, default_timeout)) {
+		vhost->discovery_threads--;
+		ibmvfc_set_tgt_action(tgt, IBMVFC_TGT_ACTION_DEL_RPORT);
+		kref_put(&tgt->kref, ibmvfc_release_tgt);
+	} else
+		tgt_dbg(tgt, "Sent Move Login for old scsi_id: %llX\n", tgt->old_scsi_id);
+}
+
+/**
  * ibmvfc_adisc_needs_plogi - Does device need PLOGI?
  * @mad:	ibmvfc passthru mad struct
  * @tgt:	ibmvfc target struct
@@ -3979,24 +4111,62 @@ static void ibmvfc_tgt_query_target(struct ibmvfc_target *tgt)
  * Returns:
  *	0 on success / other on failure
  **/
-static int ibmvfc_alloc_target(struct ibmvfc_host *vhost, u64 scsi_id)
+static int ibmvfc_alloc_target(struct ibmvfc_host *vhost,struct ibmvfc_discover_targets_entry *target)
 {
+	struct ibmvfc_target *stgt = NULL;
+	struct ibmvfc_target *wtgt = NULL;
 	struct ibmvfc_target *tgt;
 	unsigned long flags;
+	u64 scsi_id = be32_to_cpu(target->scsi_id) & IBMVFC_DISC_TGT_SCSI_ID_MASK;
+	u64 wwpn = be64_to_cpu(target->wwpn);
 
+	/* Look to see if we already have a target allocated for this SCSI ID or WWPN */
 	spin_lock_irqsave(vhost->host->host_lock, flags);
 	list_for_each_entry(tgt, &vhost->targets, queue) {
+		if (tgt->wwpn == wwpn) {
+			wtgt = tgt;
+			break;
+		}
+	}
+
+	list_for_each_entry(tgt, &vhost->targets, queue) {
 		if (tgt->scsi_id == scsi_id) {
-			if (tgt->need_login)
-				ibmvfc_init_tgt(tgt, ibmvfc_tgt_implicit_logout);
+			stgt = tgt;
+			break;
+		}
+	}
+
+	if (wtgt && !stgt) {
+		/* A WWPN target has moved and we still are tracking the old SCSI ID.
+		 The only way we should be able to get here is if we attempted to
+		 send an implicit logout for the old SCSI ID and it failed for some
+		 reason, such as there being I/O pending to the target. In this
+		 case, we will have already deleted the rport from the FC transport
+		 so we do a move login, which works even with I/O pending, as it
+		 will cancel any active commands. */
+		if (wtgt->action == IBMVFC_TGT_ACTION_LOGOUT_DELETED_RPORT) {
+			/* Do a move login here. The old target is no longer known to the transport layer
+			 We don't use the normal ibmvfc_set_tgt_action to set this, as
+			 we don't normally want to allow this state change */
+			wtgt->old_scsi_id = wtgt->scsi_id;
+			wtgt->scsi_id = scsi_id;
+			wtgt->action = IBMVFC_TGT_ACTION_INIT;
+			ibmvfc_init_tgt(wtgt, ibmvfc_tgt_move_login);
 			goto unlock_out;
+		} else {
+			tgt_err(wtgt, "Unexpected target state: %d, %p\n", wtgt->action, wtgt->rport);
 		}
+	} else if (stgt) {
+		if (tgt->need_login)
+			ibmvfc_init_tgt(tgt, ibmvfc_tgt_implicit_logout);
+		goto unlock_out;
 	}
 	spin_unlock_irqrestore(vhost->host->host_lock, flags);
 
 	tgt = mempool_alloc(vhost->tgt_pool, GFP_NOIO);
 	memset(tgt, 0, sizeof(*tgt));
 	tgt->scsi_id = scsi_id;
+	tgt->wwpn = wwpn;
 	tgt->vhost = vhost;
 	tgt->need_login = 1;
 	tgt->cancel_key = vhost->task_set++;
@@ -4023,9 +4193,7 @@ static int ibmvfc_alloc_targets(struct ibmvfc_host *vhost)
 	int i, rc;
 
 	for (i = 0, rc = 0; !rc && i < vhost->num_targets; i++)
-		rc = ibmvfc_alloc_target(vhost,
-					 be32_to_cpu(vhost->disc_buf->scsi_id[i]) &
-					 IBMVFC_DISC_TGT_SCSI_ID_MASK);
+		rc = ibmvfc_alloc_target(vhost, &vhost->disc_buf[i]);
 
 	return rc;
 }
@@ -4085,6 +4253,7 @@ static void ibmvfc_discover_targets(struct ibmvfc_host *vhost)
 	mad->bufflen = cpu_to_be32(vhost->disc_buf_sz);
 	mad->buffer.va = cpu_to_be64(vhost->disc_buf_dma);
 	mad->buffer.len = cpu_to_be32(vhost->disc_buf_sz);
+	mad->flags = cpu_to_be32(IBMVFC_DISC_TGT_PORT_ID_WWPN_LIST);
 	ibmvfc_set_host_action(vhost, IBMVFC_HOST_ACTION_INIT_WAIT);
 
 	if (!ibmvfc_send_event(evt, vhost, default_timeout))
@@ -4420,6 +4589,13 @@ static void ibmvfc_tgt_add_rport(struct ibmvfc_target *tgt)
 		del_timer_sync(&tgt->timer);
 		kref_put(&tgt->kref, ibmvfc_release_tgt);
 		return;
+	} else if (rport && tgt->action == IBMVFC_TGT_ACTION_DEL_AND_LOGOUT_RPORT) {
+		tgt_dbg(tgt, "Deleting rport with outstanding I/O\n");
+		ibmvfc_set_tgt_action(tgt, IBMVFC_TGT_ACTION_LOGOUT_DELETED_RPORT);
+		tgt->rport = NULL;
+		spin_unlock_irqrestore(vhost->host->host_lock, flags);
+		fc_remote_port_delete(rport);
+		return;
 	} else if (rport && tgt->action == IBMVFC_TGT_ACTION_DELETED_RPORT) {
 		spin_unlock_irqrestore(vhost->host->host_lock, flags);
 		return;
@@ -4543,6 +4719,15 @@ static void ibmvfc_do_work(struct ibmvfc_host *vhost)
 				del_timer_sync(&tgt->timer);
 				kref_put(&tgt->kref, ibmvfc_release_tgt);
 				return;
+			} else if (tgt->action == IBMVFC_TGT_ACTION_DEL_AND_LOGOUT_RPORT) {
+				tgt_dbg(tgt, "Deleting rport with I/O outstanding\n");
+				rport = tgt->rport;
+				tgt->rport = NULL;
+				ibmvfc_set_tgt_action(tgt, IBMVFC_TGT_ACTION_LOGOUT_DELETED_RPORT);
+				spin_unlock_irqrestore(vhost->host->host_lock, flags);
+				if (rport)
+					fc_remote_port_delete(rport);
+				return;
 			}
 		}
 
@@ -4775,7 +4960,7 @@ static int ibmvfc_alloc_mem(struct ibmvfc_host *vhost)
 		goto free_sg_pool;
 	}
 
-	vhost->disc_buf_sz = sizeof(vhost->disc_buf->scsi_id[0]) * max_targets;
+	vhost->disc_buf_sz = sizeof(*vhost->disc_buf) * max_targets;
 	vhost->disc_buf = dma_alloc_coherent(dev, vhost->disc_buf_sz,
 					     &vhost->disc_buf_dma, GFP_KERNEL);
 
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h
index e6e1c25..6a21ac3 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.h
+++ b/drivers/scsi/ibmvscsi/ibmvfc.h
@@ -120,6 +120,7 @@ enum ibmvfc_mad_types {
 	IBMVFC_PORT_LOGIN		= 0x0004,
 	IBMVFC_PROCESS_LOGIN	= 0x0008,
 	IBMVFC_QUERY_TARGET	= 0x0010,
+	IBMVFC_MOVE_LOGIN		= 0x0020,
 	IBMVFC_IMPLICIT_LOGOUT	= 0x0040,
 	IBMVFC_PASSTHRU		= 0x0200,
 	IBMVFC_TMF_MAD		= 0x0100,
@@ -197,6 +198,7 @@ struct ibmvfc_service_parms {
 	__be32 ext_len;
 	__be32 reserved[30];
 	__be32 clk_sync_qos[2];
+	__be32 reserved2;
 } __packed __aligned(4);
 
 struct ibmvfc_npiv_login_resp {
@@ -230,15 +232,18 @@ struct ibmvfc_npiv_login_resp {
 	struct ibmvfc_npiv_login_resp resp;
 } __packed __aligned(8);
 
-struct ibmvfc_discover_targets_buf {
-	__be32 scsi_id[1];
+struct ibmvfc_discover_targets_entry {
+	__be32 scsi_id;
+	__be32 pad;
+	__be64 wwpn;
 #define IBMVFC_DISC_TGT_SCSI_ID_MASK	0x00ffffff
-};
+}__packed __aligned(8);
 
 struct ibmvfc_discover_targets {
 	struct ibmvfc_mad_common common;
 	struct srp_direct_buf buffer;
 	__be32 flags;
+#define IBMVFC_DISC_TGT_PORT_ID_WWPN_LIST	0x02
 	__be16 status;
 	__be16 error;
 	__be32 bufflen;
@@ -291,6 +296,26 @@ struct ibmvfc_port_login {
 	__be64 reserved3[2];
 } __packed __aligned(8);
 
+struct ibmvfc_move_login {
+	struct ibmvfc_mad_common common;
+	__be64 old_scsi_id;
+	__be64 new_scsi_id;
+	__be64 wwpn;
+	__be64 node_name;
+	__be32 flags;
+#define IBMVFC_MOVE_LOGIN_IMPLICIT_OLD_FAILED	0x01
+#define IBMVFC_MOVE_LOGIN_IMPLICIT_NEW_FAILED	0x02
+#define IBMVFC_MOVE_LOGIN_PORT_LOGIN_FAILED	0x04
+	__be32 reserved;
+	struct ibmvfc_service_parms service_parms;
+	struct ibmvfc_service_parms service_parms_change;
+	__be32 reserved2;
+	__be16 service_class;
+	__be16 vios_flags;
+#define IBMVFC_MOVE_LOGIN_VF_NOT_SENT_ADAPTER	0x01
+	__be64 reserved3;
+}__packed __aligned(8);
+
 struct ibmvfc_prli_svc_parms {
 	u8 type;
 #define IBMVFC_SCSI_FCP_TYPE		0x08
@@ -646,6 +671,7 @@ struct ibmvfc_async_crq_queue {
 	struct ibmvfc_discover_targets discover_targets;
 	struct ibmvfc_port_login plogi;
 	struct ibmvfc_process_login prli;
+	struct ibmvfc_move_login move_login;
 	struct ibmvfc_query_tgt query_tgt;
 	struct ibmvfc_implicit_logout implicit_logout;
 	struct ibmvfc_tmf tmf;
@@ -664,12 +690,16 @@ enum ibmvfc_target_action {
 	IBMVFC_TGT_ACTION_LOGOUT_RPORT_WAIT,
 	IBMVFC_TGT_ACTION_DEL_RPORT,
 	IBMVFC_TGT_ACTION_DELETED_RPORT,
+	IBMVFC_TGT_ACTION_DEL_AND_LOGOUT_RPORT,
+	IBMVFC_TGT_ACTION_LOGOUT_DELETED_RPORT,
 };
 
 struct ibmvfc_target {
 	struct list_head queue;
 	struct ibmvfc_host *vhost;
 	u64 scsi_id;
+	u64 wwpn;
+	u64 old_scsi_id;
 	struct fc_rport *rport;
 	int target_id;
 	enum ibmvfc_target_action action;
@@ -765,7 +795,7 @@ struct ibmvfc_host {
 	dma_addr_t login_buf_dma;
 	int disc_buf_sz;
 	int log_level;
-	struct ibmvfc_discover_targets_buf *disc_buf;
+	struct ibmvfc_discover_targets_entry *disc_buf;
 	struct mutex passthru_mutex;
 	int task_set;
 	int init_retries;
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH] powerpc/papr_scm: Fix warning triggered by perf_stats_show()
From: Ira Weiny @ 2020-09-11 22:13 UTC (permalink / raw)
  To: Vaibhav Jain
  Cc: Santosh Sivaraj, linux-nvdimm, Aneesh Kumar K . V,
	Oliver O'Halloran, Dan Williams, linuxppc-dev
In-Reply-To: <878sdgqdrd.fsf@vajain21.in.ibm.com>

On Sat, Sep 12, 2020 at 01:36:46AM +0530, Vaibhav Jain wrote:
> Thanks for reviewing this patch Ira,
> 
> 
> Ira Weiny <ira.weiny@intel.com> writes:
> 
> > On Thu, Sep 10, 2020 at 02:52:12PM +0530, Vaibhav Jain wrote:
> >> A warning is reported by the kernel in case perf_stats_show() returns
> >> an error code. The warning is of the form below:
> >> 
> >>  papr_scm ibm,persistent-memory:ibm,pmemory@44100001:
> >>  	  Failed to query performance stats, Err:-10
> >>  dev_attr_show: perf_stats_show+0x0/0x1c0 [papr_scm] returned bad count
> >>  fill_read_buffer: dev_attr_show+0x0/0xb0 returned bad count
> >> 
> >> On investigation it looks like that the compiler is silently truncating the
> >> return value of drc_pmem_query_stats() from 'long' to 'int', since the
> >> variable used to store the return code 'rc' is an 'int'. This
> >> truncated value is then returned back as a 'ssize_t' back from
> >> perf_stats_show() to 'dev_attr_show()' which thinks of it as a large
> >> unsigned number and triggers this warning..
> >> 
> >> To fix this we update the type of variable 'rc' from 'int' to
> >> 'ssize_t' that prevents the compiler from truncating the return value
> >> of drc_pmem_query_stats() and returning correct signed value back from
> >> perf_stats_show().
> >> 
> >> Fixes: 2d02bf835e573 ('powerpc/papr_scm: Fetch nvdimm performance
> >>        stats from PHYP')
> >> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
> >> ---
> >>  arch/powerpc/platforms/pseries/papr_scm.c | 3 ++-
> >>  1 file changed, 2 insertions(+), 1 deletion(-)
> >> 
> >> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
> >> index a88a707a608aa..9f00b61676ab9 100644
> >> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> >> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> >> @@ -785,7 +785,8 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc,
> >>  static ssize_t perf_stats_show(struct device *dev,
> >>  			       struct device_attribute *attr, char *buf)
> >>  {
> >> -	int index, rc;
> >> +	int index;
> >> +	ssize_t rc;
> >
> > I'm not sure this is really fixing everything here.
> 
> The issue is with the statement in perf_stats_show():
> 
> 'return rc ? rc : seq_buf_used(&s);'
> 
> The function seq_buf_used() returns an 'unsigned int' and with 'rc'
> typed as 'int', forces a promotion of the expression to 'unsigned int'
> which causes a loss of signedness of 'rc' and compiler silently
> assigns this unsigned value to the function return typed as 'signed
> long'.
> 
> Making 'rc', a 'signed long' forces a promotion of the expresion to
> 'signed long' which preserves the signedness of 'rc' and will also be
> compatible with the function return type.

Ok, ok, I read this all wrong.

FWIW I would also cast seq_buf_used() to ssize_t to show you know what you are
doing there.

> 
> >
> > drc_pmem_query_stats() can return negative errno's.  Why are those not checked
> > somewhere in perf_stats_show()?
> >
> For the specific invocation 'drc_pmem_query_stats(p, stats, 0)' we only
> expect return value 'rc <=0' with '0' indicating a successful fetch of
> nvdimm performance stats from hypervisor. Hence there are no explicit
> checks for negative error codes in the functions as all return values
> !=0 indicate an error.
> 
> 
> > It seems like all this fix is handling is a > 0 return value: 'ret[0]' from
> > line 289 in papr_scm.c...  Or something?
> No, in case the arg 'num_stats' is '0' and 'buff_stats != NULL' the
> variable 'size' is assigned a non-zero value hence that specific branch
> you mentioned  is never taken. Instead in case of success
> drc_pmem_query_stats() return '0' and in case of an error a negative
> error code is returned.
> 
> >
> > Worse yet drc_pmem_query_stats() is returning ssize_t which is a signed value.
> > Therefore, it should not be returning -errno.  I'm surprised the static
> > checkers did not catch that.
> Didnt quite get the assertion here. The function is marked to return
> 'ssize_t' because we can return both +ve for success and -ve values to
> indicate errors.

Sorry I was reading this as size_t and meant to say unsigned...  I was looking
at this too quickly.

> 
> >
> > I believe I caught similar errors with a patch series before which did not pay
> > attention to variable types.
> >
> > Please audit this code for these types of errors and ensure you are really
> > doing the correct thing when using the sysfs interface.  I'm pretty sure bad
> > things will eventually happen (if they are not already) if you return some
> > really big number to the sysfs core from *_show().
> I think this problem is different compared to what you had previously pointed
> to. The values returned from drc_pmem_query_stats() can be stored in an
> 'int' variable too, however it was the silent promotion of a signed type
> to unsigned type was what caused this specific issue.

Ok this makes more sense now.  Sorry about not looking more carefully.

But I still think matching up the return of seq_buf_used() is worth it.  I
don't particularly like depending on 'automatic' promotions which make
reviewing code harder like this.

And sorry if my email seemed harsh I did not mean it to be.  I just like when
types are more explicit because I feel like it can avoid issues like this.
(Specifically my confusion over the types...)

:-D

Thanks,
Ira


^ permalink raw reply

* Re: [PATCH v2] selftests/seccomp: fix ptrace tests on powerpc
From: Kees Cook @ 2020-09-11 22:55 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo
  Cc: linuxppc-dev, Shuah Khan, Oleg Nesterov, linux-kselftest
In-Reply-To: <20200911181012.171027-1-cascardo@canonical.com>

On Fri, Sep 11, 2020 at 03:10:12PM -0300, Thadeu Lima de Souza Cascardo wrote:
> As pointed out by Michael Ellerman, the ptrace ABI on powerpc does not
> allow or require the return code to be set on syscall entry when
> skipping the syscall. It will always return ENOSYS and the return code
> must be set on syscall exit.
> 
> This code does that, behaving more similarly to strace. It still sets
> the return code on entry, which is overridden on powerpc, and it will
> always repeat the same on exit. Also, on powerpc, the errno is not
> inverted, and depends on ccr.so being set.
> 
> This has been tested on powerpc and amd64.

This looks like two fixes in one, so this should be split. :)

>  tools/testing/selftests/seccomp/seccomp_bpf.c | 81 ++++++++++++-------
>  1 file changed, 53 insertions(+), 28 deletions(-)
> 
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 7a6d40286a42..0ddc0846e9c0 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -1837,15 +1837,24 @@ void change_syscall(struct __test_metadata *_metadata,
>  #endif
>  
>  	/* If syscall is skipped, change return value. */
> -	if (syscall == -1)
> +	if (syscall == -1) {
>  #ifdef SYSCALL_NUM_RET_SHARE_REG
>  		TH_LOG("Can't modify syscall return on this architecture");
> -
>  #elif defined(__xtensa__)
>  		regs.SYSCALL_RET(regs) = result;
> +#elif defined(__powerpc__)
> +		/* Error is signaled by CR0 SO bit and error code is positive. */
> +		if (result < 0) {
> +			regs.SYSCALL_RET = -result;
> +			regs.ccr |= 0x10000000;
> +		} else {
> +			regs.SYSCALL_RET = result;
> +			regs.ccr &= ~0x10000000;
> +		}
>  #else
>  		regs.SYSCALL_RET = result;
>  #endif
> +	}

I'll send a series soon that will include this bit, since I don't want
to collect these kinds of arch-specific things in the functions. (And
the xtensa one went in without my review!)

> +FIXTURE(TRACE_syscall) {
> +	struct sock_fprog prog;
> +	pid_t tracer, mytid, mypid, parent;
> +};
> +
> +FIXTURE_VARIANT(TRACE_syscall) {
> +	/*
> +	 * All of the SECCOMP_RET_TRACE behaviors can be tested with either
> +	 * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
> +	 * This indicates if we should use SECCOMP_RET_TRACE (false), or
> +	 * ptrace (true).
> +	 */
> +	bool use_ptrace;
> +
> +	/*
> +	 * Some archs (like ppc) only support changing the return code during
> +	 * syscall exit when ptrace is used.  As the syscall number might not
> +	 * be available anymore during syscall exit, it needs to be saved
> +	 * during syscall enter.
> +	 */
> +	int syscall_nr;

This should be part of the fixture struct, not the variant. 

> +};
> +
> +FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
> +	.use_ptrace = true,
> +};
> +
> +FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
> +	.use_ptrace = false,
> +};

i.e. if a member isn't initialized in FIXTURE_VARIANT_ADD, it shouldn't
be defined in FIXTURE_VARIANT. :)

> +
>  void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
>  		   int status, void *args)
>  {
>  	int ret, nr;
>  	unsigned long msg;
>  	static bool entry;
> +	FIXTURE_VARIANT(TRACE_syscall) * variant = args;
>  
>  	/*
>  	 * The traditional way to tell PTRACE_SYSCALL entry/exit
> @@ -1916,10 +1957,15 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
>  	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
>  			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
>  
> -	if (!entry)
> +	if (!entry && !variant)
>  		return;
>  
> -	nr = get_syscall(_metadata, tracee);
> +	if (entry)
> +		nr = get_syscall(_metadata, tracee);
> +	else if (variant)
> +		nr = variant->syscall_nr;
> +	if (variant)
> +		variant->syscall_nr = nr;

So, to be clear this is _only_ an issue for the ptrace side of things,
yes? i.e. seccomp's setting of the return value will correct stick?

-- 
Kees Cook

^ permalink raw reply

* [PATCH] mm/gup: fix gup_fast with dynamic page table folding
From: Vasily Gorbik @ 2020-09-11 19:03 UTC (permalink / raw)
  To: Jason Gunthorpe, John Hubbard, Linus Torvalds
  Cc: Peter Zijlstra, Dave Hansen, linux-mm, Paul Mackerras,
	linux-sparc, Alexander Gordeev, Claudio Imbrenda, Will Deacon,
	linux-arch, linux-s390, Richard Weinberger, linux-x86,
	Russell King, Christian Borntraeger, Ingo Molnar, Catalin Marinas,
	Andrey Ryabinin, Gerald Schaefer, Heiko Carstens, Arnd Bergmann,
	Jeff Dike, linux-um, Borislav Petkov, Andy Lutomirski,
	Thomas Gleixner, linux-arm, Dave Hansen, LKML, Andrew Morton,
	linux-power, Mike Rapoport
In-Reply-To: <20200911070939.GB1362448@hirez.programming.kicks-ass.net>

Currently to make sure that every page table entry is read just once
gup_fast walks perform READ_ONCE and pass pXd value down to the next
gup_pXd_range function by value e.g.:

static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
                         unsigned int flags, struct page **pages, int *nr)
...
        pudp = pud_offset(&p4d, addr);

This function passes a reference on that local value copy to pXd_offset,
and might get the very same pointer in return. This happens when the
level is folded (on most arches), and that pointer should not be iterated.

On s390 due to the fact that each task might have different 5,4 or
3-level address translation and hence different levels folded the logic
is more complex and non-iteratable pointer to a local copy leads to
severe problems.

Here is an example of what happens with gup_fast on s390, for a task
with 3-levels paging, crossing a 2 GB pud boundary:

// addr = 0x1007ffff000, end = 0x10080001000
static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
                         unsigned int flags, struct page **pages, int *nr)
{
        unsigned long next;
        pud_t *pudp;

        // pud_offset returns &p4d itself (a pointer to a value on stack)
        pudp = pud_offset(&p4d, addr);
        do {
                // on second iteratation reading "random" stack value
                pud_t pud = READ_ONCE(*pudp);

                // next = 0x10080000000, due to PUD_SIZE/MASK != PGDIR_SIZE/MASK on s390
                next = pud_addr_end(addr, end);
                ...
        } while (pudp++, addr = next, addr != end); // pudp++ iterating over stack

        return 1;
}

This happens since s390 moved to common gup code with
commit d1874a0c2805 ("s390/mm: make the pxd_offset functions more robust")
and commit 1a42010cdc26 ("s390/mm: convert to the generic
get_user_pages_fast code"). s390 tried to mimic static level folding by
changing pXd_offset primitives to always calculate top level page table
offset in pgd_offset and just return the value passed when pXd_offset
has to act as folded.

What is crucial for gup_fast and what has been overlooked is
that PxD_SIZE/MASK and thus pXd_addr_end should also change
correspondingly. And the latter is not possible with dynamic folding.

To fix the issue in addition to pXd values pass original
pXdp pointers down to gup_pXd_range functions. And introduce
pXd_offset_lockless helpers, which take an additional pXd
entry value parameter. This has already been discussed in
https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1

Cc: <stable@vger.kernel.org> # 5.2+
Fixes: 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast code")
Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/pgtable.h | 42 +++++++++++++++++++++++----------
 include/linux/pgtable.h         | 10 ++++++++
 mm/gup.c                        | 18 +++++++-------
 3 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 7eb01a5459cd..b55561cc8786 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1260,26 +1260,44 @@ static inline pgd_t *pgd_offset_raw(pgd_t *pgd, unsigned long address)
 
 #define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address)
 
-static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
+static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long address)
 {
-	if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
-		return (p4d_t *) pgd_deref(*pgd) + p4d_index(address);
-	return (p4d_t *) pgd;
+	if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
+		return (p4d_t *) pgd_deref(pgd) + p4d_index(address);
+	return (p4d_t *) pgdp;
 }
+#define p4d_offset_lockless p4d_offset_lockless
 
-static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
+static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address)
 {
-	if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
-		return (pud_t *) p4d_deref(*p4d) + pud_index(address);
-	return (pud_t *) p4d;
+	return p4d_offset_lockless(pgdp, *pgdp, address);
+}
+
+static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long address)
+{
+	if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
+		return (pud_t *) p4d_deref(p4d) + pud_index(address);
+	return (pud_t *) p4dp;
+}
+#define pud_offset_lockless pud_offset_lockless
+
+static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long address)
+{
+	return pud_offset_lockless(p4dp, *p4dp, address);
 }
 #define pud_offset pud_offset
 
-static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
+static inline pmd_t *pmd_offset_lockless(pud_t *pudp, pud_t pud, unsigned long address)
+{
+	if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3)
+		return (pmd_t *) pud_deref(pud) + pmd_index(address);
+	return (pmd_t *) pudp;
+}
+#define pmd_offset_lockless pmd_offset_lockless
+
+static inline pmd_t *pmd_offset(pud_t *pudp, unsigned long address)
 {
-	if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3)
-		return (pmd_t *) pud_deref(*pud) + pmd_index(address);
-	return (pmd_t *) pud;
+	return pmd_offset_lockless(pudp, *pudp, address);
 }
 #define pmd_offset pmd_offset
 
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e8cbc2e795d5..e899d3506671 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1427,6 +1427,16 @@ typedef unsigned int pgtbl_mod_mask;
 #define mm_pmd_folded(mm)	__is_defined(__PAGETABLE_PMD_FOLDED)
 #endif
 
+#ifndef p4d_offset_lockless
+#define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&pgd, address)
+#endif
+#ifndef pud_offset_lockless
+#define pud_offset_lockless(p4dp, p4d, address) pud_offset(&p4d, address)
+#endif
+#ifndef pmd_offset_lockless
+#define pmd_offset_lockless(pudp, pud, address) pmd_offset(&pud, address)
+#endif
+
 /*
  * p?d_leaf() - true if this entry is a final mapping to a physical address.
  * This differs from p?d_huge() by the fact that they are always available (if
diff --git a/mm/gup.c b/mm/gup.c
index e5739a1974d5..578bf5bd8bf8 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2485,13 +2485,13 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
 	return 1;
 }
 
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
 		unsigned int flags, struct page **pages, int *nr)
 {
 	unsigned long next;
 	pmd_t *pmdp;
 
-	pmdp = pmd_offset(&pud, addr);
+	pmdp = pmd_offset_lockless(pudp, pud, addr);
 	do {
 		pmd_t pmd = READ_ONCE(*pmdp);
 
@@ -2528,13 +2528,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 	return 1;
 }
 
-static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
+static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
 			 unsigned int flags, struct page **pages, int *nr)
 {
 	unsigned long next;
 	pud_t *pudp;
 
-	pudp = pud_offset(&p4d, addr);
+	pudp = pud_offset_lockless(p4dp, p4d, addr);
 	do {
 		pud_t pud = READ_ONCE(*pudp);
 
@@ -2549,20 +2549,20 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
 			if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
 					 PUD_SHIFT, next, flags, pages, nr))
 				return 0;
-		} else if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
+		} else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
 			return 0;
 	} while (pudp++, addr = next, addr != end);
 
 	return 1;
 }
 
-static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
+static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
 			 unsigned int flags, struct page **pages, int *nr)
 {
 	unsigned long next;
 	p4d_t *p4dp;
 
-	p4dp = p4d_offset(&pgd, addr);
+	p4dp = p4d_offset_lockless(pgdp, pgd, addr);
 	do {
 		p4d_t p4d = READ_ONCE(*p4dp);
 
@@ -2574,7 +2574,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
 			if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
 					 P4D_SHIFT, next, flags, pages, nr))
 				return 0;
-		} else if (!gup_pud_range(p4d, addr, next, flags, pages, nr))
+		} else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
 			return 0;
 	} while (p4dp++, addr = next, addr != end);
 
@@ -2602,7 +2602,7 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
 			if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
 					 PGDIR_SHIFT, next, flags, pages, nr))
 				return;
-		} else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr))
+		} else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
 			return;
 	} while (pgdp++, addr = next, addr != end);
 }
-- 
⣿⣿⣿⣿⢋⡀⣀⠹⣿⣿⣿⣿
⣿⣿⣿⣿⠠⣶⡦⠀⣿⣿⣿⣿
⣿⣿⣿⠏⣴⣮⣴⣧⠈⢿⣿⣿
⣿⣿⡏⢰⣿⠖⣠⣿⡆⠈⣿⣿
⣿⢛⣵⣄⠙⣶⣶⡟⣅⣠⠹⣿
⣿⣜⣛⠻⢎⣉⣉⣀⠿⣫⣵⣿

^ permalink raw reply related

* [PATCH v2] mm/gup: fix gup_fast with dynamic page table folding
From: Vasily Gorbik @ 2020-09-11 20:36 UTC (permalink / raw)
  To: Jason Gunthorpe, John Hubbard
  Cc: Peter Zijlstra, Dave Hansen, linux-mm, Paul Mackerras,
	linux-sparc, Alexander Gordeev, Claudio Imbrenda, Will Deacon,
	linux-arch, linux-s390, Richard Weinberger, linux-x86,
	Russell King, Christian Borntraeger, Ingo Molnar, Catalin Marinas,
	Andrey Ryabinin, Gerald Schaefer, Heiko Carstens, Arnd Bergmann,
	Jeff Dike, linux-um, Borislav Petkov, Andy Lutomirski,
	Thomas Gleixner, linux-arm, Dave Hansen, linux-power, LKML,
	Andrew Morton, Linus Torvalds, Mike Rapoport
In-Reply-To: <20200911200511.GC1221970@ziepe.ca>

Currently to make sure that every page table entry is read just once
gup_fast walks perform READ_ONCE and pass pXd value down to the next
gup_pXd_range function by value e.g.:

static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
                         unsigned int flags, struct page **pages, int *nr)
...
        pudp = pud_offset(&p4d, addr);

This function passes a reference on that local value copy to pXd_offset,
and might get the very same pointer in return. This happens when the
level is folded (on most arches), and that pointer should not be iterated.

On s390 due to the fact that each task might have different 5,4 or
3-level address translation and hence different levels folded the logic
is more complex and non-iteratable pointer to a local copy leads to
severe problems.

Here is an example of what happens with gup_fast on s390, for a task
with 3-levels paging, crossing a 2 GB pud boundary:

// addr = 0x1007ffff000, end = 0x10080001000
static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
                         unsigned int flags, struct page **pages, int *nr)
{
        unsigned long next;
        pud_t *pudp;

        // pud_offset returns &p4d itself (a pointer to a value on stack)
        pudp = pud_offset(&p4d, addr);
        do {
                // on second iteratation reading "random" stack value
                pud_t pud = READ_ONCE(*pudp);

                // next = 0x10080000000, due to PUD_SIZE/MASK != PGDIR_SIZE/MASK on s390
                next = pud_addr_end(addr, end);
                ...
        } while (pudp++, addr = next, addr != end); // pudp++ iterating over stack

        return 1;
}

This happens since s390 moved to common gup code with
commit d1874a0c2805 ("s390/mm: make the pxd_offset functions more robust")
and commit 1a42010cdc26 ("s390/mm: convert to the generic
get_user_pages_fast code"). s390 tried to mimic static level folding by
changing pXd_offset primitives to always calculate top level page table
offset in pgd_offset and just return the value passed when pXd_offset
has to act as folded.

What is crucial for gup_fast and what has been overlooked is
that PxD_SIZE/MASK and thus pXd_addr_end should also change
correspondingly. And the latter is not possible with dynamic folding.

To fix the issue in addition to pXd values pass original
pXdp pointers down to gup_pXd_range functions. And introduce
pXd_offset_lockless helpers, which take an additional pXd
entry value parameter. This has already been discussed in
https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1

Cc: <stable@vger.kernel.org> # 5.2+
Fixes: 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast code")
Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
v2: added brackets &pgd -> &(pgd)

 arch/s390/include/asm/pgtable.h | 42 +++++++++++++++++++++++----------
 include/linux/pgtable.h         | 10 ++++++++
 mm/gup.c                        | 18 +++++++-------
 3 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 7eb01a5459cd..b55561cc8786 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1260,26 +1260,44 @@ static inline pgd_t *pgd_offset_raw(pgd_t *pgd, unsigned long address)
 
 #define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address)
 
-static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
+static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long address)
 {
-	if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
-		return (p4d_t *) pgd_deref(*pgd) + p4d_index(address);
-	return (p4d_t *) pgd;
+	if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
+		return (p4d_t *) pgd_deref(pgd) + p4d_index(address);
+	return (p4d_t *) pgdp;
 }
+#define p4d_offset_lockless p4d_offset_lockless
 
-static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
+static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address)
 {
-	if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
-		return (pud_t *) p4d_deref(*p4d) + pud_index(address);
-	return (pud_t *) p4d;
+	return p4d_offset_lockless(pgdp, *pgdp, address);
+}
+
+static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long address)
+{
+	if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
+		return (pud_t *) p4d_deref(p4d) + pud_index(address);
+	return (pud_t *) p4dp;
+}
+#define pud_offset_lockless pud_offset_lockless
+
+static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long address)
+{
+	return pud_offset_lockless(p4dp, *p4dp, address);
 }
 #define pud_offset pud_offset
 
-static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
+static inline pmd_t *pmd_offset_lockless(pud_t *pudp, pud_t pud, unsigned long address)
+{
+	if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3)
+		return (pmd_t *) pud_deref(pud) + pmd_index(address);
+	return (pmd_t *) pudp;
+}
+#define pmd_offset_lockless pmd_offset_lockless
+
+static inline pmd_t *pmd_offset(pud_t *pudp, unsigned long address)
 {
-	if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3)
-		return (pmd_t *) pud_deref(*pud) + pmd_index(address);
-	return (pmd_t *) pud;
+	return pmd_offset_lockless(pudp, *pudp, address);
 }
 #define pmd_offset pmd_offset
 
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e8cbc2e795d5..90654cb63e9e 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1427,6 +1427,16 @@ typedef unsigned int pgtbl_mod_mask;
 #define mm_pmd_folded(mm)	__is_defined(__PAGETABLE_PMD_FOLDED)
 #endif
 
+#ifndef p4d_offset_lockless
+#define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address)
+#endif
+#ifndef pud_offset_lockless
+#define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address)
+#endif
+#ifndef pmd_offset_lockless
+#define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address)
+#endif
+
 /*
  * p?d_leaf() - true if this entry is a final mapping to a physical address.
  * This differs from p?d_huge() by the fact that they are always available (if
diff --git a/mm/gup.c b/mm/gup.c
index e5739a1974d5..578bf5bd8bf8 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2485,13 +2485,13 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
 	return 1;
 }
 
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
 		unsigned int flags, struct page **pages, int *nr)
 {
 	unsigned long next;
 	pmd_t *pmdp;
 
-	pmdp = pmd_offset(&pud, addr);
+	pmdp = pmd_offset_lockless(pudp, pud, addr);
 	do {
 		pmd_t pmd = READ_ONCE(*pmdp);
 
@@ -2528,13 +2528,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 	return 1;
 }
 
-static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
+static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
 			 unsigned int flags, struct page **pages, int *nr)
 {
 	unsigned long next;
 	pud_t *pudp;
 
-	pudp = pud_offset(&p4d, addr);
+	pudp = pud_offset_lockless(p4dp, p4d, addr);
 	do {
 		pud_t pud = READ_ONCE(*pudp);
 
@@ -2549,20 +2549,20 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
 			if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
 					 PUD_SHIFT, next, flags, pages, nr))
 				return 0;
-		} else if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
+		} else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
 			return 0;
 	} while (pudp++, addr = next, addr != end);
 
 	return 1;
 }
 
-static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
+static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
 			 unsigned int flags, struct page **pages, int *nr)
 {
 	unsigned long next;
 	p4d_t *p4dp;
 
-	p4dp = p4d_offset(&pgd, addr);
+	p4dp = p4d_offset_lockless(pgdp, pgd, addr);
 	do {
 		p4d_t p4d = READ_ONCE(*p4dp);
 
@@ -2574,7 +2574,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
 			if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
 					 P4D_SHIFT, next, flags, pages, nr))
 				return 0;
-		} else if (!gup_pud_range(p4d, addr, next, flags, pages, nr))
+		} else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
 			return 0;
 	} while (p4dp++, addr = next, addr != end);
 
@@ -2602,7 +2602,7 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
 			if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
 					 PGDIR_SHIFT, next, flags, pages, nr))
 				return;
-		} else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr))
+		} else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
 			return;
 	} while (pgdp++, addr = next, addr != end);
 }
-- 
⣿⣿⣿⣿⢋⡀⣀⠹⣿⣿⣿⣿
⣿⣿⣿⣿⠠⣶⡦⠀⣿⣿⣿⣿
⣿⣿⣿⠏⣴⣮⣴⣧⠈⢿⣿⣿
⣿⣿⡏⢰⣿⠖⣠⣿⡆⠈⣿⣿
⣿⢛⣵⣄⠙⣶⣶⡟⣅⣠⠹⣿
⣿⣜⣛⠻⢎⣉⣉⣀⠿⣫⣵⣿

^ permalink raw reply related

* [PATCH] serial: ucc_uart: make qe_uart_set_mctrl() static
From: Jason Yan @ 2020-09-12  3:38 UTC (permalink / raw)
  To: timur, gregkh, jirislaby, linux, leoyang.li, linuxppc-dev,
	linux-serial
  Cc: Hulk Robot, Jason Yan

This eliminates the following sparse warning:

drivers/tty/serial/ucc_uart.c:286:6: warning: symbol 'qe_uart_set_mctrl'
was not declared. Should it be static?

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Jason Yan <yanaijie@huawei.com>
---
 drivers/tty/serial/ucc_uart.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/serial/ucc_uart.c b/drivers/tty/serial/ucc_uart.c
index 3c8c662c69e2..d6a8604157ab 100644
--- a/drivers/tty/serial/ucc_uart.c
+++ b/drivers/tty/serial/ucc_uart.c
@@ -283,7 +283,7 @@ static unsigned int qe_uart_tx_empty(struct uart_port *port)
  * don't need that support. This function must exist, however, otherwise
  * the kernel will panic.
  */
-void qe_uart_set_mctrl(struct uart_port *port, unsigned int mctrl)
+static void qe_uart_set_mctrl(struct uart_port *port, unsigned int mctrl)
 {
 }
 
-- 
2.25.4


^ permalink raw reply related

* Re: [PATCH v5 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling
From: Srikar Dronamraju @ 2020-09-12  4:46 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Michael Neuling, Peter Zijlstra,
	Jordan Niethe, LKML, Nicholas Piggin, Valentin Schneider,
	Oliver O'Halloran, linuxppc-dev, Ingo Molnar
In-Reply-To: <87y2lgr0ic.fsf@mpe.ellerman.id.au>

* Michael Ellerman <mpe@ellerman.id.au> [2020-09-11 21:55:23]:

> Srikar Dronamraju <srikar@linux.vnet.ibm.com> writes:
> > Current code assumes that cpumask of cpus sharing a l2-cache mask will
> > always be a superset of cpu_sibling_mask.
> >
> > Lets stop that assumption. cpu_l2_cache_mask is a superset of
> > cpu_sibling_mask if and only if shared_caches is set.
> 
> I'm seeing oopses with this:
> 
> [    0.117392][    T1] smp: Bringing up secondary CPUs ...
> [    0.156515][    T1] smp: Brought up 2 nodes, 2 CPUs
> [    0.158265][    T1] numa: Node 0 CPUs: 0
> [    0.158520][    T1] numa: Node 1 CPUs: 1
> [    0.167453][    T1] BUG: Unable to handle kernel data access on read at 0x8000000041228298
> [    0.167992][    T1] Faulting instruction address: 0xc00000000018c128
> [    0.168817][    T1] Oops: Kernel access of bad area, sig: 11 [#1]
> [    0.168964][    T1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
> [    0.169417][    T1] Modules linked in:
> [    0.170047][    T1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.9.0-rc2-00095-g7430ad5aa700 #209
> [    0.170305][    T1] NIP:  c00000000018c128 LR: c00000000018c0cc CTR: c00000000004dce0
> [    0.170498][    T1] REGS: c00000007e343880 TRAP: 0380   Not tainted  (5.9.0-rc2-00095-g7430ad5aa700)
> [    0.170602][    T1] MSR:  8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE>  CR: 44002222  XER: 00000000
> [    0.170985][    T1] CFAR: c00000000018c288 IRQMASK: 0
> [    0.170985][    T1] GPR00: 0000000000000000 c00000007e343b10 c00000000173e400 0000000000004000
> [    0.170985][    T1] GPR04: 0000000000000000 0000000000000800 0000000000000800 0000000000000000
> [    0.170985][    T1] GPR08: 0000000000000000 c00000000122c298 c00000003fffc000 c00000007fd05ce8
> [    0.170985][    T1] GPR12: c00000007e0119f8 c000000001930000 00000000ffff8ade 0000000000000000
> [    0.170985][    T1] GPR16: c00000007e3c0640 0000000000000917 c00000007e3c0658 0000000000000008
> [    0.170985][    T1] GPR20: c0000000015d0bb8 00000000ffff8ade c000000000f57400 c000000001817c28
> [    0.170985][    T1] GPR24: c00000000176dc80 c00000007e3c0890 c00000007e3cfe00 0000000000000000
> [    0.170985][    T1] GPR28: c000000001772310 c00000007e011900 c00000007e3c0800 0000000000000001
> [    0.172750][    T1] NIP [c00000000018c128] build_sched_domains+0x808/0x14b0
> [    0.172900][    T1] LR [c00000000018c0cc] build_sched_domains+0x7ac/0x14b0
> [    0.173186][    T1] Call Trace:
> [    0.173484][    T1] [c00000007e343b10] [c00000000018bfe8] build_sched_domains+0x6c8/0x14b0 (unreliable)
> [    0.173821][    T1] [c00000007e343c50] [c00000000018dcdc] sched_init_domains+0xec/0x130
> [    0.174037][    T1] [c00000007e343ca0] [c0000000010d59d8] sched_init_smp+0x50/0xc4
> [    0.174207][    T1] [c00000007e343cd0] [c0000000010b45c4] kernel_init_freeable+0x1b4/0x378
> [    0.174378][    T1] [c00000007e343db0] [c0000000000129fc] kernel_init+0x24/0x158
> [    0.174740][    T1] [c00000007e343e20] [c00000000000d9d0] ret_from_kernel_thread+0x5c/0x6c
> [    0.175050][    T1] Instruction dump:
> [    0.175626][    T1] 554905ee 71480040 7d2907b4 4182016c 2c290000 3920006e 913e002c 41820034
> [    0.175841][    T1] 7c6307b4 e9300020 78631f24 7d58182a <7d2a482a> f93e0080 7d404828 314a0001
> [    0.178340][    T1] ---[ end trace 6876b88dd1d4b3bb ]---
> [    0.178512][    T1]
> [    1.180458][    T1] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
> 
> That's qemu:
> 
> qemu-system-ppc64 -nographic -vga none -M pseries -cpu POWER8 \
>   -kernel build~/vmlinux \
>   -m 2G,slots=2,maxmem=4G \
>   -object memory-backend-ram,size=1G,id=m0 \
>   -object memory-backend-ram,size=1G,id=m1 \
>   -numa node,nodeid=0,memdev=m0 \
>   -numa node,nodeid=1,memdev=m1 \
>   -smp 2,sockets=2,maxcpus=2  \
> 

Thanks Michael for the report and also for identifying the patch and also
giving an easy reproducer. That made my task easy. (My only problem was all
my PowerKVM hosts had a old compiler that refuse to compile never kernels.)

So in this setup, CPU doesn't have a l2-cache. And in that scenario, we
miss updating the l2-cache domain. Actually the initial patch had this
exact code. However it was my mistake. I should have reassessed it before
making changes suggested by Gautham.

Patch below. Do let me know if you want me to send the patch separately.

> 
> On mambo I get:
> 
> [    0.005069][    T1] smp: Bringing up secondary CPUs ...
> [    0.011656][    T1] smp: Brought up 2 nodes, 8 CPUs
> [    0.011682][    T1] numa: Node 0 CPUs: 0-3
> [    0.011709][    T1] numa: Node 1 CPUs: 4-7
> [    0.012015][    T1] BUG: arch topology borken
> [    0.012040][    T1]      the SMT domain not a subset of the CACHE domain
> [    0.012107][    T1] BUG: Unable to handle kernel data access on read at 0x80000001012e7398
> [    0.012142][    T1] Faulting instruction address: 0xc0000000001aa4f0
> [    0.012174][    T1] Oops: Kernel access of bad area, sig: 11 [#1]
> [    0.012206][    T1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
> [    0.012236][    T1] Modules linked in:
> [    0.012264][    T1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.9.0-rc2-00095-g7430ad5aa700 #1
> [    0.012304][    T1] NIP:  c0000000001aa4f0 LR: c0000000001aa498 CTR: 0000000000000000
> [    0.012341][    T1] REGS: c0000000ef583880 TRAP: 0380   Not tainted  (5.9.0-rc2-00095-g7430ad5aa700)
> [    0.012379][    T1] MSR:  9000000002009033 <SF,HV,VEC,EE,ME,IR,DR,RI,LE>  CR: 44002222  XER: 00040000
> [    0.012439][    T1] CFAR: c0000000000101b0 IRQMASK: 0
> [    0.012439][    T1] GPR00: 0000000000000000 c0000000ef583b10 c0000000017fd000 0000000000004000
> [    0.012439][    T1] GPR04: 0000000000000000 0000000000000800 0000000000000000 0000000000000000
> [    0.012439][    T1] GPR08: 0000000000000000 c0000000012eb398 c0000000ffffc000 0000000000000000
> [    0.012439][    T1] GPR12: 0000000000000020 c0000000019f0000 00000000ffff8ad1 0000000000000000
> [    0.012439][    T1] GPR16: c0000000ef068658 c0000000018d7ba8 0000000000000008 c000000001690bb8
> [    0.012439][    T1] GPR20: c00000000182dc80 c0000000ef06be90 00000000ffff8ad1 c000000001014aa8
> [    0.012439][    T1] GPR24: 0000000000000917 c0000000ef068e00 0000000000000000 c0000000ef06be00
> [    0.012439][    T1] GPR28: 0000000000000001 c0000000ef068640 c0000000ef4a1800 c000000001832310
> [    0.012774][    T1] NIP [c0000000001aa4f0] build_sched_domains+0x5c0/0x14f0
> [    0.012812][    T1] LR [c0000000001aa498] build_sched_domains+0x568/0x14f0
> [    0.012842][    T1] Call Trace:
> [    0.012872][    T1] [c0000000ef583b10] [c0000000001aa3b4] build_sched_domains+0x484/0x14f0 (unreliable)
> [    0.012922][    T1] [c0000000ef583c50] [c0000000001ac3d8] sched_init_domains+0xd8/0x120
> [    0.012966][    T1] [c0000000ef583ca0] [c0000000011962d0] sched_init_smp+0x50/0xc4
> [    0.013008][    T1] [c0000000ef583cd0] [c00000000117451c] kernel_init_freeable+0x1b4/0x378
> [    0.013051][    T1] [c0000000ef583db0] [c000000000012994] kernel_init+0x2c/0x158
> [    0.013092][    T1] [c0000000ef583e20] [c00000000000d9d0] ret_from_kernel_thread+0x5c/0x6c
> [    0.013128][    T1] Instruction dump:
> [    0.013151][    T1] e93b003a 712a0040 552a05ee 418203c4 2c2a0000 3920006e 913b002c 41820034
> [    0.013206][    T1] 7c6307b4 e93d0020 78631f24 7d54182a <7d2a482a> f93b0080 7d404828 314a0001
> [    0.013267][    T1] ---[ end trace 1bf5f6f38a9fd096 ]---
> 

I haven't tried Mambo. But the problem report looks similar.

> 
> Did I miss a lead-up patch?
> 

No

-- 
Thanks and Regards
Srikar Dronamraju

------------------------->8--------------------------------------------8<------------

From b25d47b01b7195b1df19083a4043fa6a87a901a3 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Thu, 9 Jul 2020 13:33:38 +0530
Subject: [PATCH v5.1 05/10] powerpc/smp: Dont assume l2-cache to be superset of
 sibling

Current code assumes that cpumask of cpus sharing a l2-cache mask will
always be a superset of cpu_sibling_mask.

Lets stop that assumption. cpu_l2_cache_mask is a superset of
cpu_sibling_mask if and only if shared_caches is set.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: LKML <linux-kernel@vger.kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Anton Blanchard <anton@ozlabs.org>
Cc: Oliver O'Halloran <oohall@gmail.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Jordan Niethe <jniethe5@gmail.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
Changelog v1 -> v2:
	Set cpumask after verifying l2-cache. (Gautham)

Changelog v5 -> v5.1:
	Set cpumask before verifying l2-cache. (Michael Ellerman)

 arch/powerpc/kernel/smp.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 9f4333d..a87afdf 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1185,6 +1185,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int))
 	struct device_node *l2_cache, *np;
 	int i;
 
+	cpumask_set_cpu(cpu, mask_fn(cpu));
 	l2_cache = cpu_to_l2cache(cpu);
 	if (!l2_cache)
 		return false;
@@ -1271,29 +1272,30 @@ static void add_cpu_to_masks(int cpu)
 	 * add it to it's own thread sibling mask.
 	 */
 	cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+	cpumask_set_cpu(cpu, cpu_core_mask(cpu));
 
 	for (i = first_thread; i < first_thread + threads_per_core; i++)
 		if (cpu_online(i))
 			set_cpus_related(i, cpu, cpu_sibling_mask);
 
 	add_cpu_to_smallcore_masks(cpu);
-	/*
-	 * Copy the thread sibling mask into the cache sibling mask
-	 * and mark any CPUs that share an L2 with this CPU.
-	 */
-	for_each_cpu(i, cpu_sibling_mask(cpu))
-		set_cpus_related(cpu, i, cpu_l2_cache_mask);
 	update_mask_by_l2(cpu, cpu_l2_cache_mask);
 
-	/*
-	 * Copy the cache sibling mask into core sibling mask and mark
-	 * any CPUs on the same chip as this CPU.
-	 */
-	for_each_cpu(i, cpu_l2_cache_mask(cpu))
-		set_cpus_related(cpu, i, cpu_core_mask);
+	if (pkg_id == -1) {
+		struct cpumask *(*mask)(int) = cpu_sibling_mask;
+
+		/*
+		 * Copy the sibling mask into core sibling mask and
+		 * mark any CPUs on the same chip as this CPU.
+		 */
+		if (shared_caches)
+			mask = cpu_l2_cache_mask;
+
+		for_each_cpu(i, mask(cpu))
+			set_cpus_related(cpu, i, cpu_core_mask);
 
-	if (pkg_id == -1)
 		return;
+	}
 
 	for_each_cpu(i, cpu_online_mask)
 		if (get_physical_package_id(i) == pkg_id)
-- 
2.17.1


^ permalink raw reply related

* [PATCH v2] powerpc/papr_scm: Fix warning triggered by perf_stats_show()
From: Vaibhav Jain @ 2020-09-12  8:14 UTC (permalink / raw)
  To: linuxppc-dev, linux-nvdimm
  Cc: Santosh Sivaraj, Oliver O'Halloran, Aneesh Kumar K . V,
	Vaibhav Jain, Dan Williams, Ira Weiny

A warning is reported by the kernel in case perf_stats_show() returns
an error code. The warning is of the form below:

 papr_scm ibm,persistent-memory:ibm,pmemory@44100001:
 	  Failed to query performance stats, Err:-10
 dev_attr_show: perf_stats_show+0x0/0x1c0 [papr_scm] returned bad count
 fill_read_buffer: dev_attr_show+0x0/0xb0 returned bad count

On investigation it looks like that the compiler is silently truncating the
return value of drc_pmem_query_stats() from 'long' to 'int', since the
variable used to store the return code 'rc' is an 'int'. This
truncated value is then returned back as a 'ssize_t' back from
perf_stats_show() to 'dev_attr_show()' which thinks of it as a large
unsigned number and triggers this warning..

To fix this we update the type of variable 'rc' from 'int' to
'ssize_t' that prevents the compiler from truncating the return value
of drc_pmem_query_stats() and returning correct signed value back from
perf_stats_show().

Fixes: 2d02bf835e573 ('powerpc/papr_scm: Fetch nvdimm performance
       stats from PHYP')
Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
---
Changelog:

v2: Added an explicit cast to the expression calling 'seq_buf_used()'
    and triggering this issue. [ Ira ]
---
 arch/powerpc/platforms/pseries/papr_scm.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index a88a707a608aa..5493bc847bd08 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -785,7 +785,8 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc,
 static ssize_t perf_stats_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
-	int index, rc;
+	int index;
+	ssize_t rc;
 	struct seq_buf s;
 	struct papr_scm_perf_stat *stat;
 	struct papr_scm_perf_stats *stats;
@@ -820,7 +821,7 @@ static ssize_t perf_stats_show(struct device *dev,
 
 free_stats:
 	kfree(stats);
-	return rc ? rc : seq_buf_used(&s);
+	return rc ? rc : (ssize_t)seq_buf_used(&s);
 }
 DEVICE_ATTR_ADMIN_RO(perf_stats);
 
-- 
2.26.2


^ permalink raw reply related

* Re: [PATCH] powerpc/papr_scm: Fix warning triggered by perf_stats_show()
From: Vaibhav Jain @ 2020-09-12  8:27 UTC (permalink / raw)
  To: Ira Weiny
  Cc: Santosh Sivaraj, linux-nvdimm, Aneesh Kumar K . V,
	Oliver O'Halloran, Dan Williams, linuxppc-dev
In-Reply-To: <20200911221308.GP1930795@iweiny-DESK2.sc.intel.com>

Ira Weiny <ira.weiny@intel.com> writes:

> On Sat, Sep 12, 2020 at 01:36:46AM +0530, Vaibhav Jain wrote:
>> Thanks for reviewing this patch Ira,
>> 
>> 
>> Ira Weiny <ira.weiny@intel.com> writes:
>> 
>> > On Thu, Sep 10, 2020 at 02:52:12PM +0530, Vaibhav Jain wrote:
>> >> A warning is reported by the kernel in case perf_stats_show() returns
>> >> an error code. The warning is of the form below:
>> >> 
>> >>  papr_scm ibm,persistent-memory:ibm,pmemory@44100001:
>> >>  	  Failed to query performance stats, Err:-10
>> >>  dev_attr_show: perf_stats_show+0x0/0x1c0 [papr_scm] returned bad count
>> >>  fill_read_buffer: dev_attr_show+0x0/0xb0 returned bad count
>> >> 
>> >> On investigation it looks like that the compiler is silently truncating the
>> >> return value of drc_pmem_query_stats() from 'long' to 'int', since the
>> >> variable used to store the return code 'rc' is an 'int'. This
>> >> truncated value is then returned back as a 'ssize_t' back from
>> >> perf_stats_show() to 'dev_attr_show()' which thinks of it as a large
>> >> unsigned number and triggers this warning..
>> >> 
>> >> To fix this we update the type of variable 'rc' from 'int' to
>> >> 'ssize_t' that prevents the compiler from truncating the return value
>> >> of drc_pmem_query_stats() and returning correct signed value back from
>> >> perf_stats_show().
>> >> 
>> >> Fixes: 2d02bf835e573 ('powerpc/papr_scm: Fetch nvdimm performance
>> >>        stats from PHYP')
>> >> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
>> >> ---
>> >>  arch/powerpc/platforms/pseries/papr_scm.c | 3 ++-
>> >>  1 file changed, 2 insertions(+), 1 deletion(-)
>> >> 
>> >> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
>> >> index a88a707a608aa..9f00b61676ab9 100644
>> >> --- a/arch/powerpc/platforms/pseries/papr_scm.c
>> >> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
>> >> @@ -785,7 +785,8 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc,
>> >>  static ssize_t perf_stats_show(struct device *dev,
>> >>  			       struct device_attribute *attr, char *buf)
>> >>  {
>> >> -	int index, rc;
>> >> +	int index;
>> >> +	ssize_t rc;
>> >
>> > I'm not sure this is really fixing everything here.
>> 
>> The issue is with the statement in perf_stats_show():
>> 
>> 'return rc ? rc : seq_buf_used(&s);'
>> 
>> The function seq_buf_used() returns an 'unsigned int' and with 'rc'
>> typed as 'int', forces a promotion of the expression to 'unsigned int'
>> which causes a loss of signedness of 'rc' and compiler silently
>> assigns this unsigned value to the function return typed as 'signed
>> long'.
>> 
>> Making 'rc', a 'signed long' forces a promotion of the expresion to
>> 'signed long' which preserves the signedness of 'rc' and will also be
>> compatible with the function return type.
>
> Ok, ok, I read this all wrong.
>
> FWIW I would also cast seq_buf_used() to ssize_t to show you know what you are
> doing there.
>
>> 
>> >
>> > drc_pmem_query_stats() can return negative errno's.  Why are those not checked
>> > somewhere in perf_stats_show()?
>> >
>> For the specific invocation 'drc_pmem_query_stats(p, stats, 0)' we only
>> expect return value 'rc <=0' with '0' indicating a successful fetch of
>> nvdimm performance stats from hypervisor. Hence there are no explicit
>> checks for negative error codes in the functions as all return values
>> !=0 indicate an error.
>> 
>> 
>> > It seems like all this fix is handling is a > 0 return value: 'ret[0]' from
>> > line 289 in papr_scm.c...  Or something?
>> No, in case the arg 'num_stats' is '0' and 'buff_stats != NULL' the
>> variable 'size' is assigned a non-zero value hence that specific branch
>> you mentioned  is never taken. Instead in case of success
>> drc_pmem_query_stats() return '0' and in case of an error a negative
>> error code is returned.
>> 
>> >
>> > Worse yet drc_pmem_query_stats() is returning ssize_t which is a signed value.
>> > Therefore, it should not be returning -errno.  I'm surprised the static
>> > checkers did not catch that.
>> Didnt quite get the assertion here. The function is marked to return
>> 'ssize_t' because we can return both +ve for success and -ve values to
>> indicate errors.
>
> Sorry I was reading this as size_t and meant to say unsigned...  I was looking
> at this too quickly.
>
>> 
>> >
>> > I believe I caught similar errors with a patch series before which did not pay
>> > attention to variable types.
>> >
>> > Please audit this code for these types of errors and ensure you are really
>> > doing the correct thing when using the sysfs interface.  I'm pretty sure bad
>> > things will eventually happen (if they are not already) if you return some
>> > really big number to the sysfs core from *_show().
>> I think this problem is different compared to what you had previously pointed
>> to. The values returned from drc_pmem_query_stats() can be stored in an
>> 'int' variable too, however it was the silent promotion of a signed type
>> to unsigned type was what caused this specific issue.
>
> Ok this makes more sense now.  Sorry about not looking more carefully.
>
> But I still think matching up the return of seq_buf_used() is worth it.  I
> don't particularly like depending on 'automatic' promotions which make
> reviewing code harder like this.
Agree, I have sent out a v2 addressing this.

>
> And sorry if my email seemed harsh I did not mean it to be.  I just like when
> types are more explicit because I feel like it can avoid issues like this.
> (Specifically my confusion over the types...)
>
> :-D
No worries :-)


Thanks,
-- 
Cheers
~ Vaibhav

^ permalink raw reply

* [PATCH 03/15] selftests/seccomp: mips: Define SYSCALL_NUM_SET macro
From: Kees Cook @ 2020-09-12 11:08 UTC (permalink / raw)
  To: linux-kernel
  Cc: Thadeu Lima de Souza Cascardo, Will Drewry, Kees Cook,
	linux-xtensa, linux-mips, Andy Lutomirski, Max Filippov,
	linux-arm-kernel, linux-kselftest, linuxppc-dev,
	Christian Brauner
In-Reply-To: <20200912110820.597135-1-keescook@chromium.org>

Remove the mips special-case in change_syscall().

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 1c83e743bfb1..02a9a6599746 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -1742,6 +1742,13 @@ TEST_F(TRACE_poke, getpid_runs_normally)
 # define ARCH_REGS		struct pt_regs
 # define SYSCALL_NUM(_regs)	(_regs).regs[2]
 # define SYSCALL_SYSCALL_NUM	regs[4]
+# define SYSCALL_NUM_SET(_regs, _nr)			\
+	do {						\
+		if ((_regs).regs[2] == __NR_O32_Linux)	\
+			(_regs).regs[4] = _nr;		\
+		else					\
+			(_regs).regs[2] = _nr;		\
+	} while (0)
 # define SYSCALL_RET(_regs)	(_regs).regs[2]
 # define SYSCALL_NUM_RET_SHARE_REG
 #elif defined(__xtensa__)
@@ -1839,17 +1846,11 @@ void change_syscall(struct __test_metadata *_metadata,
 
 #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \
 	defined(__s390__) || defined(__hppa__) || defined(__riscv) || \
-	defined(__xtensa__) || defined(__csky__) || defined(__sh__)
+	defined(__xtensa__) || defined(__csky__) || defined(__sh__) || \
+	defined(__mips__)
 	{
 		SYSCALL_NUM_SET(regs, syscall);
 	}
-#elif defined(__mips__)
-	{
-		if (SYSCALL_NUM(regs) == __NR_O32_Linux)
-			regs.SYSCALL_SYSCALL_NUM = syscall;
-		else
-			SYSCALL_NUM_SET(regs, syscall);
-	}
 
 #elif defined(__arm__)
 # ifndef PTRACE_SET_SYSCALL
-- 
2.25.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox