LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3 2/6] powerpc/pseries/iommu: Update call to ibm, query-pe-dma-windows
From: Leonardo Bras @ 2020-07-03  6:18 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Alexey Kardashevskiy, Leonardo Bras, Thiago Jung Bauermann,
	Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200703061844.111865-1-leobras.c@gmail.com>

From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
outputs from "ibm,query-pe-dma-windows" go from 5 to 6.

This change of output size is meant to expand the address size of
largest_available_block PE TCE from 32-bit to 64-bit, which ends up
shifting page_size and migration_capable.

This ends up requiring the update of
ddw_query_response->largest_available_block from u32 to u64, and manually
assigning the values from the buffer into this struct, according to
output size.

Also, a routine was created for helping reading the ddw extensions as
suggested by LoPAR: First reading the size of the extension array from
index 0, checking if the property exists, and then returning it's value.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 91 +++++++++++++++++++++++---
 1 file changed, 81 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index ac0d6376bdad..1a933c4e8bba 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -47,6 +47,12 @@ enum {
 	DDW_APPLICABLE_SIZE
 };
 
+enum {
+	DDW_EXT_SIZE = 0,
+	DDW_EXT_RESET_DMA_WIN = 1,
+	DDW_EXT_QUERY_OUT_SIZE = 2
+};
+
 static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
 	struct iommu_table_group *table_group;
@@ -342,7 +348,7 @@ struct direct_window {
 /* Dynamic DMA Window support */
 struct ddw_query_response {
 	u32 windows_available;
-	u32 largest_available_block;
+	u64 largest_available_block;
 	u32 page_size;
 	u32 migration_capable;
 };
@@ -877,14 +883,62 @@ static int find_existing_ddw_windows(void)
 }
 machine_arch_initcall(pseries, find_existing_ddw_windows);
 
+/**
+ * ddw_read_ext - Get the value of an DDW extension
+ * @np:		device node from which the extension value is to be read.
+ * @extnum:	index number of the extension.
+ * @value:	pointer to return value, modified when extension is available.
+ *
+ * Checks if "ibm,ddw-extensions" exists for this node, and get the value
+ * on index 'extnum'.
+ * It can be used only to check if a property exists, passing value == NULL.
+ *
+ * Returns:
+ *	0 if extension successfully read
+ *	-EINVAL if the "ibm,ddw-extensions" does not exist,
+ *	-ENODATA if "ibm,ddw-extensions" does not have a value, and
+ *	-EOVERFLOW if "ibm,ddw-extensions" does not contain this extension.
+ */
+static inline int ddw_read_ext(const struct device_node *np, int extnum,
+			       u32 *value)
+{
+	static const char propname[] = "ibm,ddw-extensions";
+	u32 count;
+	int ret;
+
+	ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, &count);
+	if (ret)
+		return ret;
+
+	if (count < extnum)
+		return -EOVERFLOW;
+
+	if (!value)
+		value = &count;
+
+	return of_property_read_u32_index(np, propname, extnum, value);
+}
+
 static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
-			struct ddw_query_response *query)
+		     struct ddw_query_response *query,
+		     struct device_node *parent)
 {
 	struct device_node *dn;
 	struct pci_dn *pdn;
-	u32 cfg_addr;
+	u32 cfg_addr, ext_query, query_out[5];
 	u64 buid;
-	int ret;
+	int ret, out_sz;
+
+	/*
+	 * From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many
+	 * output parameters ibm,query-pe-dma-windows will have, ranging from
+	 * 5 to 6.
+	 */
+	ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, &ext_query);
+	if (!ret && ext_query == 1)
+		out_sz = 6;
+	else
+		out_sz = 5;
 
 	/*
 	 * Get the config address and phb buid of the PE window.
@@ -897,11 +951,28 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
 	buid = pdn->phb->buid;
 	cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
 
-	ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, 5, (u32 *)query,
+	ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out,
 			cfg_addr, BUID_HI(buid), BUID_LO(buid));
-	dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
-		" returned %d\n", ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr,
-		 BUID_HI(buid), BUID_LO(buid), ret);
+	dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned %d\n",
+		 ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
+		 BUID_LO(buid), ret);
+
+	switch (out_sz) {
+	case 5:
+		query->windows_available = query_out[0];
+		query->largest_available_block = query_out[1];
+		query->page_size = query_out[2];
+		query->migration_capable = query_out[3];
+		break;
+	case 6:
+		query->windows_available = query_out[0];
+		query->largest_available_block = ((u64)query_out[1] << 32) |
+						 query_out[2];
+		query->page_size = query_out[3];
+		query->migration_capable = query_out[4];
+		break;
+	}
+
 	return ret;
 }
 
@@ -1049,7 +1120,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	 * of page sizes: supported and supported for migrate-dma.
 	 */
 	dn = pci_device_to_OF_node(dev);
-	ret = query_ddw(dev, ddw_avail, &query);
+	ret = query_ddw(dev, ddw_avail, &query, pdn);
 	if (ret != 0)
 		goto out_failed;
 
@@ -1077,7 +1148,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	/* check largest block * page size > max memory hotplug addr */
 	max_addr = ddw_memory_hotplug_max();
 	if (query.largest_available_block < (max_addr >> page_shift)) {
-		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u "
+		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu "
 			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
 			  1ULL << page_shift);
 		goto out_failed;
-- 
2.25.4


^ permalink raw reply related

* [PATCH v3 3/6] powerpc/pseries/iommu: Move window-removing part of remove_ddw into remove_dma_window
From: Leonardo Bras @ 2020-07-03  6:18 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Alexey Kardashevskiy, Leonardo Bras, Thiago Jung Bauermann,
	Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200703061844.111865-1-leobras.c@gmail.com>

Move the window-removing part of remove_ddw into a new function
(remove_dma_window), so it can be used to remove other DMA windows.

It's useful for removing DMA windows that don't create DIRECT64_PROPNAME
property, like the default DMA window from the device, which uses
"ibm,dma-window".

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 45 +++++++++++++++-----------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 1a933c4e8bba..4e33147825cc 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -781,25 +781,14 @@ static int __init disable_ddw_setup(char *str)
 
 early_param("disable_ddw", disable_ddw_setup);
 
-static void remove_ddw(struct device_node *np, bool remove_prop)
+static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
+			      struct property *win)
 {
 	struct dynamic_dma_window_prop *dwp;
-	struct property *win64;
-	u32 ddw_avail[DDW_APPLICABLE_SIZE];
 	u64 liobn;
-	int ret = 0;
-
-	ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
-					 &ddw_avail[0], DDW_APPLICABLE_SIZE);
-
-	win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
-	if (!win64)
-		return;
-
-	if (ret || win64->length < sizeof(*dwp))
-		goto delprop;
+	int ret;
 
-	dwp = win64->value;
+	dwp = win->value;
 	liobn = (u64)be32_to_cpu(dwp->liobn);
 
 	/* clear the whole window, note the arg is in kernel pages */
@@ -821,10 +810,30 @@ static void remove_ddw(struct device_node *np, bool remove_prop)
 		pr_debug("%pOF: successfully removed direct window: rtas returned "
 			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
 			np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
+}
+
+static void remove_ddw(struct device_node *np, bool remove_prop)
+{
+	struct property *win;
+	u32 ddw_avail[DDW_APPLICABLE_SIZE];
+	int ret = 0;
+
+	ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
+					 &ddw_avail[0], DDW_APPLICABLE_SIZE);
+	if (ret)
+		return;
+
+	win = of_find_property(np, DIRECT64_PROPNAME, NULL);
+	if (!win)
+		return;
+
+	if (win->length >= sizeof(struct dynamic_dma_window_prop))
+		remove_dma_window(np, ddw_avail, win);
+
+	if (!remove_prop)
+		return;
 
-delprop:
-	if (remove_prop)
-		ret = of_remove_property(np, win64);
+	ret = of_remove_property(np, win);
 	if (ret)
 		pr_warn("%pOF: failed to remove direct window property: %d\n",
 			np, ret);
-- 
2.25.4


^ permalink raw reply related

* [PATCH v3 4/6] powerpc/pseries/iommu: Remove default DMA window before creating DDW
From: Leonardo Bras @ 2020-07-03  6:18 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Alexey Kardashevskiy, Leonardo Bras, Thiago Jung Bauermann,
	Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200703061844.111865-1-leobras.c@gmail.com>

On LoPAR "DMA Window Manipulation Calls", it's recommended to remove the
default DMA window for the device, before attempting to configure a DDW,
in order to make the maximum resources available for the next DDW to be
created.

This is a requirement for using DDW on devices in which hypervisor
allows only one DMA window.

If setting up a new DDW fails anywhere after the removal of this
default DMA window, it's needed to restore the default DMA window.
For this, an implementation of ibm,reset-pe-dma-windows rtas call is
needed:

Platforms supporting the DDW option starting with LoPAR level 2.7 implement
ibm,ddw-extensions. The first extension available (index 2) carries the
token for ibm,reset-pe-dma-windows rtas call, which is used to restore
the default DMA window for a device, if it has been deleted.

It does so by resetting the TCE table allocation for the PE to it's
boot time value, available in "ibm,dma-window" device tree node.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 83 +++++++++++++++++++++-----
 1 file changed, 69 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 4e33147825cc..5b520ac354c6 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1066,6 +1066,38 @@ static phys_addr_t ddw_memory_hotplug_max(void)
 	return max_addr;
 }
 
+/*
+ * Platforms supporting the DDW option starting with LoPAR level 2.7 implement
+ * ibm,ddw-extensions, which carries the rtas token for
+ * ibm,reset-pe-dma-windows.
+ * That rtas-call can be used to restore the default DMA window for the device.
+ */
+static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
+{
+	int ret;
+	u32 cfg_addr, reset_dma_win;
+	u64 buid;
+	struct device_node *dn;
+	struct pci_dn *pdn;
+
+	ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win);
+	if (ret)
+		return;
+
+	dn = pci_device_to_OF_node(dev);
+	pdn = PCI_DN(dn);
+	buid = pdn->phb->buid;
+	cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
+
+	ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid),
+			BUID_LO(buid));
+	if (ret)
+		dev_info(&dev->dev,
+			 "ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ",
+			 reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid),
+			 ret);
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1079,7 +1111,7 @@ static phys_addr_t ddw_memory_hotplug_max(void)
  */
 static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 {
-	int len, ret;
+	int len, ret, reset_win_ext;
 	struct ddw_query_response query;
 	struct ddw_create_response create;
 	int page_shift;
@@ -1087,7 +1119,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	struct device_node *dn;
 	u32 ddw_avail[DDW_APPLICABLE_SIZE];
 	struct direct_window *window;
-	struct property *win64;
+	struct property *win64, *default_win = NULL;
 	struct dynamic_dma_window_prop *ddwprop;
 	struct failed_ddw_pdn *fpdn;
 
@@ -1122,7 +1154,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	if (ret)
 		goto out_failed;
 
-       /*
+	/*
 	 * Query if there is a second window of size to map the
 	 * whole partition.  Query returns number of windows, largest
 	 * block assigned to PE (partition endpoint), and two bitmasks
@@ -1133,14 +1165,34 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	if (ret != 0)
 		goto out_failed;
 
+	/*
+	 * If there is no window available, remove the default DMA window,
+	 * if it's present. This will make all the resources available to the
+	 * new DDW window.
+	 * If anything fails after this, we need to restore it, so also check
+	 * for extensions presence.
+	 */
 	if (query.windows_available == 0) {
-		/*
-		 * no additional windows are available for this device.
-		 * We might be able to reallocate the existing window,
-		 * trading in for a larger page size.
-		 */
-		dev_dbg(&dev->dev, "no free dynamic windows");
-		goto out_failed;
+		default_win = of_find_property(pdn, "ibm,dma-window", NULL);
+		if (!default_win)
+			goto out_failed;
+
+		reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL);
+		if (reset_win_ext)
+			goto out_failed;
+
+		remove_dma_window(pdn, ddw_avail, default_win);
+
+		/* Query again, to check if the window is available */
+		ret = query_ddw(dev, ddw_avail, &query, pdn);
+		if (ret != 0)
+			goto out_restore_defwin;
+
+		if (query.windows_available == 0) {
+			/* no windows are available for this device. */
+			dev_dbg(&dev->dev, "no free dynamic windows");
+			goto out_restore_defwin;
+		}
 	}
 	if (query.page_size & 4) {
 		page_shift = 24; /* 16MB */
@@ -1151,7 +1203,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	} else {
 		dev_dbg(&dev->dev, "no supported direct page size in mask %x",
 			  query.page_size);
-		goto out_failed;
+		goto out_restore_defwin;
 	}
 	/* verify the window * number of ptes will map the partition */
 	/* check largest block * page size > max memory hotplug addr */
@@ -1160,14 +1212,14 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu "
 			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
 			  1ULL << page_shift);
-		goto out_failed;
+		goto out_restore_defwin;
 	}
 	len = order_base_2(max_addr);
 	win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
 	if (!win64) {
 		dev_info(&dev->dev,
 			"couldn't allocate property for 64bit dma window\n");
-		goto out_failed;
+		goto out_restore_defwin;
 	}
 	win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
 	win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
@@ -1230,8 +1282,11 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	kfree(win64->value);
 	kfree(win64);
 
-out_failed:
+out_restore_defwin:
+	if (default_win && reset_win_ext == 0)
+		reset_dma_window(dev, pdn);
 
+out_failed:
 	fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
 	if (!fpdn)
 		goto out_unlock;
-- 
2.25.4


^ permalink raw reply related

* [PATCH v3 5/6] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition
From: Leonardo Bras @ 2020-07-03  6:18 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Alexey Kardashevskiy, Leonardo Bras, Thiago Jung Bauermann,
	Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200703061844.111865-1-leobras.c@gmail.com>

As of today, if the biggest DDW that can be created can't map the whole
partition, it's creation is skipped and the default DMA window
"ibm,dma-window" is used instead.

Usually this DDW is bigger than the default DMA window, and it performs
better, so it would be nice to use it instead.

The ddw created will be used for direct mapping by default.
If it's not available, indirect mapping sill be used instead.

As there will never have both mappings at the same time, the same property
name can be used for the created DDW.

So renaming
define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
to
define DMA64_PROPNAME "linux,dma64-ddr-window-info"
looks the right thing to do.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 38 ++++++++++++++++----------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 5b520ac354c6..c652177de09c 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -364,7 +364,7 @@ static LIST_HEAD(direct_window_list);
 static DEFINE_SPINLOCK(direct_window_list_lock);
 /* protects initializing window twice for same device */
 static DEFINE_MUTEX(direct_window_init_mutex);
-#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
 
 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
 					unsigned long num_pfn, const void *arg)
@@ -690,7 +690,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 	struct iommu_table *tbl;
 	struct device_node *dn, *pdn;
 	struct pci_dn *ppci;
-	const __be32 *dma_window = NULL;
+	const __be32 *dma_window = NULL, *alt_dma_window = NULL;
 
 	dn = pci_bus_to_OF_node(bus);
 
@@ -704,8 +704,13 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 			break;
 	}
 
+	/* If there is a DDW available, use it instead */
+	alt_dma_window = of_get_property(pdn, DMA64_PROPNAME, NULL);
+	if (alt_dma_window)
+		dma_window = alt_dma_window;
+
 	if (dma_window == NULL) {
-		pr_debug("  no ibm,dma-window property !\n");
+		pr_debug("  no ibm,dma-window nor linux,dma64-ddr-window-info property !\n");
 		return;
 	}
 
@@ -823,7 +828,7 @@ static void remove_ddw(struct device_node *np, bool remove_prop)
 	if (ret)
 		return;
 
-	win = of_find_property(np, DIRECT64_PROPNAME, NULL);
+	win = of_find_property(np, DMA64_PROPNAME, NULL);
 	if (!win)
 		return;
 
@@ -869,8 +874,8 @@ static int find_existing_ddw_windows(void)
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
 		return 0;
 
-	for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
-		direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);
+	for_each_node_with_property(pdn, DMA64_PROPNAME) {
+		direct64 = of_get_property(pdn, DMA64_PROPNAME, &len);
 		if (!direct64)
 			continue;
 
@@ -1205,23 +1210,26 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 			  query.page_size);
 		goto out_restore_defwin;
 	}
+
 	/* verify the window * number of ptes will map the partition */
-	/* check largest block * page size > max memory hotplug addr */
 	max_addr = ddw_memory_hotplug_max();
 	if (query.largest_available_block < (max_addr >> page_shift)) {
-		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu "
-			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
-			  1ULL << page_shift);
-		goto out_restore_defwin;
+		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu %llu-sized pages\n",
+			max_addr, query.largest_available_block,
+			1ULL << page_shift);
+
+		len = order_base_2(query.largest_available_block << page_shift);
+	} else {
+		len = order_base_2(max_addr);
 	}
-	len = order_base_2(max_addr);
+
 	win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
 	if (!win64) {
 		dev_info(&dev->dev,
 			"couldn't allocate property for 64bit dma window\n");
 		goto out_restore_defwin;
 	}
-	win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
+	win64->name = kstrdup(DMA64_PROPNAME, GFP_KERNEL);
 	win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
 	win64->length = sizeof(*ddwprop);
 	if (!win64->name || !win64->value) {
@@ -1268,7 +1276,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	list_add(&window->list, &direct_window_list);
 	spin_unlock(&direct_window_list_lock);
 
-	dma_addr = be64_to_cpu(ddwprop->dma_base);
+	/* Only returns the dma_addr if DDW maps the whole partition */
+	if (len == order_base_2(max_addr))
+		dma_addr = be64_to_cpu(ddwprop->dma_base);
 	goto out_unlock;
 
 out_free_window:
-- 
2.25.4


^ permalink raw reply related

* [PATCH v3 6/6] powerpc/pseries/iommu: Rename "direct window" to "dma window"
From: Leonardo Bras @ 2020-07-03  6:18 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Alexey Kardashevskiy, Leonardo Bras, Thiago Jung Bauermann,
	Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200703061844.111865-1-leobras.c@gmail.com>

A previous change introduced the usage of DDW as a bigger indirect DMA
mapping when the DDW available size does not map the whole partition.

As most of the code that manipulates direct mappings was reused for
indirect mappings, it's necessary to rename all names and debug/info
messages to reflect that it can be used for both kinds of mapping.

Also, defines DEFAULT_DMA_WIN as "ibm,dma-window" to document that
it's the name of the default DMA window.

Those changes are not supposed to change how the code works in any
way, just adjust naming.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 101 +++++++++++++------------
 1 file changed, 53 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index c652177de09c..070b80efc43a 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -339,7 +339,7 @@ struct dynamic_dma_window_prop {
 	__be32	window_shift;	/* ilog2(tce_window_size) */
 };
 
-struct direct_window {
+struct dma_win {
 	struct device_node *device;
 	const struct dynamic_dma_window_prop *prop;
 	struct list_head list;
@@ -359,12 +359,13 @@ struct ddw_create_response {
 	u32 addr_lo;
 };
 
-static LIST_HEAD(direct_window_list);
+static LIST_HEAD(dma_win_list);
 /* prevents races between memory on/offline and window creation */
-static DEFINE_SPINLOCK(direct_window_list_lock);
+static DEFINE_SPINLOCK(dma_win_list_lock);
 /* protects initializing window twice for same device */
-static DEFINE_MUTEX(direct_window_init_mutex);
+static DEFINE_MUTEX(dma_win_init_mutex);
 #define DMA64_PROPNAME "linux,dma64-ddr-window-info"
+#define DEFAULT_DMA_WIN "ibm,dma-window"
 
 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
 					unsigned long num_pfn, const void *arg)
@@ -697,9 +698,12 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 	pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
 		 dn);
 
-	/* Find nearest ibm,dma-window, walking up the device tree */
+	/*
+	 * Find nearest ibm,dma-window (default DMA window), walking up the
+	 * device tree
+	 */
 	for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
-		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+		dma_window = of_get_property(pdn, DEFAULT_DMA_WIN, NULL);
 		if (dma_window != NULL)
 			break;
 	}
@@ -710,7 +714,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 		dma_window = alt_dma_window;
 
 	if (dma_window == NULL) {
-		pr_debug("  no ibm,dma-window nor linux,dma64-ddr-window-info property !\n");
+		pr_debug("  no %s nor %s property !\n",
+			 DEFAULT_DMA_WIN, DMA64_PROPNAME);
 		return;
 	}
 
@@ -808,11 +813,11 @@ static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
 
 	ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
 	if (ret)
-		pr_warn("%pOF: failed to remove direct window: rtas returned "
+		pr_warn("%pOF: failed to remove dma window: rtas returned "
 			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
 			np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 	else
-		pr_debug("%pOF: successfully removed direct window: rtas returned "
+		pr_debug("%pOF: successfully removed dma window: rtas returned "
 			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
 			np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 }
@@ -840,26 +845,26 @@ static void remove_ddw(struct device_node *np, bool remove_prop)
 
 	ret = of_remove_property(np, win);
 	if (ret)
-		pr_warn("%pOF: failed to remove direct window property: %d\n",
+		pr_warn("%pOF: failed to remove dma window property: %d\n",
 			np, ret);
 }
 
 static u64 find_existing_ddw(struct device_node *pdn)
 {
-	struct direct_window *window;
-	const struct dynamic_dma_window_prop *direct64;
+	struct dma_win *window;
+	const struct dynamic_dma_window_prop *dma64;
 	u64 dma_addr = 0;
 
-	spin_lock(&direct_window_list_lock);
+	spin_lock(&dma_win_list_lock);
 	/* check if we already created a window and dupe that config if so */
-	list_for_each_entry(window, &direct_window_list, list) {
+	list_for_each_entry(window, &dma_win_list, list) {
 		if (window->device == pdn) {
-			direct64 = window->prop;
-			dma_addr = be64_to_cpu(direct64->dma_base);
+			dma64 = window->prop;
+			dma_addr = be64_to_cpu(dma64->dma_base);
 			break;
 		}
 	}
-	spin_unlock(&direct_window_list_lock);
+	spin_unlock(&dma_win_list_lock);
 
 	return dma_addr;
 }
@@ -868,15 +873,15 @@ static int find_existing_ddw_windows(void)
 {
 	int len;
 	struct device_node *pdn;
-	struct direct_window *window;
-	const struct dynamic_dma_window_prop *direct64;
+	struct dma_win *window;
+	const struct dynamic_dma_window_prop *dma64;
 
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
 		return 0;
 
 	for_each_node_with_property(pdn, DMA64_PROPNAME) {
-		direct64 = of_get_property(pdn, DMA64_PROPNAME, &len);
-		if (!direct64)
+		dma64 = of_get_property(pdn, DMA64_PROPNAME, &len);
+		if (!dma64)
 			continue;
 
 		window = kzalloc(sizeof(*window), GFP_KERNEL);
@@ -887,10 +892,10 @@ static int find_existing_ddw_windows(void)
 		}
 
 		window->device = pdn;
-		window->prop = direct64;
-		spin_lock(&direct_window_list_lock);
-		list_add(&window->list, &direct_window_list);
-		spin_unlock(&direct_window_list_lock);
+		window->prop = dma64;
+		spin_lock(&dma_win_list_lock);
+		list_add(&window->list, &dma_win_list);
+		spin_unlock(&dma_win_list_lock);
 	}
 
 	return 0;
@@ -1123,12 +1128,12 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	u64 dma_addr, max_addr;
 	struct device_node *dn;
 	u32 ddw_avail[DDW_APPLICABLE_SIZE];
-	struct direct_window *window;
+	struct dma_win *window;
 	struct property *win64, *default_win = NULL;
 	struct dynamic_dma_window_prop *ddwprop;
 	struct failed_ddw_pdn *fpdn;
 
-	mutex_lock(&direct_window_init_mutex);
+	mutex_lock(&dma_win_init_mutex);
 
 	dma_addr = find_existing_ddw(pdn);
 	if (dma_addr != 0)
@@ -1178,7 +1183,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	 * for extensions presence.
 	 */
 	if (query.windows_available == 0) {
-		default_win = of_find_property(pdn, "ibm,dma-window", NULL);
+		default_win = of_find_property(pdn, DEFAULT_DMA_WIN, NULL);
 		if (!default_win)
 			goto out_failed;
 
@@ -1206,8 +1211,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	} else if (query.page_size & 1) {
 		page_shift = 12; /* 4kB */
 	} else {
-		dev_dbg(&dev->dev, "no supported direct page size in mask %x",
-			  query.page_size);
+		dev_dbg(&dev->dev, "no supported page size in mask %x",
+			query.page_size);
 		goto out_restore_defwin;
 	}
 
@@ -1258,7 +1263,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
 			win64->value, tce_setrange_multi_pSeriesLP_walk);
 	if (ret) {
-		dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n",
+		dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n",
 			 dn, ret);
 		goto out_free_window;
 	}
@@ -1272,9 +1277,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 
 	window->device = pdn;
 	window->prop = ddwprop;
-	spin_lock(&direct_window_list_lock);
-	list_add(&window->list, &direct_window_list);
-	spin_unlock(&direct_window_list_lock);
+	spin_lock(&dma_win_list_lock);
+	list_add(&window->list, &dma_win_list);
+	spin_unlock(&dma_win_list_lock);
 
 	/* Only returns the dma_addr if DDW maps the whole partition */
 	if (len == order_base_2(max_addr))
@@ -1304,7 +1309,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	list_add(&fpdn->list, &failed_ddw_pdn_list);
 
 out_unlock:
-	mutex_unlock(&direct_window_init_mutex);
+	mutex_unlock(&dma_win_init_mutex);
 	return dma_addr;
 }
 
@@ -1328,7 +1333,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 
 	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
 	     pdn = pdn->parent) {
-		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+		dma_window = of_get_property(pdn, DEFAULT_DMA_WIN, NULL);
 		if (dma_window)
 			break;
 	}
@@ -1379,7 +1384,7 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
 	 */
 	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
 			pdn = pdn->parent) {
-		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+		dma_window = of_get_property(pdn, DEFAULT_DMA_WIN, NULL);
 		if (dma_window)
 			break;
 	}
@@ -1396,29 +1401,29 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
 static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
 		void *data)
 {
-	struct direct_window *window;
+	struct dma_win *window;
 	struct memory_notify *arg = data;
 	int ret = 0;
 
 	switch (action) {
 	case MEM_GOING_ONLINE:
-		spin_lock(&direct_window_list_lock);
-		list_for_each_entry(window, &direct_window_list, list) {
+		spin_lock(&dma_win_list_lock);
+		list_for_each_entry(window, &dma_win_list, list) {
 			ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
 					arg->nr_pages, window->prop);
 			/* XXX log error */
 		}
-		spin_unlock(&direct_window_list_lock);
+		spin_unlock(&dma_win_list_lock);
 		break;
 	case MEM_CANCEL_ONLINE:
 	case MEM_OFFLINE:
-		spin_lock(&direct_window_list_lock);
-		list_for_each_entry(window, &direct_window_list, list) {
+		spin_lock(&dma_win_list_lock);
+		list_for_each_entry(window, &dma_win_list, list) {
 			ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
 					arg->nr_pages, window->prop);
 			/* XXX log error */
 		}
-		spin_unlock(&direct_window_list_lock);
+		spin_unlock(&dma_win_list_lock);
 		break;
 	default:
 		break;
@@ -1439,7 +1444,7 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
 	struct of_reconfig_data *rd = data;
 	struct device_node *np = rd->dn;
 	struct pci_dn *pci = PCI_DN(np);
-	struct direct_window *window;
+	struct dma_win *window;
 
 	switch (action) {
 	case OF_RECONFIG_DETACH_NODE:
@@ -1455,15 +1460,15 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
 			iommu_pseries_free_group(pci->table_group,
 					np->full_name);
 
-		spin_lock(&direct_window_list_lock);
-		list_for_each_entry(window, &direct_window_list, list) {
+		spin_lock(&dma_win_list_lock);
+		list_for_each_entry(window, &dma_win_list, list) {
 			if (window->device == np) {
 				list_del(&window->list);
 				kfree(window);
 				break;
 			}
 		}
-		spin_unlock(&direct_window_list_lock);
+		spin_unlock(&dma_win_list_lock);
 		break;
 	default:
 		err = NOTIFY_DONE;
-- 
2.25.4


^ permalink raw reply related

* Re: [PATCH v2 5/6] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition
From: Leonardo Bras @ 2020-07-03  6:27 UTC (permalink / raw)
  To: Alexey Kardashevskiy, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Thiago Jung Bauermann, Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <2c5dc8d2-f379-5a5f-844a-f4eea233f265@ozlabs.ru>

On Thu, 2020-07-02 at 10:31 +1000, Alexey Kardashevskiy wrote:
> 
> On 02/07/2020 09:48, Leonardo Bras wrote:
> > On Wed, 2020-07-01 at 16:57 -0300, Leonardo Bras wrote:
> > > > It is not necessarily "direct" anymore as the name suggests, you may
> > > > want to change that. DMA64_PROPNAME, may be. Thanks,
> > > > 
> > > 
> > > Yeah, you are right.
> > > I will change this for next version, also changing the string name to
> > > reflect this.
> > > 
> > > -#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
> > > +#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
> > > 
> > > Is that ok?
> > > 
> > > Thank you for helping!
> > 
> > In fact, there is a lot of places in this file where it's called direct
> > window. Should I replace everything?
> > Should it be in a separated patch?
> 
> If it looks simple and you write a nice commit log explaining all that
> and why you are not reusing the existing ibm,dma-window property (to
> provide a clue what "reset" will reset to? is there any other reason?)
> for that - sure, do it :)
> 

v3 available here:
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=187348&state=%2A&archive=both

Best regards,
Leonardo


^ permalink raw reply

* Re: [PATCH V3 (RESEND) 2/3] mm/sparsemem: Enable vmem_altmap support in vmemmap_alloc_block_buf()
From: Anshuman Khandual @ 2020-07-03  6:32 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: x86, H. Peter Anvin, Peter Zijlstra, Dave Hansen, linuxppc-dev,
	linux-kernel, linux-mm, Ingo Molnar, Paul Mackerras,
	Andy Lutomirski, Borislav Petkov, Thomas Gleixner, Will Deacon,
	Andrew Morton, linux-arm-kernel
In-Reply-To: <20200702140752.GF22241@gaia>



On 07/02/2020 07:37 PM, Catalin Marinas wrote:
> On Thu, Jun 18, 2020 at 06:45:29AM +0530, Anshuman Khandual wrote:
>> There are many instances where vmemap allocation is often switched between
>> regular memory and device memory just based on whether altmap is available
>> or not. vmemmap_alloc_block_buf() is used in various platforms to allocate
>> vmemmap mappings. Lets also enable it to handle altmap based device memory
>> allocation along with existing regular memory allocations. This will help
>> in avoiding the altmap based allocation switch in many places.
>>
>> While here also implement a regular memory allocation fallback mechanism
>> when the first preferred device memory allocation fails. This will ensure
>> preserving the existing semantics on powerpc platform. To summarize there
>> are three different methods to call vmemmap_alloc_block_buf().
>>
>> (., NULL,   false) /* Allocate from system RAM */
>> (., altmap, false) /* Allocate from altmap without any fallback */
>> (., altmap, true)  /* Allocate from altmap with fallback (system RAM) */
> [...]
>> diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
>> index bc73abf0bc25..01e25b56eccb 100644
>> --- a/arch/powerpc/mm/init_64.c
>> +++ b/arch/powerpc/mm/init_64.c
>> @@ -225,12 +225,12 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
>>  		 * fall back to system memory if the altmap allocation fail.
>>  		 */
>>  		if (altmap && !altmap_cross_boundary(altmap, start, page_size)) {
>> -			p = altmap_alloc_block_buf(page_size, altmap);
>> -			if (!p)
>> -				pr_debug("altmap block allocation failed, falling back to system memory");
>> +			p = vmemmap_alloc_block_buf(page_size, node,
>> +						    altmap, true);
>> +		} else {
>> +			p = vmemmap_alloc_block_buf(page_size, node,
>> +						    NULL, false);
>>  		}
>> -		if (!p)
>> -			p = vmemmap_alloc_block_buf(page_size, node);
>>  		if (!p)
>>  			return -ENOMEM;
> 
> Is the fallback argument actually necessary. It may be cleaner to just
> leave the code as is with the choice between altmap and NULL. If an arch
> needs a fallback (only powerpc), they have the fallback in place
> already. I don't see the powerpc code any better after this change.
> 
> I'm fine with the altmap argument though.

Okay. Will drop 'fallback' from vmemmap_alloc_block_buf() and update the
callers. There will also be a single change in the subsequent patch i.e
vmemmap_alloc_block_buf(PMD_SIZE, node, altmap).

^ permalink raw reply

* [PATCH] powerpc/perf: Add kernel support for new MSR[HV PR] bits in trace-imc.
From: Anju T Sudhakar @ 2020-07-03  6:36 UTC (permalink / raw)
  To: mpe; +Cc: maddy, linuxppc-dev, anju

IMC trace-mode record has MSR[HV PR] bits added in the third DW. 
These bits can be used to set the cpumode for the instruction pointer
captured in each sample.                                                                  
                                                                           
Add support in kernel to use these bits to set the cpumode for         
each sample.                                                               
                                                                           
Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/imc-pmu.h |  5 +++++
 arch/powerpc/perf/imc-pmu.c        | 29 ++++++++++++++++++++++++-----
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h
index 4da4fcba0684..4f897993b710 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -99,6 +99,11 @@ struct trace_imc_data {
  */
 #define IMC_TRACE_RECORD_TB1_MASK      0x3ffffffffffULL
 
+/*
+ * Bit 0:1 in third DW of IMC trace record
+ * specifies the MSR[HV PR] values.
+ */
+#define IMC_TRACE_RECORD_VAL_HVPR(x)	((x) >> 62)
 
 /*
  * Device tree parser code detects IMC pmu support and
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index cb50a9e1fd2d..310922fed9eb 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1178,11 +1178,30 @@ static int trace_imc_prepare_sample(struct trace_imc_data *mem,
 	header->size = sizeof(*header) + event->header_size;
 	header->misc = 0;
 
-	if (is_kernel_addr(data->ip))
-		header->misc |= PERF_RECORD_MISC_KERNEL;
-	else
-		header->misc |= PERF_RECORD_MISC_USER;
-
+	if (cpu_has_feature(CPU_FTRS_POWER9)) {
+		if (is_kernel_addr(data->ip))
+			header->misc |= PERF_RECORD_MISC_KERNEL;
+		else
+			header->misc |= PERF_RECORD_MISC_USER;
+	} else {
+		switch (IMC_TRACE_RECORD_VAL_HVPR(mem->val)) {
+		case 0:/* when MSR HV and PR not set in the trace-record */
+			header->misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+			break;
+		case 1: /* MSR HV is 0 and PR is 1 */
+			header->misc |= PERF_RECORD_MISC_GUEST_USER;
+			break;
+		case 2: /* MSR Hv is 1 and PR is 0 */
+			header->misc |= PERF_RECORD_MISC_HYPERVISOR;
+			break;
+		case 3: /* MSR HV is 1 and PR is 1 */
+			header->misc |= PERF_RECORD_MISC_USER;
+			break;
+		default:
+			pr_info("IMC: Unable to set the flag based on MSR bits\n");
+			break;
+		}
+	}
 	perf_event_header__init_id(header, data, event);
 
 	return 0;
-- 
2.25.4


^ permalink raw reply related

* Re: [PATCH] powerpc/powernv: machine check handler for POWER10
From: kernel test robot @ 2020-07-03  6:40 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev
  Cc: Mahesh Salgaonkar, kbuild-all, Nicholas Piggin
In-Reply-To: <20200702233343.1128026-1-npiggin@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2412 bytes --]

Hi Nicholas,

I love your patch! Perhaps something to improve:

[auto build test WARNING on powerpc/next]
[also build test WARNING on v5.8-rc3 next-20200702]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use  as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/powerpc-powernv-machine-check-handler-for-POWER10/20200703-073739
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-allyesconfig (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   arch/powerpc/kernel/mce_power.c:709:6: warning: no previous prototype for '__machine_check_early_realmode_p7' [-Wmissing-prototypes]
     709 | long __machine_check_early_realmode_p7(struct pt_regs *regs)
         |      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/powerpc/kernel/mce_power.c:717:6: warning: no previous prototype for '__machine_check_early_realmode_p8' [-Wmissing-prototypes]
     717 | long __machine_check_early_realmode_p8(struct pt_regs *regs)
         |      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/powerpc/kernel/mce_power.c:722:6: warning: no previous prototype for '__machine_check_early_realmode_p9' [-Wmissing-prototypes]
     722 | long __machine_check_early_realmode_p9(struct pt_regs *regs)
         |      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> arch/powerpc/kernel/mce_power.c:740:6: warning: no previous prototype for '__machine_check_early_realmode_p10' [-Wmissing-prototypes]
     740 | long __machine_check_early_realmode_p10(struct pt_regs *regs)
         |      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

vim +/__machine_check_early_realmode_p10 +740 arch/powerpc/kernel/mce_power.c

   739	
 > 740	long __machine_check_early_realmode_p10(struct pt_regs *regs)

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 69688 bytes --]

^ permalink raw reply

* [PATCH v2 0/6] powerpc: queued spinlocks and rwlocks
From: Nicholas Piggin @ 2020-07-03  7:35 UTC (permalink / raw)
  Cc: linux-arch, Peter Zijlstra, linuxppc-dev, Boqun Feng,
	linux-kernel, Nicholas Piggin, virtualization, Ingo Molnar,
	kvm-ppc, Waiman Long, Will Deacon

v2 is updated to account for feedback from Will, Peter, and
Waiman (thank you), and trims off a couple of RFC and unrelated
patches.

Thanks,
Nick

Nicholas Piggin (6):
  powerpc/powernv: must include hvcall.h to get PAPR defines
  powerpc/pseries: move some PAPR paravirt functions to their own file
  powerpc: move spinlock implementation to simple_spinlock
  powerpc/64s: implement queued spinlocks and rwlocks
  powerpc/pseries: implement paravirt qspinlocks for SPLPAR
  powerpc/qspinlock: optimised atomic_try_cmpxchg_lock that adds the
    lock hint

 arch/powerpc/Kconfig                          |  13 +
 arch/powerpc/include/asm/Kbuild               |   2 +
 arch/powerpc/include/asm/atomic.h             |  28 ++
 arch/powerpc/include/asm/paravirt.h           |  89 +++++
 arch/powerpc/include/asm/qspinlock.h          |  80 +++++
 arch/powerpc/include/asm/qspinlock_paravirt.h |   5 +
 arch/powerpc/include/asm/simple_spinlock.h    | 292 +++++++++++++++++
 .../include/asm/simple_spinlock_types.h       |  21 ++
 arch/powerpc/include/asm/spinlock.h           | 308 +-----------------
 arch/powerpc/include/asm/spinlock_types.h     |  17 +-
 arch/powerpc/lib/Makefile                     |   3 +
 arch/powerpc/lib/locks.c                      |  12 +-
 arch/powerpc/platforms/powernv/pci-ioda-tce.c |   1 +
 arch/powerpc/platforms/pseries/Kconfig        |   5 +
 arch/powerpc/platforms/pseries/setup.c        |   6 +-
 include/asm-generic/qspinlock.h               |   4 +
 16 files changed, 564 insertions(+), 322 deletions(-)
 create mode 100644 arch/powerpc/include/asm/paravirt.h
 create mode 100644 arch/powerpc/include/asm/qspinlock.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
 create mode 100644 arch/powerpc/include/asm/simple_spinlock.h
 create mode 100644 arch/powerpc/include/asm/simple_spinlock_types.h

-- 
2.23.0


^ permalink raw reply

* [PATCH v2 1/6] powerpc/powernv: must include hvcall.h to get PAPR defines
From: Nicholas Piggin @ 2020-07-03  7:35 UTC (permalink / raw)
  Cc: linux-arch, Peter Zijlstra, linuxppc-dev, Boqun Feng,
	linux-kernel, Nicholas Piggin, virtualization, Ingo Molnar,
	kvm-ppc, Waiman Long, Will Deacon
In-Reply-To: <20200703073516.1354108-1-npiggin@gmail.com>

An include goes away in future patches which breaks compilation
without this.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index f923359d8afc..8eba6ece7808 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -15,6 +15,7 @@
 
 #include <asm/iommu.h>
 #include <asm/tce.h>
+#include <asm/hvcall.h> /* share error returns with PAPR */
 #include "pci.h"
 
 unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
-- 
2.23.0


^ permalink raw reply related

* [PATCH v2 2/6] powerpc/pseries: move some PAPR paravirt functions to their own file
From: Nicholas Piggin @ 2020-07-03  7:35 UTC (permalink / raw)
  Cc: linux-arch, Peter Zijlstra, linuxppc-dev, Boqun Feng,
	linux-kernel, Nicholas Piggin, virtualization, Ingo Molnar,
	kvm-ppc, Waiman Long, Will Deacon
In-Reply-To: <20200703073516.1354108-1-npiggin@gmail.com>

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/paravirt.h | 61 +++++++++++++++++++++++++++++
 arch/powerpc/include/asm/spinlock.h | 24 +-----------
 arch/powerpc/lib/locks.c            | 12 +++---
 3 files changed, 68 insertions(+), 29 deletions(-)
 create mode 100644 arch/powerpc/include/asm/paravirt.h

diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h
new file mode 100644
index 000000000000..7a8546660a63
--- /dev/null
+++ b/arch/powerpc/include/asm/paravirt.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_PARAVIRT_H
+#define __ASM_PARAVIRT_H
+#ifdef __KERNEL__
+
+#include <linux/jump_label.h>
+#include <asm/smp.h>
+#ifdef CONFIG_PPC64
+#include <asm/paca.h>
+#include <asm/hvcall.h>
+#endif
+
+#ifdef CONFIG_PPC_SPLPAR
+DECLARE_STATIC_KEY_FALSE(shared_processor);
+
+static inline bool is_shared_processor(void)
+{
+	return static_branch_unlikely(&shared_processor);
+}
+
+/* If bit 0 is set, the cpu has been preempted */
+static inline u32 yield_count_of(int cpu)
+{
+	__be32 yield_count = READ_ONCE(lppaca_of(cpu).yield_count);
+	return be32_to_cpu(yield_count);
+}
+
+static inline void yield_to_preempted(int cpu, u32 yield_count)
+{
+	plpar_hcall_norets(H_CONFER, get_hard_smp_processor_id(cpu), yield_count);
+}
+#else
+static inline bool is_shared_processor(void)
+{
+	return false;
+}
+
+static inline u32 yield_count_of(int cpu)
+{
+	return 0;
+}
+
+extern void ___bad_yield_to_preempted(void);
+static inline void yield_to_preempted(int cpu, u32 yield_count)
+{
+	___bad_yield_to_preempted(); /* This would be a bug */
+}
+#endif
+
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+	if (!is_shared_processor())
+		return false;
+	if (yield_count_of(cpu) & 1)
+		return true;
+	return false;
+}
+
+#endif /* __KERNEL__ */
+#endif /* __ASM_PARAVIRT_H */
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 2d620896cdae..79be9bb10bbb 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -15,11 +15,10 @@
  *
  * (the type definitions are in asm/spinlock_types.h)
  */
-#include <linux/jump_label.h>
 #include <linux/irqflags.h>
+#include <asm/paravirt.h>
 #ifdef CONFIG_PPC64
 #include <asm/paca.h>
-#include <asm/hvcall.h>
 #endif
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
@@ -35,18 +34,6 @@
 #define LOCK_TOKEN	1
 #endif
 
-#ifdef CONFIG_PPC_PSERIES
-DECLARE_STATIC_KEY_FALSE(shared_processor);
-
-#define vcpu_is_preempted vcpu_is_preempted
-static inline bool vcpu_is_preempted(int cpu)
-{
-	if (!static_branch_unlikely(&shared_processor))
-		return false;
-	return !!(be32_to_cpu(lppaca_of(cpu).yield_count) & 1);
-}
-#endif
-
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
 	return lock.slock == 0;
@@ -110,15 +97,6 @@ static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
 static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
 #endif
 
-static inline bool is_shared_processor(void)
-{
-#ifdef CONFIG_PPC_SPLPAR
-	return static_branch_unlikely(&shared_processor);
-#else
-	return false;
-#endif
-}
-
 static inline void spin_yield(arch_spinlock_t *lock)
 {
 	if (is_shared_processor())
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index 6440d5943c00..04165b7a163f 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -27,14 +27,14 @@ void splpar_spin_yield(arch_spinlock_t *lock)
 		return;
 	holder_cpu = lock_value & 0xffff;
 	BUG_ON(holder_cpu >= NR_CPUS);
-	yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
+
+	yield_count = yield_count_of(holder_cpu);
 	if ((yield_count & 1) == 0)
 		return;		/* virtual cpu is currently running */
 	rmb();
 	if (lock->slock != lock_value)
 		return;		/* something has changed */
-	plpar_hcall_norets(H_CONFER,
-		get_hard_smp_processor_id(holder_cpu), yield_count);
+	yield_to_preempted(holder_cpu, yield_count);
 }
 EXPORT_SYMBOL_GPL(splpar_spin_yield);
 
@@ -53,13 +53,13 @@ void splpar_rw_yield(arch_rwlock_t *rw)
 		return;		/* no write lock at present */
 	holder_cpu = lock_value & 0xffff;
 	BUG_ON(holder_cpu >= NR_CPUS);
-	yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
+
+	yield_count = yield_count_of(holder_cpu);
 	if ((yield_count & 1) == 0)
 		return;		/* virtual cpu is currently running */
 	rmb();
 	if (rw->lock != lock_value)
 		return;		/* something has changed */
-	plpar_hcall_norets(H_CONFER,
-		get_hard_smp_processor_id(holder_cpu), yield_count);
+	yield_to_preempted(holder_cpu, yield_count);
 }
 #endif
-- 
2.23.0


^ permalink raw reply related

* [PATCH v2 3/6] powerpc: move spinlock implementation to simple_spinlock
From: Nicholas Piggin @ 2020-07-03  7:35 UTC (permalink / raw)
  Cc: linux-arch, Peter Zijlstra, linuxppc-dev, Boqun Feng,
	linux-kernel, Nicholas Piggin, virtualization, Ingo Molnar,
	kvm-ppc, Waiman Long, Will Deacon
In-Reply-To: <20200703073516.1354108-1-npiggin@gmail.com>

To prepare for queued spinlocks. This is a simple rename except to update
preprocessor guard name and a file reference.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/simple_spinlock.h    | 292 ++++++++++++++++++
 .../include/asm/simple_spinlock_types.h       |  21 ++
 arch/powerpc/include/asm/spinlock.h           | 285 +----------------
 arch/powerpc/include/asm/spinlock_types.h     |  12 +-
 4 files changed, 315 insertions(+), 295 deletions(-)
 create mode 100644 arch/powerpc/include/asm/simple_spinlock.h
 create mode 100644 arch/powerpc/include/asm/simple_spinlock_types.h

diff --git a/arch/powerpc/include/asm/simple_spinlock.h b/arch/powerpc/include/asm/simple_spinlock.h
new file mode 100644
index 000000000000..e048c041c4a9
--- /dev/null
+++ b/arch/powerpc/include/asm/simple_spinlock.h
@@ -0,0 +1,292 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_SIMPLE_SPINLOCK_H
+#define __ASM_SIMPLE_SPINLOCK_H
+#ifdef __KERNEL__
+
+/*
+ * Simple spin lock operations.  
+ *
+ * Copyright (C) 2001-2004 Paul Mackerras <paulus@au.ibm.com>, IBM
+ * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ * Copyright (C) 2002 Dave Engebretsen <engebret@us.ibm.com>, IBM
+ *	Rework to support virtual processors
+ *
+ * Type of int is used as a full 64b word is not necessary.
+ *
+ * (the type definitions are in asm/simple_spinlock_types.h)
+ */
+#include <linux/irqflags.h>
+#include <asm/paravirt.h>
+#ifdef CONFIG_PPC64
+#include <asm/paca.h>
+#endif
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+
+#ifdef CONFIG_PPC64
+/* use 0x800000yy when locked, where yy == CPU number */
+#ifdef __BIG_ENDIAN__
+#define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))
+#else
+#define LOCK_TOKEN	(*(u32 *)(&get_paca()->paca_index))
+#endif
+#else
+#define LOCK_TOKEN	1
+#endif
+
+static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
+{
+	return lock.slock == 0;
+}
+
+static inline int arch_spin_is_locked(arch_spinlock_t *lock)
+{
+	smp_mb();
+	return !arch_spin_value_unlocked(*lock);
+}
+
+/*
+ * This returns the old value in the lock, so we succeeded
+ * in getting the lock if the return value is 0.
+ */
+static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock)
+{
+	unsigned long tmp, token;
+
+	token = LOCK_TOKEN;
+	__asm__ __volatile__(
+"1:	" PPC_LWARX(%0,0,%2,1) "\n\
+	cmpwi		0,%0,0\n\
+	bne-		2f\n\
+	stwcx.		%1,0,%2\n\
+	bne-		1b\n"
+	PPC_ACQUIRE_BARRIER
+"2:"
+	: "=&r" (tmp)
+	: "r" (token), "r" (&lock->slock)
+	: "cr0", "memory");
+
+	return tmp;
+}
+
+static inline int arch_spin_trylock(arch_spinlock_t *lock)
+{
+	return __arch_spin_trylock(lock) == 0;
+}
+
+/*
+ * On a system with shared processors (that is, where a physical
+ * processor is multiplexed between several virtual processors),
+ * there is no point spinning on a lock if the holder of the lock
+ * isn't currently scheduled on a physical processor.  Instead
+ * we detect this situation and ask the hypervisor to give the
+ * rest of our timeslice to the lock holder.
+ *
+ * So that we can tell which virtual processor is holding a lock,
+ * we put 0x80000000 | smp_processor_id() in the lock when it is
+ * held.  Conveniently, we have a word in the paca that holds this
+ * value.
+ */
+
+#if defined(CONFIG_PPC_SPLPAR)
+/* We only yield to the hypervisor if we are in shared processor mode */
+void splpar_spin_yield(arch_spinlock_t *lock);
+void splpar_rw_yield(arch_rwlock_t *lock);
+#else /* SPLPAR */
+static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
+static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
+#endif
+
+static inline void spin_yield(arch_spinlock_t *lock)
+{
+	if (is_shared_processor())
+		splpar_spin_yield(lock);
+	else
+		barrier();
+}
+
+static inline void rw_yield(arch_rwlock_t *lock)
+{
+	if (is_shared_processor())
+		splpar_rw_yield(lock);
+	else
+		barrier();
+}
+
+static inline void arch_spin_lock(arch_spinlock_t *lock)
+{
+	while (1) {
+		if (likely(__arch_spin_trylock(lock) == 0))
+			break;
+		do {
+			HMT_low();
+			if (is_shared_processor())
+				splpar_spin_yield(lock);
+		} while (unlikely(lock->slock != 0));
+		HMT_medium();
+	}
+}
+
+static inline
+void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
+{
+	unsigned long flags_dis;
+
+	while (1) {
+		if (likely(__arch_spin_trylock(lock) == 0))
+			break;
+		local_save_flags(flags_dis);
+		local_irq_restore(flags);
+		do {
+			HMT_low();
+			if (is_shared_processor())
+				splpar_spin_yield(lock);
+		} while (unlikely(lock->slock != 0));
+		HMT_medium();
+		local_irq_restore(flags_dis);
+	}
+}
+#define arch_spin_lock_flags arch_spin_lock_flags
+
+static inline void arch_spin_unlock(arch_spinlock_t *lock)
+{
+	__asm__ __volatile__("# arch_spin_unlock\n\t"
+				PPC_RELEASE_BARRIER: : :"memory");
+	lock->slock = 0;
+}
+
+/*
+ * Read-write spinlocks, allowing multiple readers
+ * but only one writer.
+ *
+ * NOTE! it is quite common to have readers in interrupts
+ * but no interrupt writers. For those circumstances we
+ * can "mix" irq-safe locks - any writer needs to get a
+ * irq-safe write-lock, but readers can get non-irqsafe
+ * read-locks.
+ */
+
+#ifdef CONFIG_PPC64
+#define __DO_SIGN_EXTEND	"extsw	%0,%0\n"
+#define WRLOCK_TOKEN		LOCK_TOKEN	/* it's negative */
+#else
+#define __DO_SIGN_EXTEND
+#define WRLOCK_TOKEN		(-1)
+#endif
+
+/*
+ * This returns the old value in the lock + 1,
+ * so we got a read lock if the return value is > 0.
+ */
+static inline long __arch_read_trylock(arch_rwlock_t *rw)
+{
+	long tmp;
+
+	__asm__ __volatile__(
+"1:	" PPC_LWARX(%0,0,%1,1) "\n"
+	__DO_SIGN_EXTEND
+"	addic.		%0,%0,1\n\
+	ble-		2f\n"
+"	stwcx.		%0,0,%1\n\
+	bne-		1b\n"
+	PPC_ACQUIRE_BARRIER
+"2:"	: "=&r" (tmp)
+	: "r" (&rw->lock)
+	: "cr0", "xer", "memory");
+
+	return tmp;
+}
+
+/*
+ * This returns the old value in the lock,
+ * so we got the write lock if the return value is 0.
+ */
+static inline long __arch_write_trylock(arch_rwlock_t *rw)
+{
+	long tmp, token;
+
+	token = WRLOCK_TOKEN;
+	__asm__ __volatile__(
+"1:	" PPC_LWARX(%0,0,%2,1) "\n\
+	cmpwi		0,%0,0\n\
+	bne-		2f\n"
+"	stwcx.		%1,0,%2\n\
+	bne-		1b\n"
+	PPC_ACQUIRE_BARRIER
+"2:"	: "=&r" (tmp)
+	: "r" (token), "r" (&rw->lock)
+	: "cr0", "memory");
+
+	return tmp;
+}
+
+static inline void arch_read_lock(arch_rwlock_t *rw)
+{
+	while (1) {
+		if (likely(__arch_read_trylock(rw) > 0))
+			break;
+		do {
+			HMT_low();
+			if (is_shared_processor())
+				splpar_rw_yield(rw);
+		} while (unlikely(rw->lock < 0));
+		HMT_medium();
+	}
+}
+
+static inline void arch_write_lock(arch_rwlock_t *rw)
+{
+	while (1) {
+		if (likely(__arch_write_trylock(rw) == 0))
+			break;
+		do {
+			HMT_low();
+			if (is_shared_processor())
+				splpar_rw_yield(rw);
+		} while (unlikely(rw->lock != 0));
+		HMT_medium();
+	}
+}
+
+static inline int arch_read_trylock(arch_rwlock_t *rw)
+{
+	return __arch_read_trylock(rw) > 0;
+}
+
+static inline int arch_write_trylock(arch_rwlock_t *rw)
+{
+	return __arch_write_trylock(rw) == 0;
+}
+
+static inline void arch_read_unlock(arch_rwlock_t *rw)
+{
+	long tmp;
+
+	__asm__ __volatile__(
+	"# read_unlock\n\t"
+	PPC_RELEASE_BARRIER
+"1:	lwarx		%0,0,%1\n\
+	addic		%0,%0,-1\n"
+"	stwcx.		%0,0,%1\n\
+	bne-		1b"
+	: "=&r"(tmp)
+	: "r"(&rw->lock)
+	: "cr0", "xer", "memory");
+}
+
+static inline void arch_write_unlock(arch_rwlock_t *rw)
+{
+	__asm__ __volatile__("# write_unlock\n\t"
+				PPC_RELEASE_BARRIER: : :"memory");
+	rw->lock = 0;
+}
+
+#define arch_spin_relax(lock)	spin_yield(lock)
+#define arch_read_relax(lock)	rw_yield(lock)
+#define arch_write_relax(lock)	rw_yield(lock)
+
+/* See include/linux/spinlock.h */
+#define smp_mb__after_spinlock()   smp_mb()
+
+#endif /* __KERNEL__ */
+#endif /* __ASM_SIMPLE_SPINLOCK_H */
diff --git a/arch/powerpc/include/asm/simple_spinlock_types.h b/arch/powerpc/include/asm/simple_spinlock_types.h
new file mode 100644
index 000000000000..7c2b48ce62dc
--- /dev/null
+++ b/arch/powerpc/include/asm/simple_spinlock_types.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H
+#define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+	volatile unsigned int slock;
+} arch_spinlock_t;
+
+#define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
+
+typedef struct {
+	volatile signed int lock;
+} arch_rwlock_t;
+
+#define __ARCH_RW_LOCK_UNLOCKED		{ 0 }
+
+#endif
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 79be9bb10bbb..21357fe05fe0 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -3,290 +3,7 @@
 #define __ASM_SPINLOCK_H
 #ifdef __KERNEL__
 
-/*
- * Simple spin lock operations.  
- *
- * Copyright (C) 2001-2004 Paul Mackerras <paulus@au.ibm.com>, IBM
- * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- * Copyright (C) 2002 Dave Engebretsen <engebret@us.ibm.com>, IBM
- *	Rework to support virtual processors
- *
- * Type of int is used as a full 64b word is not necessary.
- *
- * (the type definitions are in asm/spinlock_types.h)
- */
-#include <linux/irqflags.h>
-#include <asm/paravirt.h>
-#ifdef CONFIG_PPC64
-#include <asm/paca.h>
-#endif
-#include <asm/synch.h>
-#include <asm/ppc-opcode.h>
-
-#ifdef CONFIG_PPC64
-/* use 0x800000yy when locked, where yy == CPU number */
-#ifdef __BIG_ENDIAN__
-#define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))
-#else
-#define LOCK_TOKEN	(*(u32 *)(&get_paca()->paca_index))
-#endif
-#else
-#define LOCK_TOKEN	1
-#endif
-
-static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
-{
-	return lock.slock == 0;
-}
-
-static inline int arch_spin_is_locked(arch_spinlock_t *lock)
-{
-	smp_mb();
-	return !arch_spin_value_unlocked(*lock);
-}
-
-/*
- * This returns the old value in the lock, so we succeeded
- * in getting the lock if the return value is 0.
- */
-static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock)
-{
-	unsigned long tmp, token;
-
-	token = LOCK_TOKEN;
-	__asm__ __volatile__(
-"1:	" PPC_LWARX(%0,0,%2,1) "\n\
-	cmpwi		0,%0,0\n\
-	bne-		2f\n\
-	stwcx.		%1,0,%2\n\
-	bne-		1b\n"
-	PPC_ACQUIRE_BARRIER
-"2:"
-	: "=&r" (tmp)
-	: "r" (token), "r" (&lock->slock)
-	: "cr0", "memory");
-
-	return tmp;
-}
-
-static inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
-	return __arch_spin_trylock(lock) == 0;
-}
-
-/*
- * On a system with shared processors (that is, where a physical
- * processor is multiplexed between several virtual processors),
- * there is no point spinning on a lock if the holder of the lock
- * isn't currently scheduled on a physical processor.  Instead
- * we detect this situation and ask the hypervisor to give the
- * rest of our timeslice to the lock holder.
- *
- * So that we can tell which virtual processor is holding a lock,
- * we put 0x80000000 | smp_processor_id() in the lock when it is
- * held.  Conveniently, we have a word in the paca that holds this
- * value.
- */
-
-#if defined(CONFIG_PPC_SPLPAR)
-/* We only yield to the hypervisor if we are in shared processor mode */
-void splpar_spin_yield(arch_spinlock_t *lock);
-void splpar_rw_yield(arch_rwlock_t *lock);
-#else /* SPLPAR */
-static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
-static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
-#endif
-
-static inline void spin_yield(arch_spinlock_t *lock)
-{
-	if (is_shared_processor())
-		splpar_spin_yield(lock);
-	else
-		barrier();
-}
-
-static inline void rw_yield(arch_rwlock_t *lock)
-{
-	if (is_shared_processor())
-		splpar_rw_yield(lock);
-	else
-		barrier();
-}
-
-static inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-	while (1) {
-		if (likely(__arch_spin_trylock(lock) == 0))
-			break;
-		do {
-			HMT_low();
-			if (is_shared_processor())
-				splpar_spin_yield(lock);
-		} while (unlikely(lock->slock != 0));
-		HMT_medium();
-	}
-}
-
-static inline
-void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
-{
-	unsigned long flags_dis;
-
-	while (1) {
-		if (likely(__arch_spin_trylock(lock) == 0))
-			break;
-		local_save_flags(flags_dis);
-		local_irq_restore(flags);
-		do {
-			HMT_low();
-			if (is_shared_processor())
-				splpar_spin_yield(lock);
-		} while (unlikely(lock->slock != 0));
-		HMT_medium();
-		local_irq_restore(flags_dis);
-	}
-}
-#define arch_spin_lock_flags arch_spin_lock_flags
-
-static inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
-	__asm__ __volatile__("# arch_spin_unlock\n\t"
-				PPC_RELEASE_BARRIER: : :"memory");
-	lock->slock = 0;
-}
-
-/*
- * Read-write spinlocks, allowing multiple readers
- * but only one writer.
- *
- * NOTE! it is quite common to have readers in interrupts
- * but no interrupt writers. For those circumstances we
- * can "mix" irq-safe locks - any writer needs to get a
- * irq-safe write-lock, but readers can get non-irqsafe
- * read-locks.
- */
-
-#ifdef CONFIG_PPC64
-#define __DO_SIGN_EXTEND	"extsw	%0,%0\n"
-#define WRLOCK_TOKEN		LOCK_TOKEN	/* it's negative */
-#else
-#define __DO_SIGN_EXTEND
-#define WRLOCK_TOKEN		(-1)
-#endif
-
-/*
- * This returns the old value in the lock + 1,
- * so we got a read lock if the return value is > 0.
- */
-static inline long __arch_read_trylock(arch_rwlock_t *rw)
-{
-	long tmp;
-
-	__asm__ __volatile__(
-"1:	" PPC_LWARX(%0,0,%1,1) "\n"
-	__DO_SIGN_EXTEND
-"	addic.		%0,%0,1\n\
-	ble-		2f\n"
-"	stwcx.		%0,0,%1\n\
-	bne-		1b\n"
-	PPC_ACQUIRE_BARRIER
-"2:"	: "=&r" (tmp)
-	: "r" (&rw->lock)
-	: "cr0", "xer", "memory");
-
-	return tmp;
-}
-
-/*
- * This returns the old value in the lock,
- * so we got the write lock if the return value is 0.
- */
-static inline long __arch_write_trylock(arch_rwlock_t *rw)
-{
-	long tmp, token;
-
-	token = WRLOCK_TOKEN;
-	__asm__ __volatile__(
-"1:	" PPC_LWARX(%0,0,%2,1) "\n\
-	cmpwi		0,%0,0\n\
-	bne-		2f\n"
-"	stwcx.		%1,0,%2\n\
-	bne-		1b\n"
-	PPC_ACQUIRE_BARRIER
-"2:"	: "=&r" (tmp)
-	: "r" (token), "r" (&rw->lock)
-	: "cr0", "memory");
-
-	return tmp;
-}
-
-static inline void arch_read_lock(arch_rwlock_t *rw)
-{
-	while (1) {
-		if (likely(__arch_read_trylock(rw) > 0))
-			break;
-		do {
-			HMT_low();
-			if (is_shared_processor())
-				splpar_rw_yield(rw);
-		} while (unlikely(rw->lock < 0));
-		HMT_medium();
-	}
-}
-
-static inline void arch_write_lock(arch_rwlock_t *rw)
-{
-	while (1) {
-		if (likely(__arch_write_trylock(rw) == 0))
-			break;
-		do {
-			HMT_low();
-			if (is_shared_processor())
-				splpar_rw_yield(rw);
-		} while (unlikely(rw->lock != 0));
-		HMT_medium();
-	}
-}
-
-static inline int arch_read_trylock(arch_rwlock_t *rw)
-{
-	return __arch_read_trylock(rw) > 0;
-}
-
-static inline int arch_write_trylock(arch_rwlock_t *rw)
-{
-	return __arch_write_trylock(rw) == 0;
-}
-
-static inline void arch_read_unlock(arch_rwlock_t *rw)
-{
-	long tmp;
-
-	__asm__ __volatile__(
-	"# read_unlock\n\t"
-	PPC_RELEASE_BARRIER
-"1:	lwarx		%0,0,%1\n\
-	addic		%0,%0,-1\n"
-"	stwcx.		%0,0,%1\n\
-	bne-		1b"
-	: "=&r"(tmp)
-	: "r"(&rw->lock)
-	: "cr0", "xer", "memory");
-}
-
-static inline void arch_write_unlock(arch_rwlock_t *rw)
-{
-	__asm__ __volatile__("# write_unlock\n\t"
-				PPC_RELEASE_BARRIER: : :"memory");
-	rw->lock = 0;
-}
-
-#define arch_spin_relax(lock)	spin_yield(lock)
-#define arch_read_relax(lock)	rw_yield(lock)
-#define arch_write_relax(lock)	rw_yield(lock)
-
-/* See include/linux/spinlock.h */
-#define smp_mb__after_spinlock()   smp_mb()
+#include <asm/simple_spinlock.h>
 
 #endif /* __KERNEL__ */
 #endif /* __ASM_SPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
index 87adaf13b7e8..3906f52dae65 100644
--- a/arch/powerpc/include/asm/spinlock_types.h
+++ b/arch/powerpc/include/asm/spinlock_types.h
@@ -6,16 +6,6 @@
 # error "please don't include this file directly"
 #endif
 
-typedef struct {
-	volatile unsigned int slock;
-} arch_spinlock_t;
-
-#define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
-
-typedef struct {
-	volatile signed int lock;
-} arch_rwlock_t;
-
-#define __ARCH_RW_LOCK_UNLOCKED		{ 0 }
+#include <asm/simple_spinlock_types.h>
 
 #endif
-- 
2.23.0


^ permalink raw reply related

* [PATCH v2 4/6] powerpc/64s: implement queued spinlocks and rwlocks
From: Nicholas Piggin @ 2020-07-03  7:35 UTC (permalink / raw)
  Cc: linux-arch, Peter Zijlstra, linuxppc-dev, Boqun Feng,
	linux-kernel, Nicholas Piggin, virtualization, Ingo Molnar,
	kvm-ppc, Waiman Long, Will Deacon
In-Reply-To: <20200703073516.1354108-1-npiggin@gmail.com>

These have shown significantly improved performance and fairness when
spinlock contention is moderate to high on very large systems.

 [ Numbers hopefully forthcoming after more testing, but initial
   results look good ]

Thanks to the fast path, single threaded performance is not noticably
hurt.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/Kconfig                      | 13 ++++++++++++
 arch/powerpc/include/asm/Kbuild           |  2 ++
 arch/powerpc/include/asm/qspinlock.h      | 25 +++++++++++++++++++++++
 arch/powerpc/include/asm/spinlock.h       |  5 +++++
 arch/powerpc/include/asm/spinlock_types.h |  5 +++++
 arch/powerpc/lib/Makefile                 |  3 +++
 include/asm-generic/qspinlock.h           |  2 ++
 7 files changed, 55 insertions(+)
 create mode 100644 arch/powerpc/include/asm/qspinlock.h

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 9fa23eb320ff..b17575109876 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -145,6 +145,8 @@ config PPC
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
+	select ARCH_USE_QUEUED_RWLOCKS		if PPC_QUEUED_SPINLOCKS
+	select ARCH_USE_QUEUED_SPINLOCKS	if PPC_QUEUED_SPINLOCKS
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select ARCH_WEAK_RELEASE_ACQUIRE
 	select BINFMT_ELF
@@ -490,6 +492,17 @@ config HOTPLUG_CPU
 
 	  Say N if you are unsure.
 
+config PPC_QUEUED_SPINLOCKS
+	bool "Queued spinlocks"
+	depends on SMP
+	default "y" if PPC_BOOK3S_64
+	help
+	  Say Y here to use to use queued spinlocks which are more complex
+	  but give better salability and fairness on large SMP and NUMA
+	  systems.
+
+	  If unsure, say "Y" if you have lots of cores, otherwise "N".
+
 config ARCH_CPU_PROBE_RELEASE
 	def_bool y
 	depends on HOTPLUG_CPU
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index dadbcf3a0b1e..1dd8b6adff5e 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -6,5 +6,7 @@ generated-y += syscall_table_spu.h
 generic-y += export.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
+generic-y += qrwlock.h
+generic-y += qspinlock.h
 generic-y += vtime.h
 generic-y += early_ioremap.h
diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
new file mode 100644
index 000000000000..c49e33e24edd
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_QSPINLOCK_H
+#define _ASM_POWERPC_QSPINLOCK_H
+
+#include <asm-generic/qspinlock_types.h>
+
+#define _Q_PENDING_LOOPS	(1 << 9) /* not tuned */
+
+#define smp_mb__after_spinlock()   smp_mb()
+
+static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
+{
+	/*
+	 * This barrier was added to simple spinlocks by commit 51d7d5205d338,
+	 * but it should now be possible to remove it, asm arm64 has done with
+	 * commit c6f5d02b6a0f.
+	 */
+	smp_mb();
+	return atomic_read(&lock->val);
+}
+#define queued_spin_is_locked queued_spin_is_locked
+
+#include <asm-generic/qspinlock.h>
+
+#endif /* _ASM_POWERPC_QSPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 21357fe05fe0..434615f1d761 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -3,7 +3,12 @@
 #define __ASM_SPINLOCK_H
 #ifdef __KERNEL__
 
+#ifdef CONFIG_PPC_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#include <asm/qrwlock.h>
+#else
 #include <asm/simple_spinlock.h>
+#endif
 
 #endif /* __KERNEL__ */
 #endif /* __ASM_SPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
index 3906f52dae65..c5d742f18021 100644
--- a/arch/powerpc/include/asm/spinlock_types.h
+++ b/arch/powerpc/include/asm/spinlock_types.h
@@ -6,6 +6,11 @@
 # error "please don't include this file directly"
 #endif
 
+#ifdef CONFIG_PPC_QUEUED_SPINLOCKS
+#include <asm-generic/qspinlock_types.h>
+#include <asm-generic/qrwlock_types.h>
+#else
 #include <asm/simple_spinlock_types.h>
+#endif
 
 #endif
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 5e994cda8e40..d66a645503eb 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -41,7 +41,10 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \
 obj64-y	+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
 	   memcpy_64.o memcpy_mcsafe_64.o
 
+ifndef CONFIG_PPC_QUEUED_SPINLOCKS
 obj64-$(CONFIG_SMP)	+= locks.o
+endif
+
 obj64-$(CONFIG_ALTIVEC)	+= vmx-helper.o
 obj64-$(CONFIG_KPROBES_SANITY_TEST)	+= test_emulate_step.o \
 					   test_emulate_step_exec_instr.o
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index fde943d180e0..fb0a814d4395 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -12,6 +12,7 @@
 
 #include <asm-generic/qspinlock_types.h>
 
+#ifndef queued_spin_is_locked
 /**
  * queued_spin_is_locked - is the spinlock locked?
  * @lock: Pointer to queued spinlock structure
@@ -25,6 +26,7 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
 	 */
 	return atomic_read(&lock->val);
 }
+#endif
 
 /**
  * queued_spin_value_unlocked - is the spinlock structure unlocked?
-- 
2.23.0


^ permalink raw reply related

* [PATCH v2 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR
From: Nicholas Piggin @ 2020-07-03  7:35 UTC (permalink / raw)
  Cc: linux-arch, Peter Zijlstra, linuxppc-dev, Boqun Feng,
	linux-kernel, Nicholas Piggin, virtualization, Ingo Molnar,
	kvm-ppc, Waiman Long, Will Deacon
In-Reply-To: <20200703073516.1354108-1-npiggin@gmail.com>

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/paravirt.h           | 28 ++++++++++
 arch/powerpc/include/asm/qspinlock.h          | 55 +++++++++++++++++++
 arch/powerpc/include/asm/qspinlock_paravirt.h |  5 ++
 arch/powerpc/platforms/pseries/Kconfig        |  5 ++
 arch/powerpc/platforms/pseries/setup.c        |  6 +-
 include/asm-generic/qspinlock.h               |  2 +
 6 files changed, 100 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h

diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h
index 7a8546660a63..f2d51f929cf5 100644
--- a/arch/powerpc/include/asm/paravirt.h
+++ b/arch/powerpc/include/asm/paravirt.h
@@ -29,6 +29,16 @@ static inline void yield_to_preempted(int cpu, u32 yield_count)
 {
 	plpar_hcall_norets(H_CONFER, get_hard_smp_processor_id(cpu), yield_count);
 }
+
+static inline void prod_cpu(int cpu)
+{
+	plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu));
+}
+
+static inline void yield_to_any(void)
+{
+	plpar_hcall_norets(H_CONFER, -1, 0);
+}
 #else
 static inline bool is_shared_processor(void)
 {
@@ -45,6 +55,19 @@ static inline void yield_to_preempted(int cpu, u32 yield_count)
 {
 	___bad_yield_to_preempted(); /* This would be a bug */
 }
+
+extern void ___bad_yield_to_any(void);
+static inline void yield_to_any(void)
+{
+	___bad_yield_to_any(); /* This would be a bug */
+}
+
+extern void ___bad_prod_cpu(void);
+static inline void prod_cpu(int cpu)
+{
+	___bad_prod_cpu(); /* This would be a bug */
+}
+
 #endif
 
 #define vcpu_is_preempted vcpu_is_preempted
@@ -57,5 +80,10 @@ static inline bool vcpu_is_preempted(int cpu)
 	return false;
 }
 
+static inline bool pv_is_native_spin_unlock(void)
+{
+     return !is_shared_processor();
+}
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_PARAVIRT_H */
diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
index c49e33e24edd..0960a0de2467 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -3,9 +3,36 @@
 #define _ASM_POWERPC_QSPINLOCK_H
 
 #include <asm-generic/qspinlock_types.h>
+#include <asm/paravirt.h>
 
 #define _Q_PENDING_LOOPS	(1 << 9) /* not tuned */
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+
+static __always_inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+	if (!is_shared_processor())
+		native_queued_spin_lock_slowpath(lock, val);
+	else
+		__pv_queued_spin_lock_slowpath(lock, val);
+}
+#else
+extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+#endif
+
+static __always_inline void queued_spin_lock(struct qspinlock *lock)
+{
+	u32 val = 0;
+
+	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
+		return;
+
+	queued_spin_lock_slowpath(lock, val);
+}
+#define queued_spin_lock queued_spin_lock
+
 #define smp_mb__after_spinlock()   smp_mb()
 
 static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
@@ -20,6 +47,34 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
 }
 #define queued_spin_is_locked queued_spin_is_locked
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define SPIN_THRESHOLD (1<<15) /* not tuned */
+
+static __always_inline void pv_wait(u8 *ptr, u8 val)
+{
+	if (*ptr != val)
+		return;
+	yield_to_any();
+	/*
+	 * We could pass in a CPU here if waiting in the queue and yield to
+	 * the previous CPU in the queue.
+	 */
+}
+
+static __always_inline void pv_kick(int cpu)
+{
+	prod_cpu(cpu);
+}
+
+extern void __pv_init_lock_hash(void);
+
+static inline void pv_spinlocks_init(void)
+{
+	__pv_init_lock_hash();
+}
+
+#endif
+
 #include <asm-generic/qspinlock.h>
 
 #endif /* _ASM_POWERPC_QSPINLOCK_H */
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h
new file mode 100644
index 000000000000..6dbdb8a4f84f
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_QSPINLOCK_PARAVIRT_H
+#define __ASM_QSPINLOCK_PARAVIRT_H
+
+#endif /* __ASM_QSPINLOCK_PARAVIRT_H */
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 24c18362e5ea..756e727b383f 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -25,9 +25,14 @@ config PPC_PSERIES
 	select SWIOTLB
 	default y
 
+config PARAVIRT_SPINLOCKS
+	bool
+	default n
+
 config PPC_SPLPAR
 	depends on PPC_PSERIES
 	bool "Support for shared-processor logical partitions"
+	select PARAVIRT_SPINLOCKS if PPC_QUEUED_SPINLOCKS
 	help
 	  Enabling this option will make the kernel run more efficiently
 	  on logically-partitioned pSeries systems which use shared
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 2db8469e475f..747a203d9453 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -771,8 +771,12 @@ static void __init pSeries_setup_arch(void)
 	if (firmware_has_feature(FW_FEATURE_LPAR)) {
 		vpa_init(boot_cpuid);
 
-		if (lppaca_shared_proc(get_lppaca()))
+		if (lppaca_shared_proc(get_lppaca())) {
 			static_branch_enable(&shared_processor);
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+			pv_spinlocks_init();
+#endif
+		}
 
 		ppc_md.power_save = pseries_lpar_idle;
 		ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index fb0a814d4395..38ca14e79a86 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -69,6 +69,7 @@ static __always_inline int queued_spin_trylock(struct qspinlock *lock)
 
 extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
 
+#ifndef queued_spin_lock
 /**
  * queued_spin_lock - acquire a queued spinlock
  * @lock: Pointer to queued spinlock structure
@@ -82,6 +83,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
 
 	queued_spin_lock_slowpath(lock, val);
 }
+#endif
 
 #ifndef queued_spin_unlock
 /**
-- 
2.23.0


^ permalink raw reply related

* [PATCH v2 6/6] powerpc/qspinlock: optimised atomic_try_cmpxchg_lock that adds the lock hint
From: Nicholas Piggin @ 2020-07-03  7:35 UTC (permalink / raw)
  Cc: linux-arch, Peter Zijlstra, linuxppc-dev, Boqun Feng,
	linux-kernel, Nicholas Piggin, virtualization, Ingo Molnar,
	kvm-ppc, Waiman Long, Will Deacon
In-Reply-To: <20200703073516.1354108-1-npiggin@gmail.com>

This brings the behaviour of the uncontended fast path back to
roughly equivalent to simple spinlocks -- a single atomic op with
lock hint.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/atomic.h    | 28 ++++++++++++++++++++++++++++
 arch/powerpc/include/asm/qspinlock.h |  2 +-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 498785ffc25f..f6a3d145ffb7 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -193,6 +193,34 @@ static __inline__ int atomic_dec_return_relaxed(atomic_t *v)
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 #define atomic_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new))
 
+/*
+ * Don't want to override the generic atomic_try_cmpxchg_acquire, because
+ * we add a lock hint to the lwarx, which may not be wanted for the
+ * _acquire case (and is not used by the other _acquire variants so it
+ * would be a surprise).
+ */
+static __always_inline bool
+atomic_try_cmpxchg_lock(atomic_t *v, int *old, int new)
+{
+	int r, o = *old;
+
+	__asm__ __volatile__ (
+"1:\t"	PPC_LWARX(%0,0,%2,1) "	# atomic_try_cmpxchg_acquire	\n"
+"	cmpw	0,%0,%3							\n"
+"	bne-	2f							\n"
+"	stwcx.	%4,0,%2							\n"
+"	bne-	1b							\n"
+"\t"	PPC_ACQUIRE_BARRIER "						\n"
+"2:									\n"
+	: "=&r" (r), "+m" (v->counter)
+	: "r" (&v->counter), "r" (o), "r" (new)
+	: "cr0", "memory");
+
+	if (unlikely(r != o))
+		*old = r;
+	return likely(r == o);
+}
+
 /**
  * atomic_fetch_add_unless - add unless the number is a given value
  * @v: pointer of type atomic_t
diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
index 0960a0de2467..beb6aa4628e7 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -26,7 +26,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
 {
 	u32 val = 0;
 
-	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
+	if (likely(atomic_try_cmpxchg_lock(&lock->val, &val, _Q_LOCKED_VAL)))
 		return;
 
 	queued_spin_lock_slowpath(lock, val);
-- 
2.23.0


^ permalink raw reply related

* Re: [PATCH v5 3/3] mm/page_alloc: Keep memoryless cpuless node 0 offline
From: Michal Suchánek @ 2020-07-03  9:10 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Gautham R Shenoy, Srikar Dronamraju, David Hildenbrand,
	linuxppc-dev, linux-kernel, linux-mm, Satheesh Rajendran,
	Mel Gorman, Kirill A. Shutemov, Andrew Morton, Linus Torvalds,
	Christopher Lameter, Vlastimil Babka
In-Reply-To: <20200701122110.GT2369@dhcp22.suse.cz>

On Wed, Jul 01, 2020 at 02:21:10PM +0200, Michal Hocko wrote:
> On Wed 01-07-20 13:30:57, David Hildenbrand wrote:
> > On 01.07.20 13:06, David Hildenbrand wrote:
> > > On 01.07.20 13:01, Srikar Dronamraju wrote:
> > >> * David Hildenbrand <david@redhat.com> [2020-07-01 12:15:54]:
> > >>
> > >>> On 01.07.20 12:04, Srikar Dronamraju wrote:
> > >>>> * Michal Hocko <mhocko@kernel.org> [2020-07-01 10:42:00]:
> > >>>>
> > >>>>>
> > >>>>>>
> > >>>>>> 2. Also existence of dummy node also leads to inconsistent information. The
> > >>>>>> number of online nodes is inconsistent with the information in the
> > >>>>>> device-tree and resource-dump
> > >>>>>>
> > >>>>>> 3. When the dummy node is present, single node non-Numa systems end up showing
> > >>>>>> up as NUMA systems and numa_balancing gets enabled. This will mean we take
> > >>>>>> the hit from the unnecessary numa hinting faults.
> > >>>>>
> > >>>>> I have to say that I dislike the node online/offline state and directly
> > >>>>> exporting that to the userspace. Users should only care whether the node
> > >>>>> has memory/cpus. Numa nodes can be online without any memory. Just
> > >>>>> offline all the present memory blocks but do not physically hot remove
> > >>>>> them and you are in the same situation. If users are confused by an
> > >>>>> output of tools like numactl -H then those could be updated and hide
> > >>>>> nodes without any memory&cpus.
> > >>>>>
> > >>>>> The autonuma problem sounds interesting but again this patch doesn't
> > >>>>> really solve the underlying problem because I strongly suspect that the
> > >>>>> problem is still there when a numa node gets all its memory offline as
> > >>>>> mentioned above.
> 
> I would really appreciate a feedback to these two as well.
> 
> > >>>>> While I completely agree that making node 0 special is wrong, I have
> > >>>>> still hard time to review this very simply looking patch because all the
> > >>>>> numa initialization is so spread around that this might just blow up
> > >>>>> at unexpected places. IIRC we have discussed testing in the previous
> > >>>>> version and David has provided a way to emulate these configurations
> > >>>>> on x86. Did you manage to use those instruction for additional testing
> > >>>>> on other than ppc architectures?
> > >>>>>
> > >>>>
> > >>>> I have tried all the steps that David mentioned and reported back at
> > >>>> https://lore.kernel.org/lkml/20200511174731.GD1961@linux.vnet.ibm.com/t/#u
> > >>>>
> > >>>> As a summary, David's steps are still not creating a memoryless/cpuless on
> > >>>> x86 VM.
> > >>>
> > >>> Now, that is wrong. You get a memoryless/cpuless node, which is *not
> > >>> online*. Once you hotplug some memory, it will switch online. Once you
> > >>> remove memory, it will switch back offline.
> > >>>
> > >>
> > >> Let me clarify, we are looking for a node 0 which is cpuless/memoryless at
> > >> boot.  The code in question tries to handle a cpuless/memoryless node 0 at
> > >> boot.
> > > 
> > > I was just correcting your statement, because it was wrong.
> > > 
> > > Could be that x86 code maps PXM 1 to node 0 because PXM 1 does neither
> > > have CPUs nor memory. That would imply that we can, in fact, never have
> > > node 0 offline during boot.
> > > 
> > 
> > Yep, looks like it.
> > 
> > [    0.009726] SRAT: PXM 1 -> APIC 0x00 -> Node 0
> > [    0.009727] SRAT: PXM 1 -> APIC 0x01 -> Node 0
> > [    0.009727] SRAT: PXM 1 -> APIC 0x02 -> Node 0
> > [    0.009728] SRAT: PXM 1 -> APIC 0x03 -> Node 0
> > [    0.009731] ACPI: SRAT: Node 0 PXM 1 [mem 0x00000000-0x0009ffff]
> > [    0.009732] ACPI: SRAT: Node 0 PXM 1 [mem 0x00100000-0xbfffffff]
> > [    0.009733] ACPI: SRAT: Node 0 PXM 1 [mem 0x100000000-0x13fffffff]
> 
> This begs a question whether ppc can do the same thing?
Or x86 stop doing it so that you can see on what node you are running?

What's the point of this indirection other than another way of avoiding
empty node 0?

Thanks

Michal

^ permalink raw reply

* Re: [PATCH v4 24/41] powerpc/book3s64/pkeys: Store/restore userspace AMR correctly on entry and exit from kernel
From: Nicholas Piggin @ 2020-07-03  9:18 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linuxppc-dev, mpe; +Cc: linuxram, bauerman
In-Reply-To: <20200615061430.770174-25-aneesh.kumar@linux.ibm.com>

Excerpts from Aneesh Kumar K.V's message of June 15, 2020 4:14 pm:
> This prepare kernel to operate with a different value than userspace AMR.
> For this, AMR needs to be saved and restored on entry and return from the
> kernel.
> 
> With KUAP we modify kernel AMR when accessing user address from the kernel
> via copy_to/from_user interfaces.
> 
> If MMU_FTR_KEY is enabled we always use the key mechanism to implement KUAP
> feature. If MMU_FTR_KEY is not supported and if we support MMU_FTR_KUAP
> (radix translation on POWER9), we can skip restoring AMR on return
> to userspace. Userspace won't be using AMR in that specific config.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/include/asm/book3s/64/kup.h | 141 ++++++++++++++++++-----
>  arch/powerpc/kernel/entry_64.S           |   6 +-
>  arch/powerpc/kernel/exceptions-64s.S     |   4 +-
>  arch/powerpc/kernel/syscall_64.c         |  26 ++++-
>  4 files changed, 144 insertions(+), 33 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
> index e6ee1c34842f..6979cd1a0003 100644
> --- a/arch/powerpc/include/asm/book3s/64/kup.h
> +++ b/arch/powerpc/include/asm/book3s/64/kup.h
> @@ -13,18 +13,47 @@
>  
>  #ifdef __ASSEMBLY__
>  
> -.macro kuap_restore_amr	gpr1, gpr2
> -#ifdef CONFIG_PPC_KUAP
> +.macro kuap_restore_user_amr gpr1
> +#if defined(CONFIG_PPC_MEM_KEYS)
>  	BEGIN_MMU_FTR_SECTION_NESTED(67)
> -	mfspr	\gpr1, SPRN_AMR
> +	/*
> +	 * AMR is going to be different when
> +	 * returning to userspace.
> +	 */
> +	ld	\gpr1, STACK_REGS_KUAP(r1)
> +	isync
> +	mtspr	SPRN_AMR, \gpr1
> +
> +	/* No isync required, see kuap_restore_user_amr() */
> +	END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_PKEY , 67)
> +#endif
> +.endm
> +
> +.macro kuap_restore_kernel_amr	gpr1, gpr2
> +#if defined(CONFIG_PPC_MEM_KEYS)
> +	BEGIN_MMU_FTR_SECTION_NESTED(67)
> +	b	99f  // handle_pkey_restore_amr
> +	END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_PKEY , 67)
> +
> +	BEGIN_MMU_FTR_SECTION_NESTED(68)
> +	b	99f  // handle_kuap_restore_amr
> +	MMU_FTR_SECTION_ELSE_NESTED(68)
> +	b	100f  // skip_restore_amr
> +	ALT_MMU_FTR_SECTION_END_NESTED_IFSET(MMU_FTR_KUAP, 68)
> +
> +99:
> +	/*
> +	 * AMR is going to be mostly the same since we are
> +	 * returning to the kernel. Compare and do a mtspr.
> +	 */
>  	ld	\gpr2, STACK_REGS_KUAP(r1)
> +	mfspr	\gpr1, SPRN_AMR
>  	cmpd	\gpr1, \gpr2
> -	beq	998f
> +	beq	100f
>  	isync
>  	mtspr	SPRN_AMR, \gpr2
>  	/* No isync required, see kuap_restore_amr() */
> -998:
> -	END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_KUAP, 67)
> +100:  // skip_restore_amr

Can't you code it like this? (_IFCLR requires none of the bits to be 
set)

BEGIN_MMU_FTR_SECTION_NESTED(67)
	b	99f	// nothing using AMR, no need to restore
END_MMU_FTR_SECTION_NESTED_IFCLR(MMU_FTR_PKEY | MMU_FTR_KUAP, 67)

That saves you a branch in the common case of using AMR. Similar
for others.

> @@ -69,22 +133,40 @@
>  
>  extern u64 default_uamor;
>  
> -static inline void kuap_restore_amr(struct pt_regs *regs, unsigned long amr)
> +static inline void kuap_restore_user_amr(struct pt_regs *regs)
>  {
> -	if (mmu_has_feature(MMU_FTR_KUAP) && unlikely(regs->kuap != amr)) {
> -		isync();
> -		mtspr(SPRN_AMR, regs->kuap);
> -		/*
> -		 * No isync required here because we are about to RFI back to
> -		 * previous context before any user accesses would be made,
> -		 * which is a CSI.
> -		 */
> +	if (!mmu_has_feature(MMU_FTR_PKEY))
> +		return;

If you have PKEY but not KUAP, do you still have to restore?

> +
> +	isync();
> +	mtspr(SPRN_AMR, regs->kuap);
> +	/*
> +	 * No isync required here because we are about to rfi
> +	 * back to previous context before any user accesses
> +	 * would be made, which is a CSI.
> +	 */
> +}
> +
> +static inline void kuap_restore_kernel_amr(struct pt_regs *regs,
> +					   unsigned long amr)
> +{
> +	if (mmu_has_feature(MMU_FTR_KUAP) || mmu_has_feature(MMU_FTR_PKEY)) {
> +
> +		if (unlikely(regs->kuap != amr)) {
> +			isync();
> +			mtspr(SPRN_AMR, regs->kuap);
> +			/*
> +			 * No isync required here because we are about to rfi
> +			 * back to previous context before any user accesses
> +			 * would be made, which is a CSI.
> +			 */
> +		}
>  	}
>  }
>  
>  static inline unsigned long kuap_get_and_check_amr(void)
>  {
> -	if (mmu_has_feature(MMU_FTR_KUAP)) {
> +	if (mmu_has_feature(MMU_FTR_KUAP) || mmu_has_feature(MMU_FTR_PKEY)) {
>  		unsigned long amr = mfspr(SPRN_AMR);
>  		if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG)) /* kuap_check_amr() */
>  			WARN_ON_ONCE(amr != AMR_KUAP_BLOCKED);

We could do a static key that's based on this condition, but that can 
wait for another day.

>  
>  static inline void kuap_check_amr(void)
>  {
> -	if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && mmu_has_feature(MMU_FTR_KUAP))
> +	if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) &&
> +	    (mmu_has_feature(MMU_FTR_KUAP) || mmu_has_feature(MMU_FTR_PKEY)))
>  		WARN_ON_ONCE(mfspr(SPRN_AMR) != AMR_KUAP_BLOCKED);
>  }
>  
>  #else /* CONFIG_PPC_MEM_KEYS */
>  
> -static inline void kuap_restore_amr(struct pt_regs *regs, unsigned long amr)
> +static inline void kuap_restore_user_amr(struct pt_regs *regs)
> +{
> +}
> +
> +static inline void kuap_restore_kernel_amr(struct pt_regs *regs, unsigned long amr)
>  {
>  }
>  
> @@ -113,6 +200,7 @@ static inline unsigned long kuap_get_and_check_amr(void)
>  {
>  	return 0;
>  }
> +
>  #endif /* CONFIG_PPC_MEM_KEYS */
>  
>  
> @@ -187,7 +275,6 @@ bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
>  		    "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read");
>  }
>  #endif /* CONFIG_PPC_KUAP */
> -
>  #endif /* __ASSEMBLY__ */
>  
>  #endif /* _ASM_POWERPC_BOOK3S_64_KUP_H */

Unnecessary hunks.

> diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
> index 9d49338e0c85..a087cbe0b17d 100644
> --- a/arch/powerpc/kernel/entry_64.S
> +++ b/arch/powerpc/kernel/entry_64.S
> @@ -481,8 +481,8 @@ _ASM_NOKPROBE_SYMBOL(fast_interrupt_return)
>  	kuap_check_amr r3, r4
>  	ld	r5,_MSR(r1)
>  	andi.	r0,r5,MSR_PR
> -	bne	.Lfast_user_interrupt_return
> -	kuap_restore_amr r3, r4
> +	bne	.Lfast_user_interrupt_return_amr
> +	kuap_restore_kernel_amr r3, r4
>  	andi.	r0,r5,MSR_RI
>  	li	r3,0 /* 0 return value, no EMULATE_STACK_STORE */
>  	bne+	.Lfast_kernel_interrupt_return
> @@ -502,6 +502,8 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return)
>  	cmpdi	r3,0
>  	bne-	.Lrestore_nvgprs
>  
> +.Lfast_user_interrupt_return_amr:
> +	kuap_restore_user_amr r3
>  .Lfast_user_interrupt_return:
>  	ld	r11,_NIP(r1)
>  	ld	r12,_MSR(r1)
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index e70ebb5c318c..8226af444d77 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -971,7 +971,7 @@ EXC_COMMON_BEGIN(system_reset_common)
>  	ld	r10,SOFTE(r1)
>  	stb	r10,PACAIRQSOFTMASK(r13)
>  
> -	kuap_restore_amr r9, r10
> +	kuap_restore_kernel_amr r9, r10
>  	EXCEPTION_RESTORE_REGS
>  	RFI_TO_USER_OR_KERNEL
>  
> @@ -2784,7 +2784,7 @@ EXC_COMMON_BEGIN(soft_nmi_common)
>  	ld	r10,SOFTE(r1)
>  	stb	r10,PACAIRQSOFTMASK(r13)
>  
> -	kuap_restore_amr r9, r10
> +	kuap_restore_kernel_amr r9, r10
>  	EXCEPTION_RESTORE_REGS hsrr=0
>  	RFI_TO_KERNEL
>  
> diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c
> index 7e560a01afa4..fded67982fbe 100644
> --- a/arch/powerpc/kernel/syscall_64.c
> +++ b/arch/powerpc/kernel/syscall_64.c
> @@ -35,7 +35,21 @@ notrace long system_call_exception(long r3, long r4, long r5,
>  	BUG_ON(!FULL_REGS(regs));
>  	BUG_ON(regs->softe != IRQS_ENABLED);
>  
> -	kuap_check_amr();
> +#ifdef CONFIG_PPC_MEM_KEYS
> +	if (mmu_has_feature(MMU_FTR_PKEY)) {
> +		unsigned long amr;
> +		/*
> +		 * When entering from userspace we mostly have the AMR
> +		 * different from kernel default values. Hence don't compare.
> +		 */
> +		amr = mfspr(SPRN_AMR);
> +		regs->kuap = amr;
> +		if (mmu_has_feature(MMU_FTR_KUAP))
> +			mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
> +		isync();

isync should be inside the if(). Again do pkeys need to save this if 
KUAP is not being used? I haven't really looked at how all that works, 
but what's changing for the PKEY && !KUAP case?

This would be nice if it could all go into a wrapper function rather 
than ifdef.

> +	} else
> +#endif
> +		kuap_check_amr();
>  
>  	account_cpu_user_entry();
>  
> @@ -222,6 +236,10 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
>  
>  	account_cpu_user_exit();
>  
> +	/*
> +	 * We do this at the end so that we do context switch with KERNEL AMR
> +	 */
> +	kuap_restore_user_amr(regs);
>  	return ret;

Comment doesn't make sense, newline required before return.

>  }
>  
> @@ -306,6 +324,10 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned
>  
>  	account_cpu_user_exit();
>  
> +	/*
> +	 * We do this at the end so that we do context switch with KERNEL AMR
> +	 */
> +	kuap_restore_user_amr(regs);

Duplicated comments I prefer to just have like this instead of trying to 
keep them in sync. Can complete the circular reference by having a

* similarly in interrupt_exit_user_prepare

in the main comment, but if they come close to one another in the same 
file it's not so important to keep them together.

 +	kuap_restore_user_amr(regs); /* see syscall_exit_prepare */

>  	return ret;
>  }
>  
> @@ -376,7 +398,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
>  	 * which would cause Read-After-Write stalls. Hence, we take the AMR
>  	 * value from the check above.
>  	 */
> -	kuap_restore_amr(regs, amr);
> +	kuap_restore_kernel_amr(regs, amr);
>  
>  	return ret;
>  }
> -- 
> 2.26.2
> 
> 

^ permalink raw reply

* Re: [PATCH 1/2] dt-bindings: sound: fsl-asoc-card: add new compatible for I2S slave
From: Arnaud Ferraris @ 2020-07-03  9:23 UTC (permalink / raw)
  To: Mark Brown
  Cc: devicetree, alsa-devel, Timur Tabi, Xiubo Li, Liam Girdwood,
	linux-kernel, Takashi Iwai, Nicolin Chen, Rob Herring,
	Fabio Estevam, kernel, linuxppc-dev
In-Reply-To: <20200702154207.GK4483@sirena.org.uk>



Le 02/07/2020 à 17:42, Mark Brown a écrit :
> On Thu, Jul 02, 2020 at 05:28:03PM +0200, Arnaud Ferraris wrote:
>> Le 02/07/2020 à 16:31, Mark Brown a écrit :
> 
>>> Why require that the CODEC be clock master here - why not make this
>>> configurable, reusing the properties from the generic and audio graph
>>> cards?
> 
>> This is partly because I'm not sure how to do it (yet), but mostly
>> because I don't have the hardware to test this (the 2 CODECs present on
>> my only i.MX6 board are both clock master)
> 
> Take a look at what the generic cards are doing, it's a library function 
> asoc_simple_parse_daifmt().  It's not the end of the world if you can't
> test it properly - if it turns out it's buggy somehow someone can always
> fix the code later but an ABI is an ABI so we can't change it.
> 

Thanks for the hints, I'll look into it.

Regards,
Arnaud

^ permalink raw reply

* Re: [PATCH v5 3/3] mm/page_alloc: Keep memoryless cpuless node 0 offline
From: Michal Hocko @ 2020-07-03  9:24 UTC (permalink / raw)
  To: Michal Suchánek
  Cc: Gautham R Shenoy, Andi Kleen, Srikar Dronamraju,
	David Hildenbrand, linuxppc-dev, linux-kernel, linux-mm,
	Satheesh Rajendran, Mel Gorman, Kirill A. Shutemov, Andrew Morton,
	Linus Torvalds, Christopher Lameter, Vlastimil Babka
In-Reply-To: <20200703091001.GJ21462@kitsune.suse.cz>

[Cc Andi]

On Fri 03-07-20 11:10:01, Michal Suchanek wrote:
> On Wed, Jul 01, 2020 at 02:21:10PM +0200, Michal Hocko wrote:
> > On Wed 01-07-20 13:30:57, David Hildenbrand wrote:
[...]
> > > Yep, looks like it.
> > > 
> > > [    0.009726] SRAT: PXM 1 -> APIC 0x00 -> Node 0
> > > [    0.009727] SRAT: PXM 1 -> APIC 0x01 -> Node 0
> > > [    0.009727] SRAT: PXM 1 -> APIC 0x02 -> Node 0
> > > [    0.009728] SRAT: PXM 1 -> APIC 0x03 -> Node 0
> > > [    0.009731] ACPI: SRAT: Node 0 PXM 1 [mem 0x00000000-0x0009ffff]
> > > [    0.009732] ACPI: SRAT: Node 0 PXM 1 [mem 0x00100000-0xbfffffff]
> > > [    0.009733] ACPI: SRAT: Node 0 PXM 1 [mem 0x100000000-0x13fffffff]
> > 
> > This begs a question whether ppc can do the same thing?
> Or x86 stop doing it so that you can see on what node you are running?
> 
> What's the point of this indirection other than another way of avoiding
> empty node 0?

Honestly, I do not have any idea. I've traced it down to
Author: Andi Kleen <ak@suse.de>
Date:   Tue Jan 11 15:35:48 2005 -0800

    [PATCH] x86_64: Fix ACPI SRAT NUMA parsing

    Fix fallout from the recent nodemask_t changes. The node ids assigned
    in the SRAT parser were off by one.

    I added a new first_unset_node() function to nodemask.h to allocate
    IDs sanely.

    Signed-off-by: Andi Kleen <ak@suse.de>
    Signed-off-by: Linus Torvalds <torvalds@osdl.org>

which doesn't really tell all that much. The historical baggage and a
long term behavior which is not really trivial to fix I suspect.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v4 24/41] powerpc/book3s64/pkeys: Store/restore userspace AMR correctly on entry and exit from kernel
From: Aneesh Kumar K.V @ 2020-07-03  9:30 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev, mpe; +Cc: linuxram, bauerman
In-Reply-To: <1593766495.dqos6wxr3o.astroid@bobo.none>

On 7/3/20 2:48 PM, Nicholas Piggin wrote:
> Excerpts from Aneesh Kumar K.V's message of June 15, 2020 4:14 pm:
>> This prepare kernel to operate with a different value than userspace AMR.
>> For this, AMR needs to be saved and restored on entry and return from the
>> kernel.
>>
>> With KUAP we modify kernel AMR when accessing user address from the kernel
>> via copy_to/from_user interfaces.
>>
>> If MMU_FTR_KEY is enabled we always use the key mechanism to implement KUAP
>> feature. If MMU_FTR_KEY is not supported and if we support MMU_FTR_KUAP
>> (radix translation on POWER9), we can skip restoring AMR on return
>> to userspace. Userspace won't be using AMR in that specific config.
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>   arch/powerpc/include/asm/book3s/64/kup.h | 141 ++++++++++++++++++-----
>>   arch/powerpc/kernel/entry_64.S           |   6 +-
>>   arch/powerpc/kernel/exceptions-64s.S     |   4 +-
>>   arch/powerpc/kernel/syscall_64.c         |  26 ++++-
>>   4 files changed, 144 insertions(+), 33 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
>> index e6ee1c34842f..6979cd1a0003 100644
>> --- a/arch/powerpc/include/asm/book3s/64/kup.h
>> +++ b/arch/powerpc/include/asm/book3s/64/kup.h
>> @@ -13,18 +13,47 @@
>>   
>>   #ifdef __ASSEMBLY__
>>   
>> -.macro kuap_restore_amr	gpr1, gpr2
>> -#ifdef CONFIG_PPC_KUAP
>> +.macro kuap_restore_user_amr gpr1
>> +#if defined(CONFIG_PPC_MEM_KEYS)
>>   	BEGIN_MMU_FTR_SECTION_NESTED(67)
>> -	mfspr	\gpr1, SPRN_AMR
>> +	/*
>> +	 * AMR is going to be different when
>> +	 * returning to userspace.
>> +	 */
>> +	ld	\gpr1, STACK_REGS_KUAP(r1)
>> +	isync
>> +	mtspr	SPRN_AMR, \gpr1
>> +
>> +	/* No isync required, see kuap_restore_user_amr() */
>> +	END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_PKEY , 67)
>> +#endif
>> +.endm
>> +
>> +.macro kuap_restore_kernel_amr	gpr1, gpr2
>> +#if defined(CONFIG_PPC_MEM_KEYS)
>> +	BEGIN_MMU_FTR_SECTION_NESTED(67)
>> +	b	99f  // handle_pkey_restore_amr
>> +	END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_PKEY , 67)
>> +
>> +	BEGIN_MMU_FTR_SECTION_NESTED(68)
>> +	b	99f  // handle_kuap_restore_amr
>> +	MMU_FTR_SECTION_ELSE_NESTED(68)
>> +	b	100f  // skip_restore_amr
>> +	ALT_MMU_FTR_SECTION_END_NESTED_IFSET(MMU_FTR_KUAP, 68)
>> +
>> +99:
>> +	/*
>> +	 * AMR is going to be mostly the same since we are
>> +	 * returning to the kernel. Compare and do a mtspr.
>> +	 */
>>   	ld	\gpr2, STACK_REGS_KUAP(r1)
>> +	mfspr	\gpr1, SPRN_AMR
>>   	cmpd	\gpr1, \gpr2
>> -	beq	998f
>> +	beq	100f
>>   	isync
>>   	mtspr	SPRN_AMR, \gpr2
>>   	/* No isync required, see kuap_restore_amr() */
>> -998:
>> -	END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_KUAP, 67)
>> +100:  // skip_restore_amr
> 
> Can't you code it like this? (_IFCLR requires none of the bits to be
> set)
> 
> BEGIN_MMU_FTR_SECTION_NESTED(67)
> 	b	99f	// nothing using AMR, no need to restore
> END_MMU_FTR_SECTION_NESTED_IFCLR(MMU_FTR_PKEY | MMU_FTR_KUAP, 67)
> 
> That saves you a branch in the common case of using AMR. Similar
> for others.



Yes i could switch to that. The code is taking extra 200 cycles even 
with KUAP/KUEP disabled and no keys being used on hash. I am yet to 
analyze this closely. So will rework things based on that analysis.

> 
>> @@ -69,22 +133,40 @@
>>   
>>   extern u64 default_uamor;
>>   
>> -static inline void kuap_restore_amr(struct pt_regs *regs, unsigned long amr)
>> +static inline void kuap_restore_user_amr(struct pt_regs *regs)
>>   {
>> -	if (mmu_has_feature(MMU_FTR_KUAP) && unlikely(regs->kuap != amr)) {
>> -		isync();
>> -		mtspr(SPRN_AMR, regs->kuap);
>> -		/*
>> -		 * No isync required here because we are about to RFI back to
>> -		 * previous context before any user accesses would be made,
>> -		 * which is a CSI.
>> -		 */
>> +	if (!mmu_has_feature(MMU_FTR_PKEY))
>> +		return;
> 
> If you have PKEY but not KUAP, do you still have to restore?
> 


Yes, because user space pkey is now set on the exit path. This is needed 
to handle things like exec/fork().


>> +
>> +	isync();
>> +	mtspr(SPRN_AMR, regs->kuap);
>> +	/*
>> +	 * No isync required here because we are about to rfi
>> +	 * back to previous context before any user accesses
>> +	 * would be made, which is a CSI.
>> +	 */
>> +}
>> +
>> +static inline void kuap_restore_kernel_amr(struct pt_regs *regs,
>> +					   unsigned long amr)
>> +{
>> +	if (mmu_has_feature(MMU_FTR_KUAP) || mmu_has_feature(MMU_FTR_PKEY)) {
>> +
>> +		if (unlikely(regs->kuap != amr)) {
>> +			isync();
>> +			mtspr(SPRN_AMR, regs->kuap);
>> +			/*
>> +			 * No isync required here because we are about to rfi
>> +			 * back to previous context before any user accesses
>> +			 * would be made, which is a CSI.
>> +			 */
>> +		}
>>   	}
>>   }
>>   
>>   static inline unsigned long kuap_get_and_check_amr(void)
>>   {
>> -	if (mmu_has_feature(MMU_FTR_KUAP)) {
>> +	if (mmu_has_feature(MMU_FTR_KUAP) || mmu_has_feature(MMU_FTR_PKEY)) {
>>   		unsigned long amr = mfspr(SPRN_AMR);
>>   		if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG)) /* kuap_check_amr() */
>>   			WARN_ON_ONCE(amr != AMR_KUAP_BLOCKED);
> 
> We could do a static key that's based on this condition, but that can
> wait for another day.
> 
>>   
>>   static inline void kuap_check_amr(void)
>>   {
>> -	if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && mmu_has_feature(MMU_FTR_KUAP))
>> +	if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) &&
>> +	    (mmu_has_feature(MMU_FTR_KUAP) || mmu_has_feature(MMU_FTR_PKEY)))
>>   		WARN_ON_ONCE(mfspr(SPRN_AMR) != AMR_KUAP_BLOCKED);
>>   }
>>   
>>   #else /* CONFIG_PPC_MEM_KEYS */
>>   
>> -static inline void kuap_restore_amr(struct pt_regs *regs, unsigned long amr)
>> +static inline void kuap_restore_user_amr(struct pt_regs *regs)
>> +{
>> +}
>> +
>> +static inline void kuap_restore_kernel_amr(struct pt_regs *regs, unsigned long amr)
>>   {
>>   }
>>   
>> @@ -113,6 +200,7 @@ static inline unsigned long kuap_get_and_check_amr(void)
>>   {
>>   	return 0;
>>   }
>> +
>>   #endif /* CONFIG_PPC_MEM_KEYS */
>>   
>>   
>> @@ -187,7 +275,6 @@ bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
>>   		    "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read");
>>   }
>>   #endif /* CONFIG_PPC_KUAP */
>> -
>>   #endif /* __ASSEMBLY__ */
>>   
>>   #endif /* _ASM_POWERPC_BOOK3S_64_KUP_H */
> 
> Unnecessary hunks.


will remove

> 
>> diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
>> index 9d49338e0c85..a087cbe0b17d 100644
>> --- a/arch/powerpc/kernel/entry_64.S
>> +++ b/arch/powerpc/kernel/entry_64.S
>> @@ -481,8 +481,8 @@ _ASM_NOKPROBE_SYMBOL(fast_interrupt_return)
>>   	kuap_check_amr r3, r4
>>   	ld	r5,_MSR(r1)
>>   	andi.	r0,r5,MSR_PR
>> -	bne	.Lfast_user_interrupt_return
>> -	kuap_restore_amr r3, r4
>> +	bne	.Lfast_user_interrupt_return_amr
>> +	kuap_restore_kernel_amr r3, r4
>>   	andi.	r0,r5,MSR_RI
>>   	li	r3,0 /* 0 return value, no EMULATE_STACK_STORE */
>>   	bne+	.Lfast_kernel_interrupt_return
>> @@ -502,6 +502,8 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return)
>>   	cmpdi	r3,0
>>   	bne-	.Lrestore_nvgprs
>>   
>> +.Lfast_user_interrupt_return_amr:
>> +	kuap_restore_user_amr r3
>>   .Lfast_user_interrupt_return:
>>   	ld	r11,_NIP(r1)
>>   	ld	r12,_MSR(r1)
>> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
>> index e70ebb5c318c..8226af444d77 100644
>> --- a/arch/powerpc/kernel/exceptions-64s.S
>> +++ b/arch/powerpc/kernel/exceptions-64s.S
>> @@ -971,7 +971,7 @@ EXC_COMMON_BEGIN(system_reset_common)
>>   	ld	r10,SOFTE(r1)
>>   	stb	r10,PACAIRQSOFTMASK(r13)
>>   
>> -	kuap_restore_amr r9, r10
>> +	kuap_restore_kernel_amr r9, r10
>>   	EXCEPTION_RESTORE_REGS
>>   	RFI_TO_USER_OR_KERNEL
>>   
>> @@ -2784,7 +2784,7 @@ EXC_COMMON_BEGIN(soft_nmi_common)
>>   	ld	r10,SOFTE(r1)
>>   	stb	r10,PACAIRQSOFTMASK(r13)
>>   
>> -	kuap_restore_amr r9, r10
>> +	kuap_restore_kernel_amr r9, r10
>>   	EXCEPTION_RESTORE_REGS hsrr=0
>>   	RFI_TO_KERNEL
>>   
>> diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c
>> index 7e560a01afa4..fded67982fbe 100644
>> --- a/arch/powerpc/kernel/syscall_64.c
>> +++ b/arch/powerpc/kernel/syscall_64.c
>> @@ -35,7 +35,21 @@ notrace long system_call_exception(long r3, long r4, long r5,
>>   	BUG_ON(!FULL_REGS(regs));
>>   	BUG_ON(regs->softe != IRQS_ENABLED);
>>   
>> -	kuap_check_amr();
>> +#ifdef CONFIG_PPC_MEM_KEYS
>> +	if (mmu_has_feature(MMU_FTR_PKEY)) {
>> +		unsigned long amr;
>> +		/*
>> +		 * When entering from userspace we mostly have the AMR
>> +		 * different from kernel default values. Hence don't compare.
>> +		 */
>> +		amr = mfspr(SPRN_AMR);
>> +		regs->kuap = amr;
>> +		if (mmu_has_feature(MMU_FTR_KUAP))
>> +			mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
>> +		isync();
> 
> isync should be inside the if(). Again do pkeys need to save this if
> KUAP is not being used? I haven't really looked at how all that works,
> but what's changing for the PKEY && !KUAP case?
> 

There is no SPR switch in context switch now and all the AMR/IAMR 
handling is now in the exit to userspace.


> This would be nice if it could all go into a wrapper function rather
> than ifdef.
> 

will do

>> +	} else
>> +#endif
>> +		kuap_check_amr();
>>   
>>   	account_cpu_user_entry();
>>   
>> @@ -222,6 +236,10 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
>>   
>>   	account_cpu_user_exit();
>>   
>> +	/*
>> +	 * We do this at the end so that we do context switch with KERNEL AMR
>> +	 */
>> +	kuap_restore_user_amr(regs);
>>   	return ret;
> 
> Comment doesn't make sense, newline required before return.


Ok the detail there was we need to make sure we restore AMR towrads the 
end and make sure all the kernel code continue to run with KERNEL AMR 
value. There is a schedule() call in there with _TIF_NEED_RESCHED. But 
those details are not really relevant. That was me tracking down some 
issues and writing comment around that part of the code. The only real 
detail is switch to userspace AMR in the end.



> 
>>   }
>>   
>> @@ -306,6 +324,10 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned
>>   
>>   	account_cpu_user_exit();
>>   
>> +	/*
>> +	 * We do this at the end so that we do context switch with KERNEL AMR
>> +	 */
>> +	kuap_restore_user_amr(regs);
> 
> Duplicated comments I prefer to just have like this instead of trying to
> keep them in sync. Can complete the circular reference by having a
> 
> * similarly in interrupt_exit_user_prepare
> 
> in the main comment, but if they come close to one another in the same
> file it's not so important to keep them together.
> 
>   +	kuap_restore_user_amr(regs); /* see syscall_exit_prepare */
> 
>>   	return ret;
>>   }
>>   
>> @@ -376,7 +398,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
>>   	 * which would cause Read-After-Write stalls. Hence, we take the AMR
>>   	 * value from the check above.
>>   	 */
>> -	kuap_restore_amr(regs, amr);
>> +	kuap_restore_kernel_amr(regs, amr);
>>   
>>   	return ret;
>>   }
>> -- 
>> 2.26.2
>>
>>


-aneesh

^ permalink raw reply

* Re: [PATCH 0/4] ASoC: fsl_asrc: allow selecting arbitrary clocks
From: Arnaud Ferraris @ 2020-07-03  9:38 UTC (permalink / raw)
  To: Nicolin Chen
  Cc: devicetree, alsa-devel, Timur Tabi, Xiubo Li, Liam Girdwood,
	Mark Brown, linux-kernel, Takashi Iwai, Rob Herring,
	Fabio Estevam, kernel, linuxppc-dev
In-Reply-To: <20200702184226.GA23935@Asurada-Nvidia>

Hi Nic,

Le 02/07/2020 à 20:42, Nicolin Chen a écrit :
> Hi Arnaud,
> 
> On Thu, Jul 02, 2020 at 04:22:31PM +0200, Arnaud Ferraris wrote:
>> The current ASRC driver hardcodes the input and output clocks used for
>> sample rate conversions. In order to allow greater flexibility and to
>> cover more use cases, it would be preferable to select the clocks using
>> device-tree properties.
> 
> We recent just merged a new change that auto-selecting internal
> clocks based on sample rates as the first option -- ideal ratio
> mode is the fallback mode now. Please refer to:
> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?h=next-20200702&id=d0250cf4f2abfbea64ed247230f08f5ae23979f0

That looks interesting, thanks for pointing this out!
I'll rebase and see how it works for my use-case, will keep you informed.

Regards,
Arnaud

^ permalink raw reply

* Re: [PATCH V3 (RESEND) 2/3] mm/sparsemem: Enable vmem_altmap support in vmemmap_alloc_block_buf()
From: Michael Ellerman @ 2020-07-03 10:43 UTC (permalink / raw)
  To: Catalin Marinas, Anshuman Khandual
  Cc: x86, Peter Zijlstra, Dave Hansen, linuxppc-dev, linux-kernel,
	linux-mm, Ingo Molnar, Paul Mackerras, Andy Lutomirski,
	H. Peter Anvin, Borislav Petkov, Thomas Gleixner, Will Deacon,
	Andrew Morton, linux-arm-kernel
In-Reply-To: <20200702140752.GF22241@gaia>

Catalin Marinas <catalin.marinas@arm.com> writes:
> On Thu, Jun 18, 2020 at 06:45:29AM +0530, Anshuman Khandual wrote:
>> There are many instances where vmemap allocation is often switched between
>> regular memory and device memory just based on whether altmap is available
>> or not. vmemmap_alloc_block_buf() is used in various platforms to allocate
>> vmemmap mappings. Lets also enable it to handle altmap based device memory
>> allocation along with existing regular memory allocations. This will help
>> in avoiding the altmap based allocation switch in many places.
>> 
>> While here also implement a regular memory allocation fallback mechanism
>> when the first preferred device memory allocation fails. This will ensure
>> preserving the existing semantics on powerpc platform. To summarize there
>> are three different methods to call vmemmap_alloc_block_buf().
>> 
>> (., NULL,   false) /* Allocate from system RAM */
>> (., altmap, false) /* Allocate from altmap without any fallback */
>> (., altmap, true)  /* Allocate from altmap with fallback (system RAM) */
> [...]
>> diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
>> index bc73abf0bc25..01e25b56eccb 100644
>> --- a/arch/powerpc/mm/init_64.c
>> +++ b/arch/powerpc/mm/init_64.c
>> @@ -225,12 +225,12 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
>>  		 * fall back to system memory if the altmap allocation fail.
>>  		 */
>>  		if (altmap && !altmap_cross_boundary(altmap, start, page_size)) {
>> -			p = altmap_alloc_block_buf(page_size, altmap);
>> -			if (!p)
>> -				pr_debug("altmap block allocation failed, falling back to system memory");
>> +			p = vmemmap_alloc_block_buf(page_size, node,
>> +						    altmap, true);
>> +		} else {
>> +			p = vmemmap_alloc_block_buf(page_size, node,
>> +						    NULL, false);
>>  		}
>> -		if (!p)
>> -			p = vmemmap_alloc_block_buf(page_size, node);
>>  		if (!p)
>>  			return -ENOMEM;
>
> Is the fallback argument actually necessary. It may be cleaner to just
> leave the code as is with the choice between altmap and NULL. If an arch
> needs a fallback (only powerpc), they have the fallback in place
> already. I don't see the powerpc code any better after this change.

Yeah I agree.

cheers

^ permalink raw reply

* [RFC PATCH v0 0/2] Use H_RPT_INVALIDATE for nested guest
From: Bharata B Rao @ 2020-07-03 10:44 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev; +Cc: aneesh.kumar, Bharata B Rao, npiggin

This patchset adds support for the new hcall H_RPT_INVALIDATE
(currently handles nested case only) and replaces the nested tlb flush
calls with this new hcall if the support for the same exists.

This applies on top of "[PATCH v3 0/3] Off-load TLB invalidations to host
for !GTSE" patchset that was posted at:

https://lore.kernel.org/linuxppc-dev/20200703053608.12884-1-bharata@linux.ibm.com/T/#t

H_RPT_INVALIDATE
================
Syntax:
int64   /* H_Success: Return code on successful completion */
        /* H_Busy - repeat the call with the same */
        /* H_Parameter, H_P2, H_P3, H_P4, H_P5 : Invalid parameters */
        hcall(const uint64 H_RPT_INVALIDATE, /* Invalidate RPT translation lookaside information */
              uint64 pid,       /* PID/LPID to invalidate */
              uint64 target,    /* Invalidation target */
              uint64 type,      /* Type of lookaside information */
              uint64 pageSizes,     /* Page sizes */
              uint64 start,     /* Start of Effective Address (EA) range (inclusive) */
              uint64 end)       /* End of EA range (exclusive) */

Invalidation targets (target)
-----------------------------
Core MMU        0x01 /* All virtual processors in the partition */
Core local MMU  0x02 /* Current virtual processor */
Nest MMU        0x04 /* All nest/accelerator agents in use by the partition */

A combination of the above can be specified, except core and core local.

Type of translation to invalidate (type)
---------------------------------------
NESTED       0x0001  /* Invalidate nested guest partition-scope */
TLB          0x0002  /* Invalidate TLB */
PWC          0x0004  /* Invalidate Page Walk Cache */
PRT          0x0008  /* Invalidate Process Table Entries if NESTED is clear */
PAT          0x0008  /* Invalidate Partition Table Entries if NESTED is set */

A combination of the above can be specified.

Page size mask (pageSizes)
--------------------------
4K              0x01
64K             0x02
2M              0x04
1G              0x08
All sizes       (-1UL)

A combination of the above can be specified.
All page sizes can be selected with -1.

Semantics: Invalidate radix tree lookaside information
           matching the parameters given.
* Return H_P2, H_P3 or H_P4 if target, type, or pageSizes parameters are
  different from the defined values.
* Return H_PARAMETER if NESTED is set and pid is not a valid nested
  LPID allocated to this partition
* Return H_P5 if (start, end) doesn't form a valid range. Start and end
  should be a valid Quadrant address and  end > start.
* Return H_NotSupported if the partition is not in running in radix
  translation mode.
* May invalidate more translation information than requested.
* If start = 0 and end = -1, set the range to cover all valid addresses.
  Else start and end should be aligned to 4kB (lower 11 bits clear).
* If NESTED is clear, then invalidate process scoped lookaside information.
  Else pid specifies a nested LPID, and the invalidation is performed
  on nested guest partition table and nested guest partition scope real
  addresses.
* If pid = 0 and NESTED is clear, then valid addresses are quadrant 3 and
  quadrant 0 spaces, Else valid addresses are quadrant 0.
* Pages which are fully covered by the range are to be invalidated.
  Those which are partially covered are considered outside invalidation
  range, which allows a caller to optimally invalidate ranges that may
  contain mixed page sizes.
* Return H_SUCCESS on success.

Bharata B Rao (2):
  KVM: PPC: Book3S HV: Add support for H_RPT_INVALIDATE (nested case
    only)
  KVM: PPC: Book3S HV: Use H_RPT_INVALIDATE in nested KVM

 Documentation/virt/kvm/api.rst                |  17 +++
 .../include/asm/book3s/64/tlbflush-radix.h    |  18 +++
 arch/powerpc/include/asm/firmware.h           |   4 +-
 arch/powerpc/include/asm/kvm_book3s.h         |   3 +
 arch/powerpc/kvm/book3s_64_mmu_radix.c        |  26 ++++-
 arch/powerpc/kvm/book3s_hv.c                  |  32 ++++++
 arch/powerpc/kvm/book3s_hv_nested.c           | 107 +++++++++++++++++-
 arch/powerpc/kvm/powerpc.c                    |   3 +
 arch/powerpc/mm/book3s64/radix_tlb.c          |   4 -
 arch/powerpc/platforms/pseries/firmware.c     |   1 +
 include/uapi/linux/kvm.h                      |   1 +
 11 files changed, 204 insertions(+), 12 deletions(-)

-- 
2.21.3

^ permalink raw reply

* [RFC PATCH v0 1/2] KVM: PPC: Book3S HV: Add support for H_RPT_INVALIDATE (nested case only)
From: Bharata B Rao @ 2020-07-03 10:44 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev; +Cc: aneesh.kumar, Bharata B Rao, npiggin
In-Reply-To: <20200703104420.21349-1-bharata@linux.ibm.com>

Implements H_RPT_INVALIDATE hcall and supports only nested case
currently.

A KVM capability KVM_CAP_RPT_INVALIDATE is added to indicate the
support for this hcall.

Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
---
 Documentation/virt/kvm/api.rst                | 17 ++++
 .../include/asm/book3s/64/tlbflush-radix.h    | 18 ++++
 arch/powerpc/include/asm/kvm_book3s.h         |  3 +
 arch/powerpc/kvm/book3s_hv.c                  | 32 +++++++
 arch/powerpc/kvm/book3s_hv_nested.c           | 94 +++++++++++++++++++
 arch/powerpc/kvm/powerpc.c                    |  3 +
 arch/powerpc/mm/book3s64/radix_tlb.c          |  4 -
 include/uapi/linux/kvm.h                      |  1 +
 8 files changed, 168 insertions(+), 4 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 426f94582b7a..d235d16a4bf0 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5843,6 +5843,23 @@ controlled by the kvm module parameter halt_poll_ns. This capability allows
 the maximum halt time to specified on a per-VM basis, effectively overriding
 the module parameter for the target VM.
 
+7.21 KVM_CAP_RPT_INVALIDATE
+--------------------------
+
+:Capability: KVM_CAP_RPT_INVALIDATE
+:Architectures: ppc
+:Type: vm
+
+This capability indicates that the kernel is capable of handling
+H_RPT_INVALIDATE hcall.
+
+In order to enable the use of H_RPT_INVALIDATE in the guest,
+user space might have to advertise it for the guest. For example,
+IBM pSeries (sPAPR) guest starts using it if "hcall-rpt-invalidate" is
+present in the "ibm,hypertas-functions" device-tree property.
+
+This capability is always enabled.
+
 8. Other capabilities.
 ======================
 
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index 94439e0cefc9..aace7e9b2397 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -4,6 +4,10 @@
 
 #include <asm/hvcall.h>
 
+#define RIC_FLUSH_TLB 0
+#define RIC_FLUSH_PWC 1
+#define RIC_FLUSH_ALL 2
+
 struct vm_area_struct;
 struct mm_struct;
 struct mmu_gather;
@@ -21,6 +25,20 @@ static inline u64 psize_to_rpti_pgsize(unsigned long psize)
 	return H_RPTI_PAGE_ALL;
 }
 
+static inline int rpti_pgsize_to_psize(unsigned long page_size)
+{
+	if (page_size == H_RPTI_PAGE_4K)
+		return MMU_PAGE_4K;
+	if (page_size == H_RPTI_PAGE_64K)
+		return MMU_PAGE_64K;
+	if (page_size == H_RPTI_PAGE_2M)
+		return MMU_PAGE_2M;
+	if (page_size == H_RPTI_PAGE_1G)
+		return MMU_PAGE_1G;
+	else
+		return MMU_PAGE_64K; /* Default */
+}
+
 static inline int mmu_get_ap(int psize)
 {
 	return mmu_psize_defs[psize].ap;
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index d32ec9ae73bd..0f1c5fa6e8ce 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -298,6 +298,9 @@ void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
 void kvmhv_release_all_nested(struct kvm *kvm);
 long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
 long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
+long kvmhv_h_rpti_nested(struct kvm_vcpu *vcpu, unsigned long lpid,
+			 unsigned long type, unsigned long pg_sizes,
+			 unsigned long start, unsigned long end);
 int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu,
 			  u64 time_limit, unsigned long lpcr);
 void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6bf66649ab92..2f772183f249 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -895,6 +895,28 @@ static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
 	return yield_count;
 }
 
+static long kvmppc_h_rpt_invalidate(struct kvm_vcpu *vcpu,
+				    unsigned long pid, unsigned long target,
+				    unsigned long type, unsigned long pg_sizes,
+				    unsigned long start, unsigned long end)
+{
+	if (end < start)
+		return H_P5;
+
+	if ((!type & H_RPTI_TYPE_NESTED))
+		return H_P3;
+
+	if (!nesting_enabled(vcpu->kvm))
+		return H_FUNCTION;
+
+	/* Support only cores as target */
+	if (target != H_RPTI_TARGET_CMMU)
+		return H_P2;
+
+	return kvmhv_h_rpti_nested(vcpu, pid, (type & ~H_RPTI_TYPE_NESTED),
+				   pg_sizes, start, end);
+}
+
 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 {
 	unsigned long req = kvmppc_get_gpr(vcpu, 3);
@@ -1103,6 +1125,14 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 		 */
 		ret = kvmppc_h_svm_init_abort(vcpu->kvm);
 		break;
+	case H_RPT_INVALIDATE:
+		ret = kvmppc_h_rpt_invalidate(vcpu, kvmppc_get_gpr(vcpu, 4),
+					      kvmppc_get_gpr(vcpu, 5),
+					      kvmppc_get_gpr(vcpu, 6),
+					      kvmppc_get_gpr(vcpu, 7),
+					      kvmppc_get_gpr(vcpu, 8),
+					      kvmppc_get_gpr(vcpu, 9));
+		break;
 
 	default:
 		return RESUME_HOST;
@@ -1149,6 +1179,7 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd)
 	case H_XIRR_X:
 #endif
 	case H_PAGE_INIT:
+	case H_RPT_INVALIDATE:
 		return 1;
 	}
 
@@ -5284,6 +5315,7 @@ static unsigned int default_hcall_list[] = {
 	H_XIRR,
 	H_XIRR_X,
 #endif
+	H_RPT_INVALIDATE,
 	0
 };
 
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 2c849a65db77..efb78d37f29a 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -1141,6 +1141,100 @@ long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu)
 	return H_SUCCESS;
 }
 
+static long do_tlb_invalidate_nested_tlb(struct kvm_vcpu *vcpu,
+					 unsigned long lpid,
+					 unsigned long page_size,
+					 unsigned long ap,
+					 unsigned long start,
+					 unsigned long end)
+{
+	unsigned long addr = start;
+	int ret;
+
+	do {
+		ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap,
+						   get_epn(addr));
+		if (ret)
+			return ret;
+		addr += page_size;
+	} while (addr < end);
+
+	return ret;
+}
+
+static long do_tlb_invalidate_nested_all(struct kvm_vcpu *vcpu,
+					 unsigned long lpid)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_nested_guest *gp;
+
+	gp = kvmhv_get_nested(kvm, lpid, false);
+	if (gp) {
+		kvmhv_emulate_tlbie_lpid(vcpu, gp, RIC_FLUSH_ALL);
+		kvmhv_put_nested(gp);
+	}
+	return H_SUCCESS;
+}
+
+long kvmhv_h_rpti_nested(struct kvm_vcpu *vcpu, unsigned long lpid,
+			 unsigned long type, unsigned long pg_sizes,
+			 unsigned long start, unsigned long end)
+{
+	struct kvm_nested_guest *gp;
+	long ret;
+	unsigned long psize, ap;
+
+	/*
+	 * If L2 lpid isn't valid, we need to return H_PARAMETER.
+	 * Nested KVM issues a L2 lpid flush call when creating
+	 * partition table entries for L2. This happens even before
+	 * the corresponding shadow lpid is created in HV. Until
+	 * this is fixed, ignore such flush requests.
+	 */
+	gp = kvmhv_find_nested(vcpu->kvm, lpid);
+	if (!gp)
+		return H_SUCCESS;
+
+	if ((type & H_RPTI_TYPE_NESTED_ALL) == H_RPTI_TYPE_NESTED_ALL)
+		return do_tlb_invalidate_nested_all(vcpu, lpid);
+
+	if ((type & H_RPTI_TYPE_TLB) == H_RPTI_TYPE_TLB) {
+		if (pg_sizes & H_RPTI_PAGE_64K) {
+			psize = rpti_pgsize_to_psize(pg_sizes & H_RPTI_PAGE_64K);
+			ap = mmu_get_ap(psize);
+
+			ret = do_tlb_invalidate_nested_tlb(vcpu, lpid,
+							   (1UL << 16),
+							   ap, start, end);
+			if (ret)
+				return H_P4;
+		}
+
+		if (pg_sizes & H_RPTI_PAGE_2M) {
+			psize = rpti_pgsize_to_psize(pg_sizes & H_RPTI_PAGE_2M);
+			ap = mmu_get_ap(psize);
+
+			ret = do_tlb_invalidate_nested_tlb(vcpu, lpid,
+							   (1UL << 21),
+							   ap, start, end);
+			if (ret)
+				return H_P4;
+		}
+
+		if (pg_sizes & H_RPTI_PAGE_1G) {
+			psize = rpti_pgsize_to_psize(pg_sizes & H_RPTI_PAGE_1G);
+			ap = mmu_get_ap(psize);
+
+			ret = do_tlb_invalidate_nested_tlb(vcpu, lpid,
+							   (1UL << 30),
+							   ap, start, end);
+			if (ret)
+				return H_P4;
+		}
+	}
+	return H_SUCCESS;
+}
+
 /* Used to convert a nested guest real address to a L1 guest real address */
 static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
 				       struct kvm_nested_guest *gp,
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index dd7d141e33e8..fddd742dafeb 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -675,6 +675,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = hv_enabled && kvmppc_hv_ops->enable_svm &&
 			!kvmppc_hv_ops->enable_svm(NULL);
 		break;
+	case KVM_CAP_RPT_INVALIDATE:
+		r = 1;
+		break;
 #endif
 	default:
 		r = 0;
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 0d233763441f..ffbb43fd1c6f 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -18,10 +18,6 @@
 #include <asm/cputhreads.h>
 #include <asm/plpar_wrappers.h>
 
-#define RIC_FLUSH_TLB 0
-#define RIC_FLUSH_PWC 1
-#define RIC_FLUSH_ALL 2
-
 /*
  * tlbiel instruction for radix, set invalidation
  * i.e., r=1 and is=01 or is=10 or is=11
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4fdf30316582..67170f9267ff 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1031,6 +1031,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_SECURE_GUEST 181
 #define KVM_CAP_HALT_POLL 182
 #define KVM_CAP_ASYNC_PF_INT 183
+#define KVM_CAP_RPT_INVALIDATE 184
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.21.3


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox