LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v11 10/12] swiotlb: Add restricted DMA pool initialization
From: Claire Chang @ 2021-06-16  3:52 UTC (permalink / raw)
  To: Rob Herring, mpe, Joerg Roedel, Will Deacon, Frank Rowand,
	Konrad Rzeszutek Wilk, boris.ostrovsky, jgross, Christoph Hellwig,
	Marek Szyprowski
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, mingo, jxgao, sstabellini,
	Saravana Kannan, xypron.glpk, Rafael J . Wysocki,
	Bartosz Golaszewski, bskeggs, linux-pci, xen-devel,
	Thierry Reding, intel-gfx, matthew.auld, linux-devicetree, daniel,
	airlied, maarten.lankhorst, linuxppc-dev, jani.nikula,
	Nicolas Boichat, rodrigo.vivi, bhelgaas, tientzu, Dan Williams,
	Andy Shevchenko, Greg KH, Randy Dunlap, lkml, tfiga,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, Robin Murphy, bauerman
In-Reply-To: <20210616035240.840463-1-tientzu@chromium.org>

Add the initialization function to create restricted DMA pools from
matching reserved-memory nodes.

Regardless of swiotlb setting, the restricted DMA pool is preferred if
available.

The restricted DMA pools provide a basic level of protection against the
DMA overwriting buffer contents at unexpected times. However, to protect
against general data leakage and system memory corruption, the system
needs to provide a way to lock down the memory access, e.g., MPU.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/swiotlb.h |  3 +-
 kernel/dma/Kconfig      | 14 ++++++++
 kernel/dma/swiotlb.c    | 75 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 2d5ec670e064..9616346b727f 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -73,7 +73,8 @@ extern enum swiotlb_force swiotlb_force;
  *		range check to see if the memory was in fact allocated by this
  *		API.
  * @nslabs:	The number of IO TLB blocks (in groups of 64) between @start and
- *		@end. This is command line adjustable via setup_io_tlb_npages.
+ *		@end. For default swiotlb, this is command line adjustable via
+ *		setup_io_tlb_npages.
  * @used:	The number of used IO TLB block.
  * @list:	The free list describing the number of free entries available
  *		from each index.
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 77b405508743..3e961dc39634 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -80,6 +80,20 @@ config SWIOTLB
 	bool
 	select NEED_DMA_MAP_STATE
 
+config DMA_RESTRICTED_POOL
+	bool "DMA Restricted Pool"
+	depends on OF && OF_RESERVED_MEM
+	select SWIOTLB
+	help
+	  This enables support for restricted DMA pools which provide a level of
+	  DMA memory protection on systems with limited hardware protection
+	  capabilities, such as those lacking an IOMMU.
+
+	  For more information see
+	  <Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt>
+	  and <kernel/dma/swiotlb.c>.
+	  If unsure, say "n".
+
 #
 # Should be selected if we can mmap non-coherent mappings to userspace.
 # The only thing that is really required is a way to set an uncached bit
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 6ad85b48f101..f3f271f7e272 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -39,6 +39,13 @@
 #ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
 #endif
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_reserved_mem.h>
+#include <linux/slab.h>
+#endif
 
 #include <asm/io.h>
 #include <asm/dma.h>
@@ -742,4 +749,72 @@ bool swiotlb_free(struct device *dev, struct page *page, size_t size)
 	return true;
 }
 
+static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
+				    struct device *dev)
+{
+	struct io_tlb_mem *mem = rmem->priv;
+	unsigned long nslabs = rmem->size >> IO_TLB_SHIFT;
+
+	/*
+	 * Since multiple devices can share the same pool, the private data,
+	 * io_tlb_mem struct, will be initialized by the first device attached
+	 * to it.
+	 */
+	if (!mem) {
+		mem = kzalloc(struct_size(mem, slots, nslabs), GFP_KERNEL);
+		if (!mem)
+			return -ENOMEM;
+
+		swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false);
+		mem->force = true;
+		set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
+				     rmem->size >> PAGE_SHIFT);
+
+		rmem->priv = mem;
+
+		if (IS_ENABLED(CONFIG_DEBUG_FS)) {
+			mem->debugfs =
+				debugfs_create_dir(rmem->name, debugfs_dir);
+			swiotlb_create_debugfs_files(mem);
+		}
+	}
+
+	dev->dma_io_tlb_mem = mem;
+
+	return 0;
+}
+
+static void rmem_swiotlb_device_release(struct reserved_mem *rmem,
+					struct device *dev)
+{
+	dev->dma_io_tlb_mem = io_tlb_default_mem;
+}
+
+static const struct reserved_mem_ops rmem_swiotlb_ops = {
+	.device_init = rmem_swiotlb_device_init,
+	.device_release = rmem_swiotlb_device_release,
+};
+
+static int __init rmem_swiotlb_setup(struct reserved_mem *rmem)
+{
+	unsigned long node = rmem->fdt_node;
+
+	if (of_get_flat_dt_prop(node, "reusable", NULL) ||
+	    of_get_flat_dt_prop(node, "linux,cma-default", NULL) ||
+	    of_get_flat_dt_prop(node, "linux,dma-default", NULL) ||
+	    of_get_flat_dt_prop(node, "no-map", NULL))
+		return -EINVAL;
+
+	if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) {
+		pr_err("Restricted DMA pool must be accessible within the linear mapping.");
+		return -EINVAL;
+	}
+
+	rmem->ops = &rmem_swiotlb_ops;
+	pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n",
+		&rmem->base, (unsigned long)rmem->size / SZ_1M);
+	return 0;
+}
+
+RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup);
 #endif /* CONFIG_DMA_RESTRICTED_POOL */
-- 
2.32.0.272.g935e593368-goog


^ permalink raw reply related

* [PATCH v11 11/12] dt-bindings: of: Add restricted DMA pool
From: Claire Chang @ 2021-06-16  3:52 UTC (permalink / raw)
  To: Rob Herring, mpe, Joerg Roedel, Will Deacon, Frank Rowand,
	Konrad Rzeszutek Wilk, boris.ostrovsky, jgross, Christoph Hellwig,
	Marek Szyprowski
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, mingo, jxgao, sstabellini,
	Saravana Kannan, xypron.glpk, Rafael J . Wysocki,
	Bartosz Golaszewski, bskeggs, linux-pci, xen-devel,
	Thierry Reding, intel-gfx, matthew.auld, linux-devicetree, daniel,
	airlied, maarten.lankhorst, linuxppc-dev, jani.nikula,
	Nicolas Boichat, rodrigo.vivi, bhelgaas, tientzu, Dan Williams,
	Andy Shevchenko, Greg KH, Randy Dunlap, lkml, tfiga,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, Robin Murphy, bauerman
In-Reply-To: <20210616035240.840463-1-tientzu@chromium.org>

Introduce the new compatible string, restricted-dma-pool, for restricted
DMA. One can specify the address and length of the restricted DMA memory
region by restricted-dma-pool in the reserved-memory node.

Signed-off-by: Claire Chang <tientzu@chromium.org>
---
 .../reserved-memory/reserved-memory.txt       | 36 +++++++++++++++++--
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt b/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
index e8d3096d922c..46804f24df05 100644
--- a/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
+++ b/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
@@ -51,6 +51,23 @@ compatible (optional) - standard definition
           used as a shared pool of DMA buffers for a set of devices. It can
           be used by an operating system to instantiate the necessary pool
           management subsystem if necessary.
+        - restricted-dma-pool: This indicates a region of memory meant to be
+          used as a pool of restricted DMA buffers for a set of devices. The
+          memory region would be the only region accessible to those devices.
+          When using this, the no-map and reusable properties must not be set,
+          so the operating system can create a virtual mapping that will be used
+          for synchronization. The main purpose for restricted DMA is to
+          mitigate the lack of DMA access control on systems without an IOMMU,
+          which could result in the DMA accessing the system memory at
+          unexpected times and/or unexpected addresses, possibly leading to data
+          leakage or corruption. The feature on its own provides a basic level
+          of protection against the DMA overwriting buffer contents at
+          unexpected times. However, to protect against general data leakage and
+          system memory corruption, the system needs to provide way to lock down
+          the memory access, e.g., MPU. Note that since coherent allocation
+          needs remapping, one must set up another device coherent pool by
+          shared-dma-pool and use dma_alloc_from_dev_coherent instead for atomic
+          coherent allocation.
         - vendor specific string in the form <vendor>,[<device>-]<usage>
 no-map (optional) - empty property
     - Indicates the operating system must not create a virtual mapping
@@ -85,10 +102,11 @@ memory-region-names (optional) - a list of names, one for each corresponding
 
 Example
 -------
-This example defines 3 contiguous regions are defined for Linux kernel:
+This example defines 4 contiguous regions for Linux kernel:
 one default of all device drivers (named linux,cma@72000000 and 64MiB in size),
-one dedicated to the framebuffer device (named framebuffer@78000000, 8MiB), and
-one for multimedia processing (named multimedia-memory@77000000, 64MiB).
+one dedicated to the framebuffer device (named framebuffer@78000000, 8MiB),
+one for multimedia processing (named multimedia-memory@77000000, 64MiB), and
+one for restricted dma pool (named restricted_dma_reserved@0x50000000, 64MiB).
 
 / {
 	#address-cells = <1>;
@@ -120,6 +138,11 @@ one for multimedia processing (named multimedia-memory@77000000, 64MiB).
 			compatible = "acme,multimedia-memory";
 			reg = <0x77000000 0x4000000>;
 		};
+
+		restricted_dma_reserved: restricted_dma_reserved {
+			compatible = "restricted-dma-pool";
+			reg = <0x50000000 0x4000000>;
+		};
 	};
 
 	/* ... */
@@ -138,4 +161,11 @@ one for multimedia processing (named multimedia-memory@77000000, 64MiB).
 		memory-region = <&multimedia_reserved>;
 		/* ... */
 	};
+
+	pcie_device: pcie_device@0,0 {
+		reg = <0x83010000 0x0 0x00000000 0x0 0x00100000
+		       0x83010000 0x0 0x00100000 0x0 0x00100000>;
+		memory-region = <&restricted_dma_mem_reserved>;
+		/* ... */
+	};
 };
-- 
2.32.0.272.g935e593368-goog


^ permalink raw reply related

* [PATCH v11 12/12] of: Add plumbing for restricted DMA pool
From: Claire Chang @ 2021-06-16  3:52 UTC (permalink / raw)
  To: Rob Herring, mpe, Joerg Roedel, Will Deacon, Frank Rowand,
	Konrad Rzeszutek Wilk, boris.ostrovsky, jgross, Christoph Hellwig,
	Marek Szyprowski
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, mingo, jxgao, sstabellini,
	Saravana Kannan, xypron.glpk, Rafael J . Wysocki,
	Bartosz Golaszewski, bskeggs, linux-pci, xen-devel,
	Thierry Reding, intel-gfx, matthew.auld, linux-devicetree, daniel,
	airlied, maarten.lankhorst, linuxppc-dev, jani.nikula,
	Nicolas Boichat, rodrigo.vivi, bhelgaas, tientzu, Dan Williams,
	Andy Shevchenko, Greg KH, Randy Dunlap, lkml, tfiga,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, Robin Murphy, bauerman
In-Reply-To: <20210616035240.840463-1-tientzu@chromium.org>

If a device is not behind an IOMMU, we look up the device node and set
up the restricted DMA when the restricted-dma-pool is presented.

Signed-off-by: Claire Chang <tientzu@chromium.org>
---
 drivers/of/address.c    | 33 +++++++++++++++++++++++++++++++++
 drivers/of/device.c     |  3 +++
 drivers/of/of_private.h |  6 ++++++
 3 files changed, 42 insertions(+)

diff --git a/drivers/of/address.c b/drivers/of/address.c
index 73ddf2540f3f..cdf700fba5c4 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -8,6 +8,7 @@
 #include <linux/logic_pio.h>
 #include <linux/module.h>
 #include <linux/of_address.h>
+#include <linux/of_reserved_mem.h>
 #include <linux/pci.h>
 #include <linux/pci_regs.h>
 #include <linux/sizes.h>
@@ -1022,6 +1023,38 @@ int of_dma_get_range(struct device_node *np, const struct bus_dma_region **map)
 	of_node_put(node);
 	return ret;
 }
+
+int of_dma_set_restricted_buffer(struct device *dev, struct device_node *np)
+{
+	struct device_node *node, *of_node = dev->of_node;
+	int count, i;
+
+	count = of_property_count_elems_of_size(of_node, "memory-region",
+						sizeof(u32));
+	/*
+	 * If dev->of_node doesn't exist or doesn't contain memory-region, try
+	 * the OF node having DMA configuration.
+	 */
+	if (count <= 0) {
+		of_node = np;
+		count = of_property_count_elems_of_size(
+			of_node, "memory-region", sizeof(u32));
+	}
+
+	for (i = 0; i < count; i++) {
+		node = of_parse_phandle(of_node, "memory-region", i);
+		/*
+		 * There might be multiple memory regions, but only one
+		 * restricted-dma-pool region is allowed.
+		 */
+		if (of_device_is_compatible(node, "restricted-dma-pool") &&
+		    of_device_is_available(node))
+			return of_reserved_mem_device_init_by_idx(dev, of_node,
+								  i);
+	}
+
+	return 0;
+}
 #endif /* CONFIG_HAS_DMA */
 
 /**
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 6cb86de404f1..e68316836a7a 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -165,6 +165,9 @@ int of_dma_configure_id(struct device *dev, struct device_node *np,
 
 	arch_setup_dma_ops(dev, dma_start, size, iommu, coherent);
 
+	if (!iommu)
+		return of_dma_set_restricted_buffer(dev, np);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(of_dma_configure_id);
diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
index d9e6a324de0a..25cebbed5f02 100644
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -161,12 +161,18 @@ struct bus_dma_region;
 #if defined(CONFIG_OF_ADDRESS) && defined(CONFIG_HAS_DMA)
 int of_dma_get_range(struct device_node *np,
 		const struct bus_dma_region **map);
+int of_dma_set_restricted_buffer(struct device *dev, struct device_node *np);
 #else
 static inline int of_dma_get_range(struct device_node *np,
 		const struct bus_dma_region **map)
 {
 	return -ENODEV;
 }
+static inline int of_dma_set_restricted_buffer(struct device *dev,
+					       struct device_node *np)
+{
+	return -ENODEV;
+}
 #endif
 
 #endif /* _LINUX_OF_PRIVATE_H */
-- 
2.32.0.272.g935e593368-goog


^ permalink raw reply related

* Re: [PATCH v10 00/12] Restricted DMA
From: Claire Chang @ 2021-06-16  3:55 UTC (permalink / raw)
  To: Rob Herring, mpe, Joerg Roedel, Will Deacon, Frank Rowand,
	Konrad Rzeszutek Wilk, boris.ostrovsky, jgross, Christoph Hellwig,
	Marek Szyprowski
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, mingo, Jianxiong Gao,
	sstabellini, Saravana Kannan, xypron.glpk, Rafael J . Wysocki,
	Bartosz Golaszewski, bskeggs, linux-pci, xen-devel,
	Thierry Reding, intel-gfx, matthew.auld, linux-devicetree,
	Daniel Vetter, airlied, maarten.lankhorst, linuxppc-dev,
	jani.nikula, Nicolas Boichat, rodrigo.vivi, Bjorn Helgaas,
	Dan Williams, Andy Shevchenko, Greg KH, Randy Dunlap, lkml,
	Tomasz Figa, list@263.net:IOMMU DRIVERS, Jim Quinlan,
	Robin Murphy, bauerman
In-Reply-To: <20210615132711.553451-1-tientzu@chromium.org>

v11 https://lore.kernel.org/patchwork/cover/1447216/

On Tue, Jun 15, 2021 at 9:27 PM Claire Chang <tientzu@chromium.org> wrote:
>
> This series implements mitigations for lack of DMA access control on
> systems without an IOMMU, which could result in the DMA accessing the
> system memory at unexpected times and/or unexpected addresses, possibly
> leading to data leakage or corruption.
>
> For example, we plan to use the PCI-e bus for Wi-Fi and that PCI-e bus is
> not behind an IOMMU. As PCI-e, by design, gives the device full access to
> system memory, a vulnerability in the Wi-Fi firmware could easily escalate
> to a full system exploit (remote wifi exploits: [1a], [1b] that shows a
> full chain of exploits; [2], [3]).
>
> To mitigate the security concerns, we introduce restricted DMA. Restricted
> DMA utilizes the existing swiotlb to bounce streaming DMA in and out of a
> specially allocated region and does memory allocation from the same region.
> The feature on its own provides a basic level of protection against the DMA
> overwriting buffer contents at unexpected times. However, to protect
> against general data leakage and system memory corruption, the system needs
> to provide a way to restrict the DMA to a predefined memory region (this is
> usually done at firmware level, e.g. MPU in ATF on some ARM platforms [4]).
>
> [1a] https://googleprojectzero.blogspot.com/2017/04/over-air-exploiting-broadcoms-wi-fi_4.html
> [1b] https://googleprojectzero.blogspot.com/2017/04/over-air-exploiting-broadcoms-wi-fi_11.html
> [2] https://blade.tencent.com/en/advisories/qualpwn/
> [3] https://www.bleepingcomputer.com/news/security/vulnerabilities-found-in-highly-popular-firmware-for-wifi-chips/
> [4] https://github.com/ARM-software/arm-trusted-firmware/blob/master/plat/mediatek/mt8183/drivers/emi_mpu/emi_mpu.c#L132
>
> v10:
> Address the comments in v9 to
>   - fix the dev->dma_io_tlb_mem assignment
>   - propagate swiotlb_force setting into io_tlb_default_mem->force
>   - move set_memory_decrypted out of swiotlb_init_io_tlb_mem
>   - move debugfs_dir declaration into the main CONFIG_DEBUG_FS block
>   - add swiotlb_ prefix to find_slots and release_slots
>   - merge the 3 alloc/free related patches
>   - move the CONFIG_DMA_RESTRICTED_POOL later
>
> v9:
> Address the comments in v7 to
>   - set swiotlb active pool to dev->dma_io_tlb_mem
>   - get rid of get_io_tlb_mem
>   - dig out the device struct for is_swiotlb_active
>   - move debugfs_create_dir out of swiotlb_create_debugfs
>   - do set_memory_decrypted conditionally in swiotlb_init_io_tlb_mem
>   - use IS_ENABLED in kernel/dma/direct.c
>   - fix redefinition of 'of_dma_set_restricted_buffer'
> https://lore.kernel.org/patchwork/cover/1445081/
>
> v8:
> - Fix reserved-memory.txt and add the reg property in example.
> - Fix sizeof for of_property_count_elems_of_size in
>   drivers/of/address.c#of_dma_set_restricted_buffer.
> - Apply Will's suggestion to try the OF node having DMA configuration in
>   drivers/of/address.c#of_dma_set_restricted_buffer.
> - Fix typo in the comment of drivers/of/address.c#of_dma_set_restricted_buffer.
> - Add error message for PageHighMem in
>   kernel/dma/swiotlb.c#rmem_swiotlb_device_init and move it to
>   rmem_swiotlb_setup.
> - Fix the message string in rmem_swiotlb_setup.
> https://lore.kernel.org/patchwork/cover/1437112/
>
> v7:
> Fix debugfs, PageHighMem and comment style in rmem_swiotlb_device_init
> https://lore.kernel.org/patchwork/cover/1431031/
>
> v6:
> Address the comments in v5
> https://lore.kernel.org/patchwork/cover/1423201/
>
> v5:
> Rebase on latest linux-next
> https://lore.kernel.org/patchwork/cover/1416899/
>
> v4:
> - Fix spinlock bad magic
> - Use rmem->name for debugfs entry
> - Address the comments in v3
> https://lore.kernel.org/patchwork/cover/1378113/
>
> v3:
> Using only one reserved memory region for both streaming DMA and memory
> allocation.
> https://lore.kernel.org/patchwork/cover/1360992/
>
> v2:
> Building on top of swiotlb.
> https://lore.kernel.org/patchwork/cover/1280705/
>
> v1:
> Using dma_map_ops.
> https://lore.kernel.org/patchwork/cover/1271660/
>
>
> Claire Chang (12):
>   swiotlb: Refactor swiotlb init functions
>   swiotlb: Refactor swiotlb_create_debugfs
>   swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used
>   swiotlb: Update is_swiotlb_buffer to add a struct device argument
>   swiotlb: Update is_swiotlb_active to add a struct device argument
>   swiotlb: Use is_dev_swiotlb_force for swiotlb data bouncing
>   swiotlb: Move alloc_size to swiotlb_find_slots
>   swiotlb: Refactor swiotlb_tbl_unmap_single
>   swiotlb: Add restricted DMA pool initialization
>   swiotlb: Add restricted DMA alloc/free support
>   dt-bindings: of: Add restricted DMA pool
>   of: Add plumbing for restricted DMA pool
>
>  .../reserved-memory/reserved-memory.txt       |  36 ++-
>  drivers/base/core.c                           |   4 +
>  drivers/gpu/drm/i915/gem/i915_gem_internal.c  |   2 +-
>  drivers/gpu/drm/nouveau/nouveau_ttm.c         |   2 +-
>  drivers/iommu/dma-iommu.c                     |  12 +-
>  drivers/of/address.c                          |  33 +++
>  drivers/of/device.c                           |   3 +
>  drivers/of/of_private.h                       |   6 +
>  drivers/pci/xen-pcifront.c                    |   2 +-
>  drivers/xen/swiotlb-xen.c                     |   2 +-
>  include/linux/device.h                        |   4 +
>  include/linux/swiotlb.h                       |  40 ++-
>  kernel/dma/Kconfig                            |  14 +
>  kernel/dma/direct.c                           |  60 +++--
>  kernel/dma/direct.h                           |   8 +-
>  kernel/dma/swiotlb.c                          | 255 +++++++++++++-----
>  16 files changed, 380 insertions(+), 103 deletions(-)
>
> --
> 2.32.0.272.g935e593368-goog
>

^ permalink raw reply

* Re: [PATCH v11 09/12] swiotlb: Add restricted DMA alloc/free support
From: Claire Chang @ 2021-06-16  4:04 UTC (permalink / raw)
  To: Rob Herring, mpe, Joerg Roedel, Will Deacon, Frank Rowand,
	Konrad Rzeszutek Wilk, boris.ostrovsky, jgross, Christoph Hellwig,
	Marek Szyprowski
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, mingo, Jianxiong Gao,
	sstabellini, Saravana Kannan, xypron.glpk, Rafael J . Wysocki,
	Bartosz Golaszewski, bskeggs, linux-pci, xen-devel,
	Thierry Reding, intel-gfx, matthew.auld, linux-devicetree,
	Daniel Vetter, airlied, maarten.lankhorst, linuxppc-dev,
	jani.nikula, Nicolas Boichat, rodrigo.vivi, Bjorn Helgaas,
	Dan Williams, Andy Shevchenko, Greg KH, Randy Dunlap, lkml,
	Tomasz Figa, list@263.net:IOMMU DRIVERS, Jim Quinlan,
	Robin Murphy, bauerman
In-Reply-To: <20210616035240.840463-10-tientzu@chromium.org>

On Wed, Jun 16, 2021 at 11:54 AM Claire Chang <tientzu@chromium.org> wrote:
>
> Add the functions, swiotlb_{alloc,free} to support the memory allocation
> from restricted DMA pool.
>
> The restricted DMA pool is preferred if available.
>
> Note that since coherent allocation needs remapping, one must set up
> another device coherent pool by shared-dma-pool and use
> dma_alloc_from_dev_coherent instead for atomic coherent allocation.
>
> Signed-off-by: Claire Chang <tientzu@chromium.org>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> ---
>  include/linux/swiotlb.h | 15 +++++++++++++
>  kernel/dma/direct.c     | 50 ++++++++++++++++++++++++++++++-----------
>  kernel/dma/swiotlb.c    | 45 +++++++++++++++++++++++++++++++++++--
>  3 files changed, 95 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
> index efcd56e3a16c..2d5ec670e064 100644
> --- a/include/linux/swiotlb.h
> +++ b/include/linux/swiotlb.h
> @@ -156,4 +156,19 @@ static inline void swiotlb_adjust_size(unsigned long size)
>  extern void swiotlb_print_info(void);
>  extern void swiotlb_set_max_segment(unsigned int);
>
> +#ifdef CONFIG_DMA_RESTRICTED_POOL
> +struct page *swiotlb_alloc(struct device *dev, size_t size);
> +bool swiotlb_free(struct device *dev, struct page *page, size_t size);
> +#else
> +static inline struct page *swiotlb_alloc(struct device *dev, size_t size)
> +{
> +       return NULL;
> +}
> +static inline bool swiotlb_free(struct device *dev, struct page *page,
> +                               size_t size)
> +{
> +       return false;
> +}
> +#endif /* CONFIG_DMA_RESTRICTED_POOL */
> +
>  #endif /* __LINUX_SWIOTLB_H */
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 3713461d6fe0..da0e09621230 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -75,6 +75,15 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
>                 min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
>  }
>
> +static void __dma_direct_free_pages(struct device *dev, struct page *page,
> +                                   size_t size)
> +{
> +       if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
> +           swiotlb_free(dev, page, size))
> +               return;
> +       dma_free_contiguous(dev, page, size);
> +}
> +
>  static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
>                 gfp_t gfp)
>  {
> @@ -86,7 +95,16 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
>
>         gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
>                                            &phys_limit);
> -       page = dma_alloc_contiguous(dev, size, gfp);
> +       if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL)) {
> +               page = swiotlb_alloc(dev, size);
> +               if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
> +                       __dma_direct_free_pages(dev, page, size);
> +                       return NULL;
> +               }
> +       }
> +
> +       if (!page)
> +               page = dma_alloc_contiguous(dev, size, gfp);
>         if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
>                 dma_free_contiguous(dev, page, size);
>                 page = NULL;
> @@ -142,7 +160,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
>                 gfp |= __GFP_NOWARN;
>
>         if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
> -           !force_dma_unencrypted(dev)) {
> +           !force_dma_unencrypted(dev) && !is_dev_swiotlb_force(dev)) {
>                 page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
>                 if (!page)
>                         return NULL;
> @@ -155,18 +173,23 @@ void *dma_direct_alloc(struct device *dev, size_t size,
>         }
>
>         if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
> -           !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
> -           !dev_is_dma_coherent(dev))
> +           !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev) &&
> +           !is_dev_swiotlb_force(dev))
>                 return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);

Just noticed that after propagating swiotlb_force setting into
io_tlb_default_mem->force, the memory allocation behavior for
swiotlb_force will change (i.e. always skipping arch_dma_alloc and
dma_direct_alloc_from_pool).

>
>         /*
>          * Remapping or decrypting memory may block. If either is required and
>          * we can't block, allocate the memory from the atomic pools.
> +        * If restricted DMA (i.e., is_dev_swiotlb_force) is required, one must
> +        * set up another device coherent pool by shared-dma-pool and use
> +        * dma_alloc_from_dev_coherent instead.
>          */
>         if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
>             !gfpflags_allow_blocking(gfp) &&
>             (force_dma_unencrypted(dev) ||
> -            (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev))))
> +            (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
> +             !dev_is_dma_coherent(dev))) &&
> +           !is_dev_swiotlb_force(dev))
>                 return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);

And here.

>
>         /* we always manually zero the memory once we are done */
> @@ -237,7 +260,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
>                         return NULL;
>         }
>  out_free_pages:
> -       dma_free_contiguous(dev, page, size);
> +       __dma_direct_free_pages(dev, page, size);
>         return NULL;
>  }
>
> @@ -247,15 +270,15 @@ void dma_direct_free(struct device *dev, size_t size,
>         unsigned int page_order = get_order(size);
>
>         if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
> -           !force_dma_unencrypted(dev)) {
> +           !force_dma_unencrypted(dev) && !is_dev_swiotlb_force(dev)) {
>                 /* cpu_addr is a struct page cookie, not a kernel address */
>                 dma_free_contiguous(dev, cpu_addr, size);
>                 return;
>         }
>
>         if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
> -           !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
> -           !dev_is_dma_coherent(dev)) {
> +           !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev) &&
> +           !is_dev_swiotlb_force(dev)) {
>                 arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
>                 return;
>         }
> @@ -273,7 +296,7 @@ void dma_direct_free(struct device *dev, size_t size,
>         else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
>                 arch_dma_clear_uncached(cpu_addr, size);
>
> -       dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
> +       __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size);
>  }
>
>  struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
> @@ -283,7 +306,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
>         void *ret;
>
>         if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
> -           force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp))
> +           force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) &&
> +           !is_dev_swiotlb_force(dev))
>                 return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
>
>         page = __dma_direct_alloc_pages(dev, size, gfp);
> @@ -310,7 +334,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
>         *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
>         return page;
>  out_free_pages:
> -       dma_free_contiguous(dev, page, size);
> +       __dma_direct_free_pages(dev, page, size);
>         return NULL;
>  }
>
> @@ -329,7 +353,7 @@ void dma_direct_free_pages(struct device *dev, size_t size,
>         if (force_dma_unencrypted(dev))
>                 set_memory_encrypted((unsigned long)vaddr, 1 << page_order);
>
> -       dma_free_contiguous(dev, page, size);
> +       __dma_direct_free_pages(dev, page, size);
>  }
>
>  #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index fec4934b9926..6ad85b48f101 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -462,8 +462,9 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
>
>         index = wrap = wrap_index(mem, ALIGN(mem->index, stride));
>         do {
> -               if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
> -                   (orig_addr & iotlb_align_mask)) {
> +               if (orig_addr &&
> +                   (slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
> +                           (orig_addr & iotlb_align_mask)) {
>                         index = wrap_index(mem, index + 1);
>                         continue;
>                 }
> @@ -702,3 +703,43 @@ static int __init swiotlb_create_default_debugfs(void)
>  late_initcall(swiotlb_create_default_debugfs);
>
>  #endif
> +
> +#ifdef CONFIG_DMA_RESTRICTED_POOL
> +struct page *swiotlb_alloc(struct device *dev, size_t size)
> +{
> +       struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
> +       phys_addr_t tlb_addr;
> +       int index;
> +
> +       /*
> +        * Skip io_tlb_default_mem since swiotlb_alloc doesn't support atomic
> +        * coherent allocation. Otherwise might break existing devices.
> +        * One must set up another device coherent pool by shared-dma-pool and
> +        * use dma_alloc_from_dev_coherent instead for atomic coherent
> +        * allocation to avoid memory remapping.
> +        */
> +       if (!mem || mem == io_tlb_default_mem)
> +               return NULL;
> +
> +       index = swiotlb_find_slots(dev, 0, size);
> +       if (index == -1)
> +               return NULL;
> +
> +       tlb_addr = slot_addr(mem->start, index);
> +
> +       return pfn_to_page(PFN_DOWN(tlb_addr));
> +}
> +
> +bool swiotlb_free(struct device *dev, struct page *page, size_t size)
> +{
> +       phys_addr_t tlb_addr = page_to_phys(page);
> +
> +       if (!is_swiotlb_buffer(dev, tlb_addr))
> +               return false;
> +
> +       swiotlb_release_slots(dev, tlb_addr);
> +
> +       return true;
> +}
> +
> +#endif /* CONFIG_DMA_RESTRICTED_POOL */
> --
> 2.32.0.272.g935e593368-goog
>

^ permalink raw reply

* Re: [PATCH 6/8] powerpc/membarrier: Remove special barrier on mm switch
From: Nicholas Piggin @ 2021-06-16  4:36 UTC (permalink / raw)
  To: Andy Lutomirski, x86
  Cc: Dave Hansen, Peter Zijlstra, LKML, linux-mm, Mathieu Desnoyers,
	Paul Mackerras, Andrew Morton, linuxppc-dev
In-Reply-To: <d9034f951329830b766feb08099c15b714e08e8f.1623813516.git.luto@kernel.org>

Excerpts from Andy Lutomirski's message of June 16, 2021 1:21 pm:
> powerpc did the following on some, but not all, paths through
> switch_mm_irqs_off():
> 
>        /*
>         * Only need the full barrier when switching between processes.
>         * Barrier when switching from kernel to userspace is not
>         * required here, given that it is implied by mmdrop(). Barrier
>         * when switching from userspace to kernel is not needed after
>         * store to rq->curr.
>         */
>        if (likely(!(atomic_read(&next->membarrier_state) &
>                     (MEMBARRIER_STATE_PRIVATE_EXPEDITED |
>                      MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev))
>                return;
> 
> This is puzzling: if !prev, then one might expect that we are switching
> from kernel to user, not user to kernel, which is inconsistent with the
> comment.  But this is all nonsense, because the one and only caller would
> never have prev == NULL and would, in fact, OOPS if prev == NULL.

Yeah that's strange, code definitely doesn't match comment. Good catch.

> 
> In any event, this code is unnecessary, since the new generic
> membarrier_finish_switch_mm() provides the same barrier without arch help.

If that's merged then I think this could be too. I'll do a bit more 
digging into this too.

Thanks,
Nick

> 
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: Nicholas Piggin <npiggin@gmail.com>
> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
>  arch/powerpc/include/asm/membarrier.h | 27 ---------------------------
>  arch/powerpc/mm/mmu_context.c         |  2 --
>  2 files changed, 29 deletions(-)
>  delete mode 100644 arch/powerpc/include/asm/membarrier.h
> 
> diff --git a/arch/powerpc/include/asm/membarrier.h b/arch/powerpc/include/asm/membarrier.h
> deleted file mode 100644
> index 6e20bb5c74ea..000000000000
> --- a/arch/powerpc/include/asm/membarrier.h
> +++ /dev/null
> @@ -1,27 +0,0 @@
> -#ifndef _ASM_POWERPC_MEMBARRIER_H
> -#define _ASM_POWERPC_MEMBARRIER_H
> -
> -static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
> -					     struct mm_struct *next,
> -					     struct task_struct *tsk)
> -{
> -	/*
> -	 * Only need the full barrier when switching between processes.
> -	 * Barrier when switching from kernel to userspace is not
> -	 * required here, given that it is implied by mmdrop(). Barrier
> -	 * when switching from userspace to kernel is not needed after
> -	 * store to rq->curr.
> -	 */
> -	if (likely(!(atomic_read(&next->membarrier_state) &
> -		     (MEMBARRIER_STATE_PRIVATE_EXPEDITED |
> -		      MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev))
> -		return;
> -
> -	/*
> -	 * The membarrier system call requires a full memory barrier
> -	 * after storing to rq->curr, before going back to user-space.
> -	 */
> -	smp_mb();
> -}
> -
> -#endif /* _ASM_POWERPC_MEMBARRIER_H */
> diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
> index a857af401738..8daa95b3162b 100644
> --- a/arch/powerpc/mm/mmu_context.c
> +++ b/arch/powerpc/mm/mmu_context.c
> @@ -85,8 +85,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
>  
>  	if (new_on_cpu)
>  		radix_kvm_prefetch_workaround(next);
> -	else
> -		membarrier_arch_switch_mm(prev, next, tsk);
>  
>  	/*
>  	 * The actual HW switching method differs between the various
> -- 
> 2.31.1
> 
> 

^ permalink raw reply

* Re: [PATCH v12 1/6] kasan: allow an architecture to disable inline instrumentation
From: Daniel Axtens @ 2021-06-16  4:39 UTC (permalink / raw)
  To: Marco Elver
  Cc: aneesh.kumar, LKML, Linux Memory Management List, kasan-dev,
	linuxppc-dev
In-Reply-To: <CANpmjNOa-a=M-EgdkneiWDD0eCF-DELjMFxAeJzGQz6AgCdNWg@mail.gmail.com>

Hi Marco,

@@ -12,6 +12,15 @@ config HAVE_ARCH_KASAN_HW_TAGS
>>  config HAVE_ARCH_KASAN_VMALLOC
>>         bool
>>
>> +# Sometimes an architecture might not be able to support inline instrumentation
>> +# but might be able to support outline instrumentation. This option allows an
>> +# arch to prevent inline and stack instrumentation from being enabled.
>
> This comment could be moved into 'help' of this new config option.

It could. I did wonder if that made sense given that this is not a user
selectable option so I'm not sure if the help will ever be visible, but
I see that we do this sort of thing in Kconfig.kcsan and Kconfig.kgdb.
I've changed it over.

>> +# ppc64 turns on virtual memory late in boot, after calling into generic code
>> +# like the device-tree parser, so it uses this in conjuntion with a hook in
>> +# outline mode to avoid invalid access early in boot.
>
> I think the ppc64-related comment isn't necessary and can be moved to
> arch/ppc64 somewhere, if there isn't one already.

Fair enough. I'll pull it out of this file and look for a good place to
put the information in arch/powerpc in a later patch/series.

Kind regards,
Daniel


^ permalink raw reply

* Re: [PATCH v12 2/6] kasan: allow architectures to provide an outline readiness check
From: Daniel Axtens @ 2021-06-16  4:41 UTC (permalink / raw)
  To: Marco Elver
  Cc: aneesh.kumar, LKML, Linux Memory Management List, kasan-dev,
	linuxppc-dev, Aneesh Kumar K . V
In-Reply-To: <CANpmjNN2=gdDBPzYQYsmOtLQVVjSz2qFcwcTMEqB=s_ZWndJLg@mail.gmail.com>

Hi Marco,
>> +       /* Don't touch the shadow memory if arch isn't ready */
>> +       if (!kasan_arch_is_ready())
>> +               return;
>> +
>
> What about kasan_poison_last_granule()? kasan_unpoison() currently
> seems to potentially trip on that.

Ah the perils of rebasing an old series! I'll re-audit the generic code
for functions that touch memory and make sure I have covered them all.

Thanks for the review.

Kind regards,
Daniel

>
> -- 
> You received this message because you are subscribed to the Google Groups "kasan-dev" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to kasan-dev+unsubscribe@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/kasan-dev/CANpmjNN2%3DgdDBPzYQYsmOtLQVVjSz2qFcwcTMEqB%3Ds_ZWndJLg%40mail.gmail.com.

^ permalink raw reply

* Re: [PATCH 8/8] membarrier: Rewrite sync_core_before_usermode() and improve documentation
From: Nicholas Piggin @ 2021-06-16  4:45 UTC (permalink / raw)
  To: Andy Lutomirski, x86
  Cc: Will Deacon, linux-mm, Peter Zijlstra, LKML, stable, Dave Hansen,
	Mathieu Desnoyers, Catalin Marinas, Paul Mackerras, Andrew Morton,
	linuxppc-dev, linux-arm-kernel
In-Reply-To: <07a8b963002cb955b7516e61bad19514a3acaa82.1623813516.git.luto@kernel.org>

Excerpts from Andy Lutomirski's message of June 16, 2021 1:21 pm:
> The old sync_core_before_usermode() comments suggested that a non-icache-syncing
> return-to-usermode instruction is x86-specific and that all other
> architectures automatically notice cross-modified code on return to
> userspace.
> 
> This is misleading.  The incantation needed to modify code from one
> CPU and execute it on another CPU is highly architecture dependent.
> On x86, according to the SDM, one must modify the code, issue SFENCE
> if the modification was WC or nontemporal, and then issue a "serializing
> instruction" on the CPU that will execute the code.  membarrier() can do
> the latter.
> 
> On arm64 and powerpc, one must flush the icache and then flush the pipeline
> on the target CPU, although the CPU manuals don't necessarily use this
> language.
> 
> So let's drop any pretense that we can have a generic way to define or
> implement membarrier's SYNC_CORE operation and instead require all
> architectures to define the helper and supply their own documentation as to
> how to use it.  This means x86, arm64, and powerpc for now.  Let's also
> rename the function from sync_core_before_usermode() to
> membarrier_sync_core_before_usermode() because the precise flushing details
> may very well be specific to membarrier, and even the concept of
> "sync_core" in the kernel is mostly an x86-ism.
> 
> (It may well be the case that, on real x86 processors, synchronizing the
>  icache (which requires no action at all) and "flushing the pipeline" is
>  sufficient, but trying to use this language would be confusing at best.
>  LFENCE does something awfully like "flushing the pipeline", but the SDM
>  does not permit LFENCE as an alternative to a "serializing instruction"
>  for this purpose.)
> 
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: Nicholas Piggin <npiggin@gmail.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Cc: Nicholas Piggin <npiggin@gmail.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: x86@kernel.org
> Cc: stable@vger.kernel.org
> Fixes: 70216e18e519 ("membarrier: Provide core serializing command, *_SYNC_CORE")
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
>  .../membarrier-sync-core/arch-support.txt     | 68 ++++++-------------
>  arch/arm64/include/asm/sync_core.h            | 19 ++++++
>  arch/powerpc/include/asm/sync_core.h          | 14 ++++
>  arch/x86/Kconfig                              |  1 -
>  arch/x86/include/asm/sync_core.h              |  7 +-
>  arch/x86/kernel/alternative.c                 |  2 +-
>  arch/x86/kernel/cpu/mce/core.c                |  2 +-
>  arch/x86/mm/tlb.c                             |  3 +-
>  drivers/misc/sgi-gru/grufault.c               |  2 +-
>  drivers/misc/sgi-gru/gruhandles.c             |  2 +-
>  drivers/misc/sgi-gru/grukservices.c           |  2 +-
>  include/linux/sched/mm.h                      |  1 -
>  include/linux/sync_core.h                     | 21 ------
>  init/Kconfig                                  |  3 -
>  kernel/sched/membarrier.c                     | 15 ++--
>  15 files changed, 75 insertions(+), 87 deletions(-)
>  create mode 100644 arch/arm64/include/asm/sync_core.h
>  create mode 100644 arch/powerpc/include/asm/sync_core.h
>  delete mode 100644 include/linux/sync_core.h
> 
> diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
> index 883d33b265d6..41c9ebcb275f 100644
> --- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
> +++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
> @@ -5,51 +5,25 @@
>  #
>  # Architecture requirements
>  #
> -# * arm/arm64/powerpc
>  #
> -# Rely on implicit context synchronization as a result of exception return
> -# when returning from IPI handler, and when returning to user-space.
> -#
> -# * x86
> -#
> -# x86-32 uses IRET as return from interrupt, which takes care of the IPI.
> -# However, it uses both IRET and SYSEXIT to go back to user-space. The IRET
> -# instruction is core serializing, but not SYSEXIT.
> -#
> -# x86-64 uses IRET as return from interrupt, which takes care of the IPI.
> -# However, it can return to user-space through either SYSRETL (compat code),
> -# SYSRETQ, or IRET.
> -#
> -# Given that neither SYSRET{L,Q}, nor SYSEXIT, are core serializing, we rely
> -# instead on write_cr3() performed by switch_mm() to provide core serialization
> -# after changing the current mm, and deal with the special case of kthread ->
> -# uthread (temporarily keeping current mm into active_mm) by issuing a
> -# sync_core_before_usermode() in that specific case.
> -#
> -    -----------------------
> -    |         arch |status|
> -    -----------------------
> -    |       alpha: | TODO |
> -    |         arc: | TODO |
> -    |         arm: |  ok  |
> -    |       arm64: |  ok  |
> -    |        csky: | TODO |
> -    |       h8300: | TODO |
> -    |     hexagon: | TODO |
> -    |        ia64: | TODO |
> -    |        m68k: | TODO |
> -    |  microblaze: | TODO |
> -    |        mips: | TODO |
> -    |       nds32: | TODO |
> -    |       nios2: | TODO |
> -    |    openrisc: | TODO |
> -    |      parisc: | TODO |
> -    |     powerpc: |  ok  |
> -    |       riscv: | TODO |
> -    |        s390: | TODO |
> -    |          sh: | TODO |
> -    |       sparc: | TODO |
> -    |          um: | TODO |
> -    |         x86: |  ok  |
> -    |      xtensa: | TODO |
> -    -----------------------
> +# An architecture that wants to support
> +# MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE needs to define precisely what it
> +# is supposed to do and implement membarrier_sync_core_before_usermode() to
> +# make it do that.  Then it can select ARCH_HAS_MEMBARRIER_SYNC_CORE via
> +# Kconfig.Unfortunately, MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE is not a
> +# fantastic API and may not make sense on all architectures.  Once an
> +# architecture meets these requirements,
> +#
> +# On x86, a program can safely modify code, issue
> +# MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE, and then execute that code, via
> +# the modified address or an alias, from any thread in the calling process.
> +#
> +# On arm64, a program can modify code, flush the icache as needed, and issue
> +# MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE to force a "context synchronizing
> +# event", aka pipeline flush on all CPUs that might run the calling process.
> +# Then the program can execute the modified code as long as it is executed
> +# from an address consistent with the icache flush and the CPU's cache type.
> +#
> +# On powerpc, a program can use MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE
> +# similarly to arm64.  It would be nice if the powerpc maintainers could
> +# add a more clear explanantion.
> diff --git a/arch/arm64/include/asm/sync_core.h b/arch/arm64/include/asm/sync_core.h
> new file mode 100644
> index 000000000000..74996bf533bb
> --- /dev/null
> +++ b/arch/arm64/include/asm/sync_core.h
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_ARM64_SYNC_CORE_H
> +#define _ASM_ARM64_SYNC_CORE_H
> +
> +#include <asm/barrier.h>
> +
> +/*
> + * On arm64, anyone trying to use membarrier() to handle JIT code is
> + * required to first flush the icache and then do SYNC_CORE.  All that's
> + * needed after the icache flush is to execute a "context synchronization
> + * event".  Right now, ERET does this, and we are guaranteed to ERET before
> + * any user code runs.  If Linux ever programs the CPU to make ERET stop
> + * being a context synchronizing event, then this will need to be adjusted.
> + */
> +static inline void membarrier_sync_core_before_usermode(void)
> +{
> +}
> +
> +#endif /* _ASM_ARM64_SYNC_CORE_H */
> diff --git a/arch/powerpc/include/asm/sync_core.h b/arch/powerpc/include/asm/sync_core.h
> new file mode 100644
> index 000000000000..589fdb34beab
> --- /dev/null
> +++ b/arch/powerpc/include/asm/sync_core.h
> @@ -0,0 +1,14 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_POWERPC_SYNC_CORE_H
> +#define _ASM_POWERPC_SYNC_CORE_H
> +
> +#include <asm/barrier.h>
> +
> +/*
> + * XXX: can a powerpc person put an appropriate comment here?
> + */
> +static inline void membarrier_sync_core_before_usermode(void)
> +{
> +}
> +
> +#endif /* _ASM_POWERPC_SYNC_CORE_H */

powerpc's can just go in asm/membarrier.h

/*
 * The RFI family of instructions are context synchronising, and
 * that is how we return to userspace, so nothing is required here.
 */

> diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
> index c32c32a2441e..f72a6ab3fac2 100644
> --- a/kernel/sched/membarrier.c
> +++ b/kernel/sched/membarrier.c
> @@ -5,6 +5,9 @@
>   * membarrier system call
>   */
>  #include "sched.h"
> +#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
> +#include <asm/sync_core.h>
> +#endif

Can you

#else
static inline void membarrier_sync_core_before_usermode(void)
{
 /* this gets constant folded out */
}
#endif

And avoid adding the ifdefs in the following code?

Otherwise I think this is good.

Acked-by: Nicholas Piggin <npiggin@gmail.com>

Thanks,
Nick

>  
>  /*
>   * The basic principle behind the regular memory barrier mode of membarrier()
> @@ -221,6 +224,7 @@ static void ipi_mb(void *info)
>  	smp_mb();	/* IPIs should be serializing but paranoid. */
>  }
>  
> +#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
>  static void ipi_sync_core(void *info)
>  {
>  	/*
> @@ -230,13 +234,14 @@ static void ipi_sync_core(void *info)
>  	 * the big comment at the top of this file.
>  	 *
>  	 * A sync_core() would provide this guarantee, but
> -	 * sync_core_before_usermode() might end up being deferred until
> -	 * after membarrier()'s smp_mb().
> +	 * membarrier_sync_core_before_usermode() might end up being deferred
> +	 * until after membarrier()'s smp_mb().
>  	 */
>  	smp_mb();	/* IPIs should be serializing but paranoid. */
>  
> -	sync_core_before_usermode();
> +	membarrier_sync_core_before_usermode();
>  }
> +#endif
>  
>  static void ipi_rseq(void *info)
>  {
> @@ -368,12 +373,14 @@ static int membarrier_private_expedited(int flags, int cpu_id)
>  	smp_call_func_t ipi_func = ipi_mb;
>  
>  	if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
> -		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
> +#ifndef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
>  			return -EINVAL;
> +#else
>  		if (!(atomic_read(&mm->membarrier_state) &
>  		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
>  			return -EPERM;
>  		ipi_func = ipi_sync_core;
> +#endif
>  	} else if (flags == MEMBARRIER_FLAG_RSEQ) {
>  		if (!IS_ENABLED(CONFIG_RSEQ))
>  			return -EINVAL;
> -- 
> 2.31.1
> 
> 

^ permalink raw reply

* [PATCH v2 0/6] mrermap fixes
From: Aneesh Kumar K.V @ 2021-06-16  4:52 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	Kirill A . Shutemov, linuxppc-dev

This patch series is split out series from [PATCH v7 00/11] Speedup mremap on ppc64
(https://lore.kernel.org/linux-mm/20210607055131.156184-1-aneesh.kumar@linux.ibm.com)
dropping ppc64 specific changes.

This patchset is dependent on
https://lore.kernel.org/linux-mm/20210615110859.320299-1-aneesh.kumar@linux.ibm.com

ppc64 specific changes will be sent as a separate series depending on this patchset.

Changes from v1:
* cc stable@kernel.org
* USe the correct config for TRANSPARENT_HUGEPAGE_PUD
* use pud_pgtable instead of pud_page_vaddr


Aneesh Kumar K.V (6):
  selftest/mremap_test: Update the test to handle pagesize other than 4K
  selftest/mremap_test: Avoid crash with static build
  mm/mremap: Convert huge PUD move to separate helper
  mm/mremap: Don't enable optimized PUD move if page table levels is 2
  mm/mremap: Use pmd/pud_poplulate to update page table entries
  mm/mremap: hold the rmap lock in write mode when moving page table
    entries.

 mm/mremap.c                              |  92 +++++++++++++++---
 tools/testing/selftests/vm/mremap_test.c | 118 ++++++++++++-----------
 2 files changed, 142 insertions(+), 68 deletions(-)

-- 
2.31.1


^ permalink raw reply

* [PATCH v2 1/6] selftest/mremap_test: Update the test to handle pagesize other than 4K
From: Aneesh Kumar K.V @ 2021-06-16  4:52 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	Kirill A . Shutemov, linuxppc-dev
In-Reply-To: <20210616045239.370802-1-aneesh.kumar@linux.ibm.com>

Instead of hardcoding 4K page size fetch it using sysconf(). For the performance
measurements test still assume 2M and 1G are hugepage sizes.

Reviewed-by: Kalesh Singh <kaleshsingh@google.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 tools/testing/selftests/vm/mremap_test.c | 113 ++++++++++++-----------
 1 file changed, 61 insertions(+), 52 deletions(-)

diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c
index 9c391d016922..c9a5461eb786 100644
--- a/tools/testing/selftests/vm/mremap_test.c
+++ b/tools/testing/selftests/vm/mremap_test.c
@@ -45,14 +45,15 @@ enum {
 	_4MB = 4ULL << 20,
 	_1GB = 1ULL << 30,
 	_2GB = 2ULL << 30,
-	PTE = _4KB,
 	PMD = _2MB,
 	PUD = _1GB,
 };
 
+#define PTE page_size
+
 #define MAKE_TEST(source_align, destination_align, size,	\
 		  overlaps, should_fail, test_name)		\
-{								\
+(struct test){							\
 	.name = test_name,					\
 	.config = {						\
 		.src_alignment = source_align,			\
@@ -252,12 +253,17 @@ static int parse_args(int argc, char **argv, unsigned int *threshold_mb,
 	return 0;
 }
 
+#define MAX_TEST 13
+#define MAX_PERF_TEST 3
 int main(int argc, char **argv)
 {
 	int failures = 0;
 	int i, run_perf_tests;
 	unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD;
 	unsigned int pattern_seed;
+	struct test test_cases[MAX_TEST];
+	struct test perf_test_cases[MAX_PERF_TEST];
+	int page_size;
 	time_t t;
 
 	pattern_seed = (unsigned int) time(&t);
@@ -268,56 +274,59 @@ int main(int argc, char **argv)
 	ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n",
 		       threshold_mb, pattern_seed);
 
-	struct test test_cases[] = {
-		/* Expected mremap failures */
-		MAKE_TEST(_4KB, _4KB, _4KB, OVERLAPPING, EXPECT_FAILURE,
-		  "mremap - Source and Destination Regions Overlapping"),
-		MAKE_TEST(_4KB, _1KB, _4KB, NON_OVERLAPPING, EXPECT_FAILURE,
-		  "mremap - Destination Address Misaligned (1KB-aligned)"),
-		MAKE_TEST(_1KB, _4KB, _4KB, NON_OVERLAPPING, EXPECT_FAILURE,
-		  "mremap - Source Address Misaligned (1KB-aligned)"),
-
-		/* Src addr PTE aligned */
-		MAKE_TEST(PTE, PTE, _8KB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "8KB mremap - Source PTE-aligned, Destination PTE-aligned"),
-
-		/* Src addr 1MB aligned */
-		MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "2MB mremap - Source 1MB-aligned, Destination PTE-aligned"),
-		MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned"),
-
-		/* Src addr PMD aligned */
-		MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "4MB mremap - Source PMD-aligned, Destination PTE-aligned"),
-		MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "4MB mremap - Source PMD-aligned, Destination 1MB-aligned"),
-		MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "4MB mremap - Source PMD-aligned, Destination PMD-aligned"),
-
-		/* Src addr PUD aligned */
-		MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "2GB mremap - Source PUD-aligned, Destination PTE-aligned"),
-		MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "2GB mremap - Source PUD-aligned, Destination 1MB-aligned"),
-		MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "2GB mremap - Source PUD-aligned, Destination PMD-aligned"),
-		MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "2GB mremap - Source PUD-aligned, Destination PUD-aligned"),
-	};
-
-	struct test perf_test_cases[] = {
-		/*
-		 * mremap 1GB region - Page table level aligned time
-		 * comparison.
-		 */
-		MAKE_TEST(PTE, PTE, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "1GB mremap - Source PTE-aligned, Destination PTE-aligned"),
-		MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "1GB mremap - Source PMD-aligned, Destination PMD-aligned"),
-		MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "1GB mremap - Source PUD-aligned, Destination PUD-aligned"),
-	};
+	page_size = sysconf(_SC_PAGESIZE);
+
+	/* Expected mremap failures */
+	test_cases[0] =	MAKE_TEST(page_size, page_size, page_size,
+				  OVERLAPPING, EXPECT_FAILURE,
+				  "mremap - Source and Destination Regions Overlapping");
+
+	test_cases[1] = MAKE_TEST(page_size, page_size/4, page_size,
+				  NON_OVERLAPPING, EXPECT_FAILURE,
+				  "mremap - Destination Address Misaligned (1KB-aligned)");
+	test_cases[2] = MAKE_TEST(page_size/4, page_size, page_size,
+				  NON_OVERLAPPING, EXPECT_FAILURE,
+				  "mremap - Source Address Misaligned (1KB-aligned)");
+
+	/* Src addr PTE aligned */
+	test_cases[3] = MAKE_TEST(PTE, PTE, PTE * 2,
+				  NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "8KB mremap - Source PTE-aligned, Destination PTE-aligned");
+
+	/* Src addr 1MB aligned */
+	test_cases[4] = MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "2MB mremap - Source 1MB-aligned, Destination PTE-aligned");
+	test_cases[5] = MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned");
+
+	/* Src addr PMD aligned */
+	test_cases[6] = MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "4MB mremap - Source PMD-aligned, Destination PTE-aligned");
+	test_cases[7] =	MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "4MB mremap - Source PMD-aligned, Destination 1MB-aligned");
+	test_cases[8] = MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "4MB mremap - Source PMD-aligned, Destination PMD-aligned");
+
+	/* Src addr PUD aligned */
+	test_cases[9] = MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "2GB mremap - Source PUD-aligned, Destination PTE-aligned");
+	test_cases[10] = MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				   "2GB mremap - Source PUD-aligned, Destination 1MB-aligned");
+	test_cases[11] = MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				   "2GB mremap - Source PUD-aligned, Destination PMD-aligned");
+	test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				   "2GB mremap - Source PUD-aligned, Destination PUD-aligned");
+
+	perf_test_cases[0] =  MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+					"1GB mremap - Source PTE-aligned, Destination PTE-aligned");
+	/*
+	 * mremap 1GB region - Page table level aligned time
+	 * comparison.
+	 */
+	perf_test_cases[1] = MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				       "1GB mremap - Source PMD-aligned, Destination PMD-aligned");
+	perf_test_cases[2] = MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				       "1GB mremap - Source PUD-aligned, Destination PUD-aligned");
 
 	run_perf_tests =  (threshold_mb == VALIDATION_NO_THRESHOLD) ||
 				(threshold_mb * _1MB >= _1GB);
-- 
2.31.1


^ permalink raw reply related

* [PATCH v2 2/6] selftest/mremap_test: Avoid crash with static build
From: Aneesh Kumar K.V @ 2021-06-16  4:52 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	Kirill A . Shutemov, linuxppc-dev
In-Reply-To: <20210616045239.370802-1-aneesh.kumar@linux.ibm.com>

With a large mmap map size, we can overlap with the text area and using
MAP_FIXED results in unmapping that area. Switch to MAP_FIXED_NOREPLACE
and handle the EEXIST error.

Reviewed-by: Kalesh Singh <kaleshsingh@google.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 tools/testing/selftests/vm/mremap_test.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c
index c9a5461eb786..0624d1bd71b5 100644
--- a/tools/testing/selftests/vm/mremap_test.c
+++ b/tools/testing/selftests/vm/mremap_test.c
@@ -75,9 +75,10 @@ static void *get_source_mapping(struct config c)
 retry:
 	addr += c.src_alignment;
 	src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE,
-			MAP_FIXED | MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+			MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
+			-1, 0);
 	if (src_addr == MAP_FAILED) {
-		if (errno == EPERM)
+		if (errno == EPERM || errno == EEXIST)
 			goto retry;
 		goto error;
 	}
-- 
2.31.1


^ permalink raw reply related

* [PATCH v2 3/6] mm/mremap: Convert huge PUD move to separate helper
From: Aneesh Kumar K.V @ 2021-06-16  4:52 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	Kirill A . Shutemov, linuxppc-dev
In-Reply-To: <20210616045239.370802-1-aneesh.kumar@linux.ibm.com>

With TRANSPARENT_HUGEPAGE_PUD enabled the kernel can find huge PUD entries.
Add a helper to move huge PUD entries on mremap().

This will be used by a later patch to optimize mremap of PUD_SIZE aligned
level 4 PTE mapped address

This also make sure we support mremap on huge PUD entries even with
CONFIG_HAVE_MOVE_PUD disabled.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/mremap.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 72 insertions(+), 7 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 47c255b60150..958ecdc6f29d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -324,10 +324,61 @@ static inline bool move_normal_pud(struct vm_area_struct *vma,
 }
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
+			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+{
+	spinlock_t *old_ptl, *new_ptl;
+	struct mm_struct *mm = vma->vm_mm;
+	pud_t pud;
+
+	/*
+	 * The destination pud shouldn't be established, free_pgtables()
+	 * should have released it.
+	 */
+	if (WARN_ON_ONCE(!pud_none(*new_pud)))
+		return false;
+
+	/*
+	 * We don't have to worry about the ordering of src and dst
+	 * ptlocks because exclusive mmap_lock prevents deadlock.
+	 */
+	old_ptl = pud_lock(vma->vm_mm, old_pud);
+	new_ptl = pud_lockptr(mm, new_pud);
+	if (new_ptl != old_ptl)
+		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+
+	/* Clear the pud */
+	pud = *old_pud;
+	pud_clear(old_pud);
+
+	VM_BUG_ON(!pud_none(*new_pud));
+
+	/* Set the new pud */
+	/* mark soft_ditry when we add pud level soft dirty support */
+	set_pud_at(mm, new_addr, new_pud, pud);
+	flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
+	if (new_ptl != old_ptl)
+		spin_unlock(new_ptl);
+	spin_unlock(old_ptl);
+
+	return true;
+}
+#else
+static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
+			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+{
+	WARN_ON_ONCE(1);
+	return false;
+
+}
+#endif
+
 enum pgt_entry {
 	NORMAL_PMD,
 	HPAGE_PMD,
 	NORMAL_PUD,
+	HPAGE_PUD,
 };
 
 /*
@@ -347,6 +398,7 @@ static __always_inline unsigned long get_extent(enum pgt_entry entry,
 		mask = PMD_MASK;
 		size = PMD_SIZE;
 		break;
+	case HPAGE_PUD:
 	case NORMAL_PUD:
 		mask = PUD_MASK;
 		size = PUD_SIZE;
@@ -395,6 +447,11 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
 			move_huge_pmd(vma, old_addr, new_addr, old_entry,
 				      new_entry);
 		break;
+	case HPAGE_PUD:
+		moved = move_huge_pud(vma, old_addr, new_addr, old_entry,
+				      new_entry);
+		break;
+
 	default:
 		WARN_ON_ONCE(1);
 		break;
@@ -414,6 +471,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 	unsigned long extent, old_end;
 	struct mmu_notifier_range range;
 	pmd_t *old_pmd, *new_pmd;
+	pud_t *old_pud, *new_pud;
 
 	old_end = old_addr + len;
 	flush_cache_range(vma, old_addr, old_end);
@@ -429,15 +487,22 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 		 * PUD level if possible.
 		 */
 		extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
-		if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
-			pud_t *old_pud, *new_pud;
 
-			old_pud = get_old_pud(vma->vm_mm, old_addr);
-			if (!old_pud)
+		old_pud = get_old_pud(vma->vm_mm, old_addr);
+		if (!old_pud)
+			continue;
+		new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
+		if (!new_pud)
+			break;
+		if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
+			if (extent == HPAGE_PUD_SIZE) {
+				move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
+					       old_pud, new_pud, need_rmap_locks);
+				/* We ignore and continue on error? */
 				continue;
-			new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
-			if (!new_pud)
-				break;
+			}
+		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
+
 			if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
 					   old_pud, new_pud, need_rmap_locks))
 				continue;
-- 
2.31.1


^ permalink raw reply related

* [PATCH v2 4/6] mm/mremap: Don't enable optimized PUD move if page table levels is 2
From: Aneesh Kumar K.V @ 2021-06-16  4:52 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	Kirill A . Shutemov, linuxppc-dev
In-Reply-To: <20210616045239.370802-1-aneesh.kumar@linux.ibm.com>

With two level page table don't enable move_normal_pud.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/mremap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 958ecdc6f29d..97313e316a4d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -276,7 +276,7 @@ static inline bool move_normal_pmd(struct vm_area_struct *vma,
 }
 #endif
 
-#ifdef CONFIG_HAVE_MOVE_PUD
+#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
 static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
 {
-- 
2.31.1


^ permalink raw reply related

* [PATCH v2 5/6] mm/mremap: Use pmd/pud_poplulate to update page table entries
From: Aneesh Kumar K.V @ 2021-06-16  4:52 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	Kirill A . Shutemov, linuxppc-dev
In-Reply-To: <20210616045239.370802-1-aneesh.kumar@linux.ibm.com>

pmd/pud_populate is the right interface to be used to set the respective
page table entries. Some architectures like ppc64 do assume that set_pmd/pud_at
can only be used to set a hugepage PTE. Since we are not setting up a hugepage
PTE here, use the pmd/pud_populate interface.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/mremap.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 97313e316a4d..72fa0491681e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -26,6 +26,7 @@
 
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
 
 #include "internal.h"
 
@@ -258,8 +259,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 
 	VM_BUG_ON(!pmd_none(*new_pmd));
 
-	/* Set the new pmd */
-	set_pmd_at(mm, new_addr, new_pmd, pmd);
+	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
 	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
@@ -306,8 +306,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 
 	VM_BUG_ON(!pud_none(*new_pud));
 
-	/* Set the new pud */
-	set_pud_at(mm, new_addr, new_pud, pud);
+	pud_populate(mm, new_pud, pud_pgtable(pud));
 	flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
-- 
2.31.1


^ permalink raw reply related

* [PATCH v2 6/6] mm/mremap: hold the rmap lock in write mode when moving page table entries.
From: Aneesh Kumar K.V @ 2021-06-16  4:52 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Hugh Dickins, Linus Torvalds, npiggin,
	kaleshsingh, joel, Kirill A . Shutemov, stable, linuxppc-dev,
	Kirill A . Shutemov
In-Reply-To: <20210616045239.370802-1-aneesh.kumar@linux.ibm.com>

To avoid a race between rmap walk and mremap, mremap does take_rmap_locks().
The lock was taken to ensure that rmap walk don't miss a page table entry due to
PTE moves via move_pagetables(). The kernel does further optimization of
this lock such that if we are going to find the newly added vma after the
old vma, the rmap lock is not taken. This is because rmap walk would find the
vmas in the same order and if we don't find the page table attached to
older vma we would find it with the new vma which we would iterate later.

As explained in commit eb66ae030829 ("mremap: properly flush TLB before releasing the page")
mremap is special in that it doesn't take ownership of the page. The
optimized version for PUD/PMD aligned mremap also doesn't hold the ptl lock.
This can result in stale TLB entries as show below.

This patch updates the rmap locking requirement in mremap to handle the race condition
explained below with optimized mremap::

Optmized PMD move

    CPU 1                           CPU 2                                   CPU 3

    mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one

    mmap_write_lock_killable()

                                    addr = old_addr
                                    lock(pte_ptl)
    lock(pmd_ptl)
    pmd = *old_pmd
    pmd_clear(old_pmd)
    flush_tlb_range(old_addr)

    *new_pmd = pmd
                                                                            *new_addr = 10; and fills
                                                                            TLB with new addr
                                                                            and old pfn

    unlock(pmd_ptl)
                                    ptep_clear_flush()
                                    old pfn is free.
                                                                            Stale TLB entry

Optimized PUD move also suffers from a similar race.
Both the above race condition can be fixed if we force mremap path to take rmap lock.

Cc: stable@vger.kernel.org
Fixes: 2c91bd4a4e2e ("mm: speed up mremap by 20x on large regions")
Fixes: c49dd3401802 ("mm: speedup mremap on 1GB or larger regions")
Link: https://lore.kernel.org/linux-mm/CAHk-=wgXVR04eBNtxQfevontWnP6FDm+oj5vauQXP3S-huwbPw@mail.gmail.com
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/mremap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 72fa0491681e..c3cad539a7aa 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -503,7 +503,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
 
 			if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
-					   old_pud, new_pud, need_rmap_locks))
+					   old_pud, new_pud, true))
 				continue;
 		}
 
@@ -530,7 +530,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 			 * moving at the PMD level if possible.
 			 */
 			if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
-					   old_pmd, new_pmd, need_rmap_locks))
+					   old_pmd, new_pmd, true))
 				continue;
 		}
 
-- 
2.31.1


^ permalink raw reply related

* Re: [PATCH 0/5] cpufreq: cppc: Fix suspend/resume specific races with FIE code
From: Viresh Kumar @ 2021-06-16  4:57 UTC (permalink / raw)
  To: Qian Cai
  Cc: linuxppc-dev, Vincent Guittot, linux-doc, Jonathan Corbet,
	Dirk Brandewie, linux-pm, Srinivas Pandruvada, Rafael Wysocki,
	linux-kernel, Paul Mackerras, Ionela Voinescu, Len Brown
In-Reply-To: <19527d26-526e-6c6f-431d-7b78ed92bb34@quicinc.com>

On 15-06-21, 08:17, Qian Cai wrote:
> On 6/15/2021 3:50 AM, Viresh Kumar wrote:
> > This is a strange place to get the issue from. And this is a new
> > issue.
> 
> Well, it was still the same exercises with CPU online/offline.
> 
> > 
> >> [  488.151939][  T670]  kthread+0x3ac/0x460
> >> [  488.155854][  T670]  ret_from_fork+0x10/0x18
> >> [  488.160120][  T670] Code: 911e8000 aa1303e1 910a0000 941b595b (d4210000)
> >> [  488.166901][  T670] ---[ end trace e637e2d38b2cc087 ]---
> >> [  488.172206][  T670] Kernel panic - not syncing: Oops - BUG: Fatal exception
> >> [  488.179182][  T670] SMP: stopping secondary CPUs
> >> [  489.209347][  T670] SMP: failed to stop secondary CPUs 0-1,10-11,16-17,31
> >> [  489.216128][  T][  T670] Memoryn ]---
> > 
> > Can you give details on what exactly did you try to do, to get this ?
> > Normal boot or something more ?
> 
> Basically, it has the cpufreq driver as CPPC and the governor as
> schedutil. Running a few workloads to get CPU scaling up and down.
> Later, try to offline all CPUs until the last one and then online
> all CPUs.

Hmm, okay.

So I basically have very similar setup with 8 cores (1-policy
per-cpu), the only difference is I don't end up reading the
performance counters, everything else remains same. So I should see
issues now just like you, in case there are any.

Since the insmod/rmmod setup is a bit different, this is what I tried
today for around an hour with CONFIG_DEBUG_LIST and RCU debugging
options.

while true; do
    for i in `seq 1 7`;
    do
        echo 0 > /sys/devices/system/cpu/cpu$i/online;
    done;

    for i in `seq 1 7`;
    do
        echo 1 > /sys/devices/system/cpu/cpu$i/online;
    done;
done

I don't see any crashes, oops or warnings with latest stuff.

> I am hesitate to try this at the moment because this all feel like
> shooting in the dark.

I understand your point and you aren't completely wrong here. It
wasn't completely in dark but since I am unable to reproduce the issue
at my end, I asked for help.

FWIW, I think one of the possible cause of corruption of kthread thing
could have been because of the race in the topology related code. I
already fixed that in my tree yesterday.

> Ideally, you will be able to get access to one
> of those arm64 servers (Huawei, Ampere, TX2, FJ etc) eventually and
> really try the same exercises yourself with those debugging options
> like list debugging and KASAN on. That way you could fix things way
> efficiently.

Yeah, I thought of this work being over and I am not a user of it
normally. I had to enable it for ARM servers and I took help of my
colleagues (Vincent Guittot and Ionela) for testing the same.

I have also asked Vincent to give it a try again.

> I could share you the .config once you are there. Last
> but not least, once you get better narrow down of the issues, I'd
> hope to see someone else familiar with the code there to get review
> of those patches first (feel free to Cc me once you are ready to
> post) before I'll rerun the whole things again. That way we don't
> waste time on each other backing and forth chasing the shadow.

I did send the stuff up for review and this last thing (you reported)
was a different race altogether, so asked for testing without reviews.

Anyway, I am quite sure my tests have covered such issues now. I will
send out patches again soon.

Thanks Qian.

-- 
viresh

^ permalink raw reply

* [PATCH v8 0/3] Speedup mremap on ppc64
From: Aneesh Kumar K.V @ 2021-06-16  4:57 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	Kirill A . Shutemov, linuxppc-dev

This patchset enables MOVE_PMD/MOVE_PUD support on power. This requires
the platform to support updating higher-level page tables without
updating page table entries. This also needs to invalidate the Page Walk
Cache on architecture supporting the same.

This patchset is dependent on
https://lore.kernel.org/linux-mm/20210616045239.370802-1-aneesh.kumar@linux.ibm.com/

Changes from v7:
* Split mremap fixes to a separate series

Changes from v6:
* Update ppc64 flush_tlb_range to invalidate page walk cache.
* Add patches to fix race between mremap and page out
* Add patch to fix build error with page table levels 2

Changes from v5:
* Drop patch mm/mremap: Move TLB flush outside page table lock
* Add fixes for race between optimized mremap and page out

Changes from v4:
* Change function name and arguments based on review feedback.

Changes from v3:
* Fix build error reported by kernel test robot
* Address review feedback.

Changes from v2:
* switch from using mmu_gather to flush_pte_tlb_pwc_range() 

Changes from v1:
* Rebase to recent upstream
* Fix build issues with tlb_gather_mmu changes

Aneesh Kumar K.V (3):
  mm/mremap: Allow arch runtime override
  powerpc/book3s64/mm: Update flush_tlb_range to flush page walk cache
  powerpc/mm: Enable HAVE_MOVE_PMD support

 .../include/asm/book3s/64/tlbflush-radix.h    |  2 +
 arch/powerpc/include/asm/tlb.h                |  6 +++
 arch/powerpc/mm/book3s64/radix_hugetlbpage.c  |  8 +++-
 arch/powerpc/mm/book3s64/radix_tlb.c          | 44 ++++++++++++-------
 arch/powerpc/platforms/Kconfig.cputype        |  2 +
 mm/mremap.c                                   | 15 ++++++-
 6 files changed, 58 insertions(+), 19 deletions(-)

-- 
2.31.1

^ permalink raw reply

* [PATCH v8 1/3] mm/mremap: Allow arch runtime override
From: Aneesh Kumar K.V @ 2021-06-16  4:57 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	Kirill A . Shutemov, linuxppc-dev
In-Reply-To: <20210616045735.374532-1-aneesh.kumar@linux.ibm.com>

Architectures like ppc64 support faster mremap only with radix
translation. Hence allow a runtime check w.r.t support for fast mremap.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/tlb.h |  6 ++++++
 mm/mremap.c                    | 15 ++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index 160422a439aa..09a9ae5f3656 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -83,5 +83,11 @@ static inline int mm_is_thread_local(struct mm_struct *mm)
 }
 #endif
 
+#define arch_supports_page_table_move arch_supports_page_table_move
+static inline bool arch_supports_page_table_move(void)
+{
+	return radix_enabled();
+}
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_TLB_H */
diff --git a/mm/mremap.c b/mm/mremap.c
index c3cad539a7aa..ca9d345f22e8 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -25,7 +25,7 @@
 #include <linux/userfaultfd_k.h>
 
 #include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
+#include <asm/tlb.h>
 #include <asm/pgalloc.h>
 
 #include "internal.h"
@@ -210,6 +210,15 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		drop_rmap_locks(vma);
 }
 
+#ifndef arch_supports_page_table_move
+#define arch_supports_page_table_move arch_supports_page_table_move
+static inline bool arch_supports_page_table_move(void)
+{
+	return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
+		IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
+}
+#endif
+
 #ifdef CONFIG_HAVE_MOVE_PMD
 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
@@ -218,6 +227,8 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t pmd;
 
+	if (!arch_supports_page_table_move())
+		return false;
 	/*
 	 * The destination pmd shouldn't be established, free_pgtables()
 	 * should have released it.
@@ -284,6 +295,8 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 	struct mm_struct *mm = vma->vm_mm;
 	pud_t pud;
 
+	if (!arch_supports_page_table_move())
+		return false;
 	/*
 	 * The destination pud shouldn't be established, free_pgtables()
 	 * should have released it.
-- 
2.31.1


^ permalink raw reply related

* [PATCH v8 2/3] powerpc/book3s64/mm: Update flush_tlb_range to flush page walk cache
From: Aneesh Kumar K.V @ 2021-06-16  4:57 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	Kirill A . Shutemov, linuxppc-dev
In-Reply-To: <20210616045735.374532-1-aneesh.kumar@linux.ibm.com>

flush_tlb_range is special in that we don't specify the page size used
for the translation. Hence when flushing TLB we flush the translation cache
for all possible page sizes. The kernel also uses the same interface when
moving page tables around. Such a move requires us to flush the page walk cache.

Instead of adding another interface to force page walk cache flush,
update flush_tlb_range to flush page walk cache if the range flushed
is more than the PMD range. A page table move will always involve an
invalidate range more than PMD_SIZE.

Running microbenchmark with mprotect and parallel memory access
didn't show any observable performance impact.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 .../include/asm/book3s/64/tlbflush-radix.h    |  2 +
 arch/powerpc/mm/book3s64/radix_hugetlbpage.c  |  8 +++-
 arch/powerpc/mm/book3s64/radix_tlb.c          | 44 ++++++++++++-------
 3 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index 8b33601cdb9d..ab9d5e535000 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -60,6 +60,8 @@ extern void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma,
 					   unsigned long start, unsigned long end);
 extern void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 					 unsigned long end, int psize);
+void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+				      unsigned long end, int psize);
 extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
 				       unsigned long start, unsigned long end);
 extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
index cb91071eef52..23d3e08911d3 100644
--- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
@@ -32,7 +32,13 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st
 	struct hstate *hstate = hstate_file(vma->vm_file);
 
 	psize = hstate_get_psize(hstate);
-	radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
+	/*
+	 * Flush PWC even if we get PUD_SIZE hugetlb invalidate to keep this simpler.
+	 */
+	if (end - start >= PUD_SIZE)
+		radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize);
+	else
+		radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
 }
 
 /*
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 409e61210789..9f1a177f6bb6 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -989,14 +989,13 @@ static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_
 
 static inline void __radix__flush_tlb_range(struct mm_struct *mm,
 					    unsigned long start, unsigned long end)
-
 {
 	unsigned long pid;
 	unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
 	unsigned long page_size = 1UL << page_shift;
 	unsigned long nr_pages = (end - start) >> page_shift;
 	bool fullmm = (end == TLB_FLUSH_ALL);
-	bool flush_pid;
+	bool flush_pid, flush_pwc = false;
 	enum tlb_flush_type type;
 
 	pid = mm->context.id;
@@ -1015,8 +1014,16 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
 		flush_pid = nr_pages > tlb_single_page_flush_ceiling;
 	else
 		flush_pid = nr_pages > tlb_local_single_page_flush_ceiling;
+	/*
+	 * full pid flush already does the PWC flush. if it is not full pid
+	 * flush check the range is more than PMD and force a pwc flush
+	 * mremap() depends on this behaviour.
+	 */
+	if (!flush_pid && (end - start) >= PMD_SIZE)
+		flush_pwc = true;
 
 	if (!mmu_has_feature(MMU_FTR_GTSE) && type == FLUSH_TYPE_GLOBAL) {
+		unsigned long type = H_RPTI_TYPE_TLB;
 		unsigned long tgt = H_RPTI_TARGET_CMMU;
 		unsigned long pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize);
 
@@ -1024,19 +1031,20 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
 			pg_sizes |= psize_to_rpti_pgsize(MMU_PAGE_2M);
 		if (atomic_read(&mm->context.copros) > 0)
 			tgt |= H_RPTI_TARGET_NMMU;
-		pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB, pg_sizes,
-				       start, end);
+		if (flush_pwc)
+			type |= H_RPTI_TYPE_PWC;
+		pseries_rpt_invalidate(pid, tgt, type, pg_sizes, start, end);
 	} else if (flush_pid) {
+		/*
+		 * We are now flushing a range larger than PMD size force a RIC_FLUSH_ALL
+		 */
 		if (type == FLUSH_TYPE_LOCAL) {
-			_tlbiel_pid(pid, RIC_FLUSH_TLB);
+			_tlbiel_pid(pid, RIC_FLUSH_ALL);
 		} else {
 			if (cputlb_use_tlbie()) {
-				if (mm_needs_flush_escalation(mm))
-					_tlbie_pid(pid, RIC_FLUSH_ALL);
-				else
-					_tlbie_pid(pid, RIC_FLUSH_TLB);
+				_tlbie_pid(pid, RIC_FLUSH_ALL);
 			} else {
-				_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB);
+				_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
 			}
 		}
 	} else {
@@ -1052,6 +1060,9 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
 
 		if (type == FLUSH_TYPE_LOCAL) {
 			asm volatile("ptesync": : :"memory");
+			if (flush_pwc)
+				/* For PWC, only one flush is needed */
+				__tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
 			__tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
 			if (hflush)
 				__tlbiel_va_range(hstart, hend, pid,
@@ -1059,6 +1070,8 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
 			ppc_after_tlbiel_barrier();
 		} else if (cputlb_use_tlbie()) {
 			asm volatile("ptesync": : :"memory");
+			if (flush_pwc)
+				__tlbie_pid(pid, RIC_FLUSH_PWC);
 			__tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
 			if (hflush)
 				__tlbie_va_range(hstart, hend, pid,
@@ -1066,10 +1079,10 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
 			asm volatile("eieio; tlbsync; ptesync": : :"memory");
 		} else {
 			_tlbiel_va_range_multicast(mm,
-					start, end, pid, page_size, mmu_virtual_psize, false);
+					start, end, pid, page_size, mmu_virtual_psize, flush_pwc);
 			if (hflush)
 				_tlbiel_va_range_multicast(mm,
-					hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, false);
+					hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, flush_pwc);
 		}
 	}
 out:
@@ -1143,9 +1156,6 @@ void radix__flush_all_lpid_guest(unsigned int lpid)
 	_tlbie_lpid_guest(lpid, RIC_FLUSH_ALL);
 }
 
-static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
-				  unsigned long end, int psize);
-
 void radix__tlb_flush(struct mmu_gather *tlb)
 {
 	int psize = 0;
@@ -1252,8 +1262,8 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 	return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
 }
 
-static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
-				  unsigned long end, int psize)
+void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+				      unsigned long end, int psize)
 {
 	__radix__flush_tlb_range_psize(mm, start, end, psize, true);
 }
-- 
2.31.1


^ permalink raw reply related

* [PATCH v8 3/3] powerpc/mm: Enable HAVE_MOVE_PMD support
From: Aneesh Kumar K.V @ 2021-06-16  4:57 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	Kirill A . Shutemov, linuxppc-dev
In-Reply-To: <20210616045735.374532-1-aneesh.kumar@linux.ibm.com>

mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:
1GB mremap - Source PTE-aligned, Destination PTE-aligned
  mremap time:      2292772ns
1GB mremap - Source PMD-aligned, Destination PMD-aligned
  mremap time:      1158928ns
1GB mremap - Source PUD-aligned, Destination PUD-aligned
  mremap time:        63886ns

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/platforms/Kconfig.cputype | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index f998e655b570..be8ceb5bece4 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -101,6 +101,8 @@ config PPC_BOOK3S_64
 	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
 	select ARCH_SUPPORTS_HUGETLBFS
 	select ARCH_SUPPORTS_NUMA_BALANCING
+	select HAVE_MOVE_PMD
+	select HAVE_MOVE_PUD
 	select IRQ_WORK
 	select PPC_MM_SLICES
 	select PPC_HAVE_KUEP
-- 
2.31.1


^ permalink raw reply related

* Re: [PATCH v11 09/12] swiotlb: Add restricted DMA alloc/free support
From: Christoph Hellwig @ 2021-06-16  4:59 UTC (permalink / raw)
  To: Claire Chang
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, Frank Rowand, mingo,
	Marek Szyprowski, sstabellini, Saravana Kannan, Joerg Roedel,
	Rafael J . Wysocki, Christoph Hellwig, Bartosz Golaszewski,
	bskeggs, linux-pci, xen-devel, Thierry Reding, intel-gfx,
	matthew.auld, linux-devicetree, Jianxiong Gao, Daniel Vetter,
	Will Deacon, Konrad Rzeszutek Wilk, maarten.lankhorst, airlied,
	Dan Williams, linuxppc-dev, jani.nikula, Rob Herring,
	rodrigo.vivi, Bjorn Helgaas, boris.ostrovsky, Andy Shevchenko,
	jgross, Nicolas Boichat, Greg KH, Randy Dunlap, lkml, Tomasz Figa,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, xypron.glpk,
	Robin Murphy, bauerman
In-Reply-To: <CALiNf28=3vqAs+8HsjyBGOiPNR2F3yT6OGnLpZH_AkWqgTqgOA@mail.gmail.com>

On Wed, Jun 16, 2021 at 12:04:16PM +0800, Claire Chang wrote:
> Just noticed that after propagating swiotlb_force setting into
> io_tlb_default_mem->force, the memory allocation behavior for
> swiotlb_force will change (i.e. always skipping arch_dma_alloc and
> dma_direct_alloc_from_pool).

Yes, I think we need to split a "use_for_alloc" flag from the force flag.

^ permalink raw reply

* Re: [PATCH v11 09/12] swiotlb: Add restricted DMA alloc/free support
From: Claire Chang @ 2021-06-16  5:10 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, Frank Rowand, mingo,
	Marek Szyprowski, sstabellini, Saravana Kannan, Joerg Roedel,
	Rafael J . Wysocki, Bartosz Golaszewski, bskeggs, linux-pci,
	xen-devel, Thierry Reding, intel-gfx, matthew.auld,
	linux-devicetree, Jianxiong Gao, Daniel Vetter, Will Deacon,
	Konrad Rzeszutek Wilk, maarten.lankhorst, airlied, Dan Williams,
	linuxppc-dev, jani.nikula, Rob Herring, rodrigo.vivi,
	Bjorn Helgaas, boris.ostrovsky, Andy Shevchenko, jgross,
	Nicolas Boichat, Greg KH, Randy Dunlap, lkml, Tomasz Figa,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, xypron.glpk,
	Robin Murphy, bauerman
In-Reply-To: <20210616045918.GA27537@lst.de>

On Wed, Jun 16, 2021 at 12:59 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Wed, Jun 16, 2021 at 12:04:16PM +0800, Claire Chang wrote:
> > Just noticed that after propagating swiotlb_force setting into
> > io_tlb_default_mem->force, the memory allocation behavior for
> > swiotlb_force will change (i.e. always skipping arch_dma_alloc and
> > dma_direct_alloc_from_pool).
>
> Yes, I think we need to split a "use_for_alloc" flag from the force flag.

How about splitting is_dev_swiotlb_force into is_swiotlb_force_bounce
(io_tlb_mem->force_bounce) and is_swiotlb_force_alloc
(io_tlb_mem->force_alloc)?

^ permalink raw reply

* Re: [PATCH v11 09/12] swiotlb: Add restricted DMA alloc/free support
From: Christoph Hellwig @ 2021-06-16  5:18 UTC (permalink / raw)
  To: Claire Chang
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, Frank Rowand, mingo,
	Marek Szyprowski, sstabellini, Saravana Kannan, Joerg Roedel,
	Rafael J . Wysocki, Christoph Hellwig, Bartosz Golaszewski,
	bskeggs, linux-pci, xen-devel, Thierry Reding, intel-gfx,
	matthew.auld, linux-devicetree, Jianxiong Gao, Daniel Vetter,
	Will Deacon, Konrad Rzeszutek Wilk, maarten.lankhorst, airlied,
	Dan Williams, linuxppc-dev, jani.nikula, Rob Herring,
	rodrigo.vivi, Bjorn Helgaas, boris.ostrovsky, Andy Shevchenko,
	jgross, Nicolas Boichat, Greg KH, Randy Dunlap, lkml, Tomasz Figa,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, xypron.glpk,
	Robin Murphy, bauerman
In-Reply-To: <CALiNf2-+vL8rw5fi=DcR=V7d55Ls3-OXoxC87Pvrf1Kz14D_+A@mail.gmail.com>

On Wed, Jun 16, 2021 at 01:10:02PM +0800, Claire Chang wrote:
> On Wed, Jun 16, 2021 at 12:59 PM Christoph Hellwig <hch@lst.de> wrote:
> >
> > On Wed, Jun 16, 2021 at 12:04:16PM +0800, Claire Chang wrote:
> > > Just noticed that after propagating swiotlb_force setting into
> > > io_tlb_default_mem->force, the memory allocation behavior for
> > > swiotlb_force will change (i.e. always skipping arch_dma_alloc and
> > > dma_direct_alloc_from_pool).
> >
> > Yes, I think we need to split a "use_for_alloc" flag from the force flag.
> 
> How about splitting is_dev_swiotlb_force into is_swiotlb_force_bounce
> (io_tlb_mem->force_bounce) and is_swiotlb_force_alloc
> (io_tlb_mem->force_alloc)?

Yes, something like that.  I'd probably not use force for the alloc side
given that we otherwise never allocte from the swiotlb buffer.

^ permalink raw reply

* [powerpc:merge] BUILD SUCCESS 77fe1f3ccbe0cdc6f386aef522b043c52196d4d2
From: kernel test robot @ 2021-06-16  6:00 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git merge
branch HEAD: 77fe1f3ccbe0cdc6f386aef522b043c52196d4d2  Automatic merge of 'next' into merge (2021-06-15 23:52)

elapsed time: 873m

configs tested: 125
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
arm                                 defconfig
arm64                            allyesconfig
arm64                               defconfig
arm                              allyesconfig
arm                              allmodconfig
powerpc                  mpc866_ads_defconfig
ia64                        generic_defconfig
arm                       versatile_defconfig
arm                         lpc32xx_defconfig
arm                         socfpga_defconfig
powerpc                     sbc8548_defconfig
arm                  colibri_pxa270_defconfig
m68k                        stmark2_defconfig
powerpc                        icon_defconfig
powerpc                      chrp32_defconfig
xtensa                    smp_lx200_defconfig
mips                          ath79_defconfig
powerpc                  iss476-smp_defconfig
xtensa                generic_kc705_defconfig
powerpc                 mpc8540_ads_defconfig
arm                        oxnas_v6_defconfig
s390                                defconfig
arm                        multi_v7_defconfig
powerpc                          g5_defconfig
arm                            hisi_defconfig
xtensa                          iss_defconfig
powerpc                     kmeter1_defconfig
nds32                               defconfig
powerpc                   lite5200b_defconfig
arm                        neponset_defconfig
arm                            zeus_defconfig
m68k                          atari_defconfig
arm                        mvebu_v5_defconfig
ia64                            zx1_defconfig
powerpc                     ksi8560_defconfig
ia64                             alldefconfig
powerpc                      ep88xc_defconfig
arm64                            alldefconfig
arc                    vdk_hs38_smp_defconfig
m68k                         amcore_defconfig
microblaze                      mmu_defconfig
mips                     cu1000-neo_defconfig
m68k                       m5275evb_defconfig
arm                        keystone_defconfig
arm                         palmz72_defconfig
powerpc                mpc7448_hpc2_defconfig
powerpc                        fsp2_defconfig
s390                             alldefconfig
riscv             nommu_k210_sdcard_defconfig
s390                          debug_defconfig
powerpc                 mpc832x_mds_defconfig
sparc                       sparc64_defconfig
riscv                          rv32_defconfig
powerpc                 mpc85xx_cds_defconfig
x86_64                            allnoconfig
ia64                             allmodconfig
ia64                                defconfig
ia64                             allyesconfig
m68k                             allmodconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
arc                              allyesconfig
nds32                             allnoconfig
nios2                            allyesconfig
csky                                defconfig
alpha                               defconfig
alpha                            allyesconfig
xtensa                           allyesconfig
h8300                            allyesconfig
arc                                 defconfig
sh                               allmodconfig
parisc                              defconfig
s390                             allyesconfig
s390                             allmodconfig
parisc                           allyesconfig
i386                             allyesconfig
sparc                            allyesconfig
sparc                               defconfig
i386                                defconfig
mips                             allyesconfig
mips                             allmodconfig
powerpc                          allyesconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
i386                 randconfig-a002-20210615
i386                 randconfig-a006-20210615
i386                 randconfig-a004-20210615
i386                 randconfig-a001-20210615
i386                 randconfig-a005-20210615
i386                 randconfig-a003-20210615
x86_64               randconfig-a001-20210615
x86_64               randconfig-a004-20210615
x86_64               randconfig-a002-20210615
x86_64               randconfig-a003-20210615
x86_64               randconfig-a006-20210615
x86_64               randconfig-a005-20210615
i386                 randconfig-a015-20210615
i386                 randconfig-a013-20210615
i386                 randconfig-a016-20210615
i386                 randconfig-a012-20210615
i386                 randconfig-a014-20210615
i386                 randconfig-a011-20210615
riscv                    nommu_k210_defconfig
riscv                            allyesconfig
riscv                    nommu_virt_defconfig
riscv                             allnoconfig
riscv                               defconfig
riscv                            allmodconfig
x86_64                    rhel-8.3-kselftests
um                           x86_64_defconfig
um                             i386_defconfig
um                            kunit_defconfig
x86_64                           allyesconfig
x86_64                              defconfig
x86_64                               rhel-8.3
x86_64                      rhel-8.3-kbuiltin
x86_64                                  kexec

clang tested configs:
x86_64               randconfig-b001-20210615
x86_64               randconfig-a015-20210615
x86_64               randconfig-a011-20210615
x86_64               randconfig-a012-20210615
x86_64               randconfig-a014-20210615
x86_64               randconfig-a016-20210615
x86_64               randconfig-a013-20210615

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox