LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v5] ocxl: control via sysfs whether the FPGA is reloaded on a link reset
From: Frederic Barrat @ 2020-06-19 14:04 UTC (permalink / raw)
  To: linuxppc-dev, clombard, ajd, alastair

From: Philippe Bergheaud <felix@linux.ibm.com>

Some opencapi FPGA images allow to control if the FPGA should be reloaded
on the next adapter reset. If it is supported, the image specifies it
through a Vendor Specific DVSEC in the config space of function 0.

Signed-off-by: Philippe Bergheaud <felix@linux.ibm.com>
Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
---
Changelog:
v2:
  - refine ResetReload debug message
  - do not call get_function_0() if pci_dev is for function 0
v3:
  - avoid get_function_0() in ocxl_config_set_reset_reload also
v4:
  - simplify parsing of Vendor Specific DVSEC during AFU init
  - only set/unset bit 0 of the config space register
  - commonize code to fetch the right PCI function and DVSEC offset
  - use kstrtoint() when parsing the sysfs buffer
v5:
  - update documentation (Andrew)


 Documentation/ABI/testing/sysfs-class-ocxl | 11 +++
 drivers/misc/ocxl/config.c                 | 81 ++++++++++++++++++++--
 drivers/misc/ocxl/ocxl_internal.h          |  6 ++
 drivers/misc/ocxl/sysfs.c                  | 35 ++++++++++
 include/misc/ocxl-config.h                 |  1 +
 5 files changed, 129 insertions(+), 5 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-class-ocxl b/Documentation/ABI/testing/sysfs-class-ocxl
index b5b1fa197592..ae1276efa45a 100644
--- a/Documentation/ABI/testing/sysfs-class-ocxl
+++ b/Documentation/ABI/testing/sysfs-class-ocxl
@@ -33,3 +33,14 @@ Date:		January 2018
 Contact:	linuxppc-dev@lists.ozlabs.org
 Description:	read/write
 		Give access the global mmio area for the AFU
+
+What:		/sys/class/ocxl/<afu name>/reload_on_reset
+Date:		February 2020
+Contact:	linuxppc-dev@lists.ozlabs.org
+Description:	read/write
+		Control whether the FPGA is reloaded on a link reset. Enabled
+		through a vendor-specific logic block on the FPGA.
+			0	Do not reload FPGA image from flash
+			1	Reload FPGA image from flash
+			unavailable
+				The device does not support this capability
diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c
index c8e19bfb5ef9..42f7a1298775 100644
--- a/drivers/misc/ocxl/config.c
+++ b/drivers/misc/ocxl/config.c
@@ -71,6 +71,20 @@ static int find_dvsec_afu_ctrl(struct pci_dev *dev, u8 afu_idx)
 	return 0;
 }
 
+/**
+ * get_function_0() - Find a related PCI device (function 0)
+ * @device: PCI device to match
+ *
+ * Returns a pointer to the related device, or null if not found
+ */
+static struct pci_dev *get_function_0(struct pci_dev *dev)
+{
+	unsigned int devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
+
+	return pci_get_domain_bus_and_slot(pci_domain_nr(dev->bus),
+					   dev->bus->number, devfn);
+}
+
 static void read_pasid(struct pci_dev *dev, struct ocxl_fn_config *fn)
 {
 	u16 val;
@@ -159,14 +173,15 @@ static int read_dvsec_afu_info(struct pci_dev *dev, struct ocxl_fn_config *fn)
 static int read_dvsec_vendor(struct pci_dev *dev)
 {
 	int pos;
-	u32 cfg, tlx, dlx;
+	u32 cfg, tlx, dlx, reset_reload;
 
 	/*
-	 * vendor specific DVSEC is optional
+	 * vendor specific DVSEC, for IBM images only. Some older
+	 * images may not have it
 	 *
-	 * It's currently only used on function 0 to specify the
-	 * version of some logic blocks. Some older images may not
-	 * even have it so we ignore any errors
+	 * It's only used on function 0 to specify the version of some
+	 * logic blocks and to give access to special registers to
+	 * enable host-based flashing.
 	 */
 	if (PCI_FUNC(dev->devfn) != 0)
 		return 0;
@@ -178,11 +193,67 @@ static int read_dvsec_vendor(struct pci_dev *dev)
 	pci_read_config_dword(dev, pos + OCXL_DVSEC_VENDOR_CFG_VERS, &cfg);
 	pci_read_config_dword(dev, pos + OCXL_DVSEC_VENDOR_TLX_VERS, &tlx);
 	pci_read_config_dword(dev, pos + OCXL_DVSEC_VENDOR_DLX_VERS, &dlx);
+	pci_read_config_dword(dev, pos + OCXL_DVSEC_VENDOR_RESET_RELOAD,
+			      &reset_reload);
 
 	dev_dbg(&dev->dev, "Vendor specific DVSEC:\n");
 	dev_dbg(&dev->dev, "  CFG version = 0x%x\n", cfg);
 	dev_dbg(&dev->dev, "  TLX version = 0x%x\n", tlx);
 	dev_dbg(&dev->dev, "  DLX version = 0x%x\n", dlx);
+	dev_dbg(&dev->dev, "  ResetReload = 0x%x\n", reset_reload);
+	return 0;
+}
+
+static int get_dvsec_vendor0(struct pci_dev *dev, struct pci_dev **dev0,
+			     int *out_pos)
+{
+	int pos;
+
+	if (PCI_FUNC(dev->devfn) != 0) {
+		dev = get_function_0(dev);
+		if (!dev)
+			return -1;
+	}
+	pos = find_dvsec(dev, OCXL_DVSEC_VENDOR_ID);
+	if (!pos)
+		return -1;
+	*dev0 = dev;
+	*out_pos = pos;
+	return 0;
+}
+
+int ocxl_config_get_reset_reload(struct pci_dev *dev, int *val)
+{
+	struct pci_dev *dev0;
+	u32 reset_reload;
+	int pos;
+
+	if (get_dvsec_vendor0(dev, &dev0, &pos))
+		return -1;
+
+	pci_read_config_dword(dev0, pos + OCXL_DVSEC_VENDOR_RESET_RELOAD,
+			      &reset_reload);
+	*val = !!(reset_reload & BIT(0));
+	return 0;
+}
+
+int ocxl_config_set_reset_reload(struct pci_dev *dev, int val)
+{
+	struct pci_dev *dev0;
+	u32 reset_reload;
+	int pos;
+
+	if (get_dvsec_vendor0(dev, &dev0, &pos))
+		return -1;
+
+	pci_read_config_dword(dev0, pos + OCXL_DVSEC_VENDOR_RESET_RELOAD,
+			      &reset_reload);
+	if (val)
+		reset_reload |= BIT(0);
+	else
+		reset_reload &= ~BIT(0);
+	pci_write_config_dword(dev0, pos + OCXL_DVSEC_VENDOR_RESET_RELOAD,
+			       reset_reload);
 	return 0;
 }
 
diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h
index 345bf843a38e..af9a84aeee6f 100644
--- a/drivers/misc/ocxl/ocxl_internal.h
+++ b/drivers/misc/ocxl/ocxl_internal.h
@@ -112,6 +112,12 @@ void ocxl_actag_afu_free(struct ocxl_fn *fn, u32 start, u32 size);
  */
 int ocxl_config_get_pasid_info(struct pci_dev *dev, int *count);
 
+/*
+ * Control whether the FPGA is reloaded on a link reset
+ */
+int ocxl_config_get_reset_reload(struct pci_dev *dev, int *val);
+int ocxl_config_set_reset_reload(struct pci_dev *dev, int val);
+
 /*
  * Check if an AFU index is valid for the given function.
  *
diff --git a/drivers/misc/ocxl/sysfs.c b/drivers/misc/ocxl/sysfs.c
index 58f1ba264206..25c78df8055d 100644
--- a/drivers/misc/ocxl/sysfs.c
+++ b/drivers/misc/ocxl/sysfs.c
@@ -51,11 +51,46 @@ static ssize_t contexts_show(struct device *device,
 			afu->pasid_count, afu->pasid_max);
 }
 
+static ssize_t reload_on_reset_show(struct device *device,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct ocxl_afu *afu = to_afu(device);
+	struct ocxl_fn *fn = afu->fn;
+	struct pci_dev *pci_dev = to_pci_dev(fn->dev.parent);
+	int val;
+
+	if (ocxl_config_get_reset_reload(pci_dev, &val))
+		return scnprintf(buf, PAGE_SIZE, "unavailable\n");
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+
+static ssize_t reload_on_reset_store(struct device *device,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct ocxl_afu *afu = to_afu(device);
+	struct ocxl_fn *fn = afu->fn;
+	struct pci_dev *pci_dev = to_pci_dev(fn->dev.parent);
+	int rc, val;
+
+	rc = kstrtoint(buf, 0, &val);
+	if (rc || (val != 0 && val != 1))
+		return -EINVAL;
+
+	if (ocxl_config_set_reset_reload(pci_dev, val))
+		return -ENODEV;
+
+	return count;
+}
+
 static struct device_attribute afu_attrs[] = {
 	__ATTR_RO(global_mmio_size),
 	__ATTR_RO(pp_mmio_size),
 	__ATTR_RO(afu_version),
 	__ATTR_RO(contexts),
+	__ATTR_RW(reload_on_reset),
 };
 
 static ssize_t global_mmio_read(struct file *filp, struct kobject *kobj,
diff --git a/include/misc/ocxl-config.h b/include/misc/ocxl-config.h
index 3526fa996a22..ccfd3b463517 100644
--- a/include/misc/ocxl-config.h
+++ b/include/misc/ocxl-config.h
@@ -41,5 +41,6 @@
 #define   OCXL_DVSEC_VENDOR_CFG_VERS            0x0C
 #define   OCXL_DVSEC_VENDOR_TLX_VERS            0x10
 #define   OCXL_DVSEC_VENDOR_DLX_VERS            0x20
+#define   OCXL_DVSEC_VENDOR_RESET_RELOAD        0x38
 
 #endif /* _OCXL_CONFIG_H_ */
-- 
2.26.2


^ permalink raw reply related

* [PATCH v1 3/8] powerpc: Set user/kernel boundary at TASK_SIZE instead of PAGE_OFFSET
From: Christophe Leroy @ 2020-06-19 15:06 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1592578278.git.christophe.leroy@csgroup.eu>

User space stops at TASK_SIZE. At the moment, kernel space starts
at PAGE_OFFSET.

In order to use space between TASK_SIZE and PAGE_OFFSET for modules,
make TASK_SIZE the limit between user and kernel space.

Note that fault.c already considers TASK_SIZE as the boundary between
user and kernel space.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/page.h | 2 +-
 arch/powerpc/mm/ptdump/ptdump.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index a63fe6f3a0ff..352a2b80d505 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -256,7 +256,7 @@ static inline bool pfn_valid(unsigned long pfn)
 #ifdef CONFIG_PPC_BOOK3E_64
 #define is_kernel_addr(x)	((x) >= 0x8000000000000000ul)
 #else
-#define is_kernel_addr(x)	((x) >= PAGE_OFFSET)
+#define is_kernel_addr(x)	((x) >= TASK_SIZE)
 #endif
 
 #ifndef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index b71cc628facd..e995f2e9e9f7 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -351,7 +351,7 @@ static void populate_markers(void)
 {
 	int i = 0;
 
-	address_markers[i++].start_address = PAGE_OFFSET;
+	address_markers[i++].start_address = TASK_SIZE;
 	address_markers[i++].start_address = VMALLOC_START;
 	address_markers[i++].start_address = VMALLOC_END;
 #ifdef CONFIG_PPC64
@@ -388,7 +388,7 @@ static int ptdump_show(struct seq_file *m, void *v)
 	struct pg_state st = {
 		.seq = m,
 		.marker = address_markers,
-		.start_address = PAGE_OFFSET,
+		.start_address = TASK_SIZE,
 	};
 
 #ifdef CONFIG_PPC64
@@ -432,7 +432,7 @@ void ptdump_check_wx(void)
 		.seq = NULL,
 		.marker = address_markers,
 		.check_wx = true,
-		.start_address = PAGE_OFFSET,
+		.start_address = TASK_SIZE,
 	};
 
 #ifdef CONFIG_PPC64
-- 
2.25.0


^ permalink raw reply related

* [PATCH v1 1/8] powerpc/ptdump: Refactor update of st->last_pa
From: Christophe Leroy @ 2020-06-19 15:06 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1592578278.git.christophe.leroy@csgroup.eu>

st->last_pa is always updated in note_page() so it can
be done outside the if/elseif/else block.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/ptdump/ptdump.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index de6e05ef871c..d5e42b958e86 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -207,7 +207,6 @@ static void note_page(struct pg_state *st, unsigned long addr,
 		st->current_flags = flag;
 		st->start_address = addr;
 		st->start_pa = pa;
-		st->last_pa = pa;
 		st->page_size = page_size;
 		pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
 	/*
@@ -247,13 +246,11 @@ static void note_page(struct pg_state *st, unsigned long addr,
 		}
 		st->start_address = addr;
 		st->start_pa = pa;
-		st->last_pa = pa;
 		st->page_size = page_size;
 		st->current_flags = flag;
 		st->level = level;
-	} else {
-		st->last_pa = pa;
 	}
+	st->last_pa = pa;
 }
 
 static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
-- 
2.25.0


^ permalink raw reply related

* [PATCH v1 0/8] powerpc/32s: Allocate modules outside of vmalloc space for STRICT_KERNEL_RWX
From: Christophe Leroy @ 2020-06-19 15:06 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

On book3s32 (hash), exec protection is set per 256Mb segments with NX bit.
Instead of clearing NX bit on vmalloc space when CONFIG_MODULES is selected,
allocate modules in a dedicated segment (0xb0000000-0xbfffffff by default).
This allows to keep exec protection on vmalloc space while allowing exec
on modules.

Christophe Leroy (8):
  powerpc/ptdump: Refactor update of st->last_pa
  powerpc/ptdump: Refactor update of pg_state
  powerpc: Set user/kernel boundary at TASK_SIZE instead of PAGE_OFFSET
  powerpc/lib: Prepare code-patching for modules allocated outside
    vmalloc space
  powerpc: Use MODULES_VADDR if defined
  powerpc/32s: Only leave NX unset on segments used for modules
  powerpc/32s: Kernel space starts at TASK_SIZE
  powerpc/32s: Use dedicated segment for modules with STRICT_KERNEL_RWX

 arch/powerpc/Kconfig                         |  1 +
 arch/powerpc/include/asm/book3s/32/pgtable.h | 15 ++----
 arch/powerpc/include/asm/page.h              |  2 +-
 arch/powerpc/kernel/head_32.S                | 12 ++---
 arch/powerpc/kernel/module.c                 | 11 ++++
 arch/powerpc/lib/code-patching.c             |  2 +-
 arch/powerpc/mm/book3s32/hash_low.S          |  2 +-
 arch/powerpc/mm/book3s32/mmu.c               | 17 +++++--
 arch/powerpc/mm/kasan/kasan_init_32.c        |  6 +++
 arch/powerpc/mm/ptdump/ptdump.c              | 53 ++++++++++++--------
 10 files changed, 78 insertions(+), 43 deletions(-)

-- 
2.25.0


^ permalink raw reply

* [PATCH v1 4/8] powerpc/lib: Prepare code-patching for modules allocated outside vmalloc space
From: Christophe Leroy @ 2020-06-19 15:06 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1592578278.git.christophe.leroy@csgroup.eu>

Use is_vmalloc_or_module_addr() instead of is_vmalloc_addr()

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/lib/code-patching.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 0a051dfeb177..8c3934ea6220 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -93,7 +93,7 @@ static int map_patch_area(void *addr, unsigned long text_poke_addr)
 	unsigned long pfn;
 	int err;
 
-	if (is_vmalloc_addr(addr))
+	if (is_vmalloc_or_module_addr(addr))
 		pfn = vmalloc_to_pfn(addr);
 	else
 		pfn = __pa_symbol(addr) >> PAGE_SHIFT;
-- 
2.25.0


^ permalink raw reply related

* [PATCH v1 2/8] powerpc/ptdump: Refactor update of pg_state
From: Christophe Leroy @ 2020-06-19 15:06 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1592578278.git.christophe.leroy@csgroup.eu>

In note_page(), the pg_state is updated the same way in two places.

Add note_page_update_state() to do it.

Also include the display of boundary markers there as it is missing
"no level" leg, leading to a mismatch when the first two markers
are at the same address and the first displayed area uses that
address.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/ptdump/ptdump.c | 34 +++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index d5e42b958e86..b71cc628facd 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -195,6 +195,24 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
 	st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
 }
 
+static void note_page_update_state(struct pg_state *st, unsigned long addr,
+				   unsigned int level, u64 val, unsigned long page_size)
+{
+	u64 flag = val & pg_level[level].mask;
+	u64 pa = val & PTE_RPN_MASK;
+
+	st->level = level;
+	st->current_flags = flag;
+	st->start_address = addr;
+	st->start_pa = pa;
+	st->page_size = page_size;
+
+	while (addr >= st->marker[1].start_address) {
+		st->marker++;
+		pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+	}
+}
+
 static void note_page(struct pg_state *st, unsigned long addr,
 	       unsigned int level, u64 val, unsigned long page_size)
 {
@@ -203,12 +221,8 @@ static void note_page(struct pg_state *st, unsigned long addr,
 
 	/* At first no level is set */
 	if (!st->level) {
-		st->level = level;
-		st->current_flags = flag;
-		st->start_address = addr;
-		st->start_pa = pa;
-		st->page_size = page_size;
 		pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+		note_page_update_state(st, addr, level, val, page_size);
 	/*
 	 * Dump the section of virtual memory when:
 	 *   - the PTE flags from one entry to the next differs.
@@ -240,15 +254,7 @@ static void note_page(struct pg_state *st, unsigned long addr,
 		 * Address indicates we have passed the end of the
 		 * current section of virtual memory
 		 */
-		while (addr >= st->marker[1].start_address) {
-			st->marker++;
-			pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
-		}
-		st->start_address = addr;
-		st->start_pa = pa;
-		st->page_size = page_size;
-		st->current_flags = flag;
-		st->level = level;
+		note_page_update_state(st, addr, level, val, page_size);
 	}
 	st->last_pa = pa;
 }
-- 
2.25.0


^ permalink raw reply related

* [PATCH v1 8/8] powerpc/32s: Use dedicated segment for modules with STRICT_KERNEL_RWX
From: Christophe Leroy @ 2020-06-19 15:06 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1592578278.git.christophe.leroy@csgroup.eu>

When STRICT_KERNEL_RWX is set, we want to set NX bit on vmalloc
segments. But modules require exec.

Use a dedicated segment for modules. There is not much space
above kernel, and we don't waste vmalloc space to do alignment.
Therefore, we take the segment before PAGE_OFFSET for modules.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/Kconfig                         |  1 +
 arch/powerpc/include/asm/book3s/32/pgtable.h | 15 +++++----------
 arch/powerpc/mm/ptdump/ptdump.c              |  8 ++++++++
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 9fa23eb320ff..2ba6ac9da46f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -1198,6 +1198,7 @@ config TASK_SIZE_BOOL
 config TASK_SIZE
 	hex "Size of user task space" if TASK_SIZE_BOOL
 	default "0x80000000" if PPC_8xx
+	default "0xb0000000" if PPC_BOOK3S_32 && STRICT_KERNEL_RWX
 	default "0xc0000000"
 endmenu
 
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 224912432821..36443cda8dcf 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -184,17 +184,7 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
  */
 #define VMALLOC_OFFSET (0x1000000) /* 16M */
 
-/*
- * With CONFIG_STRICT_KERNEL_RWX, kernel segments are set NX. But when modules
- * are used, NX cannot be set on VMALLOC space. So vmalloc VM space and linear
- * memory shall not share segments.
- */
-#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_MODULES)
-#define VMALLOC_START ((ALIGN((long)high_memory, 256L << 20) + VMALLOC_OFFSET) & \
-		       ~(VMALLOC_OFFSET - 1))
-#else
 #define VMALLOC_START ((((long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)))
-#endif
 
 #ifdef CONFIG_KASAN_VMALLOC
 #define VMALLOC_END	ALIGN_DOWN(ioremap_bot, PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
@@ -202,6 +192,11 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
 #define VMALLOC_END	ioremap_bot
 #endif
 
+#ifdef CONFIG_STRICT_KERNEL_RWX
+#define MODULES_END	ALIGN_DOWN(PAGE_OFFSET, SZ_256M)
+#define MODULES_VADDR	(MODULES_END - SZ_256M)
+#endif
+
 #ifndef __ASSEMBLY__
 #include <linux/sched.h>
 #include <linux/threads.h>
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index e995f2e9e9f7..51aab1b7be31 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -74,6 +74,10 @@ struct addr_marker {
 
 static struct addr_marker address_markers[] = {
 	{ 0,	"Start of kernel VM" },
+#ifdef MODULES_VADDR
+	{ 0,	"modules start" },
+	{ 0,	"modules end" },
+#endif
 	{ 0,	"vmalloc() Area" },
 	{ 0,	"vmalloc() End" },
 #ifdef CONFIG_PPC64
@@ -352,6 +356,10 @@ static void populate_markers(void)
 	int i = 0;
 
 	address_markers[i++].start_address = TASK_SIZE;
+#ifdef MODULES_VADDR
+	address_markers[i++].start_address = MODULES_VADDR;
+	address_markers[i++].start_address = MODULES_END;
+#endif
 	address_markers[i++].start_address = VMALLOC_START;
 	address_markers[i++].start_address = VMALLOC_END;
 #ifdef CONFIG_PPC64
-- 
2.25.0


^ permalink raw reply related

* [PATCH v1 7/8] powerpc/32s: Kernel space starts at TASK_SIZE
From: Christophe Leroy @ 2020-06-19 15:06 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1592578278.git.christophe.leroy@csgroup.eu>

Kernel space starts at TASK_SIZE. Select kernel page table
when address is over TASK_SIZE.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/head_32.S       | 12 ++++++------
 arch/powerpc/mm/book3s32/hash_low.S |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 705c042309d8..bbef6ce8322b 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -474,7 +474,7 @@ InstructionTLBMiss:
 	/* Get PTE (linux-style) and check access */
 	mfspr	r3,SPRN_IMISS
 #if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC)
-	lis	r1,PAGE_OFFSET@h		/* check if kernel address */
+	lis	r1, TASK_SIZE@h		/* check if kernel address */
 	cmplw	0,r1,r3
 #endif
 	mfspr	r2, SPRN_SPRG_PGDIR
@@ -484,7 +484,7 @@ InstructionTLBMiss:
 	li	r1,_PAGE_PRESENT | _PAGE_EXEC
 #endif
 #if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC)
-	bge-	112f
+	bgt-	112f
 	lis	r2, (swapper_pg_dir - PAGE_OFFSET)@ha	/* if kernel address, use */
 	addi	r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l	/* kernel page table */
 #endif
@@ -541,7 +541,7 @@ DataLoadTLBMiss:
  */
 	/* Get PTE (linux-style) and check access */
 	mfspr	r3,SPRN_DMISS
-	lis	r1,PAGE_OFFSET@h		/* check if kernel address */
+	lis	r1, TASK_SIZE@h		/* check if kernel address */
 	cmplw	0,r1,r3
 	mfspr	r2, SPRN_SPRG_PGDIR
 #ifdef CONFIG_SWAP
@@ -549,7 +549,7 @@ DataLoadTLBMiss:
 #else
 	li	r1, _PAGE_PRESENT
 #endif
-	bge-	112f
+	bgt-	112f
 	lis	r2, (swapper_pg_dir - PAGE_OFFSET)@ha	/* if kernel address, use */
 	addi	r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l	/* kernel page table */
 112:	rlwimi	r2,r3,12,20,29		/* insert top 10 bits of address */
@@ -621,7 +621,7 @@ DataStoreTLBMiss:
  */
 	/* Get PTE (linux-style) and check access */
 	mfspr	r3,SPRN_DMISS
-	lis	r1,PAGE_OFFSET@h		/* check if kernel address */
+	lis	r1, TASK_SIZE@h		/* check if kernel address */
 	cmplw	0,r1,r3
 	mfspr	r2, SPRN_SPRG_PGDIR
 #ifdef CONFIG_SWAP
@@ -629,7 +629,7 @@ DataStoreTLBMiss:
 #else
 	li	r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT
 #endif
-	bge-	112f
+	bgt-	112f
 	lis	r2, (swapper_pg_dir - PAGE_OFFSET)@ha	/* if kernel address, use */
 	addi	r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l	/* kernel page table */
 112:	rlwimi	r2,r3,12,20,29		/* insert top 10 bits of address */
diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S
index 923ad8f374eb..1690d369688b 100644
--- a/arch/powerpc/mm/book3s32/hash_low.S
+++ b/arch/powerpc/mm/book3s32/hash_low.S
@@ -62,7 +62,7 @@ _GLOBAL(hash_page)
 	isync
 #endif
 	/* Get PTE (linux-style) and check access */
-	lis	r0,KERNELBASE@h		/* check if kernel address */
+	lis	r0, TASK_SIZE@h		/* check if kernel address */
 	cmplw	0,r4,r0
 	ori	r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
 	mfspr	r5, SPRN_SPRG_PGDIR	/* phys page-table root */
-- 
2.25.0


^ permalink raw reply related

* [PATCH v1 5/8] powerpc: Use MODULES_VADDR if defined
From: Christophe Leroy @ 2020-06-19 15:06 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1592578278.git.christophe.leroy@csgroup.eu>

In order to allow allocation of modules outside of vmalloc space,
use MODULES_VADDR and MODULES_END when MODULES_VADDR is defined.

Redefine module_alloc() when MODULES_VADDR defined.
Unmap corresponding KASAN shadow memory.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/module.c          | 11 +++++++++++
 arch/powerpc/mm/kasan/kasan_init_32.c |  6 ++++++
 2 files changed, 17 insertions(+)

diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index df649acb5631..a211b0253cdb 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -86,3 +86,14 @@ int module_finalize(const Elf_Ehdr *hdr,
 
 	return 0;
 }
+
+#ifdef MODULES_VADDR
+void *module_alloc(unsigned long size)
+{
+	BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
+
+	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, GFP_KERNEL,
+				    PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
+				    __builtin_return_address(0));
+}
+#endif
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index 0760e1e754e4..f1bc267d42af 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -115,6 +115,12 @@ static void __init kasan_unmap_early_shadow_vmalloc(void)
 	unsigned long k_end = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_END);
 
 	kasan_update_early_region(k_start, k_end, __pte(0));
+
+#ifdef MODULES_VADDR
+	k_start = (unsigned long)kasan_mem_to_shadow((void *)MODULES_VADDR);
+	k_end = (unsigned long)kasan_mem_to_shadow((void *)MODULES_END);
+	kasan_update_early_region(k_start, k_end, __pte(0));
+#endif
 }
 
 static void __init kasan_mmu_init(void)
-- 
2.25.0


^ permalink raw reply related

* [PATCH v1 6/8] powerpc/32s: Only leave NX unset on segments used for modules
From: Christophe Leroy @ 2020-06-19 15:06 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1592578278.git.christophe.leroy@csgroup.eu>

Instead of leaving NX unset on all segments above the start
of vmalloc space, only leave NX unset on segments used for
modules.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/book3s32/mmu.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 03b6ba54460e..c0162911f6cb 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -187,6 +187,17 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
 	return __mmu_mapin_ram(border, top);
 }
 
+static bool is_module_segment(unsigned long addr)
+{
+	if (!IS_ENABLED(CONFIG_MODULES))
+		return false;
+	if (addr < ALIGN_DOWN(VMALLOC_START, SZ_256M))
+		return false;
+	if (addr >= ALIGN(VMALLOC_END, SZ_256M))
+		return false;
+	return true;
+}
+
 void mmu_mark_initmem_nx(void)
 {
 	int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
@@ -223,9 +234,9 @@ void mmu_mark_initmem_nx(void)
 
 	for (i = TASK_SIZE >> 28; i < 16; i++) {
 		/* Do not set NX on VM space for modules */
-		if (IS_ENABLED(CONFIG_MODULES) &&
-		    (VMALLOC_START & 0xf0000000) == i << 28)
-			break;
+		if (is_module_segment(i << 28))
+			continue;
+
 		mtsrin(mfsrin(i << 28) | 0x10000000, i << 28);
 	}
 }
-- 
2.25.0


^ permalink raw reply related

* [PATCH] powerpc/pseries: new lparcfg key/value pair: partition_affinity_score
From: Scott Cheloha @ 2020-06-19 15:34 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nathan Lynch, Tyrel Datwyler

The H_GetPerformanceCounterInfo PHYP hypercall has a subcall,
Affinity_Domain_Info_By_Partition, which returns, among other things,
a "partition affinity score" for a given LPAR.  This score, a value on
[0-100], represents the processor-memory affinity for the LPAR in
question.  A score of 0 indicates the worst possible affinity while a
score of 100 indicates perfect affinity.  The score can be used to
reason about performance.

This patch adds the score for the local LPAR to the lparcfg procfile
under a new 'partition_affinity_score' key.

The H_GetPerformanceCounterInfo hypercall is already used elsewhere in
the kernel, in powerpc/perf/hv-gpci.c.  Refactoring that code and this
code into a more general API might be worthwhile if additional modules
require the hypercall in the future.

Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com>
---
 arch/powerpc/platforms/pseries/lparcfg.c | 53 ++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
index b8d28ab88178..b75151eee0f0 100644
--- a/arch/powerpc/platforms/pseries/lparcfg.c
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -136,6 +136,57 @@ static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
 	return rc;
 }
 
+/*
+ * Based on H_GetPerformanceCounterInfo v1.10.
+ */
+static void show_gpci_data(struct seq_file *m)
+{
+	struct perf_counter_info_params {
+		__be32 counter_request;
+		__be32 starting_index;
+		__be16 secondary_index;
+		__be16 returned_values;
+		__be32 detail_rc;
+		__be16 counter_value_element_size;
+		u8     counter_info_version_in;
+		u8     counter_info_version_out;
+		u8     reserved[0xC];
+	} __packed;
+	struct hv_gpci_request_buffer {
+		struct perf_counter_info_params params;
+		u8 output[4096 - sizeof(struct perf_counter_info_params)];
+	} __packed;
+	struct hv_gpci_request_buffer *buf;
+	long ret;
+	unsigned int affinity_score;
+
+	buf = kmalloc(sizeof(*buf), GFP_KERNEL);
+	if (buf == NULL)
+		return;
+
+	/*
+	 * Show the local LPAR's affinity score.
+	 *
+	 * 0xB1 selects the Affinity_Domain_Info_By_Partition subcall.
+	 * The score is at byte 0xB in the output buffer.
+	 */
+	memset(&buf->params, 0, sizeof(buf->params));
+	buf->params.counter_request = cpu_to_be32(0xB1);
+	buf->params.starting_index = cpu_to_be32(-1);	/* local LPAR */
+	buf->params.counter_info_version_in = 0x5;	/* v5+ for score */
+	ret = plpar_hcall_norets(H_GET_PERF_COUNTER_INFO, virt_to_phys(buf),
+				 sizeof(*buf));
+	if (ret != H_SUCCESS) {
+		pr_debug("hcall failed: H_GET_PERF_COUNTER_INFO: %ld, %x\n",
+			 ret, be32_to_cpu(buf->params.detail_rc));
+		goto out;
+	}
+	affinity_score = buf->output[0xB];
+	seq_printf(m, "partition_affinity_score=%u\n", affinity_score);
+out:
+	kfree(buf);
+}
+
 static unsigned h_pic(unsigned long *pool_idle_time,
 		      unsigned long *num_procs)
 {
@@ -487,6 +538,8 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
 			   partition_active_processors * 100);
 	}
 
+	show_gpci_data(m);
+
 	seq_printf(m, "partition_active_processors=%d\n",
 		   partition_active_processors);
 
-- 
2.24.1


^ permalink raw reply related

* [PATCH 16/26] mm/powerpc: Use general page fault accounting
From: Peter Xu @ 2020-06-19 16:13 UTC (permalink / raw)
  To: linux-kernel, linux-mm
  Cc: Andrea Arcangeli, linuxppc-dev, Peter Xu, Linus Torvalds,
	Paul Mackerras, Andrew Morton, Will Deacon, Gerald Schaefer
In-Reply-To: <20200619160538.8641-1-peterx@redhat.com>

Use the general page fault accounting by passing regs into handle_mm_fault().

CC: Michael Ellerman <mpe@ellerman.id.au>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 arch/powerpc/mm/fault.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 992b10c3761c..e325d13efaf5 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -563,7 +563,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	fault = handle_mm_fault(vma, address, flags, NULL);
+	fault = handle_mm_fault(vma, address, flags, regs);
 
 #ifdef CONFIG_PPC_MEM_KEYS
 	/*
@@ -604,14 +604,9 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
 	/*
 	 * Major/minor page fault accounting.
 	 */
-	if (major) {
-		current->maj_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+	if (major)
 		cmo_account_page_fault();
-	} else {
-		current->min_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
-	}
+
 	return 0;
 }
 NOKPROBE_SYMBOL(__do_page_fault);
-- 
2.26.2


^ permalink raw reply related

* Re: [PATCH] pci: pcie: AER: Fix logging of Correctable errors
From: Sinan Kaya @ 2020-06-19 17:17 UTC (permalink / raw)
  To: Matt Jolly, Russell Currey, Sam Bobroff, Oliver O'Halloran,
	Bjorn Helgaas, linuxppc-dev, linux-pci, linux-kernel
In-Reply-To: <20200618155511.16009-1-Kangie@footclan.ninja>

On 6/18/2020 11:55 AM, Matt Jolly wrote:

> +		pci_warn(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> +			dev->vendor, dev->device,
> +			info->status, info->mask);
> +	} else {

<snip>

> +		pci_err(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> +			dev->vendor, dev->device,
> +			info->status, info->mask);


Function pointers for pci_warn vs. pci_err ?

This looks like a lot of copy/paste.

^ permalink raw reply

* [PATCH] hvc: unify console setup naming
From: Sergey Senozhatsky @ 2020-06-19 17:22 UTC (permalink / raw)
  To: Petr Mladek
  Cc: Greg Kroah-Hartman, linux-kernel, Steven Rostedt,
	Sergey Senozhatsky, Jiri Slaby, Andy Shevchenko, linuxppc-dev

Use the 'common' foo_console_setup() naming scheme. There are 71
foo_console_setup() callbacks and only one foo_setup_console().

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jiri Slaby <jslaby@suse.com>
---
 drivers/tty/hvc/hvc_xen.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c
index 5ef08905fe05..2a0e51a20e34 100644
--- a/drivers/tty/hvc/hvc_xen.c
+++ b/drivers/tty/hvc/hvc_xen.c
@@ -603,7 +603,7 @@ static void xen_hvm_early_write(uint32_t vtermno, const char *str, int len) { }
 #endif
 
 #ifdef CONFIG_EARLY_PRINTK
-static int __init xenboot_setup_console(struct console *console, char *string)
+static int __init xenboot_console_setup(struct console *console, char *string)
 {
 	static struct xencons_info xenboot;
 
@@ -647,7 +647,7 @@ static void xenboot_write_console(struct console *console, const char *string,
 struct console xenboot_console = {
 	.name		= "xenboot",
 	.write		= xenboot_write_console,
-	.setup		= xenboot_setup_console,
+	.setup		= xenboot_console_setup,
 	.flags		= CON_PRINTBUFFER | CON_BOOT | CON_ANYTIME,
 	.index		= -1,
 };
-- 
2.27.0


^ permalink raw reply related

* Re: [PATCH] pci: pcie: AER: Fix logging of Correctable errors
From: Joe Perches @ 2020-06-19 18:09 UTC (permalink / raw)
  To: Sinan Kaya, Matt Jolly, Russell Currey, Sam Bobroff,
	Oliver O'Halloran, Bjorn Helgaas, linuxppc-dev, linux-pci,
	linux-kernel
In-Reply-To: <d611dabd-b943-8492-a29e-0b7fb1980de8@kernel.org>

On Fri, 2020-06-19 at 13:17 -0400, Sinan Kaya wrote:
> On 6/18/2020 11:55 AM, Matt Jolly wrote:
> 
> > +		pci_warn(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> > +			dev->vendor, dev->device,
> > +			info->status, info->mask);
> > +	} else {
> 
> <snip>
> 
> > +		pci_err(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> > +			dev->vendor, dev->device,
> > +			info->status, info->mask);
> 
> Function pointers for pci_warn vs. pci_err ?

Not really possible as both are function-like macros.



^ permalink raw reply

* Re: [PATCH 00/22] ReST conversion patches (final?)
From: Jonathan Corbet @ 2020-06-19 20:20 UTC (permalink / raw)
  To: Mauro Carvalho Chehab
  Cc: Rich Felker, Linux Doc Mailing List, David Airlie,
	Catalin Marinas, Dragan Cvetic, linux-pci, Jarkko Sakkinen,
	Bjorn Andersson, David Howells, linux-mm, Harry Wei,
	Paul Mackerras, Alex Shi, Will Deacon, Javi Merino, Herbert Xu,
	Yoshinori Sato, linux-sh, Anil S Keshavamurthy, Viresh Kumar,
	Naveen N. Rao, Derek Kiernan, linux-crypto, Ohad Ben-Cohen,
	devicetree, Daniel Vetter, Michael Hennerich, linux-pm,
	linux-remoteproc, Maarten Lankhorst, Maxime Ripard, Rob Herring,
	dri-devel, Bjorn Helgaas, Dan Williams, linux-arm-kernel,
	Daniel Lezcano, Greg Kroah-Hartman, Amit Daniel Kachhap,
	linux-kernel, David S. Miller, tee-dev, Vinod Koul, keyrings,
	Arnd Bergmann, Masami Hiramatsu, Thomas Zimmermann, dmaengine,
	Andrew Morton, linuxppc-dev, Jens Wiklander
In-Reply-To: <cover.1592203650.git.mchehab+huawei@kernel.org>

On Mon, 15 Jun 2020 08:50:05 +0200
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:

> That's my final(*) series of conversion patches from .txt to ReST.
> 
> (*) Well, running the script I'm using to check, I noticed a couple of new *.txt files.
> If I have some time, I'll try to address those last pending things for v5.9.

OK, I've applied the set except for parts:

 1: |copy| as mentioned before
 18: because of the license boilerplate
 19: doesn't apply at all (perhaps because of one of the above)
 22: because I don't like the latex markup.

Also, I took the liberty of just reverting the |copy| change in #10.

Getting there..!

Thanks,

jon

^ permalink raw reply

* Re: [PATCH] powerpc/pseries: new lparcfg key/value pair: partition_affinity_score
From: Tyrel Datwyler @ 2020-06-19 20:38 UTC (permalink / raw)
  To: Scott Cheloha, linuxppc-dev; +Cc: Nathan Lynch
In-Reply-To: <20200619153419.3676392-1-cheloha@linux.ibm.com>

On 6/19/20 8:34 AM, Scott Cheloha wrote:
> The H_GetPerformanceCounterInfo PHYP hypercall has a subcall,
> Affinity_Domain_Info_By_Partition, which returns, among other things,
> a "partition affinity score" for a given LPAR.  This score, a value on
> [0-100], represents the processor-memory affinity for the LPAR in
> question.  A score of 0 indicates the worst possible affinity while a
> score of 100 indicates perfect affinity.  The score can be used to
> reason about performance.
> 
> This patch adds the score for the local LPAR to the lparcfg procfile
> under a new 'partition_affinity_score' key.

I expect that you will probably get a NACK from Michael on this. The overall
desire is to move away from these dated /proc interfaces. While its true that I
did add a new value recently it was strictly to facilitate and correct the
calculation of a derived value that was already dependent on a couple other
existing values in lparcfg.

With that said I would expect that you would likely be advised to expose this as
a sysfs attribute. The question is where? We probably should put some thought in
to this as I would like to port each lparcfg value over to sysfs so that we can
move to deprecating lparcfg. Putting everything under something like
/sys/kernel/lparcfg/* maybe. Michael may have a better suggestion.

> 
> The H_GetPerformanceCounterInfo hypercall is already used elsewhere in
> the kernel, in powerpc/perf/hv-gpci.c.  Refactoring that code and this
> code into a more general API might be worthwhile if additional modules
> require the hypercall in the future.

If you are duplicating code its likely you should already be doing this. See the
rest of my comments about below.

> 
> Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com>
> ---
>  arch/powerpc/platforms/pseries/lparcfg.c | 53 ++++++++++++++++++++++++
>  1 file changed, 53 insertions(+)
> 
> diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
> index b8d28ab88178..b75151eee0f0 100644
> --- a/arch/powerpc/platforms/pseries/lparcfg.c
> +++ b/arch/powerpc/platforms/pseries/lparcfg.c
> @@ -136,6 +136,57 @@ static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
>  	return rc;
>  }
>  
> +/*
> + * Based on H_GetPerformanceCounterInfo v1.10.
> + */
> +static void show_gpci_data(struct seq_file *m)
> +{
> +	struct perf_counter_info_params {
> +		__be32 counter_request;
> +		__be32 starting_index;
> +		__be16 secondary_index;
> +		__be16 returned_values;
> +		__be32 detail_rc;
> +		__be16 counter_value_element_size;
> +		u8     counter_info_version_in;
> +		u8     counter_info_version_out;
> +		u8     reserved[0xC];
> +	} __packed;

This looks to duplicate the hv_get_perf_counter_info_params struct from
arch/powerpc/perf/hv-gpci.h. Maybe this include file needs to move to
arch/powerpc/asm/inlcude so you don't have to redefine this struct.

> +	struct hv_gpci_request_buffer {
> +		struct perf_counter_info_params params;
> +		u8 output[4096 - sizeof(struct perf_counter_info_params)];
> +	} __packed;

This struct is code duplication of the one defined in
arch/powerpc/perf/hv-gpci.c and could be moved into hv-gpci.h along with
HGPCI_MAX_DATA_BYTES so that you can use those versions here.

> +	struct hv_gpci_request_buffer *buf;
> +	long ret;
> +	unsigned int affinity_score;
> +
> +	buf = kmalloc(sizeof(*buf), GFP_KERNEL);
> +	if (buf == NULL)
> +		return;
> +
> +	/*
> +	 * Show the local LPAR's affinity score.
> +	 *
> +	 * 0xB1 selects the Affinity_Domain_Info_By_Partition subcall.
> +	 * The score is at byte 0xB in the output buffer.
> +	 */
> +	memset(&buf->params, 0, sizeof(buf->params));
> +	buf->params.counter_request = cpu_to_be32(0xB1);
> +	buf->params.starting_index = cpu_to_be32(-1);	/* local LPAR */
> +	buf->params.counter_info_version_in = 0x5;	/* v5+ for score */
> +	ret = plpar_hcall_norets(H_GET_PERF_COUNTER_INFO, virt_to_phys(buf),
> +				 sizeof(*buf));
> +	if (ret != H_SUCCESS) {
> +		pr_debug("hcall failed: H_GET_PERF_COUNTER_INFO: %ld, %x\n",
> +			 ret, be32_to_cpu(buf->params.detail_rc));
> +		goto out;
> +	}
> +	affinity_score = buf->output[0xB];
> +	seq_printf(m, "partition_affinity_score=%u\n", affinity_score);
> +out:
> +	kfree(buf);
> +}
> +

IIUC we should already be able to get this value from userspace using perf tool,
right? If thats the case can't we also programatically retrieve it via the
perf_event interface in userspace as well?

-Tyrel

>  static unsigned h_pic(unsigned long *pool_idle_time,
>  		      unsigned long *num_procs)
>  {
> @@ -487,6 +538,8 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
>  			   partition_active_processors * 100);
>  	}
>  
> +	show_gpci_data(m);
> +
>  	seq_printf(m, "partition_active_processors=%d\n",
>  		   partition_active_processors);
>  
> 


^ permalink raw reply

* Re: [PATCH 6/6] kernel: add a kernel_wait helper
From: Luis Chamberlain @ 2020-06-19 21:17 UTC (permalink / raw)
  To: Christoph Hellwig, Andrew Morton, Greg Kroah-Hartman
  Cc: linux-arch, linux-s390, linux-parisc, Arnd Bergmann, Brian Gerst,
	x86, linux-mips, linux-kernel, linux-fsdevel, Al Viro, sparclinux,
	linuxppc-dev, linux-arm-kernel
In-Reply-To: <20200618144627.114057-7-hch@lst.de>

On Thu, Jun 18, 2020 at 04:46:27PM +0200, Christoph Hellwig wrote:
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -1626,6 +1626,22 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
>  	return ret;
>  }
>  
> +int kernel_wait(pid_t pid, int *stat)
> +{
> +	struct wait_opts wo = {
> +		.wo_type	= PIDTYPE_PID,
> +		.wo_pid		= find_get_pid(pid),
> +		.wo_flags	= WEXITED,
> +	};
> +	int ret;
> +
> +	ret = do_wait(&wo);
> +	if (ret > 0 && wo.wo_stat)
> +		*stat = wo.wo_stat;

Since all we care about is WEXITED, that could be simplified
to something like this:

if (ret > 0 && KWIFEXITED(wo.wo_stat)
 	*stat = KWEXITSTATUS(wo.wo_stat)

Otherwise callers have to use W*() wrappers.

> +	put_pid(wo.wo_pid);
> +	return ret;
> +}

Then we don't get *any* in-kernel code dealing with the W*() crap.
I just unwrapped this for the umh [0], given that otherwise we'd
have to use KW*() callers elsewhere. Doing it upshot one level
further would be even better.

[0] https://lkml.kernel.org/r/20200610154923.27510-1-mcgrof@kernel.org              

  Luis

^ permalink raw reply

* [PATCH v3 0/4] Migrate non-migrated pages of a SVM.
From: Ram Pai @ 2020-06-19 22:43 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david

The time taken to switch a VM to Secure-VM, increases by the size of the VM.  A
100GB VM takes about 7minutes. This is unacceptable.  This linear increase is
caused by a suboptimal behavior by the Ultravisor and the Hypervisor.  The
Ultravisor unnecessarily migrates all the GFN of the VM from normal-memory to
secure-memory. It has to just migrate the necessary and sufficient GFNs.

However when the optimization is incorporated in the Ultravisor, the Hypervisor
starts misbehaving. The Hypervisor has a inbuilt assumption that the Ultravisor
will explicitly request to migrate, each and every GFN of the VM. If only
necessary and sufficient GFNs are requested for migration, the Hypervisor
continues to manage the remaining GFNs as normal GFNs. This leads of memory
corruption, manifested consistently when the SVM reboots.

The same is true, when a memory slot is hotplugged into a SVM. The Hypervisor
expects the ultravisor to request migration of all GFNs to secure-GFN.  But at
the same time, the hypervisor is unable to handle any H_SVM_PAGE_IN requests
from the Ultravisor, done in the context of UV_REGISTER_MEM_SLOT ucall.  This
problem manifests as random errors in the SVM, when a memory-slot is
hotplugged.

This patch series automatically migrates the non-migrated pages of a SVM,
     and thus solves the problem.

Testing: Passed rigorous SVM test using various sized SVMs.

Changelog:

v2: . fixed a bug observed by Laurent. The state of the GFN's associated
	with Secure-VMs were not reset during memslot flush.
    . Re-organized the code, for easier review.
    . Better description of the patch series.

v1: fixed a bug observed by Bharata. Pages that where paged-in and later
paged-out must also be skipped from migration during H_SVM_INIT_DONE.

Laurent Dufour (1):
  KVM: PPC: Book3S HV: migrate hot plugged memory

Ram Pai (3):
  KVM: PPC: Book3S HV: Fix function definition in book3s_hv_uvmem.c
  KVM: PPC: Book3S HV: track the state GFNs associated with secure VMs
  KVM: PPC: Book3S HV: migrate remaining normal-GFNs to secure-GFNs in
    H_SVM_INIT_DONE

 Documentation/powerpc/ultravisor.rst        |   2 +
 arch/powerpc/include/asm/kvm_book3s_uvmem.h |   2 +
 arch/powerpc/kvm/book3s_hv.c                |  10 +-
 arch/powerpc/kvm/book3s_hv_uvmem.c          | 360 +++++++++++++++++++++++-----
 4 files changed, 315 insertions(+), 59 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* [PATCH v3 1/4] KVM: PPC: Book3S HV: Fix function definition in book3s_hv_uvmem.c
From: Ram Pai @ 2020-06-19 22:43 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1592606622-29884-1-git-send-email-linuxram@us.ibm.com>

Without this fix, git is confused. It generates wrong
function context for code changes in subsequent patches.
Weird, but true.

Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 arch/powerpc/kvm/book3s_hv_uvmem.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index ad950f89..3599aaa 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -369,8 +369,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
  * Alloc a PFN from private device memory pool and copy page from normal
  * memory to secure memory using UV_PAGE_IN uvcall.
  */
-static int
-kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
+static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 		   unsigned long end, unsigned long gpa, struct kvm *kvm,
 		   unsigned long page_shift, bool *downgrade)
 {
@@ -437,8 +436,8 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
  * In the former case, uses dev_pagemap_ops.migrate_to_ram handler
  * to unmap the device page from QEMU's page tables.
  */
-static unsigned long
-kvmppc_share_page(struct kvm *kvm, unsigned long gpa, unsigned long page_shift)
+static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
+		unsigned long page_shift)
 {
 
 	int ret = H_PARAMETER;
@@ -487,9 +486,9 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
  * H_PAGE_IN_SHARED flag makes the page shared which means that the same
  * memory in is visible from both UV and HV.
  */
-unsigned long
-kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
-		     unsigned long flags, unsigned long page_shift)
+unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
+		unsigned long flags,
+		unsigned long page_shift)
 {
 	bool downgrade = false;
 	unsigned long start, end;
@@ -546,10 +545,10 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
  * Provision a new page on HV side and copy over the contents
  * from secure memory using UV_PAGE_OUT uvcall.
  */
-static int
-kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start,
-		    unsigned long end, unsigned long page_shift,
-		    struct kvm *kvm, unsigned long gpa)
+static int kvmppc_svm_page_out(struct vm_area_struct *vma,
+		unsigned long start,
+		unsigned long end, unsigned long page_shift,
+		struct kvm *kvm, unsigned long gpa)
 {
 	unsigned long src_pfn, dst_pfn = 0;
 	struct migrate_vma mig;
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v3 2/4] KVM: PPC: Book3S HV: track the state GFNs associated with secure VMs
From: Ram Pai @ 2020-06-19 22:43 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1592606622-29884-1-git-send-email-linuxram@us.ibm.com>

During the life of SVM, its GFNs transition through normal, secure and
shared states. Since the kernel does not track GFNs that are shared, it
is not possible to disambiguate a shared GFN from a GFN whose PFN has
not yet been migrated to a secure-PFN. Also it is not possible to
disambiguate a secure-GFN from a GFN whose GFN has been pagedout from
the ultravisor.

The ability to identify the state of a GFN is needed to skip migration
of its PFN to secure-PFN during ESM transition.

The code is re-organized to track the states of a GFN as explained
below.

************************************************************************
 1. States of a GFN
    ---------------
 The GFN can be in one of the following states.

 (a) Secure - The GFN is secure. The GFN is associated with
 	a Secure VM, the contents of the GFN is not accessible
 	to the Hypervisor.  This GFN can be backed by a secure-PFN,
 	or can be backed by a normal-PFN with contents encrypted.
 	The former is true when the GFN is paged-in into the
 	ultravisor. The latter is true when the GFN is paged-out
 	of the ultravisor.

 (b) Shared - The GFN is shared. The GFN is associated with a
 	a secure VM. The contents of the GFN is accessible to
 	Hypervisor. This GFN is backed by a normal-PFN and its
 	content is un-encrypted.

 (c) Normal - The GFN is a normal. The GFN is associated with
 	a normal VM. The contents of the GFN is accesible to
 	the Hypervisor. Its content is never encrypted.

 2. States of a VM.
    ---------------

 (a) Normal VM:  A VM whose contents are always accessible to
 	the hypervisor.  All its GFNs are normal-GFNs.

 (b) Secure VM: A VM whose contents are not accessible to the
 	hypervisor without the VM's consent.  Its GFNs are
 	either Shared-GFN or Secure-GFNs.

 (c) Transient VM: A Normal VM that is transitioning to secure VM.
 	The transition starts on successful return of
 	H_SVM_INIT_START, and ends on successful return
 	of H_SVM_INIT_DONE. This transient VM, can have GFNs
 	in any of the three states; i.e Secure-GFN, Shared-GFN,
 	and Normal-GFN.	The VM never executes in this state
 	in supervisor-mode.

 3. Memory slot State.
    ------------------
  	The state of a memory slot mirrors the state of the
  	VM the memory slot is associated with.

 4. VM State transition.
    --------------------

  A VM always starts in Normal Mode.

  H_SVM_INIT_START moves the VM into transient state. During this
  time the Ultravisor may request some of its GFNs to be shared or
  secured. So its GFNs can be in one of the three GFN states.

  H_SVM_INIT_DONE moves the VM entirely from transient state to
  secure-state. At this point any left-over normal-GFNs are
  transitioned to Secure-GFN.

  H_SVM_INIT_ABORT moves the transient VM back to normal VM.
  All its GFNs are moved to Normal-GFNs.

  UV_TERMINATE transitions the secure-VM back to normal-VM. All
  the secure-GFN and shared-GFNs are tranistioned to normal-GFN
  Note: The contents of the normal-GFN is undefined at this point.

 5. GFN state implementation:
    -------------------------

 Secure GFN is associated with a secure-PFN; also called uvmem_pfn,
 when the GFN is paged-in. Its pfn[] has KVMPPC_GFN_UVMEM_PFN flag
 set, and contains the value of the secure-PFN.
 It is associated with a normal-PFN; also called mem_pfn, when
 the GFN is pagedout. Its pfn[] has KVMPPC_GFN_MEM_PFN flag set.
 The value of the normal-PFN is not tracked.

 Shared GFN is associated with a normal-PFN. Its pfn[] has
 KVMPPC_UVMEM_SHARED_PFN flag set. The value of the normal-PFN
 is not tracked.

 Normal GFN is associated with normal-PFN. Its pfn[] has
 no flag set. The value of the normal-PFN is not tracked.

 6. Life cycle of a GFN
    --------------------
 --------------------------------------------------------------
 |        |     Share  |  Unshare | SVM       |H_SVM_INIT_DONE|
 |        |operation   |operation | abort/    |               |
 |        |            |          | terminate |               |
 -------------------------------------------------------------
 |        |            |          |           |               |
 | Secure |     Shared | Secure   |Normal     |Secure         |
 |        |            |          |           |               |
 | Shared |     Shared | Secure   |Normal     |Shared         |
 |        |            |          |           |               |
 | Normal |     Shared | Secure   |Normal     |Secure         |
 --------------------------------------------------------------

 7. Life cycle of a VM
    --------------------
 --------------------------------------------------------------------
 |         |  start    |  H_SVM_  |H_SVM_   |H_SVM_     |UV_SVM_    |
 |         |  VM       |INIT_START|INIT_DONE|INIT_ABORT |TERMINATE  |
 |         |           |          |         |           |           |
 --------- ----------------------------------------------------------
 |         |           |          |         |           |           |
 | Normal  | Normal    | Transient|Error    |Error      |Normal     |
 |         |           |          |         |           |           |
 | Secure  |   Error   | Error    |Error    |Error      |Normal     |
 |         |           |          |         |           |           |
 |Transient|   N/A     | Error    |Secure   |Normal     |Normal     |
 --------------------------------------------------------------------

************************************************************************

Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 arch/powerpc/kvm/book3s_hv_uvmem.c | 187 +++++++++++++++++++++++++++++++++----
 1 file changed, 168 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 3599aaa..c8c0290 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -98,7 +98,127 @@
 static unsigned long *kvmppc_uvmem_bitmap;
 static DEFINE_SPINLOCK(kvmppc_uvmem_bitmap_lock);
 
-#define KVMPPC_UVMEM_PFN	(1UL << 63)
+/*
+ * States of a GFN
+ * ---------------
+ * The GFN can be in one of the following states.
+ *
+ * (a) Secure - The GFN is secure. The GFN is associated with
+ *	a Secure VM, the contents of the GFN is not accessible
+ *	to the Hypervisor.  This GFN can be backed by a secure-PFN,
+ *	or can be backed by a normal-PFN with contents encrypted.
+ *	The former is true when the GFN is paged-in into the
+ *	ultravisor. The latter is true when the GFN is paged-out
+ *	of the ultravisor.
+ *
+ * (b) Shared - The GFN is shared. The GFN is associated with a
+ *	a secure VM. The contents of the GFN is accessible to
+ *	Hypervisor. This GFN is backed by a normal-PFN and its
+ *	content is un-encrypted.
+ *
+ * (c) Normal - The GFN is a normal. The GFN is associated with
+ *	a normal VM. The contents of the GFN is accesible to
+ *	the Hypervisor. Its content is never encrypted.
+ *
+ * States of a VM.
+ * ---------------
+ *
+ * Normal VM:  A VM whose contents are always accessible to
+ *	the hypervisor.  All its GFNs are normal-GFNs.
+ *
+ * Secure VM: A VM whose contents are not accessible to the
+ *	hypervisor without the VM's consent.  Its GFNs are
+ *	either Shared-GFN or Secure-GFNs.
+ *
+ * Transient VM: A Normal VM that is transitioning to secure VM.
+ *	The transition starts on successful return of
+ *	H_SVM_INIT_START, and ends on successful return
+ *	of H_SVM_INIT_DONE. This transient VM, can have GFNs
+ *	in any of the three states; i.e Secure-GFN, Shared-GFN,
+ *	and Normal-GFN.	The VM never executes in this state
+ *	in supervisor-mode.
+ *
+ * Memory slot State.
+ * -----------------------------
+ *	The state of a memory slot mirrors the state of the
+ *	VM the memory slot is associated with.
+ *
+ * VM State transition.
+ * --------------------
+ *
+ *  A VM always starts in Normal Mode.
+ *
+ *  H_SVM_INIT_START moves the VM into transient state. During this
+ *  time the Ultravisor may request some of its GFNs to be shared or
+ *  secured. So its GFNs can be in one of the three GFN states.
+ *
+ *  H_SVM_INIT_DONE moves the VM entirely from transient state to
+ *  secure-state. At this point any left-over normal-GFNs are
+ *  transitioned to Secure-GFN.
+ *
+ *  H_SVM_INIT_ABORT moves the transient VM back to normal VM.
+ *  All its GFNs are moved to Normal-GFNs.
+ *
+ *  UV_TERMINATE transitions the secure-VM back to normal-VM. All
+ *  the secure-GFN and shared-GFNs are tranistioned to normal-GFN
+ *  Note: The contents of the normal-GFN is undefined at this point.
+ *
+ * GFN state implementation:
+ * -------------------------
+ *
+ * Secure GFN is associated with a secure-PFN; also called uvmem_pfn,
+ * when the GFN is paged-in. Its pfn[] has KVMPPC_GFN_UVMEM_PFN flag
+ * set, and contains the value of the secure-PFN.
+ * It is associated with a normal-PFN; also called mem_pfn, when
+ * the GFN is pagedout. Its pfn[] has KVMPPC_GFN_MEM_PFN flag set.
+ * The value of the normal-PFN is not tracked.
+ *
+ * Shared GFN is associated with a normal-PFN. Its pfn[] has
+ * KVMPPC_UVMEM_SHARED_PFN flag set. The value of the normal-PFN
+ * is not tracked.
+ *
+ * Normal GFN is associated with normal-PFN. Its pfn[] has
+ * no flag set. The value of the normal-PFN is not tracked.
+ *
+ * Life cycle of a GFN
+ * --------------------
+ *
+ * --------------------------------------------------------------
+ * |        |     Share  |  Unshare | SVM       |H_SVM_INIT_DONE|
+ * |        |operation   |operation | abort/    |               |
+ * |        |            |          | terminate |               |
+ * -------------------------------------------------------------
+ * |        |            |          |           |               |
+ * | Secure |     Shared | Secure   |Normal     |Secure         |
+ * |        |            |          |           |               |
+ * | Shared |     Shared | Secure   |Normal     |Shared         |
+ * |        |            |          |           |               |
+ * | Normal |     Shared | Secure   |Normal     |Secure         |
+ * --------------------------------------------------------------
+ *
+ * Life cycle of a VM
+ * --------------------
+ *
+ * --------------------------------------------------------------------
+ * |         |  start    |  H_SVM_  |H_SVM_   |H_SVM_     |UV_SVM_    |
+ * |         |  VM       |INIT_START|INIT_DONE|INIT_ABORT |TERMINATE  |
+ * |         |           |          |         |           |           |
+ * --------- ----------------------------------------------------------
+ * |         |           |          |         |           |           |
+ * | Normal  | Normal    | Transient|Error    |Error      |Normal     |
+ * |         |           |          |         |           |           |
+ * | Secure  |   Error   | Error    |Error    |Error      |Normal     |
+ * |         |           |          |         |           |           |
+ * |Transient|   N/A     | Error    |Secure   |Normal     |Normal     |
+ * --------------------------------------------------------------------
+ */
+
+#define KVMPPC_GFN_UVMEM_PFN	(1UL << 63)
+#define KVMPPC_GFN_MEM_PFN	(1UL << 62)
+#define KVMPPC_GFN_SHARED	(1UL << 61)
+#define KVMPPC_GFN_SECURE	(KVMPPC_GFN_UVMEM_PFN | KVMPPC_GFN_MEM_PFN)
+#define KVMPPC_GFN_FLAG_MASK	(KVMPPC_GFN_SECURE | KVMPPC_GFN_SHARED)
+#define KVMPPC_GFN_PFN_MASK	(~KVMPPC_GFN_FLAG_MASK)
 
 struct kvmppc_uvmem_slot {
 	struct list_head list;
@@ -106,11 +226,11 @@ struct kvmppc_uvmem_slot {
 	unsigned long base_pfn;
 	unsigned long *pfns;
 };
-
 struct kvmppc_uvmem_page_pvt {
 	struct kvm *kvm;
 	unsigned long gpa;
 	bool skip_page_out;
+	bool remove_gfn;
 };
 
 int kvmppc_uvmem_slot_init(struct kvm *kvm, const struct kvm_memory_slot *slot)
@@ -154,8 +274,8 @@ void kvmppc_uvmem_slot_free(struct kvm *kvm, const struct kvm_memory_slot *slot)
 	mutex_unlock(&kvm->arch.uvmem_lock);
 }
 
-static void kvmppc_uvmem_pfn_insert(unsigned long gfn, unsigned long uvmem_pfn,
-				    struct kvm *kvm)
+static void kvmppc_mark_gfn(unsigned long gfn, struct kvm *kvm,
+			unsigned long flag, unsigned long uvmem_pfn)
 {
 	struct kvmppc_uvmem_slot *p;
 
@@ -163,24 +283,41 @@ static void kvmppc_uvmem_pfn_insert(unsigned long gfn, unsigned long uvmem_pfn,
 		if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
 			unsigned long index = gfn - p->base_pfn;
 
-			p->pfns[index] = uvmem_pfn | KVMPPC_UVMEM_PFN;
+			if (flag == KVMPPC_GFN_UVMEM_PFN)
+				p->pfns[index] = uvmem_pfn | flag;
+			else
+				p->pfns[index] = flag;
 			return;
 		}
 	}
 }
 
-static void kvmppc_uvmem_pfn_remove(unsigned long gfn, struct kvm *kvm)
+/* mark the GFN as secure-GFN associated with @uvmem pfn device-PFN. */
+static void kvmppc_gfn_secure_uvmem_pfn(unsigned long gfn,
+			unsigned long uvmem_pfn, struct kvm *kvm)
 {
-	struct kvmppc_uvmem_slot *p;
+	kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_UVMEM_PFN, uvmem_pfn);
+}
 
-	list_for_each_entry(p, &kvm->arch.uvmem_pfns, list) {
-		if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
-			p->pfns[gfn - p->base_pfn] = 0;
-			return;
-		}
-	}
+/* mark the GFN as secure-GFN associated with a memory-PFN. */
+static void kvmppc_gfn_secure_mem_pfn(unsigned long gfn, struct kvm *kvm)
+{
+	kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_MEM_PFN, 0);
+}
+
+/* mark the GFN as a shared GFN. */
+static void kvmppc_gfn_shared(unsigned long gfn, struct kvm *kvm)
+{
+	kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_SHARED, 0);
+}
+
+/* mark the GFN as a non-existent GFN. */
+static void kvmppc_gfn_remove(unsigned long gfn, struct kvm *kvm)
+{
+	kvmppc_mark_gfn(gfn, kvm, 0, 0);
 }
 
+/* return true, if the GFN is a secure-GFN backed by a secure-PFN */
 static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
 				    unsigned long *uvmem_pfn)
 {
@@ -190,10 +327,10 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
 		if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
 			unsigned long index = gfn - p->base_pfn;
 
-			if (p->pfns[index] & KVMPPC_UVMEM_PFN) {
+			if (p->pfns[index] & KVMPPC_GFN_UVMEM_PFN) {
 				if (uvmem_pfn)
 					*uvmem_pfn = p->pfns[index] &
-						     ~KVMPPC_UVMEM_PFN;
+						     KVMPPC_GFN_PFN_MASK;
 				return true;
 			} else
 				return false;
@@ -271,6 +408,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 
 		mutex_lock(&kvm->arch.uvmem_lock);
 		if (!kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) {
+			kvmppc_gfn_remove(gfn, kvm);
 			mutex_unlock(&kvm->arch.uvmem_lock);
 			continue;
 		}
@@ -278,6 +416,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 		uvmem_page = pfn_to_page(uvmem_pfn);
 		pvt = uvmem_page->zone_device_data;
 		pvt->skip_page_out = skip_page_out;
+		pvt->remove_gfn = true;
 		mutex_unlock(&kvm->arch.uvmem_lock);
 
 		pfn = gfn_to_pfn(kvm, gfn);
@@ -347,7 +486,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
 		goto out_clear;
 
 	uvmem_pfn = bit + pfn_first;
-	kvmppc_uvmem_pfn_insert(gpa >> PAGE_SHIFT, uvmem_pfn, kvm);
+	kvmppc_gfn_secure_uvmem_pfn(gpa >> PAGE_SHIFT, uvmem_pfn, kvm);
 
 	pvt->gpa = gpa;
 	pvt->kvm = kvm;
@@ -454,6 +593,7 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
 		uvmem_page = pfn_to_page(uvmem_pfn);
 		pvt = uvmem_page->zone_device_data;
 		pvt->skip_page_out = true;
+		pvt->remove_gfn = false;
 	}
 
 retry:
@@ -467,12 +607,16 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
 		uvmem_page = pfn_to_page(uvmem_pfn);
 		pvt = uvmem_page->zone_device_data;
 		pvt->skip_page_out = true;
+		pvt->remove_gfn = false;
 		kvm_release_pfn_clean(pfn);
 		goto retry;
 	}
 
-	if (!uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0, page_shift))
+	if (!uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0,
+				page_shift)) {
+		kvmppc_gfn_shared(gfn, kvm);
 		ret = H_SUCCESS;
+	}
 	kvm_release_pfn_clean(pfn);
 	mutex_unlock(&kvm->arch.uvmem_lock);
 out:
@@ -530,6 +674,7 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 	if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift,
 				&downgrade))
 		ret = H_SUCCESS;
+
 out_unlock:
 	mutex_unlock(&kvm->arch.uvmem_lock);
 out:
@@ -640,7 +785,8 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf)
 /*
  * Release the device PFN back to the pool
  *
- * Gets called when secure page becomes a normal page during H_SVM_PAGE_OUT.
+ * Gets called when secure GFN tranistions from a secure-PFN
+ * to a normal PFN during H_SVM_PAGE_OUT.
  * Gets called with kvm->arch.uvmem_lock held.
  */
 static void kvmppc_uvmem_page_free(struct page *page)
@@ -655,7 +801,10 @@ static void kvmppc_uvmem_page_free(struct page *page)
 
 	pvt = page->zone_device_data;
 	page->zone_device_data = NULL;
-	kvmppc_uvmem_pfn_remove(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
+	if (pvt->remove_gfn)
+		kvmppc_gfn_remove(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
+	else
+		kvmppc_gfn_secure_mem_pfn(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
 	kfree(pvt);
 }
 
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v3 3/4] KVM: PPC: Book3S HV: migrate remaining normal-GFNs to secure-GFNs in H_SVM_INIT_DONE
From: Ram Pai @ 2020-06-19 22:43 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1592606622-29884-1-git-send-email-linuxram@us.ibm.com>

H_SVM_INIT_DONE incorrectly assumes that the Ultravisor has explicitly
called H_SVM_PAGE_IN for all secure pages. These GFNs continue to be
normal GFNs associated with normal PFNs; when infact, these GFNs should
have been secure GFNs, associated with device PFNs.

Move all the PFNs associated with the SVM's GFNs, to secure-PFNs, in
H_SVM_INIT_DONE. Skip the GFNs that are already Paged-in or Shared
through H_SVM_PAGE_IN, or Paged-in followed by a Paged-out through
UV_PAGE_OUT.

Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 Documentation/powerpc/ultravisor.rst        |   2 +
 arch/powerpc/include/asm/kvm_book3s_uvmem.h |   2 +
 arch/powerpc/kvm/book3s_hv_uvmem.c          | 154 +++++++++++++++++++++++-----
 3 files changed, 132 insertions(+), 26 deletions(-)

diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst
index 363736d..3bc8957 100644
--- a/Documentation/powerpc/ultravisor.rst
+++ b/Documentation/powerpc/ultravisor.rst
@@ -933,6 +933,8 @@ Return values
 	* H_UNSUPPORTED		if called from the wrong context (e.g.
 				from an SVM or before an H_SVM_INIT_START
 				hypercall).
+	* H_STATE		if the hypervisor could not successfully
+                                transition the VM to Secure VM.
 
 Description
 ~~~~~~~~~~~
diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
index 5a9834e..b9cd7eb 100644
--- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h
+++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
@@ -22,6 +22,8 @@ unsigned long kvmppc_h_svm_page_out(struct kvm *kvm,
 unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm);
 void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 			     struct kvm *kvm, bool skip_page_out);
+int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+			const struct kvm_memory_slot *memslot);
 #else
 static inline int kvmppc_uvmem_init(void)
 {
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index c8c0290..449e8a7 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -93,6 +93,7 @@
 #include <asm/ultravisor.h>
 #include <asm/mman.h>
 #include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s_uvmem.h>
 
 static struct dev_pagemap kvmppc_uvmem_pgmap;
 static unsigned long *kvmppc_uvmem_bitmap;
@@ -339,6 +340,21 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
 	return false;
 }
 
+/* return true, if the GFN is a shared-GFN, or a secure-GFN */
+bool kvmppc_gfn_has_transitioned(unsigned long gfn, struct kvm *kvm)
+{
+	struct kvmppc_uvmem_slot *p;
+
+	list_for_each_entry(p, &kvm->arch.uvmem_pfns, list) {
+		if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
+			unsigned long index = gfn - p->base_pfn;
+
+			return (p->pfns[index] & KVMPPC_GFN_FLAG_MASK);
+		}
+	}
+	return false;
+}
+
 unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 {
 	struct kvm_memslots *slots;
@@ -379,12 +395,31 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 
 unsigned long kvmppc_h_svm_init_done(struct kvm *kvm)
 {
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int srcu_idx;
+	long ret = H_SUCCESS;
+
 	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
 		return H_UNSUPPORTED;
 
+	/* migrate any unmoved normal pfn to device pfns*/
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	slots = kvm_memslots(kvm);
+	kvm_for_each_memslot(memslot, slots) {
+		ret = kvmppc_uv_migrate_mem_slot(kvm, memslot);
+		if (ret) {
+			ret = H_STATE;
+			goto out;
+		}
+	}
+
 	kvm->arch.secure_guest |= KVMPPC_SECURE_INIT_DONE;
 	pr_info("LPID %d went secure\n", kvm->arch.lpid);
-	return H_SUCCESS;
+
+out:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	return ret;
 }
 
 /*
@@ -505,12 +540,14 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
 }
 
 /*
- * Alloc a PFN from private device memory pool and copy page from normal
- * memory to secure memory using UV_PAGE_IN uvcall.
+ * Alloc a PFN from private device memory pool. If @pagein is true,
+ * copy page from normal memory to secure memory using UV_PAGE_IN uvcall.
  */
-static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
-		   unsigned long end, unsigned long gpa, struct kvm *kvm,
-		   unsigned long page_shift, bool *downgrade)
+static int kvmppc_svm_migrate_page(struct vm_area_struct *vma,
+		unsigned long start,
+		unsigned long end, unsigned long gpa, struct kvm *kvm,
+		unsigned long page_shift,
+		bool pagein)
 {
 	unsigned long src_pfn, dst_pfn = 0;
 	struct migrate_vma mig;
@@ -526,18 +563,6 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 	mig.src = &src_pfn;
 	mig.dst = &dst_pfn;
 
-	/*
-	 * We come here with mmap_sem write lock held just for
-	 * ksm_madvise(), otherwise we only need read mmap_sem.
-	 * Hence downgrade to read lock once ksm_madvise() is done.
-	 */
-	ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
-			  MADV_UNMERGEABLE, &vma->vm_flags);
-	downgrade_write(&kvm->mm->mmap_sem);
-	*downgrade = true;
-	if (ret)
-		return ret;
-
 	ret = migrate_vma_setup(&mig);
 	if (ret)
 		return ret;
@@ -553,11 +578,16 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 		goto out_finalize;
 	}
 
-	pfn = *mig.src >> MIGRATE_PFN_SHIFT;
-	spage = migrate_pfn_to_page(*mig.src);
-	if (spage)
-		uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0,
-			   page_shift);
+	if (pagein) {
+		pfn = *mig.src >> MIGRATE_PFN_SHIFT;
+		spage = migrate_pfn_to_page(*mig.src);
+		if (spage) {
+			ret = uv_page_in(kvm->arch.lpid, pfn << page_shift,
+					gpa, 0, page_shift);
+			if (ret)
+				goto out_finalize;
+		}
+	}
 
 	*mig.dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
 	migrate_vma_pages(&mig);
@@ -566,6 +596,66 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 	return ret;
 }
 
+int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+		const struct kvm_memory_slot *memslot)
+{
+	unsigned long gfn = memslot->base_gfn;
+	unsigned long end;
+	bool downgrade = false;
+	struct vm_area_struct *vma;
+	int i, ret = 0;
+	unsigned long start = gfn_to_hva(kvm, gfn);
+
+	if (kvm_is_error_hva(start))
+		return H_STATE;
+
+	end = start + (memslot->npages << PAGE_SHIFT);
+
+	down_write(&kvm->mm->mmap_sem);
+
+	mutex_lock(&kvm->arch.uvmem_lock);
+	vma = find_vma_intersection(kvm->mm, start, end);
+	if (!vma || vma->vm_start > start || vma->vm_end < end) {
+		ret = H_STATE;
+		goto out_unlock;
+	}
+
+	ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
+			  MADV_UNMERGEABLE, &vma->vm_flags);
+	downgrade_write(&kvm->mm->mmap_sem);
+	downgrade = true;
+	if (ret) {
+		ret = H_STATE;
+		goto out_unlock;
+	}
+
+	for (i = 0; i < memslot->npages; i++, ++gfn) {
+		/*
+		 * skip GFNs that have already tranistioned.
+		 * paged-in GFNs, shared GFNs, paged-in GFNs
+		 * that were later paged-out.
+		 */
+		if (kvmppc_gfn_has_transitioned(gfn, kvm))
+			continue;
+
+		start = gfn_to_hva(kvm, gfn);
+		end = start + (1UL << PAGE_SHIFT);
+		ret = kvmppc_svm_migrate_page(vma, start, end,
+			(gfn << PAGE_SHIFT), kvm, PAGE_SHIFT, false);
+
+		if (ret)
+			goto out_unlock;
+	}
+
+out_unlock:
+	mutex_unlock(&kvm->arch.uvmem_lock);
+	if (downgrade)
+		up_read(&kvm->mm->mmap_sem);
+	else
+		up_write(&kvm->mm->mmap_sem);
+	return ret;
+}
+
 /*
  * Shares the page with HV, thus making it a normal page.
  *
@@ -671,9 +761,21 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 	if (!vma || vma->vm_start > start || vma->vm_end < end)
 		goto out_unlock;
 
-	if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift,
-				&downgrade))
-		ret = H_SUCCESS;
+	ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
+			  MADV_UNMERGEABLE, &vma->vm_flags);
+	downgrade_write(&kvm->mm->mmap_sem);
+	downgrade = true;
+	if (ret) {
+		ret = H_PARAMETER;
+		goto out_unlock;
+	}
+
+	ret = H_PARAMETER;
+	if (kvmppc_svm_migrate_page(vma, start, end, gpa, kvm, page_shift,
+				true))
+		goto out_unlock;
+
+	ret = H_SUCCESS;
 
 out_unlock:
 	mutex_unlock(&kvm->arch.uvmem_lock);
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v3 4/4] KVM: PPC: Book3S HV: migrate hot plugged memory
From: Ram Pai @ 2020-06-19 22:43 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1592606622-29884-1-git-send-email-linuxram@us.ibm.com>

From: Laurent Dufour <ldufour@linux.ibm.com>

When a memory slot is hot plugged to a SVM, PFNs associated with the
GFNs in that slot must be migrated to the secure-PFNs, aka device-PFNs.

kvmppc_uv_migrate_mem_slot() is called to accomplish this. UV_PAGE_IN
ucall is skipped, since the ultravisor does not trust the content of
those pages and hence ignores it.

Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
	[resolved conflicts, and modified the commit log]
---
 arch/powerpc/kvm/book3s_hv.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6717d24..fcea41c 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4531,10 +4531,12 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 	case KVM_MR_CREATE:
 		if (kvmppc_uvmem_slot_init(kvm, new))
 			return;
-		uv_register_mem_slot(kvm->arch.lpid,
-				     new->base_gfn << PAGE_SHIFT,
-				     new->npages * PAGE_SIZE,
-				     0, new->id);
+		if (uv_register_mem_slot(kvm->arch.lpid,
+					 new->base_gfn << PAGE_SHIFT,
+					 new->npages * PAGE_SIZE,
+					 0, new->id))
+			return;
+		kvmppc_uv_migrate_mem_slot(kvm, new);
 		break;
 	case KVM_MR_DELETE:
 		uv_unregister_mem_slot(kvm->arch.lpid, old->id);
-- 
1.8.3.1


^ permalink raw reply related

* Re: [V2 PATCH 1/3] Refactoring powerpc code for carrying over IMA measurement logs, to move non architecture specific code to security/ima.
From: Thiago Jung Bauermann @ 2020-06-20  0:19 UTC (permalink / raw)
  To: Prakhar Srivastava
  Cc: kstewart, mark.rutland, gregkh, bhsharma, tao.li, zohar, paulus,
	vincenzo.frascino, will, nramas, frowand.list, masahiroy, jmorris,
	takahiro.akashi, linux-arm-kernel, catalin.marinas, serge,
	devicetree, pasha.tatashin, robh+dt, hsinyi, tusharsu, tglx,
	allison, christophe.leroy, mbrugger, balajib, dmitry.kasatkin,
	linux-kernel, linux-security-module, james.morse, linux-integrity,
	linuxppc-dev
In-Reply-To: <20200618071045.471131-2-prsriva@linux.microsoft.com>

Prakhar Srivastava <prsriva@linux.microsoft.com> writes:

> Powerpc has support to carry over the IMA measurement logs. Refatoring the
> non-architecture specific code out of arch/powerpc and into security/ima.
>
> The code adds support for reserving and freeing up of memory for IMA measurement
> logs.

Last week, Mimi provided this feedback:

"From your patch description, this patch should be broken up.  Moving
the non-architecture specific code out of powerpc should be one patch.
 Additional support should be in another patch.  After each patch, the
code should work properly."

That's not what you do here. You move the code, but you also make other
changes at the same time. This has two problems:

1. It makes the patch harder to review, because it's very easy to miss a
   change.

2. If in the future a git bisect later points to this patch, it's not
   clear whether the problem is because of the code movement, or because
   of the other changes.

When you move code, ideally the patch should only make the changes
necessary to make the code work at its new location. The patch which
does code movement should not cause any change in behavior.

Other changes should go in separate patches, either before or after the
one moving the code.

More comments below.

>
> ---
>  arch/powerpc/include/asm/ima.h     |  10 ---
>  arch/powerpc/kexec/ima.c           | 126 ++---------------------------
>  security/integrity/ima/ima_kexec.c | 116 ++++++++++++++++++++++++++
>  3 files changed, 124 insertions(+), 128 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/ima.h b/arch/powerpc/include/asm/ima.h
> index ead488cf3981..c29ec86498f8 100644
> --- a/arch/powerpc/include/asm/ima.h
> +++ b/arch/powerpc/include/asm/ima.h
> @@ -4,15 +4,6 @@
>
>  struct kimage;
>
> -int ima_get_kexec_buffer(void **addr, size_t *size);
> -int ima_free_kexec_buffer(void);
> -
> -#ifdef CONFIG_IMA
> -void remove_ima_buffer(void *fdt, int chosen_node);
> -#else
> -static inline void remove_ima_buffer(void *fdt, int chosen_node) {}
> -#endif
> -
>  #ifdef CONFIG_IMA_KEXEC
>  int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr,
>  			      size_t size);
> @@ -22,7 +13,6 @@ int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node);
>  static inline int setup_ima_buffer(const struct kimage *image, void *fdt,
>  				   int chosen_node)
>  {
> -	remove_ima_buffer(fdt, chosen_node);
>  	return 0;
>  }

This is wrong. Even if the currently running kernel doesn't have
CONFIG_IMA_KEXEC, it should remove the IMA buffer property and memory
reservation from the FDT that is being prepared for the next kernel.

This is because the IMA kexec buffer is useless for the next kernel,
regardless of whether the current kernel supports CONFIG_IMA_KEXEC or
not. Keeping it around would be a waste of memory.

> @@ -179,13 +64,18 @@ int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node)
>  	int ret, addr_cells, size_cells, entry_size;
>  	u8 value[16];
>
> -	remove_ima_buffer(fdt, chosen_node);

This is wrong, for the same reason stated above.

>  	if (!image->arch.ima_buffer_size)
>  		return 0;
>
> -	ret = get_addr_size_cells(&addr_cells, &size_cells);
> -	if (ret)
> +	ret = fdt_address_cells(fdt, chosen_node);
> +	if (ret < 0)
> +		return ret;
> +	addr_cells = ret;
> +
> +	ret = fdt_size_cells(fdt, chosen_node);
> +	if (ret < 0)
>  		return ret;
> +	size_cells = ret;
>
>  	entry_size = 4 * (addr_cells + size_cells);
>

I liked this change. Thanks! I agree it's better to use
fdt_address_cells() and fdt_size_cells() here.

But it should be in a separate patch. Either before or after the one
moving the code.

> diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c
> index 121de3e04af2..e1e6d6154015 100644
> --- a/security/integrity/ima/ima_kexec.c
> +++ b/security/integrity/ima/ima_kexec.c
> @@ -10,8 +10,124 @@
>  #include <linux/seq_file.h>
>  #include <linux/vmalloc.h>
>  #include <linux/kexec.h>
> +#include <linux/of.h>
> +#include <linux/memblock.h>
> +#include <linux/libfdt.h>
>  #include "ima.h"
>
> +static int get_addr_size_cells(int *addr_cells, int *size_cells)
> +{
> +	struct device_node *root;
> +
> +	root = of_find_node_by_path("/");
> +	if (!root)
> +		return -EINVAL;
> +
> +	*addr_cells = of_n_addr_cells(root);
> +	*size_cells = of_n_size_cells(root);
> +
> +	of_node_put(root);
> +
> +	return 0;
> +}
> +
> +static int do_get_kexec_buffer(const void *prop, int len, unsigned long *addr,
> +			       size_t *size)
> +{
> +	int ret, addr_cells, size_cells;
> +
> +	ret = get_addr_size_cells(&addr_cells, &size_cells);
> +	if (ret)
> +		return ret;
> +
> +	if (len < 4 * (addr_cells + size_cells))
> +		return -ENOENT;
> +
> +	*addr = of_read_number(prop, addr_cells);
> +	*size = of_read_number(prop + 4 * addr_cells, size_cells);
> +
> +	return 0;
> +}
> +
> +/**
> + * ima_get_kexec_buffer - get IMA buffer from the previous kernel
> + * @addr:	On successful return, set to point to the buffer contents.
> + * @size:	On successful return, set to the buffer size.
> + *
> + * Return: 0 on success, negative errno on error.
> + */
> +int ima_get_kexec_buffer(void **addr, size_t *size)
> +{
> +	int ret, len;
> +	unsigned long tmp_addr;
> +	size_t tmp_size;
> +	const void *prop;
> +
> +	prop = of_get_property(of_chosen, "linux,ima-kexec-buffer", &len);
> +	if (!prop)
> +		return -ENOENT;
> +
> +	ret = do_get_kexec_buffer(prop, len, &tmp_addr, &tmp_size);
> +	if (ret)
> +		return ret;
> +
> +	*addr = __va(tmp_addr);
> +	*size = tmp_size;
> +
> +	return 0;
> +}

The functions above were moved without being changed. Good.

> +/**
> + * ima_free_kexec_buffer - free memory used by the IMA buffer
> + */
> +int ima_free_kexec_buffer(void)
> +{
> +	int ret;
> +	unsigned long addr;
> +	size_t size;
> +	struct property *prop;
> +
> +	prop = of_find_property(of_chosen, "linux,ima-kexec-buffer", NULL);
> +	if (!prop)
> +		return -ENOENT;
> +
> +	ret = do_get_kexec_buffer(prop->value, prop->length, &addr, &size);
> +	if (ret)
> +		return ret;
> +
> +	ret = of_remove_property(of_chosen, prop);
> +	if (ret)
> +		return ret;
> +
> +	return memblock_free(__pa(addr), size);

Here you added a __pa() call. Do you store a virtual address in
linux,ima-kexec-buffer property? Doesn't it make more sense to store a
physical address?

Even if making this change is the correct thing to do, it should be a
separate patch, unless it can't be avoided. And if that is the case,
then it should be explained in the commit message.

> +
> +}
> +
> +/**
> + * remove_ima_buffer - remove the IMA buffer property and reservation from @fdt
> + *
> + * The IMA measurement buffer is of no use to a subsequent kernel, so we always
> + * remove it from the device tree.
> + */
> +void remove_ima_buffer(void *fdt, int chosen_node)
> +{
> +	int ret, len;
> +	unsigned long addr;
> +	size_t size;
> +	const void *prop;
> +
> +	prop = fdt_getprop(fdt, chosen_node, "linux,ima-kexec-buffer", &len);
> +	if (!prop)
> +		return;
> +
> +	do_get_kexec_buffer(prop, len, &addr, &size);
> +	ret = fdt_delprop(fdt, chosen_node, "linux,ima-kexec-buffer");
> +	if (ret < 0)
> +		return;
> +
> +	memblock_free(addr, size);
> +}

Here is another function that changed when moved. This one I know to be
wrong. You're confusing the purposes of remove_ima_buffer() and
ima_free_kexec_buffer().

You did send me a question about them nearly three weeks ago which I
only answered today, so I apologize. Also, their names could more
clearly reflect their differences, so it's bad naming on my part.

With IMA kexec buffers, there are two kernels (and thus their two
respective, separate device trees) to be concerned about:

1. the currently running kernel, which uses the live device tree
(accessed with the of_* functions) and the memblock subsystem;

2. the kernel which is being loaded by kexec, which will use the FDT
blob being passed around as argument to these functions, and the memory
reservations in the memory reservation table of the FDT blob.

ima_free_kexec_buffer() is used by IMA in the currently running kernel.
Therefore the device tree it is concerned about is the live one, and
thus uses the of_* functions to access it. And uses memblock to change
the memory reservation.

remove_ima_buffer() on the other hand is used by the kexec code to
prepare an FDT blob for the kernel that is being loaded. It should not
make any changes to live device tree, nor to memblock allocations. It
should only make changes to the FDT blob.

--
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [PATCH] powerpc/pseries: new lparcfg key/value pair: partition_affinity_score
From: Nathan Lynch @ 2020-06-20  0:32 UTC (permalink / raw)
  To: Tyrel Datwyler, Scott Cheloha, linuxppc-dev
In-Reply-To: <10febdd6-4672-24ca-03e9-553468c8852c@linux.ibm.com>

Hi Tyrel,

Tyrel Datwyler <tyreld@linux.ibm.com> writes:
> On 6/19/20 8:34 AM, Scott Cheloha wrote:
>> The H_GetPerformanceCounterInfo PHYP hypercall has a subcall,
>> Affinity_Domain_Info_By_Partition, which returns, among other things,
>> a "partition affinity score" for a given LPAR.  This score, a value on
>> [0-100], represents the processor-memory affinity for the LPAR in
>> question.  A score of 0 indicates the worst possible affinity while a
>> score of 100 indicates perfect affinity.  The score can be used to
>> reason about performance.
>> 
>> This patch adds the score for the local LPAR to the lparcfg procfile
>> under a new 'partition_affinity_score' key.
>
> I expect that you will probably get a NACK from Michael on this. The overall
> desire is to move away from these dated /proc interfaces. While its true that I
> did add a new value recently it was strictly to facilitate and correct the
> calculation of a derived value that was already dependent on a couple other
> existing values in lparcfg.
>
> With that said I would expect that you would likely be advised to expose this as
> a sysfs attribute. The question is where? We probably should put some thought in
> to this as I would like to port each lparcfg value over to sysfs so that we can
> move to deprecating lparcfg. Putting everything under something like
> /sys/kernel/lparcfg/* maybe. Michael may have a better suggestion.

I think this score fits pretty naturally in lparcfg: it's a simple
metric that is specific to the pseries/papr platform, like everything
else in there.

A few dozen key=value pairs contained in a single file is simple and
efficient, unlike sysfs with its rather inconsistently applied
one-value-per-file convention. Surely it's OK if lparcfg gains a line
every few years?


>> The H_GetPerformanceCounterInfo hypercall is already used elsewhere in
>> the kernel, in powerpc/perf/hv-gpci.c.  Refactoring that code and this
>> code into a more general API might be worthwhile if additional modules
>> require the hypercall in the future.
>
> If you are duplicating code its likely you should already be doing this. See the
> rest of my comments about below.
>
>> 
>> Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com>
>> ---
>>  arch/powerpc/platforms/pseries/lparcfg.c | 53 ++++++++++++++++++++++++
>>  1 file changed, 53 insertions(+)
>> 
>> diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
>> index b8d28ab88178..b75151eee0f0 100644
>> --- a/arch/powerpc/platforms/pseries/lparcfg.c
>> +++ b/arch/powerpc/platforms/pseries/lparcfg.c
>> @@ -136,6 +136,57 @@ static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
>>  	return rc;
>>  }
>>  
>> +/*
>> + * Based on H_GetPerformanceCounterInfo v1.10.
>> + */
>> +static void show_gpci_data(struct seq_file *m)
>> +{
>> +	struct perf_counter_info_params {
>> +		__be32 counter_request;
>> +		__be32 starting_index;
>> +		__be16 secondary_index;
>> +		__be16 returned_values;
>> +		__be32 detail_rc;
>> +		__be16 counter_value_element_size;
>> +		u8     counter_info_version_in;
>> +		u8     counter_info_version_out;
>> +		u8     reserved[0xC];
>> +	} __packed;
>
> This looks to duplicate the hv_get_perf_counter_info_params struct from
> arch/powerpc/perf/hv-gpci.h. Maybe this include file needs to move to
> arch/powerpc/asm/inlcude so you don't have to redefine this struct.
>
>> +	struct hv_gpci_request_buffer {
>> +		struct perf_counter_info_params params;
>> +		u8 output[4096 - sizeof(struct perf_counter_info_params)];
>> +	} __packed;
>
> This struct is code duplication of the one defined in
> arch/powerpc/perf/hv-gpci.c and could be moved into hv-gpci.h along with
> HGPCI_MAX_DATA_BYTES so that you can use those versions here.

I tend to agree with these comments.


>> +	struct hv_gpci_request_buffer *buf;
>> +	long ret;
>> +	unsigned int affinity_score;
>> +
>> +	buf = kmalloc(sizeof(*buf), GFP_KERNEL);
>> +	if (buf == NULL)
>> +		return;
>> +
>> +	/*
>> +	 * Show the local LPAR's affinity score.
>> +	 *
>> +	 * 0xB1 selects the Affinity_Domain_Info_By_Partition subcall.
>> +	 * The score is at byte 0xB in the output buffer.
>> +	 */
>> +	memset(&buf->params, 0, sizeof(buf->params));
>> +	buf->params.counter_request = cpu_to_be32(0xB1);
>> +	buf->params.starting_index = cpu_to_be32(-1);	/* local LPAR */
>> +	buf->params.counter_info_version_in = 0x5;	/* v5+ for score */
>> +	ret = plpar_hcall_norets(H_GET_PERF_COUNTER_INFO, virt_to_phys(buf),
>> +				 sizeof(*buf));
>> +	if (ret != H_SUCCESS) {
>> +		pr_debug("hcall failed: H_GET_PERF_COUNTER_INFO: %ld, %x\n",
>> +			 ret, be32_to_cpu(buf->params.detail_rc));
>> +		goto out;
>> +	}
>> +	affinity_score = buf->output[0xB];
>> +	seq_printf(m, "partition_affinity_score=%u\n", affinity_score);
>> +out:
>> +	kfree(buf);
>> +}
>> +
>
> IIUC we should already be able to get this value from userspace using perf tool,
> right? If thats the case can't we also programatically retrieve it via the
> perf_event interface in userspace as well?

No... I had the same thought when I first looked at this, but perf is
for sampling counters, and the partition affinity score is not a
counter.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox