LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH][next] powerpc: Use fallthrough pseudo-keyword
From: Gustavo A. R. Silva @ 2020-07-27 22:42 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras
  Cc: linuxppc-dev, linux-kernel, Gustavo A. R. Silva

Replace the existing /* fall through */ comments and its variants with
the new pseudo-keyword macro fallthrough[1]. Also, remove unnecessary
fall-through markings when it is the case.

[1] https://www.kernel.org/doc/html/v5.7/process/deprecated.html?highlight=fallthrough#implicit-switch-case-fall-through

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
---
 arch/powerpc/kernel/align.c                 | 8 ++++----
 arch/powerpc/platforms/powermac/feature.c   | 2 +-
 arch/powerpc/platforms/powernv/opal-async.c | 2 +-
 arch/powerpc/platforms/pseries/hvcserver.c  | 2 +-
 arch/powerpc/xmon/xmon.c                    | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index 1f1ce8b86d5b..c7797eb958c7 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -178,11 +178,11 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg,
 			ret |= __get_user_inatomic(temp.v[1], p++);
 			ret |= __get_user_inatomic(temp.v[2], p++);
 			ret |= __get_user_inatomic(temp.v[3], p++);
-			/* fall through */
+			fallthrough;
 		case 4:
 			ret |= __get_user_inatomic(temp.v[4], p++);
 			ret |= __get_user_inatomic(temp.v[5], p++);
-			/* fall through */
+			fallthrough;
 		case 2:
 			ret |= __get_user_inatomic(temp.v[6], p++);
 			ret |= __get_user_inatomic(temp.v[7], p++);
@@ -263,11 +263,11 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg,
 			ret |= __put_user_inatomic(data.v[1], p++);
 			ret |= __put_user_inatomic(data.v[2], p++);
 			ret |= __put_user_inatomic(data.v[3], p++);
-			/* fall through */
+			fallthrough;
 		case 4:
 			ret |= __put_user_inatomic(data.v[4], p++);
 			ret |= __put_user_inatomic(data.v[5], p++);
-			/* fall through */
+			fallthrough;
 		case 2:
 			ret |= __put_user_inatomic(data.v[6], p++);
 			ret |= __put_user_inatomic(data.v[7], p++);
diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c
index 181caa3f6717..5c77b9a24c0e 100644
--- a/arch/powerpc/platforms/powermac/feature.c
+++ b/arch/powerpc/platforms/powermac/feature.c
@@ -1465,7 +1465,7 @@ static long g5_i2s_enable(struct device_node *node, long param, long value)
 	case 2:
 		if (macio->type == macio_shasta)
 			break;
-		/* fall through */
+		fallthrough;
 	default:
 		return -ENODEV;
 	}
diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
index 1656e8965d6b..c094fdf5825c 100644
--- a/arch/powerpc/platforms/powernv/opal-async.c
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -104,7 +104,7 @@ static int __opal_async_release_token(int token)
 	 */
 	case ASYNC_TOKEN_DISPATCHED:
 		opal_async_tokens[token].state = ASYNC_TOKEN_ABANDONED;
-		/* Fall through */
+		fallthrough;
 	default:
 		rc = 1;
 	}
diff --git a/arch/powerpc/platforms/pseries/hvcserver.c b/arch/powerpc/platforms/pseries/hvcserver.c
index 267139b13530..96e18d3b2fcf 100644
--- a/arch/powerpc/platforms/pseries/hvcserver.c
+++ b/arch/powerpc/platforms/pseries/hvcserver.c
@@ -45,7 +45,7 @@ static int hvcs_convert(long to_convert)
 		case H_LONG_BUSY_ORDER_10_SEC:
 		case H_LONG_BUSY_ORDER_100_SEC:
 			return -EBUSY;
-		case H_FUNCTION: /* fall through */
+		case H_FUNCTION:
 		default:
 			return -EPERM;
 	}
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 8fb1f857c11c..ed1a9f43709d 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -4278,7 +4278,7 @@ static int do_spu_cmd(void)
 		subcmd = inchar();
 		if (isxdigit(subcmd) || subcmd == '\n')
 			termch = subcmd;
-		/* fall through */
+		fallthrough;
 	case 'f':
 		scanhex(&num);
 		if (num >= XMON_NUM_SPUS || !spu_info[num].spu) {
-- 
2.27.0


^ permalink raw reply related

* [PATCH][next] dmaengine: Use fallthrough pseudo-keyword
From: Gustavo A. R. Silva @ 2020-07-27 20:34 UTC (permalink / raw)
  To: Vinod Koul, Dan Williams, Li Yang, Zhang Wei, Shawn Guo,
	Sascha Hauer, Pengutronix Kernel Team, Fabio Estevam,
	NXP Linux Team, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Song Liu, Yonghong Song, Andrii Nakryiko,
	John Fastabend, KP Singh
  Cc: netdev, linux-kernel, Gustavo A. R. Silva, dmaengine, bpf,
	linuxppc-dev, linux-arm-kernel

Replace the existing /* fall through */ comments and its variants with
the new pseudo-keyword macro fallthrough[1]. Also, remove unnecessary
fall-through markings when it is the case.

[1] https://www.kernel.org/doc/html/v5.7/process/deprecated.html?highlight=fallthrough#implicit-switch-case-fall-through

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
---
 drivers/dma/amba-pl08x.c    | 10 +++++-----
 drivers/dma/fsldma.c        |  2 +-
 drivers/dma/imx-dma.c       |  2 +-
 drivers/dma/iop-adma.h      | 12 ++++++------
 drivers/dma/nbpfaxi.c       |  2 +-
 drivers/dma/pl330.c         | 10 +++-------
 drivers/dma/sh/shdma-base.c |  2 +-
 7 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 9adc7a2fa3d3..a24882ba3764 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -1767,7 +1767,7 @@ static u32 pl08x_memcpy_cctl(struct pl08x_driver_data *pl08x)
 	default:
 		dev_err(&pl08x->adev->dev,
 			"illegal burst size for memcpy, set to 1\n");
-		/* Fall through */
+		fallthrough;
 	case PL08X_BURST_SZ_1:
 		cctl |= PL080_BSIZE_1 << PL080_CONTROL_SB_SIZE_SHIFT |
 			PL080_BSIZE_1 << PL080_CONTROL_DB_SIZE_SHIFT;
@@ -1806,7 +1806,7 @@ static u32 pl08x_memcpy_cctl(struct pl08x_driver_data *pl08x)
 	default:
 		dev_err(&pl08x->adev->dev,
 			"illegal bus width for memcpy, set to 8 bits\n");
-		/* Fall through */
+		fallthrough;
 	case PL08X_BUS_WIDTH_8_BITS:
 		cctl |= PL080_WIDTH_8BIT << PL080_CONTROL_SWIDTH_SHIFT |
 			PL080_WIDTH_8BIT << PL080_CONTROL_DWIDTH_SHIFT;
@@ -1850,7 +1850,7 @@ static u32 pl08x_ftdmac020_memcpy_cctl(struct pl08x_driver_data *pl08x)
 	default:
 		dev_err(&pl08x->adev->dev,
 			"illegal bus width for memcpy, set to 8 bits\n");
-		/* Fall through */
+		fallthrough;
 	case PL08X_BUS_WIDTH_8_BITS:
 		cctl |= PL080_WIDTH_8BIT << FTDMAC020_LLI_SRC_WIDTH_SHIFT |
 			PL080_WIDTH_8BIT << FTDMAC020_LLI_DST_WIDTH_SHIFT;
@@ -2612,7 +2612,7 @@ static int pl08x_of_probe(struct amba_device *adev,
 	switch (val) {
 	default:
 		dev_err(&adev->dev, "illegal burst size for memcpy, set to 1\n");
-		/* Fall through */
+		fallthrough;
 	case 1:
 		pd->memcpy_burst_size = PL08X_BURST_SZ_1;
 		break;
@@ -2647,7 +2647,7 @@ static int pl08x_of_probe(struct amba_device *adev,
 	switch (val) {
 	default:
 		dev_err(&adev->dev, "illegal bus width for memcpy, set to 8 bits\n");
-		/* Fall through */
+		fallthrough;
 	case 8:
 		pd->memcpy_bus_width = PL08X_BUS_WIDTH_8_BITS;
 		break;
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index ad72b3f42ffa..e342cf52d296 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -1163,7 +1163,7 @@ static int fsl_dma_chan_probe(struct fsldma_device *fdev,
 	switch (chan->feature & FSL_DMA_IP_MASK) {
 	case FSL_DMA_IP_85XX:
 		chan->toggle_ext_pause = fsl_chan_toggle_ext_pause;
-		/* Fall through */
+		fallthrough;
 	case FSL_DMA_IP_83XX:
 		chan->toggle_ext_start = fsl_chan_toggle_ext_start;
 		chan->set_src_loop_size = fsl_chan_set_src_loop_size;
diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
index 5c0fb3134825..88717506c1f6 100644
--- a/drivers/dma/imx-dma.c
+++ b/drivers/dma/imx-dma.c
@@ -556,7 +556,7 @@ static int imxdma_xfer_desc(struct imxdma_desc *d)
 		 * We fall-through here intentionally, since a 2D transfer is
 		 * similar to MEMCPY just adding the 2D slot configuration.
 		 */
-		/* Fall through */
+		fallthrough;
 	case IMXDMA_DESC_MEMCPY:
 		imx_dmav1_writel(imxdma, d->src, DMA_SAR(imxdmac->channel));
 		imx_dmav1_writel(imxdma, d->dest, DMA_DAR(imxdmac->channel));
diff --git a/drivers/dma/iop-adma.h b/drivers/dma/iop-adma.h
index c499c9578f00..d44eabb6f5eb 100644
--- a/drivers/dma/iop-adma.h
+++ b/drivers/dma/iop-adma.h
@@ -496,7 +496,7 @@ iop3xx_desc_init_xor(struct iop3xx_desc_aau *hw_desc, int src_cnt,
 		}
 		hw_desc->src_edc[AAU_EDCR2_IDX].e_desc_ctrl = edcr;
 		src_cnt = 24;
-		/* fall through */
+		fallthrough;
 	case 17 ... 24:
 		if (!u_desc_ctrl.field.blk_ctrl) {
 			hw_desc->src_edc[AAU_EDCR2_IDX].e_desc_ctrl = 0;
@@ -510,7 +510,7 @@ iop3xx_desc_init_xor(struct iop3xx_desc_aau *hw_desc, int src_cnt,
 		}
 		hw_desc->src_edc[AAU_EDCR1_IDX].e_desc_ctrl = edcr;
 		src_cnt = 16;
-		/* fall through */
+		fallthrough;
 	case 9 ... 16:
 		if (!u_desc_ctrl.field.blk_ctrl)
 			u_desc_ctrl.field.blk_ctrl = 0x2; /* use EDCR0 */
@@ -522,7 +522,7 @@ iop3xx_desc_init_xor(struct iop3xx_desc_aau *hw_desc, int src_cnt,
 		}
 		hw_desc->src_edc[AAU_EDCR0_IDX].e_desc_ctrl = edcr;
 		src_cnt = 8;
-		/* fall through */
+		fallthrough;
 	case 2 ... 8:
 		shift = 1;
 		for (i = 0; i < src_cnt; i++) {
@@ -602,19 +602,19 @@ iop_desc_init_null_xor(struct iop_adma_desc_slot *desc, int src_cnt,
 	case 25 ... 32:
 		u_desc_ctrl.field.blk_ctrl = 0x3; /* use EDCR[2:0] */
 		hw_desc->src_edc[AAU_EDCR2_IDX].e_desc_ctrl = 0;
-		/* fall through */
+		fallthrough;
 	case 17 ... 24:
 		if (!u_desc_ctrl.field.blk_ctrl) {
 			hw_desc->src_edc[AAU_EDCR2_IDX].e_desc_ctrl = 0;
 			u_desc_ctrl.field.blk_ctrl = 0x3; /* use EDCR[2:0] */
 		}
 		hw_desc->src_edc[AAU_EDCR1_IDX].e_desc_ctrl = 0;
-		/* fall through */
+		fallthrough;
 	case 9 ... 16:
 		if (!u_desc_ctrl.field.blk_ctrl)
 			u_desc_ctrl.field.blk_ctrl = 0x2; /* use EDCR0 */
 		hw_desc->src_edc[AAU_EDCR0_IDX].e_desc_ctrl = 0;
-		/* fall through */
+		fallthrough;
 	case 1 ... 8:
 		if (!u_desc_ctrl.field.blk_ctrl && src_cnt > 4)
 			u_desc_ctrl.field.blk_ctrl = 0x1; /* use mini-desc */
diff --git a/drivers/dma/nbpfaxi.c b/drivers/dma/nbpfaxi.c
index 74df621402e1..ca4e0930207a 100644
--- a/drivers/dma/nbpfaxi.c
+++ b/drivers/dma/nbpfaxi.c
@@ -483,7 +483,7 @@ static size_t nbpf_xfer_size(struct nbpf_device *nbpf,
 
 	default:
 		pr_warn("%s(): invalid bus width %u\n", __func__, width);
-		/* fall through */
+		fallthrough;
 	case DMA_SLAVE_BUSWIDTH_1_BYTE:
 		size = burst;
 	}
diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
index 2c508ee672b9..9b69716172a4 100644
--- a/drivers/dma/pl330.c
+++ b/drivers/dma/pl330.c
@@ -1061,16 +1061,16 @@ static bool _start(struct pl330_thread *thrd)
 
 		if (_state(thrd) == PL330_STATE_KILLING)
 			UNTIL(thrd, PL330_STATE_STOPPED)
-		/* fall through */
+		fallthrough;
 
 	case PL330_STATE_FAULTING:
 		_stop(thrd);
-		/* fall through */
+		fallthrough;
 
 	case PL330_STATE_KILLING:
 	case PL330_STATE_COMPLETING:
 		UNTIL(thrd, PL330_STATE_STOPPED)
-		/* fall through */
+		fallthrough;
 
 	case PL330_STATE_STOPPED:
 		return _trigger(thrd);
@@ -1121,7 +1121,6 @@ static u32 _emit_load(unsigned int dry_run, u8 buf[],
 
 	switch (direction) {
 	case DMA_MEM_TO_MEM:
-		/* fall through */
 	case DMA_MEM_TO_DEV:
 		off += _emit_LD(dry_run, &buf[off], cond);
 		break;
@@ -1155,7 +1154,6 @@ static inline u32 _emit_store(unsigned int dry_run, u8 buf[],
 
 	switch (direction) {
 	case DMA_MEM_TO_MEM:
-		/* fall through */
 	case DMA_DEV_TO_MEM:
 		off += _emit_ST(dry_run, &buf[off], cond);
 		break;
@@ -1216,7 +1214,6 @@ static int _bursts(struct pl330_dmac *pl330, unsigned dry_run, u8 buf[],
 
 	switch (pxs->desc->rqtype) {
 	case DMA_MEM_TO_DEV:
-		/* fall through */
 	case DMA_DEV_TO_MEM:
 		off += _ldst_peripheral(pl330, dry_run, &buf[off], pxs, cyc,
 			cond);
@@ -1266,7 +1263,6 @@ static int _dregs(struct pl330_dmac *pl330, unsigned int dry_run, u8 buf[],
 
 	switch (pxs->desc->rqtype) {
 	case DMA_MEM_TO_DEV:
-		/* fall through */
 	case DMA_DEV_TO_MEM:
 		off += _emit_MOV(dry_run, &buf[off], CCR, dregs_ccr);
 		off += _ldst_peripheral(pl330, dry_run, &buf[off], pxs, 1,
diff --git a/drivers/dma/sh/shdma-base.c b/drivers/dma/sh/shdma-base.c
index 2deeaab078a4..788d696323bb 100644
--- a/drivers/dma/sh/shdma-base.c
+++ b/drivers/dma/sh/shdma-base.c
@@ -383,7 +383,7 @@ static dma_async_tx_callback __ld_cleanup(struct shdma_chan *schan, bool all)
 			switch (desc->mark) {
 			case DESC_COMPLETED:
 				desc->mark = DESC_WAITING;
-				/* Fall through */
+				fallthrough;
 			case DESC_WAITING:
 				if (head_acked)
 					async_tx_ack(&desc->async_tx);
-- 
2.27.0


^ permalink raw reply related

* [PATCH v2 2/2] KVM: PPC: Book3S HV: rework secure mem slot dropping
From: Ram Pai @ 2020-07-27 19:24 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1595877869-2746-1-git-send-email-linuxram@us.ibm.com>

From: Laurent Dufour <ldufour@linux.ibm.com>

When a secure memslot is dropped, all the pages backed in the secure
device (aka really backed by secure memory by the Ultravisor)
should be paged out to a normal page. Previously, this was
achieved by triggering the page fault mechanism which is calling
kvmppc_svm_page_out() on each pages.

This can't work when hot unplugging a memory slot because the memory
slot is flagged as invalid and gfn_to_pfn() is then not trying to access
the page, so the page fault mechanism is not triggered.

Since the final goal is to make a call to kvmppc_svm_page_out() it seems
simpler to call directly instead of triggering such a mechanism. This
way kvmppc_uvmem_drop_pages() can be called even when hot unplugging a
memslot.

Since kvmppc_uvmem_drop_pages() is already holding kvm->arch.uvmem_lock,
the call to __kvmppc_svm_page_out() is made.  As
__kvmppc_svm_page_out needs the vma pointer to migrate the pages,
the VMA is fetched in a lazy way, to not trigger find_vma() all
the time. In addition, the mmap_sem is held in read mode during
that time, not in write mode since the virual memory layout is not
impacted, and kvm->arch.uvmem_lock prevents concurrent operation
on the secure device.

Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Paul Mackerras <paulus@ozlabs.org>
Reviewed-by: Bharata B Rao <bharata@linux.ibm.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
	[modified the changelog description]
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
        [modified check on the VMA in kvmppc_uvmem_drop_pages]
---
 arch/powerpc/kvm/book3s_hv_uvmem.c | 52 +++++++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 565f24b..0d49e34 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -594,35 +594,53 @@ static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,
  * fault on them, do fault time migration to replace the device PTEs in
  * QEMU page table with normal PTEs from newly allocated pages.
  */
-void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
+void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot,
 			     struct kvm *kvm, bool skip_page_out)
 {
 	int i;
 	struct kvmppc_uvmem_page_pvt *pvt;
-	unsigned long pfn, uvmem_pfn;
-	unsigned long gfn = free->base_gfn;
+	struct page *uvmem_page;
+	struct vm_area_struct *vma = NULL;
+	unsigned long uvmem_pfn, gfn;
+	unsigned long addr;
+
+	mmap_read_lock(kvm->mm);
+
+	addr = slot->userspace_addr;
 
-	for (i = free->npages; i; --i, ++gfn) {
-		struct page *uvmem_page;
+	gfn = slot->base_gfn;
+	for (i = slot->npages; i; --i, ++gfn, addr += PAGE_SIZE) {
+
+		/* Fetch the VMA if addr is not in the latest fetched one */
+		if (!vma || addr >= vma->vm_end) {
+			vma = find_vma_intersection(kvm->mm, addr, addr+1);
+			if (!vma) {
+				pr_err("Can't find VMA for gfn:0x%lx\n", gfn);
+				break;
+			}
+		}
 
 		mutex_lock(&kvm->arch.uvmem_lock);
-		if (!kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) {
+
+		if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) {
+			uvmem_page = pfn_to_page(uvmem_pfn);
+			pvt = uvmem_page->zone_device_data;
+			pvt->skip_page_out = skip_page_out;
+			pvt->remove_gfn = true;
+
+			if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE,
+						  PAGE_SHIFT, kvm, pvt->gpa))
+				pr_err("Can't page out gpa:0x%lx addr:0x%lx\n",
+				       pvt->gpa, addr);
+		} else {
+			/* Remove the shared flag if any */
 			kvmppc_gfn_remove(gfn, kvm);
-			mutex_unlock(&kvm->arch.uvmem_lock);
-			continue;
 		}
 
-		uvmem_page = pfn_to_page(uvmem_pfn);
-		pvt = uvmem_page->zone_device_data;
-		pvt->skip_page_out = skip_page_out;
-		pvt->remove_gfn = true;
 		mutex_unlock(&kvm->arch.uvmem_lock);
-
-		pfn = gfn_to_pfn(kvm, gfn);
-		if (is_error_noslot_pfn(pfn))
-			continue;
-		kvm_release_pfn_clean(pfn);
 	}
+
+	mmap_read_unlock(kvm->mm);
 }
 
 unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm)
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v2 1/2] KVM: PPC: Book3S HV: move kvmppc_svm_page_out up
From: Ram Pai @ 2020-07-27 19:24 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1595877869-2746-1-git-send-email-linuxram@us.ibm.com>

From: Laurent Dufour <ldufour@linux.ibm.com>

kvmppc_svm_page_out() will need to be called by kvmppc_uvmem_drop_pages()
so move it upper in this file.

Furthermore it will be interesting to call this function when already
holding the kvm->arch.uvmem_lock, so prefix the original function with __
and remove the locking in it, and introduce a wrapper which call that
function with the lock held.

There is no functional change.

Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Paul Mackerras <paulus@ozlabs.org>
Reviewed-by: Bharata B Rao <bharata@linux.ibm.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
---
 arch/powerpc/kvm/book3s_hv_uvmem.c | 166 ++++++++++++++++++++-----------------
 1 file changed, 90 insertions(+), 76 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 5b917ea..565f24b 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -497,6 +497,96 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 }
 
 /*
+ * Provision a new page on HV side and copy over the contents
+ * from secure memory using UV_PAGE_OUT uvcall.
+ * Caller must held kvm->arch.uvmem_lock.
+ */
+static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
+		unsigned long start,
+		unsigned long end, unsigned long page_shift,
+		struct kvm *kvm, unsigned long gpa)
+{
+	unsigned long src_pfn, dst_pfn = 0;
+	struct migrate_vma mig;
+	struct page *dpage, *spage;
+	struct kvmppc_uvmem_page_pvt *pvt;
+	unsigned long pfn;
+	int ret = U_SUCCESS;
+
+	memset(&mig, 0, sizeof(mig));
+	mig.vma = vma;
+	mig.start = start;
+	mig.end = end;
+	mig.src = &src_pfn;
+	mig.dst = &dst_pfn;
+	mig.src_owner = &kvmppc_uvmem_pgmap;
+
+	/* The requested page is already paged-out, nothing to do */
+	if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL))
+		return ret;
+
+	ret = migrate_vma_setup(&mig);
+	if (ret)
+		return -1;
+
+	spage = migrate_pfn_to_page(*mig.src);
+	if (!spage || !(*mig.src & MIGRATE_PFN_MIGRATE))
+		goto out_finalize;
+
+	if (!is_zone_device_page(spage))
+		goto out_finalize;
+
+	dpage = alloc_page_vma(GFP_HIGHUSER, vma, start);
+	if (!dpage) {
+		ret = -1;
+		goto out_finalize;
+	}
+
+	lock_page(dpage);
+	pvt = spage->zone_device_data;
+	pfn = page_to_pfn(dpage);
+
+	/*
+	 * This function is used in two cases:
+	 * - When HV touches a secure page, for which we do UV_PAGE_OUT
+	 * - When a secure page is converted to shared page, we *get*
+	 *   the page to essentially unmap the device page. In this
+	 *   case we skip page-out.
+	 */
+	if (!pvt->skip_page_out)
+		ret = uv_page_out(kvm->arch.lpid, pfn << page_shift,
+				  gpa, 0, page_shift);
+
+	if (ret == U_SUCCESS)
+		*mig.dst = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED;
+	else {
+		unlock_page(dpage);
+		__free_page(dpage);
+		goto out_finalize;
+	}
+
+	migrate_vma_pages(&mig);
+
+out_finalize:
+	migrate_vma_finalize(&mig);
+	return ret;
+}
+
+static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,
+				      unsigned long start, unsigned long end,
+				      unsigned long page_shift,
+				      struct kvm *kvm, unsigned long gpa)
+{
+	int ret;
+
+	mutex_lock(&kvm->arch.uvmem_lock);
+	ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa);
+	mutex_unlock(&kvm->arch.uvmem_lock);
+
+	return ret;
+}
+
+/*
  * Drop device pages that we maintain for the secure guest
  *
  * We first mark the pages to be skipped from UV_PAGE_OUT when there
@@ -866,82 +956,6 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 	return ret;
 }
 
-/*
- * Provision a new page on HV side and copy over the contents
- * from secure memory using UV_PAGE_OUT uvcall.
- */
-static int kvmppc_svm_page_out(struct vm_area_struct *vma,
-		unsigned long start,
-		unsigned long end, unsigned long page_shift,
-		struct kvm *kvm, unsigned long gpa)
-{
-	unsigned long src_pfn, dst_pfn = 0;
-	struct migrate_vma mig;
-	struct page *dpage, *spage;
-	struct kvmppc_uvmem_page_pvt *pvt;
-	unsigned long pfn;
-	int ret = U_SUCCESS;
-
-	memset(&mig, 0, sizeof(mig));
-	mig.vma = vma;
-	mig.start = start;
-	mig.end = end;
-	mig.src = &src_pfn;
-	mig.dst = &dst_pfn;
-	mig.src_owner = &kvmppc_uvmem_pgmap;
-
-	mutex_lock(&kvm->arch.uvmem_lock);
-	/* The requested page is already paged-out, nothing to do */
-	if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL))
-		goto out;
-
-	ret = migrate_vma_setup(&mig);
-	if (ret)
-		goto out;
-
-	spage = migrate_pfn_to_page(*mig.src);
-	if (!spage || !(*mig.src & MIGRATE_PFN_MIGRATE))
-		goto out_finalize;
-
-	if (!is_zone_device_page(spage))
-		goto out_finalize;
-
-	dpage = alloc_page_vma(GFP_HIGHUSER, vma, start);
-	if (!dpage) {
-		ret = -1;
-		goto out_finalize;
-	}
-
-	lock_page(dpage);
-	pvt = spage->zone_device_data;
-	pfn = page_to_pfn(dpage);
-
-	/*
-	 * This function is used in two cases:
-	 * - When HV touches a secure page, for which we do UV_PAGE_OUT
-	 * - When a secure page is converted to shared page, we *get*
-	 *   the page to essentially unmap the device page. In this
-	 *   case we skip page-out.
-	 */
-	if (!pvt->skip_page_out)
-		ret = uv_page_out(kvm->arch.lpid, pfn << page_shift,
-				  gpa, 0, page_shift);
-
-	if (ret == U_SUCCESS)
-		*mig.dst = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED;
-	else {
-		unlock_page(dpage);
-		__free_page(dpage);
-		goto out_finalize;
-	}
-
-	migrate_vma_pages(&mig);
-out_finalize:
-	migrate_vma_finalize(&mig);
-out:
-	mutex_unlock(&kvm->arch.uvmem_lock);
-	return ret;
-}
 
 /*
  * Fault handler callback that gets called when HV touches any page that
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v2 0/2] Rework secure memslot dropping
From: Ram Pai @ 2020-07-27 19:24 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david

From: Laurent Dufour <ldufour@linux.ibm.com>

When doing memory hotplug on a secure VM, the secure pages are not well
cleaned from the secure device when dropping the memslot.  This silent
error, is then preventing the SVM to reboot properly after the following
sequence of commands are run in the Qemu monitor:

device_add pc-dimm,id=dimm1,memdev=mem1
device_del dimm1
device_add pc-dimm,id=dimm1,memdev=mem1

At reboot time, when the kernel is booting again and switching to the
secure mode, the page_in is failing for the pages in the memslot because
the cleanup was not done properly, because the memslot is flagged as
invalid during the hot unplug and thus the page fault mechanism is not
triggered.

To prevent that during the memslot dropping, instead of belonging on the
page fault mechanism to trigger the page out of the secured pages, it seems
simpler to directly call the function doing the page out. This way the
state of the memslot is not interfering on the page out process.

This series applies on top of the Ram's one titled:
"[v6 0/5] Migrate non-migrated pages of a SVM."


Changes since V2:
 - fix to vma boundary check in kvmppc_uvmem_drop_pages().

Changes since V1:
 - Rebase on top of Ram's V4 series
 - Address Bharata's comment to use mmap_read_*lock().

Laurent Dufour (2):
  KVM: PPC: Book3S HV: move kvmppc_svm_page_out up
  KVM: PPC: Book3S HV: rework secure mem slot dropping

 arch/powerpc/kvm/book3s_hv_uvmem.c | 218 +++++++++++++++++++++----------------
 1 file changed, 125 insertions(+), 93 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* Re: [PATCH 0/5] cpuidle-pseries: Parse extended CEDE information for idle.
From: Gautham R Shenoy @ 2020-07-27 18:55 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Nathan Lynch, Gautham R. Shenoy, Michael Neuling,
	Vaidyanathan Srinivasan, Linux PM, Linux Kernel Mailing List,
	Nicholas Piggin, linuxppc-dev
In-Reply-To: <CAJZ5v0jA20TJyxRwtBu31zF5otkqbTW9R03Na3LgJsWB3nDmoQ@mail.gmail.com>

Hello Rafael,

On Mon, Jul 27, 2020 at 04:14:12PM +0200, Rafael J. Wysocki wrote:
> On Tue, Jul 7, 2020 at 1:32 PM Gautham R Shenoy <ego@linux.vnet.ibm.com> wrote:
> >
> > Hi,
> >
> > On Tue, Jul 07, 2020 at 04:41:34PM +0530, Gautham R. Shenoy wrote:
> > > From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
> > >
> > > Hi,
> > >
> > >
> > >
> > >
> > > Gautham R. Shenoy (5):
> > >   cpuidle-pseries: Set the latency-hint before entering CEDE
> > >   cpuidle-pseries: Add function to parse extended CEDE records
> > >   cpuidle-pseries : Fixup exit latency for CEDE(0)
> > >   cpuidle-pseries : Include extended CEDE states in cpuidle framework
> > >   cpuidle-pseries: Block Extended CEDE(1) which adds no additional
> > >     value.
> >
> > Forgot to mention that these patches are on top of Nathan's series to
> > remove extended CEDE offline and bogus topology update code :
> > https://lore.kernel.org/linuxppc-dev/20200612051238.1007764-1-nathanl@linux.ibm.com/
> 
> OK, so this is targeted at the powerpc maintainers, isn't it?

Yes, the code is powerpc specific.

Also, I noticed that Nathan's patches have been merged by Michael
Ellerman in the powerpc/merge tree. I will rebase and post a v2 of
this patch series.

--
Thanks and Regards
gautham.

^ permalink raw reply

* Re: [PATCH v4 09/10] Powerpc/smp: Create coregroup domain
From: Gautham R Shenoy @ 2020-07-27 18:52 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Michael Neuling, Peter Zijlstra,
	LKML, Nicholas Piggin, Valentin Schneider, Oliver O'Halloran,
	Jordan Niethe, linuxppc-dev, Ingo Molnar
In-Reply-To: <20200727053230.19753-10-srikar@linux.vnet.ibm.com>

Hi Srikar,

On Mon, Jul 27, 2020 at 11:02:29AM +0530, Srikar Dronamraju wrote:
> Add percpu coregroup maps and masks to create coregroup domain.
> If a coregroup doesn't exist, the coregroup domain will be degenerated
> in favour of SMT/CACHE domain.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: LKML <linux-kernel@vger.kernel.org>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Nicholas Piggin <npiggin@gmail.com>
> Cc: Anton Blanchard <anton@ozlabs.org>
> Cc: Oliver O'Halloran <oohall@gmail.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@neuling.org>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Valentin Schneider <valentin.schneider@arm.com>
> Cc: Jordan Niethe <jniethe5@gmail.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

This version looks good to me.

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>


> ---
> Changelog v3 ->v4:
> 	if coregroup_support doesn't exist, update MC mask to the next
> 	smaller domain mask.
> 
> Changelog v2 -> v3:
> 	Add optimization for mask updation under coregroup_support
> 
> Changelog v1 -> v2:
> 	Moved coregroup topology fixup to fixup_topology (Gautham)
> 
>  arch/powerpc/include/asm/topology.h | 10 +++++++
>  arch/powerpc/kernel/smp.c           | 44 +++++++++++++++++++++++++++++
>  arch/powerpc/mm/numa.c              |  5 ++++
>  3 files changed, 59 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
> index f0b6300e7dd3..6609174918ab 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -88,12 +88,22 @@ static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> 
>  #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
>  extern int find_and_online_cpu_nid(int cpu);
> +extern int cpu_to_coregroup_id(int cpu);
>  #else
>  static inline int find_and_online_cpu_nid(int cpu)
>  {
>  	return 0;
>  }
> 
> +static inline int cpu_to_coregroup_id(int cpu)
> +{
> +#ifdef CONFIG_SMP
> +	return cpu_to_core_id(cpu);
> +#else
> +	return 0;
> +#endif
> +}
> +
>  #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
> 
>  #include <asm-generic/topology.h>
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index dab96a1203ec..95f0bf72e283 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -80,6 +80,7 @@ DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
> +DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map);
> 
>  EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
>  EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
> @@ -91,6 +92,7 @@ enum {
>  	smt_idx,
>  #endif
>  	bigcore_idx,
> +	mc_idx,
>  	die_idx,
>  };
> 
> @@ -869,6 +871,21 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
>  }
>  #endif
> 
> +static struct cpumask *cpu_coregroup_mask(int cpu)
> +{
> +	return per_cpu(cpu_coregroup_map, cpu);
> +}
> +
> +static bool has_coregroup_support(void)
> +{
> +	return coregroup_enabled;
> +}
> +
> +static const struct cpumask *cpu_mc_mask(int cpu)
> +{
> +	return cpu_coregroup_mask(cpu);
> +}
> +
>  static const struct cpumask *cpu_bigcore_mask(int cpu)
>  {
>  	return per_cpu(cpu_sibling_map, cpu);
> @@ -879,6 +896,7 @@ static struct sched_domain_topology_level powerpc_topology[] = {
>  	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
>  #endif
>  	{ cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
> +	{ cpu_mc_mask, SD_INIT_NAME(MC) },
>  	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
>  	{ NULL, },
>  };
> @@ -925,6 +943,10 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>  					GFP_KERNEL, cpu_to_node(cpu));
>  		zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu),
>  					GFP_KERNEL, cpu_to_node(cpu));
> +		if (has_coregroup_support())
> +			zalloc_cpumask_var_node(&per_cpu(cpu_coregroup_map, cpu),
> +						GFP_KERNEL, cpu_to_node(cpu));
> +
>  #ifdef CONFIG_NEED_MULTIPLE_NODES
>  		/*
>  		 * numa_node_id() works after this.
> @@ -942,6 +964,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>  	cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
>  	cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
> 
> +	if (has_coregroup_support())
> +		cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
> +
>  	init_big_cores();
>  	if (has_big_cores) {
>  		cpumask_set_cpu(boot_cpuid,
> @@ -1233,6 +1258,8 @@ static void remove_cpu_from_masks(int cpu)
>  		set_cpus_unrelated(cpu, i, cpu_sibling_mask);
>  		if (has_big_cores)
>  			set_cpus_unrelated(cpu, i, cpu_smallcore_mask);
> +		if (has_coregroup_support())
> +			set_cpus_unrelated(cpu, i, cpu_coregroup_mask);
>  	}
>  }
>  #endif
> @@ -1293,6 +1320,20 @@ static void add_cpu_to_masks(int cpu)
>  	add_cpu_to_smallcore_masks(cpu);
>  	update_mask_by_l2(cpu, cpu_l2_cache_mask);
> 
> +	if (has_coregroup_support()) {
> +		int coregroup_id = cpu_to_coregroup_id(cpu);
> +
> +		cpumask_set_cpu(cpu, cpu_coregroup_mask(cpu));
> +		for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
> +			int fcpu = cpu_first_thread_sibling(i);
> +
> +			if (fcpu == first_thread)
> +				set_cpus_related(cpu, i, cpu_coregroup_mask);
> +			else if (coregroup_id == cpu_to_coregroup_id(i))
> +				set_cpus_related(cpu, i, cpu_coregroup_mask);
> +		}
> +	}
> +
>  	if (pkg_id == -1) {
>  		struct cpumask *(*mask)(int) = cpu_sibling_mask;
> 
> @@ -1398,6 +1439,9 @@ static void fixup_topology(void)
>  		powerpc_topology[bigcore_idx].name = "CACHE";
>  #endif
>  	}
> +
> +	if (!has_coregroup_support())
> +		powerpc_topology[mc_idx].mask = powerpc_topology[bigcore_idx].mask;
>  }
> 
>  void __init smp_cpus_done(unsigned int max_cpus)
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 51cb672f113b..0d57779e7942 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1216,6 +1216,11 @@ int find_and_online_cpu_nid(int cpu)
>  	return new_nid;
>  }
> 
> +int cpu_to_coregroup_id(int cpu)
> +{
> +	return cpu_to_core_id(cpu);
> +}
> +
>  static int topology_update_init(void)
>  {
>  	topology_inited = 1;
> -- 
> 2.17.1
> 

^ permalink raw reply

* [PATCH v2 2/2] powerpc/pseries: new lparcfg key/value pair: partition_affinity_score
From: Scott Cheloha @ 2020-07-27 18:46 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nathan Lynch, Tyrel Datwylder
In-Reply-To: <20200727184605.2945095-1-cheloha@linux.ibm.com>

The H_GetPerformanceCounterInfo (GPCI) PHYP hypercall has a subcall,
Affinity_Domain_Info_By_Partition, which returns, among other things,
a "partition affinity score" for a given LPAR.  This score, a value on
[0-100], represents the processor-memory affinity for the LPAR in
question.  A score of 0 indicates the worst possible affinity while a
score of 100 indicates perfect affinity.  The score can be used to
reason about performance.

This patch adds the score for the local LPAR to the lparcfg procfile
under a new 'partition_affinity_score' key.

Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com>
---
 arch/powerpc/platforms/pseries/lparcfg.c | 35 ++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
index b8d28ab88178..e278390ab28d 100644
--- a/arch/powerpc/platforms/pseries/lparcfg.c
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -136,6 +136,39 @@ static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
 	return rc;
 }
 
+static void show_gpci_data(struct seq_file *m)
+{
+	struct hv_gpci_request_buffer *buf;
+	unsigned int affinity_score;
+	long ret;
+
+	buf = kmalloc(sizeof(*buf), GFP_KERNEL);
+	if (buf == NULL)
+		return;
+
+	/*
+	 * Show the local LPAR's affinity score.
+	 *
+	 * 0xB1 selects the Affinity_Domain_Info_By_Partition subcall.
+	 * The score is at byte 0xB in the output buffer.
+	 */
+	memset(&buf->params, 0, sizeof(buf->params));
+	buf->params.counter_request = cpu_to_be32(0xB1);
+	buf->params.starting_index = cpu_to_be32(-1);	/* local LPAR */
+	buf->params.counter_info_version_in = 0x5;	/* v5+ for score */
+	ret = plpar_hcall_norets(H_GET_PERF_COUNTER_INFO, virt_to_phys(buf),
+				 sizeof(*buf));
+	if (ret != H_SUCCESS) {
+		pr_debug("hcall failed: H_GET_PERF_COUNTER_INFO: %ld, %x\n",
+			 ret, be32_to_cpu(buf->params.detail_rc));
+		goto out;
+	}
+	affinity_score = buf->bytes[0xB];
+	seq_printf(m, "partition_affinity_score=%u\n", affinity_score);
+out:
+	kfree(buf);
+}
+
 static unsigned h_pic(unsigned long *pool_idle_time,
 		      unsigned long *num_procs)
 {
@@ -487,6 +520,8 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
 			   partition_active_processors * 100);
 	}
 
+	show_gpci_data(m);
+
 	seq_printf(m, "partition_active_processors=%d\n",
 		   partition_active_processors);
 
-- 
2.24.1


^ permalink raw reply related

* [PATCH v2 1/2] powerpc/perf: consolidate GPCI hcall structs into asm/hvcall.h
From: Scott Cheloha @ 2020-07-27 18:46 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nathan Lynch, Tyrel Datwylder

The H_GetPerformanceCounterInfo (GPCI) hypercall input/output structs are
useful to modules outside of perf/, so move them into asm/hvcall.h to live
alongside the other powerpc hypercall structs.

Leave the perf-specific GPCI stuff in perf/hv-gpci.h.

Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com>
---
 arch/powerpc/include/asm/hvcall.h | 36 +++++++++++++++++++++++++++++++
 arch/powerpc/perf/hv-gpci.c       |  9 --------
 arch/powerpc/perf/hv-gpci.h       | 27 -----------------------
 3 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index e90c073e437e..c338480b4551 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -527,6 +527,42 @@ struct hv_guest_state {
 /* Latest version of hv_guest_state structure */
 #define HV_GUEST_STATE_VERSION	1
 
+/*
+ * From the document "H_GetPerformanceCounterInfo Interface" v1.07
+ *
+ * H_GET_PERF_COUNTER_INFO argument
+ */
+struct hv_get_perf_counter_info_params {
+	__be32 counter_request; /* I */
+	__be32 starting_index;  /* IO */
+	__be16 secondary_index; /* IO */
+	__be16 returned_values; /* O */
+	__be32 detail_rc; /* O, only needed when called via *_norets() */
+
+	/*
+	 * O, size each of counter_value element in bytes, only set for version
+	 * >= 0x3
+	 */
+	__be16 cv_element_size;
+
+	/* I, 0 (zero) for versions < 0x3 */
+	__u8 counter_info_version_in;
+
+	/* O, 0 (zero) if version < 0x3. Must be set to 0 when making hcall */
+	__u8 counter_info_version_out;
+	__u8 reserved[0xC];
+	__u8 counter_value[];
+} __packed;
+
+#define HGPCI_REQ_BUFFER_SIZE	4096
+#define HGPCI_MAX_DATA_BYTES \
+	(HGPCI_REQ_BUFFER_SIZE - sizeof(struct hv_get_perf_counter_info_params))
+
+struct hv_gpci_request_buffer {
+	struct hv_get_perf_counter_info_params params;
+	uint8_t bytes[HGPCI_MAX_DATA_BYTES];
+} __packed;
+
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_HVCALL_H */
diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c
index 6884d16ec19b..1667315b82e9 100644
--- a/arch/powerpc/perf/hv-gpci.c
+++ b/arch/powerpc/perf/hv-gpci.c
@@ -123,17 +123,8 @@ static const struct attribute_group *attr_groups[] = {
 	NULL,
 };
 
-#define HGPCI_REQ_BUFFER_SIZE	4096
-#define HGPCI_MAX_DATA_BYTES \
-	(HGPCI_REQ_BUFFER_SIZE - sizeof(struct hv_get_perf_counter_info_params))
-
 static DEFINE_PER_CPU(char, hv_gpci_reqb[HGPCI_REQ_BUFFER_SIZE]) __aligned(sizeof(uint64_t));
 
-struct hv_gpci_request_buffer {
-	struct hv_get_perf_counter_info_params params;
-	uint8_t bytes[HGPCI_MAX_DATA_BYTES];
-} __packed;
-
 static unsigned long single_gpci_request(u32 req, u32 starting_index,
 		u16 secondary_index, u8 version_in, u32 offset, u8 length,
 		u64 *value)
diff --git a/arch/powerpc/perf/hv-gpci.h b/arch/powerpc/perf/hv-gpci.h
index a3053eda5dcc..4d108262bed7 100644
--- a/arch/powerpc/perf/hv-gpci.h
+++ b/arch/powerpc/perf/hv-gpci.h
@@ -2,33 +2,6 @@
 #ifndef LINUX_POWERPC_PERF_HV_GPCI_H_
 #define LINUX_POWERPC_PERF_HV_GPCI_H_
 
-#include <linux/types.h>
-
-/* From the document "H_GetPerformanceCounterInfo Interface" v1.07 */
-
-/* H_GET_PERF_COUNTER_INFO argument */
-struct hv_get_perf_counter_info_params {
-	__be32 counter_request; /* I */
-	__be32 starting_index;  /* IO */
-	__be16 secondary_index; /* IO */
-	__be16 returned_values; /* O */
-	__be32 detail_rc; /* O, only needed when called via *_norets() */
-
-	/*
-	 * O, size each of counter_value element in bytes, only set for version
-	 * >= 0x3
-	 */
-	__be16 cv_element_size;
-
-	/* I, 0 (zero) for versions < 0x3 */
-	__u8 counter_info_version_in;
-
-	/* O, 0 (zero) if version < 0x3. Must be set to 0 when making hcall */
-	__u8 counter_info_version_out;
-	__u8 reserved[0xC];
-	__u8 counter_value[];
-} __packed;
-
 /*
  * counter info version => fw version/reference (spec version)
  *
-- 
2.24.1


^ permalink raw reply related

* [PATCH v6 5/5] KVM: PPC: Book3S HV: migrate hot plugged memory
From: Ram Pai @ 2020-07-27 18:07 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1595873238-26184-1-git-send-email-linuxram@us.ibm.com>

From: Laurent Dufour <ldufour@linux.ibm.com>

When a memory slot is hot plugged to a SVM, PFNs associated with the
GFNs in that slot must be migrated to the secure-PFNs, aka device-PFNs.

Call kvmppc_uv_migrate_mem_slot() to accomplish this.
Disable page-merge for all pages in the memory slot.

Reviewed-by: Bharata B Rao <bharata@linux.ibm.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
[rearranged the code, and modified the commit log]
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
---
 arch/powerpc/include/asm/kvm_book3s_uvmem.h | 14 ++++++++++++++
 arch/powerpc/kvm/book3s_hv.c                | 14 ++++++--------
 arch/powerpc/kvm/book3s_hv_uvmem.c          | 23 +++++++++++++++++++----
 3 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
index 9cb7d8b..0a63194 100644
--- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h
+++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
@@ -23,6 +23,10 @@ unsigned long kvmppc_h_svm_page_out(struct kvm *kvm,
 unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm);
 void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 			     struct kvm *kvm, bool skip_page_out);
+int kvmppc_uvmem_memslot_create(struct kvm *kvm,
+		const struct kvm_memory_slot *new);
+void kvmppc_uvmem_memslot_delete(struct kvm *kvm,
+		const struct kvm_memory_slot *old);
 #else
 static inline int kvmppc_uvmem_init(void)
 {
@@ -82,5 +86,15 @@ static inline int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn)
 static inline void
 kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 			struct kvm *kvm, bool skip_page_out) { }
+
+static inline int  kvmppc_uvmem_memslot_create(struct kvm *kvm,
+		const struct kvm_memory_slot *new)
+{
+	return H_UNSUPPORTED;
+}
+
+static inline void  kvmppc_uvmem_memslot_delete(struct kvm *kvm,
+		const struct kvm_memory_slot *old) { }
+
 #endif /* CONFIG_PPC_UV */
 #endif /* __ASM_KVM_BOOK3S_UVMEM_H__ */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d331b46..a93bc65 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4515,16 +4515,14 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 
 	switch (change) {
 	case KVM_MR_CREATE:
-		if (kvmppc_uvmem_slot_init(kvm, new))
-			return;
-		uv_register_mem_slot(kvm->arch.lpid,
-				     new->base_gfn << PAGE_SHIFT,
-				     new->npages * PAGE_SIZE,
-				     0, new->id);
+		/*
+		 * @TODO kvmppc_uvmem_memslot_create() can fail and
+		 * return error. Fix this.
+		 */
+		kvmppc_uvmem_memslot_create(kvm, new);
 		break;
 	case KVM_MR_DELETE:
-		uv_unregister_mem_slot(kvm->arch.lpid, old->id);
-		kvmppc_uvmem_slot_free(kvm, old);
+		kvmppc_uvmem_memslot_delete(kvm, old);
 		break;
 	default:
 		/* TODO: Handle KVM_MR_MOVE */
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index a1664ae..5b917ea 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -418,7 +418,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm,
 	return ret;
 }
 
-static void kvmppc_uvmem_memslot_delete(struct kvm *kvm,
+static void __kvmppc_uvmem_memslot_delete(struct kvm *kvm,
 		const struct kvm_memory_slot *memslot)
 {
 	uv_unregister_mem_slot(kvm->arch.lpid, memslot->id);
@@ -426,7 +426,7 @@ static void kvmppc_uvmem_memslot_delete(struct kvm *kvm,
 	kvmppc_memslot_page_merge(kvm, memslot, true);
 }
 
-static int kvmppc_uvmem_memslot_create(struct kvm *kvm,
+static int __kvmppc_uvmem_memslot_create(struct kvm *kvm,
 		const struct kvm_memory_slot *memslot)
 {
 	int ret = H_PARAMETER;
@@ -478,7 +478,7 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 	/* register the memslot */
 	slots = kvm_memslots(kvm);
 	kvm_for_each_memslot(memslot, slots) {
-		ret = kvmppc_uvmem_memslot_create(kvm, memslot);
+		ret = __kvmppc_uvmem_memslot_create(kvm, memslot);
 		if (ret)
 			break;
 	}
@@ -488,7 +488,7 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 		kvm_for_each_memslot(m, slots) {
 			if (m == memslot)
 				break;
-			kvmppc_uvmem_memslot_delete(kvm, memslot);
+			__kvmppc_uvmem_memslot_delete(kvm, memslot);
 		}
 	}
 
@@ -1057,6 +1057,21 @@ int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn)
 	return (ret == U_SUCCESS) ? RESUME_GUEST : -EFAULT;
 }
 
+int kvmppc_uvmem_memslot_create(struct kvm *kvm, const struct kvm_memory_slot *new)
+{
+	int ret = __kvmppc_uvmem_memslot_create(kvm, new);
+
+	if (!ret)
+		ret = kvmppc_uv_migrate_mem_slot(kvm, new);
+
+	return ret;
+}
+
+void kvmppc_uvmem_memslot_delete(struct kvm *kvm, const struct kvm_memory_slot *old)
+{
+	__kvmppc_uvmem_memslot_delete(kvm, old);
+}
+
 static u64 kvmppc_get_secmem_size(void)
 {
 	struct device_node *np;
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v6 4/5] KVM: PPC: Book3S HV: in H_SVM_INIT_DONE, migrate remaining normal-GFNs to secure-GFNs.
From: Ram Pai @ 2020-07-27 18:07 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1595873238-26184-1-git-send-email-linuxram@us.ibm.com>

The Ultravisor is expected to explicitly call H_SVM_PAGE_IN for all the
pages of the SVM before calling H_SVM_INIT_DONE. This causes a huge
delay in tranistioning the VM to SVM. The Ultravisor is only interested
in the pages that contain the kernel, initrd and other important data
structures. The rest contain throw-away content.

However if not all pages are requested by the Ultravisor, the Hypervisor
continues to consider the GFNs corresponding to the non-requested pages
as normal GFNs. This can lead to data-corruption and undefined behavior.

In H_SVM_INIT_DONE handler, move all the PFNs associated with the SVM's
GFNs to secure-PFNs. Skip the GFNs that are already Paged-in or Shared
or Paged-in followed by a Paged-out.

Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Reviewed-by: Bharata B Rao <bharata@linux.ibm.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 Documentation/powerpc/ultravisor.rst |   2 +
 arch/powerpc/kvm/book3s_hv_uvmem.c   | 154 ++++++++++++++++++++++++++++++-----
 2 files changed, 134 insertions(+), 22 deletions(-)

diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst
index a1c8c37..ba6b1bf 100644
--- a/Documentation/powerpc/ultravisor.rst
+++ b/Documentation/powerpc/ultravisor.rst
@@ -934,6 +934,8 @@ Return values
 	* H_UNSUPPORTED		if called from the wrong context (e.g.
 				from an SVM or before an H_SVM_INIT_START
 				hypercall).
+	* H_STATE		if the hypervisor could not successfully
+                                transition the VM to Secure VM.
 
 Description
 ~~~~~~~~~~~
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 1b2b029..a1664ae 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -93,6 +93,7 @@
 #include <asm/ultravisor.h>
 #include <asm/mman.h>
 #include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s_uvmem.h>
 
 static struct dev_pagemap kvmppc_uvmem_pgmap;
 static unsigned long *kvmppc_uvmem_bitmap;
@@ -348,6 +349,41 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
 	return false;
 }
 
+/*
+ * starting from *gfn search for the next available GFN that is not yet
+ * transitioned to a secure GFN.  return the value of that GFN in *gfn.  If a
+ * GFN is found, return true, else return false
+ *
+ * Must be called with kvm->arch.uvmem_lock  held.
+ */
+static bool kvmppc_next_nontransitioned_gfn(const struct kvm_memory_slot *memslot,
+		struct kvm *kvm, unsigned long *gfn)
+{
+	struct kvmppc_uvmem_slot *p;
+	bool ret = false;
+	unsigned long i;
+
+	list_for_each_entry(p, &kvm->arch.uvmem_pfns, list)
+		if (*gfn >= p->base_pfn && *gfn < p->base_pfn + p->nr_pfns)
+			break;
+	if (!p)
+		return ret;
+	/*
+	 * The code below assumes, one to one correspondence between
+	 * kvmppc_uvmem_slot and memslot.
+	 */
+	for (i = *gfn; i < p->base_pfn + p->nr_pfns; i++) {
+		unsigned long index = i - p->base_pfn;
+
+		if (!(p->pfns[index] & KVMPPC_GFN_FLAG_MASK)) {
+			*gfn = i;
+			ret = true;
+			break;
+		}
+	}
+	return ret;
+}
+
 static int kvmppc_memslot_page_merge(struct kvm *kvm,
 		const struct kvm_memory_slot *memslot, bool merge)
 {
@@ -460,16 +496,6 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 	return ret;
 }
 
-unsigned long kvmppc_h_svm_init_done(struct kvm *kvm)
-{
-	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
-		return H_UNSUPPORTED;
-
-	kvm->arch.secure_guest |= KVMPPC_SECURE_INIT_DONE;
-	pr_info("LPID %d went secure\n", kvm->arch.lpid);
-	return H_SUCCESS;
-}
-
 /*
  * Drop device pages that we maintain for the secure guest
  *
@@ -588,12 +614,14 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
 }
 
 /*
- * Alloc a PFN from private device memory pool and copy page from normal
- * memory to secure memory using UV_PAGE_IN uvcall.
+ * Alloc a PFN from private device memory pool. If @pagein is true,
+ * copy page from normal memory to secure memory using UV_PAGE_IN uvcall.
  */
-static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
-		   unsigned long end, unsigned long gpa, struct kvm *kvm,
-		   unsigned long page_shift)
+static int kvmppc_svm_page_in(struct vm_area_struct *vma,
+		unsigned long start,
+		unsigned long end, unsigned long gpa, struct kvm *kvm,
+		unsigned long page_shift,
+		bool pagein)
 {
 	unsigned long src_pfn, dst_pfn = 0;
 	struct migrate_vma mig;
@@ -624,11 +652,16 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 		goto out_finalize;
 	}
 
-	pfn = *mig.src >> MIGRATE_PFN_SHIFT;
-	spage = migrate_pfn_to_page(*mig.src);
-	if (spage)
-		uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0,
-			   page_shift);
+	if (pagein) {
+		pfn = *mig.src >> MIGRATE_PFN_SHIFT;
+		spage = migrate_pfn_to_page(*mig.src);
+		if (spage) {
+			ret = uv_page_in(kvm->arch.lpid, pfn << page_shift,
+					gpa, 0, page_shift);
+			if (ret)
+				goto out_finalize;
+		}
+	}
 
 	*mig.dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
 	migrate_vma_pages(&mig);
@@ -637,6 +670,80 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 	return ret;
 }
 
+static int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+		const struct kvm_memory_slot *memslot)
+{
+	unsigned long gfn = memslot->base_gfn;
+	struct vm_area_struct *vma;
+	unsigned long start, end;
+	int ret = 0;
+
+	mmap_read_lock(kvm->mm);
+	mutex_lock(&kvm->arch.uvmem_lock);
+	while (kvmppc_next_nontransitioned_gfn(memslot, kvm, &gfn)) {
+		ret = H_STATE;
+		start = gfn_to_hva(kvm, gfn);
+		if (kvm_is_error_hva(start))
+			break;
+
+		end = start + (1UL << PAGE_SHIFT);
+		vma = find_vma_intersection(kvm->mm, start, end);
+		if (!vma || vma->vm_start > start || vma->vm_end < end)
+			break;
+
+		ret = kvmppc_svm_page_in(vma, start, end,
+				(gfn << PAGE_SHIFT), kvm, PAGE_SHIFT, false);
+		if (ret) {
+			ret = H_STATE;
+			break;
+		}
+
+		/* relinquish the cpu if needed */
+		cond_resched();
+	}
+	mutex_unlock(&kvm->arch.uvmem_lock);
+	mmap_read_unlock(kvm->mm);
+	return ret;
+}
+
+unsigned long kvmppc_h_svm_init_done(struct kvm *kvm)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int srcu_idx;
+	long ret = H_SUCCESS;
+
+	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
+		return H_UNSUPPORTED;
+
+	/* migrate any unmoved normal pfn to device pfns*/
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	slots = kvm_memslots(kvm);
+	kvm_for_each_memslot(memslot, slots) {
+		ret = kvmppc_uv_migrate_mem_slot(kvm, memslot);
+		if (ret) {
+			/*
+			 * The pages will remain transitioned.
+			 * Its the callers responsibility to
+			 * terminate the VM, which will undo
+			 * all state of the VM. Till then
+			 * this VM is in a erroneous state.
+			 * Its KVMPPC_SECURE_INIT_DONE will
+			 * remain unset.
+			 */
+			ret = H_STATE;
+			goto out;
+		}
+	}
+
+	kvm->arch.secure_guest |= KVMPPC_SECURE_INIT_DONE;
+	pr_info("LPID %d went secure\n", kvm->arch.lpid);
+
+out:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	return ret;
+}
+
 /*
  * Shares the page with HV, thus making it a normal page.
  *
@@ -745,8 +852,11 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 	if (!vma || vma->vm_start > start || vma->vm_end < end)
 		goto out_unlock;
 
-	if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift))
-		ret = H_SUCCESS;
+	if (kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift,
+				true))
+		goto out_unlock;
+
+	ret = H_SUCCESS;
 
 out_unlock:
 	mutex_unlock(&kvm->arch.uvmem_lock);
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v6 2/5] KVM: PPC: Book3S HV: Disable page merging in H_SVM_INIT_START
From: Ram Pai @ 2020-07-27 18:07 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1595873238-26184-1-git-send-email-linuxram@us.ibm.com>

Page-merging of pages in memory-slots associated with a Secure VM,
is disabled in H_SVM_PAGE_IN handler.

This operation should have been done the much earlier; the moment the VM
is initiated for secure-transition. Delaying this operation, increases
the probability for those pages to acquire new references , making it
impossible to migrate those pages in H_SVM_PAGE_IN handler.

Disable page-migration in H_SVM_INIT_START handling.

Reviewed-by: Bharata B Rao <bharata@linux.ibm.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 Documentation/powerpc/ultravisor.rst |   1 +
 arch/powerpc/kvm/book3s_hv_uvmem.c   | 123 +++++++++++++++++++++++++----------
 2 files changed, 89 insertions(+), 35 deletions(-)

diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst
index df136c8..a1c8c37 100644
--- a/Documentation/powerpc/ultravisor.rst
+++ b/Documentation/powerpc/ultravisor.rst
@@ -895,6 +895,7 @@ Return values
     One of the following values:
 
 	* H_SUCCESS	 on success.
+        * H_STATE        if the VM is not in a position to switch to secure.
 
 Description
 ~~~~~~~~~~~
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index e6f76bc..533b608 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -211,10 +211,79 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
 	return false;
 }
 
+static int kvmppc_memslot_page_merge(struct kvm *kvm,
+		const struct kvm_memory_slot *memslot, bool merge)
+{
+	unsigned long gfn = memslot->base_gfn;
+	unsigned long end, start = gfn_to_hva(kvm, gfn);
+	int ret = 0;
+	struct vm_area_struct *vma;
+	int merge_flag = (merge) ? MADV_MERGEABLE : MADV_UNMERGEABLE;
+
+	if (kvm_is_error_hva(start))
+		return H_STATE;
+
+	end = start + (memslot->npages << PAGE_SHIFT);
+
+	mmap_write_lock(kvm->mm);
+	do {
+		vma = find_vma_intersection(kvm->mm, start, end);
+		if (!vma) {
+			ret = H_STATE;
+			break;
+		}
+		ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
+			  merge_flag, &vma->vm_flags);
+		if (ret) {
+			ret = H_STATE;
+			break;
+		}
+		start = vma->vm_end;
+	} while (end > vma->vm_end);
+
+	mmap_write_unlock(kvm->mm);
+	return ret;
+}
+
+static void kvmppc_uvmem_memslot_delete(struct kvm *kvm,
+		const struct kvm_memory_slot *memslot)
+{
+	uv_unregister_mem_slot(kvm->arch.lpid, memslot->id);
+	kvmppc_uvmem_slot_free(kvm, memslot);
+	kvmppc_memslot_page_merge(kvm, memslot, true);
+}
+
+static int kvmppc_uvmem_memslot_create(struct kvm *kvm,
+		const struct kvm_memory_slot *memslot)
+{
+	int ret = H_PARAMETER;
+
+	if (kvmppc_memslot_page_merge(kvm, memslot, false))
+		return ret;
+
+	if (kvmppc_uvmem_slot_init(kvm, memslot))
+		goto out1;
+
+	ret = uv_register_mem_slot(kvm->arch.lpid,
+				   memslot->base_gfn << PAGE_SHIFT,
+				   memslot->npages * PAGE_SIZE,
+				   0, memslot->id);
+	if (ret < 0) {
+		ret = H_PARAMETER;
+		goto out;
+	}
+	return 0;
+out:
+	kvmppc_uvmem_slot_free(kvm, memslot);
+out1:
+	kvmppc_memslot_page_merge(kvm, memslot, true);
+	return ret;
+}
+
 unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 {
 	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
+	struct kvm_memory_slot *memslot, *m;
 	int ret = H_SUCCESS;
 	int srcu_idx;
 
@@ -232,23 +301,24 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 		return H_AUTHORITY;
 
 	srcu_idx = srcu_read_lock(&kvm->srcu);
+
+	/* register the memslot */
 	slots = kvm_memslots(kvm);
 	kvm_for_each_memslot(memslot, slots) {
-		if (kvmppc_uvmem_slot_init(kvm, memslot)) {
-			ret = H_PARAMETER;
-			goto out;
-		}
-		ret = uv_register_mem_slot(kvm->arch.lpid,
-					   memslot->base_gfn << PAGE_SHIFT,
-					   memslot->npages * PAGE_SIZE,
-					   0, memslot->id);
-		if (ret < 0) {
-			kvmppc_uvmem_slot_free(kvm, memslot);
-			ret = H_PARAMETER;
-			goto out;
+		ret = kvmppc_uvmem_memslot_create(kvm, memslot);
+		if (ret)
+			break;
+	}
+
+	if (ret) {
+		slots = kvm_memslots(kvm);
+		kvm_for_each_memslot(m, slots) {
+			if (m == memslot)
+				break;
+			kvmppc_uvmem_memslot_delete(kvm, memslot);
 		}
 	}
-out:
+
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 	return ret;
 }
@@ -384,7 +454,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
  */
 static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 		   unsigned long end, unsigned long gpa, struct kvm *kvm,
-		   unsigned long page_shift, bool *downgrade)
+		   unsigned long page_shift)
 {
 	unsigned long src_pfn, dst_pfn = 0;
 	struct migrate_vma mig;
@@ -400,18 +470,6 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 	mig.src = &src_pfn;
 	mig.dst = &dst_pfn;
 
-	/*
-	 * We come here with mmap_lock write lock held just for
-	 * ksm_madvise(), otherwise we only need read mmap_lock.
-	 * Hence downgrade to read lock once ksm_madvise() is done.
-	 */
-	ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
-			  MADV_UNMERGEABLE, &vma->vm_flags);
-	mmap_write_downgrade(kvm->mm);
-	*downgrade = true;
-	if (ret)
-		return ret;
-
 	ret = migrate_vma_setup(&mig);
 	if (ret)
 		return ret;
@@ -503,7 +561,6 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 		unsigned long flags,
 		unsigned long page_shift)
 {
-	bool downgrade = false;
 	unsigned long start, end;
 	struct vm_area_struct *vma;
 	int srcu_idx;
@@ -524,7 +581,7 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 
 	ret = H_PARAMETER;
 	srcu_idx = srcu_read_lock(&kvm->srcu);
-	mmap_write_lock(kvm->mm);
+	mmap_read_lock(kvm->mm);
 
 	start = gfn_to_hva(kvm, gfn);
 	if (kvm_is_error_hva(start))
@@ -540,16 +597,12 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 	if (!vma || vma->vm_start > start || vma->vm_end < end)
 		goto out_unlock;
 
-	if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift,
-				&downgrade))
+	if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift))
 		ret = H_SUCCESS;
 out_unlock:
 	mutex_unlock(&kvm->arch.uvmem_lock);
 out:
-	if (downgrade)
-		mmap_read_unlock(kvm->mm);
-	else
-		mmap_write_unlock(kvm->mm);
+	mmap_read_unlock(kvm->mm);
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 	return ret;
 }
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v6 3/5] KVM: PPC: Book3S HV: track the state GFNs associated with secure VMs
From: Ram Pai @ 2020-07-27 18:07 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1595873238-26184-1-git-send-email-linuxram@us.ibm.com>

During the life of SVM, its GFNs transition through normal, secure and
shared states. Since the kernel does not track GFNs that are shared, it
is not possible to disambiguate a shared GFN from a GFN whose PFN has
not yet been migrated to a secure-PFN. Also it is not possible to
disambiguate a secure-GFN from a GFN whose GFN has been pagedout from
the ultravisor.

The ability to identify the state of a GFN is needed to skip migration
of its PFN to secure-PFN during ESM transition.

The code is re-organized to track the states of a GFN as explained
below.

************************************************************************
 1. States of a GFN
    ---------------
 The GFN can be in one of the following states.

 (a) Secure - The GFN is secure. The GFN is associated with
 	a Secure VM, the contents of the GFN is not accessible
 	to the Hypervisor.  This GFN can be backed by a secure-PFN,
 	or can be backed by a normal-PFN with contents encrypted.
 	The former is true when the GFN is paged-in into the
 	ultravisor. The latter is true when the GFN is paged-out
 	of the ultravisor.

 (b) Shared - The GFN is shared. The GFN is associated with a
 	a secure VM. The contents of the GFN is accessible to
 	Hypervisor. This GFN is backed by a normal-PFN and its
 	content is un-encrypted.

 (c) Normal - The GFN is a normal. The GFN is associated with
 	a normal VM. The contents of the GFN is accesible to
 	the Hypervisor. Its content is never encrypted.

 2. States of a VM.
    ---------------

 (a) Normal VM:  A VM whose contents are always accessible to
 	the hypervisor.  All its GFNs are normal-GFNs.

 (b) Secure VM: A VM whose contents are not accessible to the
 	hypervisor without the VM's consent.  Its GFNs are
 	either Shared-GFN or Secure-GFNs.

 (c) Transient VM: A Normal VM that is transitioning to secure VM.
 	The transition starts on successful return of
 	H_SVM_INIT_START, and ends on successful return
 	of H_SVM_INIT_DONE. This transient VM, can have GFNs
 	in any of the three states; i.e Secure-GFN, Shared-GFN,
 	and Normal-GFN.	The VM never executes in this state
 	in supervisor-mode.

 3. Memory slot State.
    ------------------
  	The state of a memory slot mirrors the state of the
  	VM the memory slot is associated with.

 4. VM State transition.
    --------------------

  A VM always starts in Normal Mode.

  H_SVM_INIT_START moves the VM into transient state. During this
  time the Ultravisor may request some of its GFNs to be shared or
  secured. So its GFNs can be in one of the three GFN states.

  H_SVM_INIT_DONE moves the VM entirely from transient state to
  secure-state. At this point any left-over normal-GFNs are
  transitioned to Secure-GFN.

  H_SVM_INIT_ABORT moves the transient VM back to normal VM.
  All its GFNs are moved to Normal-GFNs.

  UV_TERMINATE transitions the secure-VM back to normal-VM. All
  the secure-GFN and shared-GFNs are tranistioned to normal-GFN
  Note: The contents of the normal-GFN is undefined at this point.

 5. GFN state implementation:
    -------------------------

 Secure GFN is associated with a secure-PFN; also called uvmem_pfn,
 when the GFN is paged-in. Its pfn[] has KVMPPC_GFN_UVMEM_PFN flag
 set, and contains the value of the secure-PFN.
 It is associated with a normal-PFN; also called mem_pfn, when
 the GFN is pagedout. Its pfn[] has KVMPPC_GFN_MEM_PFN flag set.
 The value of the normal-PFN is not tracked.

 Shared GFN is associated with a normal-PFN. Its pfn[] has
 KVMPPC_UVMEM_SHARED_PFN flag set. The value of the normal-PFN
 is not tracked.

 Normal GFN is associated with normal-PFN. Its pfn[] has
 no flag set. The value of the normal-PFN is not tracked.

 6. Life cycle of a GFN
    --------------------
 --------------------------------------------------------------
 |        |     Share  |  Unshare | SVM       |H_SVM_INIT_DONE|
 |        |operation   |operation | abort/    |               |
 |        |            |          | terminate |               |
 -------------------------------------------------------------
 |        |            |          |           |               |
 | Secure |     Shared | Secure   |Normal     |Secure         |
 |        |            |          |           |               |
 | Shared |     Shared | Secure   |Normal     |Shared         |
 |        |            |          |           |               |
 | Normal |     Shared | Secure   |Normal     |Secure         |
 --------------------------------------------------------------

 7. Life cycle of a VM
    --------------------
 --------------------------------------------------------------------
 |         |  start    |  H_SVM_  |H_SVM_   |H_SVM_     |UV_SVM_    |
 |         |  VM       |INIT_START|INIT_DONE|INIT_ABORT |TERMINATE  |
 |         |           |          |         |           |           |
 --------- ----------------------------------------------------------
 |         |           |          |         |           |           |
 | Normal  | Normal    | Transient|Error    |Error      |Normal     |
 |         |           |          |         |           |           |
 | Secure  |   Error   | Error    |Error    |Error      |Normal     |
 |         |           |          |         |           |           |
 |Transient|   N/A     | Error    |Secure   |Normal     |Normal     |
 --------------------------------------------------------------------

************************************************************************

Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org

Reviewed-by: Bharata B Rao <bharata@linux.ibm.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 arch/powerpc/kvm/book3s_hv_uvmem.c | 191 +++++++++++++++++++++++++++++++++----
 1 file changed, 172 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 533b608..1b2b029 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -98,7 +98,127 @@
 static unsigned long *kvmppc_uvmem_bitmap;
 static DEFINE_SPINLOCK(kvmppc_uvmem_bitmap_lock);
 
-#define KVMPPC_UVMEM_PFN	(1UL << 63)
+/*
+ * States of a GFN
+ * ---------------
+ * The GFN can be in one of the following states.
+ *
+ * (a) Secure - The GFN is secure. The GFN is associated with
+ *	a Secure VM, the contents of the GFN is not accessible
+ *	to the Hypervisor.  This GFN can be backed by a secure-PFN,
+ *	or can be backed by a normal-PFN with contents encrypted.
+ *	The former is true when the GFN is paged-in into the
+ *	ultravisor. The latter is true when the GFN is paged-out
+ *	of the ultravisor.
+ *
+ * (b) Shared - The GFN is shared. The GFN is associated with a
+ *	a secure VM. The contents of the GFN is accessible to
+ *	Hypervisor. This GFN is backed by a normal-PFN and its
+ *	content is un-encrypted.
+ *
+ * (c) Normal - The GFN is a normal. The GFN is associated with
+ *	a normal VM. The contents of the GFN is accesible to
+ *	the Hypervisor. Its content is never encrypted.
+ *
+ * States of a VM.
+ * ---------------
+ *
+ * Normal VM:  A VM whose contents are always accessible to
+ *	the hypervisor.  All its GFNs are normal-GFNs.
+ *
+ * Secure VM: A VM whose contents are not accessible to the
+ *	hypervisor without the VM's consent.  Its GFNs are
+ *	either Shared-GFN or Secure-GFNs.
+ *
+ * Transient VM: A Normal VM that is transitioning to secure VM.
+ *	The transition starts on successful return of
+ *	H_SVM_INIT_START, and ends on successful return
+ *	of H_SVM_INIT_DONE. This transient VM, can have GFNs
+ *	in any of the three states; i.e Secure-GFN, Shared-GFN,
+ *	and Normal-GFN.	The VM never executes in this state
+ *	in supervisor-mode.
+ *
+ * Memory slot State.
+ * -----------------------------
+ *	The state of a memory slot mirrors the state of the
+ *	VM the memory slot is associated with.
+ *
+ * VM State transition.
+ * --------------------
+ *
+ *  A VM always starts in Normal Mode.
+ *
+ *  H_SVM_INIT_START moves the VM into transient state. During this
+ *  time the Ultravisor may request some of its GFNs to be shared or
+ *  secured. So its GFNs can be in one of the three GFN states.
+ *
+ *  H_SVM_INIT_DONE moves the VM entirely from transient state to
+ *  secure-state. At this point any left-over normal-GFNs are
+ *  transitioned to Secure-GFN.
+ *
+ *  H_SVM_INIT_ABORT moves the transient VM back to normal VM.
+ *  All its GFNs are moved to Normal-GFNs.
+ *
+ *  UV_TERMINATE transitions the secure-VM back to normal-VM. All
+ *  the secure-GFN and shared-GFNs are tranistioned to normal-GFN
+ *  Note: The contents of the normal-GFN is undefined at this point.
+ *
+ * GFN state implementation:
+ * -------------------------
+ *
+ * Secure GFN is associated with a secure-PFN; also called uvmem_pfn,
+ * when the GFN is paged-in. Its pfn[] has KVMPPC_GFN_UVMEM_PFN flag
+ * set, and contains the value of the secure-PFN.
+ * It is associated with a normal-PFN; also called mem_pfn, when
+ * the GFN is pagedout. Its pfn[] has KVMPPC_GFN_MEM_PFN flag set.
+ * The value of the normal-PFN is not tracked.
+ *
+ * Shared GFN is associated with a normal-PFN. Its pfn[] has
+ * KVMPPC_UVMEM_SHARED_PFN flag set. The value of the normal-PFN
+ * is not tracked.
+ *
+ * Normal GFN is associated with normal-PFN. Its pfn[] has
+ * no flag set. The value of the normal-PFN is not tracked.
+ *
+ * Life cycle of a GFN
+ * --------------------
+ *
+ * --------------------------------------------------------------
+ * |        |     Share  |  Unshare | SVM       |H_SVM_INIT_DONE|
+ * |        |operation   |operation | abort/    |               |
+ * |        |            |          | terminate |               |
+ * -------------------------------------------------------------
+ * |        |            |          |           |               |
+ * | Secure |     Shared | Secure   |Normal     |Secure         |
+ * |        |            |          |           |               |
+ * | Shared |     Shared | Secure   |Normal     |Shared         |
+ * |        |            |          |           |               |
+ * | Normal |     Shared | Secure   |Normal     |Secure         |
+ * --------------------------------------------------------------
+ *
+ * Life cycle of a VM
+ * --------------------
+ *
+ * --------------------------------------------------------------------
+ * |         |  start    |  H_SVM_  |H_SVM_   |H_SVM_     |UV_SVM_    |
+ * |         |  VM       |INIT_START|INIT_DONE|INIT_ABORT |TERMINATE  |
+ * |         |           |          |         |           |           |
+ * --------- ----------------------------------------------------------
+ * |         |           |          |         |           |           |
+ * | Normal  | Normal    | Transient|Error    |Error      |Normal     |
+ * |         |           |          |         |           |           |
+ * | Secure  |   Error   | Error    |Error    |Error      |Normal     |
+ * |         |           |          |         |           |           |
+ * |Transient|   N/A     | Error    |Secure   |Normal     |Normal     |
+ * --------------------------------------------------------------------
+ */
+
+#define KVMPPC_GFN_UVMEM_PFN	(1UL << 63)
+#define KVMPPC_GFN_MEM_PFN	(1UL << 62)
+#define KVMPPC_GFN_SHARED	(1UL << 61)
+#define KVMPPC_GFN_SECURE	(KVMPPC_GFN_UVMEM_PFN | KVMPPC_GFN_MEM_PFN)
+#define KVMPPC_GFN_FLAG_MASK	(KVMPPC_GFN_SECURE | KVMPPC_GFN_SHARED)
+#define KVMPPC_GFN_PFN_MASK	(~KVMPPC_GFN_FLAG_MASK)
 
 struct kvmppc_uvmem_slot {
 	struct list_head list;
@@ -106,11 +226,11 @@ struct kvmppc_uvmem_slot {
 	unsigned long base_pfn;
 	unsigned long *pfns;
 };
-
 struct kvmppc_uvmem_page_pvt {
 	struct kvm *kvm;
 	unsigned long gpa;
 	bool skip_page_out;
+	bool remove_gfn;
 };
 
 bool kvmppc_uvmem_available(void)
@@ -163,8 +283,8 @@ void kvmppc_uvmem_slot_free(struct kvm *kvm, const struct kvm_memory_slot *slot)
 	mutex_unlock(&kvm->arch.uvmem_lock);
 }
 
-static void kvmppc_uvmem_pfn_insert(unsigned long gfn, unsigned long uvmem_pfn,
-				    struct kvm *kvm)
+static void kvmppc_mark_gfn(unsigned long gfn, struct kvm *kvm,
+			unsigned long flag, unsigned long uvmem_pfn)
 {
 	struct kvmppc_uvmem_slot *p;
 
@@ -172,24 +292,41 @@ static void kvmppc_uvmem_pfn_insert(unsigned long gfn, unsigned long uvmem_pfn,
 		if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
 			unsigned long index = gfn - p->base_pfn;
 
-			p->pfns[index] = uvmem_pfn | KVMPPC_UVMEM_PFN;
+			if (flag == KVMPPC_GFN_UVMEM_PFN)
+				p->pfns[index] = uvmem_pfn | flag;
+			else
+				p->pfns[index] = flag;
 			return;
 		}
 	}
 }
 
-static void kvmppc_uvmem_pfn_remove(unsigned long gfn, struct kvm *kvm)
+/* mark the GFN as secure-GFN associated with @uvmem pfn device-PFN. */
+static void kvmppc_gfn_secure_uvmem_pfn(unsigned long gfn,
+			unsigned long uvmem_pfn, struct kvm *kvm)
 {
-	struct kvmppc_uvmem_slot *p;
+	kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_UVMEM_PFN, uvmem_pfn);
+}
 
-	list_for_each_entry(p, &kvm->arch.uvmem_pfns, list) {
-		if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
-			p->pfns[gfn - p->base_pfn] = 0;
-			return;
-		}
-	}
+/* mark the GFN as secure-GFN associated with a memory-PFN. */
+static void kvmppc_gfn_secure_mem_pfn(unsigned long gfn, struct kvm *kvm)
+{
+	kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_MEM_PFN, 0);
 }
 
+/* mark the GFN as a shared GFN. */
+static void kvmppc_gfn_shared(unsigned long gfn, struct kvm *kvm)
+{
+	kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_SHARED, 0);
+}
+
+/* mark the GFN as a non-existent GFN. */
+static void kvmppc_gfn_remove(unsigned long gfn, struct kvm *kvm)
+{
+	kvmppc_mark_gfn(gfn, kvm, 0, 0);
+}
+
+/* return true, if the GFN is a secure-GFN backed by a secure-PFN */
 static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
 				    unsigned long *uvmem_pfn)
 {
@@ -199,10 +336,10 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
 		if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
 			unsigned long index = gfn - p->base_pfn;
 
-			if (p->pfns[index] & KVMPPC_UVMEM_PFN) {
+			if (p->pfns[index] & KVMPPC_GFN_UVMEM_PFN) {
 				if (uvmem_pfn)
 					*uvmem_pfn = p->pfns[index] &
-						     ~KVMPPC_UVMEM_PFN;
+						     KVMPPC_GFN_PFN_MASK;
 				return true;
 			} else
 				return false;
@@ -354,6 +491,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 
 		mutex_lock(&kvm->arch.uvmem_lock);
 		if (!kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) {
+			kvmppc_gfn_remove(gfn, kvm);
 			mutex_unlock(&kvm->arch.uvmem_lock);
 			continue;
 		}
@@ -361,6 +499,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 		uvmem_page = pfn_to_page(uvmem_pfn);
 		pvt = uvmem_page->zone_device_data;
 		pvt->skip_page_out = skip_page_out;
+		pvt->remove_gfn = true;
 		mutex_unlock(&kvm->arch.uvmem_lock);
 
 		pfn = gfn_to_pfn(kvm, gfn);
@@ -430,7 +569,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
 		goto out_clear;
 
 	uvmem_pfn = bit + pfn_first;
-	kvmppc_uvmem_pfn_insert(gpa >> PAGE_SHIFT, uvmem_pfn, kvm);
+	kvmppc_gfn_secure_uvmem_pfn(gpa >> PAGE_SHIFT, uvmem_pfn, kvm);
 
 	pvt->gpa = gpa;
 	pvt->kvm = kvm;
@@ -525,6 +664,11 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
 		uvmem_page = pfn_to_page(uvmem_pfn);
 		pvt = uvmem_page->zone_device_data;
 		pvt->skip_page_out = true;
+		/*
+		 * do not drop the GFN. It is a valid GFN
+		 * that is transitioned to a shared GFN.
+		 */
+		pvt->remove_gfn = false;
 	}
 
 retry:
@@ -538,12 +682,16 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
 		uvmem_page = pfn_to_page(uvmem_pfn);
 		pvt = uvmem_page->zone_device_data;
 		pvt->skip_page_out = true;
+		pvt->remove_gfn = false; /* it continues to be a valid GFN */
 		kvm_release_pfn_clean(pfn);
 		goto retry;
 	}
 
-	if (!uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0, page_shift))
+	if (!uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0,
+				page_shift)) {
+		kvmppc_gfn_shared(gfn, kvm);
 		ret = H_SUCCESS;
+	}
 	kvm_release_pfn_clean(pfn);
 	mutex_unlock(&kvm->arch.uvmem_lock);
 out:
@@ -599,6 +747,7 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 
 	if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift))
 		ret = H_SUCCESS;
+
 out_unlock:
 	mutex_unlock(&kvm->arch.uvmem_lock);
 out:
@@ -707,7 +856,8 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf)
 /*
  * Release the device PFN back to the pool
  *
- * Gets called when secure page becomes a normal page during H_SVM_PAGE_OUT.
+ * Gets called when secure GFN tranistions from a secure-PFN
+ * to a normal PFN during H_SVM_PAGE_OUT.
  * Gets called with kvm->arch.uvmem_lock held.
  */
 static void kvmppc_uvmem_page_free(struct page *page)
@@ -722,7 +872,10 @@ static void kvmppc_uvmem_page_free(struct page *page)
 
 	pvt = page->zone_device_data;
 	page->zone_device_data = NULL;
-	kvmppc_uvmem_pfn_remove(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
+	if (pvt->remove_gfn)
+		kvmppc_gfn_remove(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
+	else
+		kvmppc_gfn_secure_mem_pfn(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
 	kfree(pvt);
 }
 
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v6 0/5] Migrate non-migrated pages of a SVM.
From: Ram Pai @ 2020-07-27 18:07 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david

The time to switch a VM to Secure-VM, increases by the size of the VM.
A 100GB VM takes about 7minutes. This is unacceptable.  This linear
increase is caused by a suboptimal behavior by the Ultravisor and the
Hypervisor.  The Ultravisor unnecessarily migrates all the GFN of the
VM from normal-memory to secure-memory. It has to just migrate the
necessary and sufficient GFNs.

However when the optimization is incorporated in the Ultravisor, the
Hypervisor starts misbehaving. The Hypervisor has a inbuilt assumption
that the Ultravisor will explicitly request to migrate, each and every
GFN of the VM. If only necessary and sufficient GFNs are requested for
migration, the Hypervisor continues to manage the remaining GFNs as
normal GFNs. This leads to memory corruption; manifested
consistently when the SVM reboots.

The same is true, when a memory slot is hotplugged into a SVM. The
Hypervisor expects the ultravisor to request migration of all GFNs to
secure-GFN.  But the hypervisor cannot handle any H_SVM_PAGE_IN
requests from the Ultravisor, done in the context of
UV_REGISTER_MEM_SLOT ucall.  This problem manifests as random errors
in the SVM, when a memory-slot is hotplugged.

This patch series automatically migrates the non-migrated pages of a
SVM, and thus solves the problem.

Testing: Passed rigorous testing using various sized SVMs.

Changelog:

v6:     . rearrangement of functions in book3s_hv_uvmem.c. No functional
		change.
	. decoupling this patch series from Laurent's memory-hotplug/unplug,
		since the memhotplug/unplug/hotplug/reboot test is failing.

v5:  .  This patch series includes Laurent's fix for memory hotplug/unplug
	  . drop pages first and then delete the memslot. Otherwise
	  	the memslot does not get cleanly deleted, causing
		problems during reboot.
	  . recreatable through the following set of commands
	     . device_add pc-dimm,id=dimm1,memdev=mem1
	     . device_del dimm1
	     . device_add pc-dimm,id=dimm1,memdev=mem1
	Further incorporates comments from Bharata:
	. fix for off-by-one while disabling migration.
	. code-reorganized to maximize sharing in init_start path
       		and in memory-hotplug path
	. locking adjustments in mass-page migration during H_SVM_INIT_DONE.
	. improved recovery on error paths.
	. additional comments in the code for better understanding.
	. removed the retry-on-migration-failure code.
	. re-added the initial patch that adjust some prototype to overcome
	   a git problem, where it messes up the code context. Had
		accidently dropped the patch in the last version.

v4:  .  Incorported Bharata's comments:
	- Optimization -- replace write mmap semaphore with read mmap semphore.
	- disable page-merge during memory hotplug.
	- rearranged the patches. consolidated the page-migration-retry logic
		in a single patch.

v3: . Optimized the page-migration retry-logic. 
    . Relax and relinquish the cpu regularly while bulk migrating
    	the non-migrated pages. This issue was causing soft-lockups.
	Fixed it.
    . Added a new patch, to retry page-migration a couple of times
    	before returning H_BUSY in H_SVM_PAGE_IN. This issue was
	seen a few times in a 24hour continuous reboot test of the SVMs.

v2: . fixed a bug observed by Laurent. The state of the GFN's associated
	with Secure-VMs were not reset during memslot flush.
    . Re-organized the code, for easier review.
    . Better description of the patch series.

v1: . fixed a bug observed by Bharata. Pages that where paged-in and later
	paged-out must also be skipped from migration during H_SVM_INIT_DONE.


Laurent Dufour (1):
  KVM: PPC: Book3S HV: migrate hot plugged memory

Ram Pai (4):
  KVM: PPC: Book3S HV: Fix function definition in book3s_hv_uvmem.c
  KVM: PPC: Book3S HV: Disable page merging in H_SVM_INIT_START
  KVM: PPC: Book3S HV: track the state GFNs associated with secure VMs
  KVM: PPC: Book3S HV: in H_SVM_INIT_DONE, migrate remaining normal-GFNs
    to secure-GFNs.

 Documentation/powerpc/ultravisor.rst        |   3 +
 arch/powerpc/include/asm/kvm_book3s_uvmem.h |  14 +
 arch/powerpc/kvm/book3s_hv.c                |  14 +-
 arch/powerpc/kvm/book3s_hv_uvmem.c          | 498 +++++++++++++++++++++++-----
 4 files changed, 437 insertions(+), 92 deletions(-)

-- 
1.8.3.1


^ permalink raw reply

* [PATCH v6 1/5] KVM: PPC: Book3S HV: Fix function definition in book3s_hv_uvmem.c
From: Ram Pai @ 2020-07-27 18:07 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1595873238-26184-1-git-send-email-linuxram@us.ibm.com>

Without this fix, git is confused. It generates wrong
function context for code changes in subsequent patches.
Weird, but true.

Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 arch/powerpc/kvm/book3s_hv_uvmem.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 09d8119..e6f76bc 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -382,8 +382,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
  * Alloc a PFN from private device memory pool and copy page from normal
  * memory to secure memory using UV_PAGE_IN uvcall.
  */
-static int
-kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
+static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 		   unsigned long end, unsigned long gpa, struct kvm *kvm,
 		   unsigned long page_shift, bool *downgrade)
 {
@@ -450,8 +449,8 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
  * In the former case, uses dev_pagemap_ops.migrate_to_ram handler
  * to unmap the device page from QEMU's page tables.
  */
-static unsigned long
-kvmppc_share_page(struct kvm *kvm, unsigned long gpa, unsigned long page_shift)
+static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
+		unsigned long page_shift)
 {
 
 	int ret = H_PARAMETER;
@@ -500,9 +499,9 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
  * H_PAGE_IN_SHARED flag makes the page shared which means that the same
  * memory in is visible from both UV and HV.
  */
-unsigned long
-kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
-		     unsigned long flags, unsigned long page_shift)
+unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
+		unsigned long flags,
+		unsigned long page_shift)
 {
 	bool downgrade = false;
 	unsigned long start, end;
@@ -559,10 +558,10 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
  * Provision a new page on HV side and copy over the contents
  * from secure memory using UV_PAGE_OUT uvcall.
  */
-static int
-kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start,
-		    unsigned long end, unsigned long page_shift,
-		    struct kvm *kvm, unsigned long gpa)
+static int kvmppc_svm_page_out(struct vm_area_struct *vma,
+		unsigned long start,
+		unsigned long end, unsigned long page_shift,
+		struct kvm *kvm, unsigned long gpa)
 {
 	unsigned long src_pfn, dst_pfn = 0;
 	struct migrate_vma mig;
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH] powerpc/64s/hash: Fix hash_preload running with interrupts enabled
From: Athira Rajeev @ 2020-07-27 17:21 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: Aneesh Kumar K . V, linuxppc-dev, Nicholas Piggin
In-Reply-To: <87k0yp6sqh.fsf@mpe.ellerman.id.au>



> On 27-Jul-2020, at 6:05 PM, Michael Ellerman <mpe@ellerman.id.au> wrote:
> 
> Athira Rajeev <atrajeev@linux.vnet.ibm.com> writes:
>>> On 27-Jul-2020, at 11:39 AM, Nicholas Piggin <npiggin@gmail.com> wrote:
>>> 
>>> Commit 2f92447f9f96 ("powerpc/book3s64/hash: Use the pte_t address from the
>>> caller") removed the local_irq_disable from hash_preload, but it was
>>> required for more than just the page table walk: the hash pte busy bit is
>>> effectively a lock which may be taken in interrupt context, and the local
>>> update flag test must not be preempted before it's used.
>>> 
>>> This solves apparent lockups with perf interrupting __hash_page_64K. If
>>> get_perf_callchain then also takes a hash fault on the same page while it
>>> is already locked, it will loop forever taking hash faults, which looks like
>>> this:
>>> 
>>> cpu 0x49e: Vector: 100 (System Reset) at [c00000001a4f7d70]
>>>   pc: c000000000072dc8: hash_page_mm+0x8/0x800
>>>   lr: c00000000000c5a4: do_hash_page+0x24/0x38
>>>   sp: c0002ac1cc69ac70
>>>  msr: 8000000000081033
>>> current = 0xc0002ac1cc602e00
>>> paca    = 0xc00000001de1f280   irqmask: 0x03   irq_happened: 0x01
>>>   pid   = 20118, comm = pread2_processe
>>> Linux version 5.8.0-rc6-00345-g1fad14f18bc6
>>> 49e:mon> t
>>> [c0002ac1cc69ac70] c00000000000c5a4 do_hash_page+0x24/0x38 (unreliable)
>>> --- Exception: 300 (Data Access) at c00000000008fa60 __copy_tofrom_user_power7+0x20c/0x7ac
>>> [link register   ] c000000000335d10 copy_from_user_nofault+0xf0/0x150
>>> [c0002ac1cc69af70] c00032bf9fa3c880 (unreliable)
>>> [c0002ac1cc69afa0] c000000000109df0 read_user_stack_64+0x70/0xf0
>>> [c0002ac1cc69afd0] c000000000109fcc perf_callchain_user_64+0x15c/0x410
>>> [c0002ac1cc69b060] c000000000109c00 perf_callchain_user+0x20/0x40
>>> [c0002ac1cc69b080] c00000000031c6cc get_perf_callchain+0x25c/0x360
>>> [c0002ac1cc69b120] c000000000316b50 perf_callchain+0x70/0xa0
>>> [c0002ac1cc69b140] c000000000316ddc perf_prepare_sample+0x25c/0x790
>>> [c0002ac1cc69b1a0] c000000000317350 perf_event_output_forward+0x40/0xb0
>>> [c0002ac1cc69b220] c000000000306138 __perf_event_overflow+0x88/0x1a0
>>> [c0002ac1cc69b270] c00000000010cf70 record_and_restart+0x230/0x750
>>> [c0002ac1cc69b620] c00000000010d69c perf_event_interrupt+0x20c/0x510
>>> [c0002ac1cc69b730] c000000000027d9c performance_monitor_exception+0x4c/0x60
>>> [c0002ac1cc69b750] c00000000000b2f8 performance_monitor_common_virt+0x1b8/0x1c0
>>> --- Exception: f00 (Performance Monitor) at c0000000000cb5b0 pSeries_lpar_hpte_insert+0x0/0x160
>>> [link register   ] c0000000000846f0 __hash_page_64K+0x210/0x540
>>> [c0002ac1cc69ba50] 0000000000000000 (unreliable)
>>> [c0002ac1cc69bb00] c000000000073ae0 update_mmu_cache+0x390/0x3a0
>>> [c0002ac1cc69bb70] c00000000037f024 wp_page_copy+0x364/0xce0
>>> [c0002ac1cc69bc20] c00000000038272c do_wp_page+0xdc/0xa60
>>> [c0002ac1cc69bc70] c0000000003857bc handle_mm_fault+0xb9c/0x1b60
>>> [c0002ac1cc69bd50] c00000000006c434 __do_page_fault+0x314/0xc90
>>> [c0002ac1cc69be20] c00000000000c5c8 handle_page_fault+0x10/0x2c
>>> --- Exception: 300 (Data Access) at 00007fff8c861fe8
>>> SP (7ffff6b19660) is in userspace
>>> 
>>> Reported-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
>>> Reported-by: Anton Blanchard <anton@ozlabs.org>
>>> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>> Fixes: 2f92447f9f96 ("powerpc/book3s64/hash: Use the pte_t address from the
>>> caller")
>>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> 
>> 
>> Hi,
>> 
>> Tested with the patch and it fixes the lockups I was seeing with my test run.
>> Thanks for the fix.
>> 
>> Tested-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
> 
> Thanks for testing.
> 
> What test are you running?

Hi Michael

I was running  “perf record”  and Unixbench tests ( https://github.com/kdlucas/byte-unixbench ) in parallel where we were getting soft lockups

1. Perf command run:
# perf record -a -g -c 10000000 -o <data_file> sleep 60

2. Unixbench tests
# Run -q -c <nr_threads> spawn

Wtth the fix, perf completes successfully.

Thanks
Athira

> 
> cheers


^ permalink raw reply

* [PATCH V5 4/4] tools/perf: Add perf tools support for extended regs in power10
From: Athira Rajeev @ 2020-07-27 17:16 UTC (permalink / raw)
  To: mpe, acme, jolsa; +Cc: ravi.bangoria, mikey, maddy, linuxppc-dev, kjain
In-Reply-To: <1595870184-1460-1-git-send-email-atrajeev@linux.vnet.ibm.com>

Added support for supported regs which are new in power10
( MMCR3, SIER2, SIER3 ) to sample_reg_mask in the tool side
to use with `-I?` option. Also added PVR check to send extended
mask for power10 at kernel while capturing extended regs in
each sample.

Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
 tools/arch/powerpc/include/uapi/asm/perf_regs.h | 6 ++++++
 tools/perf/arch/powerpc/include/perf_regs.h     | 3 +++
 tools/perf/arch/powerpc/util/perf_regs.c        | 6 ++++++
 3 files changed, 15 insertions(+)

diff --git a/tools/arch/powerpc/include/uapi/asm/perf_regs.h b/tools/arch/powerpc/include/uapi/asm/perf_regs.h
index 225c64c..bdf5f10 100644
--- a/tools/arch/powerpc/include/uapi/asm/perf_regs.h
+++ b/tools/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -52,6 +52,9 @@ enum perf_event_powerpc_regs {
 	PERF_REG_POWERPC_MMCR0,
 	PERF_REG_POWERPC_MMCR1,
 	PERF_REG_POWERPC_MMCR2,
+	PERF_REG_POWERPC_MMCR3,
+	PERF_REG_POWERPC_SIER2,
+	PERF_REG_POWERPC_SIER3,
 	/* Max regs without the extended regs */
 	PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
 };
@@ -60,6 +63,9 @@ enum perf_event_powerpc_regs {
 
 /* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
 #define PERF_REG_PMU_MASK_300   (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) - PERF_REG_PMU_MASK)
+/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_31 */
+#define PERF_REG_PMU_MASK_31   (((1ULL << (PERF_REG_POWERPC_SIER3 + 1)) - 1) - PERF_REG_PMU_MASK)
 
 #define PERF_REG_MAX_ISA_300   (PERF_REG_POWERPC_MMCR2 + 1)
+#define PERF_REG_MAX_ISA_31    (PERF_REG_POWERPC_SIER3 + 1)
 #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/tools/perf/arch/powerpc/include/perf_regs.h b/tools/perf/arch/powerpc/include/perf_regs.h
index 46ed00d..63f3ac9 100644
--- a/tools/perf/arch/powerpc/include/perf_regs.h
+++ b/tools/perf/arch/powerpc/include/perf_regs.h
@@ -68,6 +68,9 @@
 	[PERF_REG_POWERPC_MMCR0] = "mmcr0",
 	[PERF_REG_POWERPC_MMCR1] = "mmcr1",
 	[PERF_REG_POWERPC_MMCR2] = "mmcr2",
+	[PERF_REG_POWERPC_MMCR3] = "mmcr3",
+	[PERF_REG_POWERPC_SIER2] = "sier2",
+	[PERF_REG_POWERPC_SIER3] = "sier3",
 };
 
 static inline const char *perf_reg_name(int id)
diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c
index d64ba0c..2b6d470 100644
--- a/tools/perf/arch/powerpc/util/perf_regs.c
+++ b/tools/perf/arch/powerpc/util/perf_regs.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 
 #define PVR_POWER9		0x004E
+#define PVR_POWER10		0x0080
 
 const struct sample_reg sample_reg_masks[] = {
 	SMPL_REG(r0, PERF_REG_POWERPC_R0),
@@ -64,6 +65,9 @@
 	SMPL_REG(mmcr0, PERF_REG_POWERPC_MMCR0),
 	SMPL_REG(mmcr1, PERF_REG_POWERPC_MMCR1),
 	SMPL_REG(mmcr2, PERF_REG_POWERPC_MMCR2),
+	SMPL_REG(mmcr3, PERF_REG_POWERPC_MMCR3),
+	SMPL_REG(sier2, PERF_REG_POWERPC_SIER2),
+	SMPL_REG(sier3, PERF_REG_POWERPC_SIER3),
 	SMPL_REG_END
 };
 
@@ -194,6 +198,8 @@ uint64_t arch__intr_reg_mask(void)
 	version = (((mfspr(SPRN_PVR)) >>  16) & 0xFFFF);
 	if (version == PVR_POWER9)
 		extended_mask = PERF_REG_PMU_MASK_300;
+	else if (version == PVR_POWER10)
+		extended_mask = PERF_REG_PMU_MASK_31;
 	else
 		return mask;
 
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH V5 3/4] tools/perf: Add perf tools support for extended register capability in powerpc
From: Athira Rajeev @ 2020-07-27 17:16 UTC (permalink / raw)
  To: mpe, acme, jolsa; +Cc: ravi.bangoria, mikey, maddy, linuxppc-dev, kjain
In-Reply-To: <1595870184-1460-1-git-send-email-atrajeev@linux.vnet.ibm.com>

From: Anju T Sudhakar <anju@linux.vnet.ibm.com>

Add extended regs to sample_reg_mask in the tool side to use
with `-I?` option. Perf tools side uses extended mask to display
the platform supported register names (with -I? option) to the user
and also send this mask to the kernel to capture the extended registers
in each sample. Hence decide the mask value based on the processor
version.

Currently definitions for `mfspr`, `SPRN_PVR` are part of
`arch/powerpc/util/header.c`. Move this to a header file so that
these definitions can be re-used in other source files as well.

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
[Decide extended mask at run time based on platform]
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
 tools/arch/powerpc/include/uapi/asm/perf_regs.h | 14 ++++++-
 tools/perf/arch/powerpc/include/perf_regs.h     |  5 ++-
 tools/perf/arch/powerpc/util/header.c           |  9 +----
 tools/perf/arch/powerpc/util/perf_regs.c        | 49 +++++++++++++++++++++++++
 tools/perf/arch/powerpc/util/utils_header.h     | 15 ++++++++
 5 files changed, 82 insertions(+), 10 deletions(-)
 create mode 100644 tools/perf/arch/powerpc/util/utils_header.h

diff --git a/tools/arch/powerpc/include/uapi/asm/perf_regs.h b/tools/arch/powerpc/include/uapi/asm/perf_regs.h
index f599064..225c64c 100644
--- a/tools/arch/powerpc/include/uapi/asm/perf_regs.h
+++ b/tools/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -48,6 +48,18 @@ enum perf_event_powerpc_regs {
 	PERF_REG_POWERPC_DSISR,
 	PERF_REG_POWERPC_SIER,
 	PERF_REG_POWERPC_MMCRA,
-	PERF_REG_POWERPC_MAX,
+	/* Extended registers */
+	PERF_REG_POWERPC_MMCR0,
+	PERF_REG_POWERPC_MMCR1,
+	PERF_REG_POWERPC_MMCR2,
+	/* Max regs without the extended regs */
+	PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
 };
+
+#define PERF_REG_PMU_MASK	((1ULL << PERF_REG_POWERPC_MAX) - 1)
+
+/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
+#define PERF_REG_PMU_MASK_300   (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) - PERF_REG_PMU_MASK)
+
+#define PERF_REG_MAX_ISA_300   (PERF_REG_POWERPC_MMCR2 + 1)
 #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/tools/perf/arch/powerpc/include/perf_regs.h b/tools/perf/arch/powerpc/include/perf_regs.h
index e18a355..46ed00d 100644
--- a/tools/perf/arch/powerpc/include/perf_regs.h
+++ b/tools/perf/arch/powerpc/include/perf_regs.h
@@ -64,7 +64,10 @@
 	[PERF_REG_POWERPC_DAR] = "dar",
 	[PERF_REG_POWERPC_DSISR] = "dsisr",
 	[PERF_REG_POWERPC_SIER] = "sier",
-	[PERF_REG_POWERPC_MMCRA] = "mmcra"
+	[PERF_REG_POWERPC_MMCRA] = "mmcra",
+	[PERF_REG_POWERPC_MMCR0] = "mmcr0",
+	[PERF_REG_POWERPC_MMCR1] = "mmcr1",
+	[PERF_REG_POWERPC_MMCR2] = "mmcr2",
 };
 
 static inline const char *perf_reg_name(int id)
diff --git a/tools/perf/arch/powerpc/util/header.c b/tools/perf/arch/powerpc/util/header.c
index d487007..1a95017 100644
--- a/tools/perf/arch/powerpc/util/header.c
+++ b/tools/perf/arch/powerpc/util/header.c
@@ -7,17 +7,10 @@
 #include <string.h>
 #include <linux/stringify.h>
 #include "header.h"
+#include "utils_header.h"
 #include "metricgroup.h"
 #include <api/fs/fs.h>
 
-#define mfspr(rn)       ({unsigned long rval; \
-			 asm volatile("mfspr %0," __stringify(rn) \
-				      : "=r" (rval)); rval; })
-
-#define SPRN_PVR        0x11F	/* Processor Version Register */
-#define PVR_VER(pvr)    (((pvr) >>  16) & 0xFFFF) /* Version field */
-#define PVR_REV(pvr)    (((pvr) >>   0) & 0xFFFF) /* Revison field */
-
 int
 get_cpuid(char *buffer, size_t sz)
 {
diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c
index 0a52429..d64ba0c 100644
--- a/tools/perf/arch/powerpc/util/perf_regs.c
+++ b/tools/perf/arch/powerpc/util/perf_regs.c
@@ -6,9 +6,15 @@
 
 #include "../../../util/perf_regs.h"
 #include "../../../util/debug.h"
+#include "../../../util/event.h"
+#include "../../../util/header.h"
+#include "../../../perf-sys.h"
+#include "utils_header.h"
 
 #include <linux/kernel.h>
 
+#define PVR_POWER9		0x004E
+
 const struct sample_reg sample_reg_masks[] = {
 	SMPL_REG(r0, PERF_REG_POWERPC_R0),
 	SMPL_REG(r1, PERF_REG_POWERPC_R1),
@@ -55,6 +61,9 @@
 	SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR),
 	SMPL_REG(sier, PERF_REG_POWERPC_SIER),
 	SMPL_REG(mmcra, PERF_REG_POWERPC_MMCRA),
+	SMPL_REG(mmcr0, PERF_REG_POWERPC_MMCR0),
+	SMPL_REG(mmcr1, PERF_REG_POWERPC_MMCR1),
+	SMPL_REG(mmcr2, PERF_REG_POWERPC_MMCR2),
 	SMPL_REG_END
 };
 
@@ -163,3 +172,43 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)
 
 	return SDT_ARG_VALID;
 }
+
+uint64_t arch__intr_reg_mask(void)
+{
+	struct perf_event_attr attr = {
+		.type                   = PERF_TYPE_HARDWARE,
+		.config                 = PERF_COUNT_HW_CPU_CYCLES,
+		.sample_type            = PERF_SAMPLE_REGS_INTR,
+		.precise_ip             = 1,
+		.disabled               = 1,
+		.exclude_kernel         = 1,
+	};
+	int fd;
+	u32 version;
+	u64 extended_mask = 0, mask = PERF_REGS_MASK;
+
+	/*
+	 * Get the PVR value to set the extended
+	 * mask specific to platform.
+	 */
+	version = (((mfspr(SPRN_PVR)) >>  16) & 0xFFFF);
+	if (version == PVR_POWER9)
+		extended_mask = PERF_REG_PMU_MASK_300;
+	else
+		return mask;
+
+	attr.sample_regs_intr = extended_mask;
+	attr.sample_period = 1;
+	event_attr_init(&attr);
+
+	/*
+	 * check if the pmu supports perf extended regs, before
+	 * returning the register mask to sample.
+	 */
+	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
+	if (fd != -1) {
+		close(fd);
+		mask |= extended_mask;
+	}
+	return mask;
+}
diff --git a/tools/perf/arch/powerpc/util/utils_header.h b/tools/perf/arch/powerpc/util/utils_header.h
new file mode 100644
index 0000000..5788eb1
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/utils_header.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PERF_UTIL_HEADER_H
+#define __PERF_UTIL_HEADER_H
+
+#include <linux/stringify.h>
+
+#define mfspr(rn)       ({unsigned long rval; \
+			asm volatile("mfspr %0," __stringify(rn) \
+				: "=r" (rval)); rval; })
+
+#define SPRN_PVR        0x11F   /* Processor Version Register */
+#define PVR_VER(pvr)    (((pvr) >>  16) & 0xFFFF) /* Version field */
+#define PVR_REV(pvr)    (((pvr) >>   0) & 0xFFFF) /* Revison field */
+
+#endif /* __PERF_UTIL_HEADER_H */
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH V5 2/4] powerpc/perf: Add extended regs support for power10 platform
From: Athira Rajeev @ 2020-07-27 17:16 UTC (permalink / raw)
  To: mpe, acme, jolsa; +Cc: ravi.bangoria, mikey, maddy, linuxppc-dev, kjain
In-Reply-To: <1595870184-1460-1-git-send-email-atrajeev@linux.vnet.ibm.com>

Include capability flag `PERF_PMU_CAP_EXTENDED_REGS` for power10
and expose MMCR3, SIER2, SIER3 registers as part of extended regs.
Also introduce `PERF_REG_PMU_MASK_31` to define extended mask
value at runtime for power10

Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
[Fix build failure on PPC32 platform]
Suggested-by: Ryan Grimm <grimm@linux.ibm.com>
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Tested-by: Nageswara R Sastry <nasastry@in.ibm.com>
Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
 arch/powerpc/include/uapi/asm/perf_regs.h |  6 ++++++
 arch/powerpc/perf/perf_regs.c             | 12 +++++++++++-
 arch/powerpc/perf/power10-pmu.c           |  6 ++++++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h b/arch/powerpc/include/uapi/asm/perf_regs.h
index 225c64c..bdf5f10 100644
--- a/arch/powerpc/include/uapi/asm/perf_regs.h
+++ b/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -52,6 +52,9 @@ enum perf_event_powerpc_regs {
 	PERF_REG_POWERPC_MMCR0,
 	PERF_REG_POWERPC_MMCR1,
 	PERF_REG_POWERPC_MMCR2,
+	PERF_REG_POWERPC_MMCR3,
+	PERF_REG_POWERPC_SIER2,
+	PERF_REG_POWERPC_SIER3,
 	/* Max regs without the extended regs */
 	PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
 };
@@ -60,6 +63,9 @@ enum perf_event_powerpc_regs {
 
 /* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
 #define PERF_REG_PMU_MASK_300   (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) - PERF_REG_PMU_MASK)
+/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_31 */
+#define PERF_REG_PMU_MASK_31   (((1ULL << (PERF_REG_POWERPC_SIER3 + 1)) - 1) - PERF_REG_PMU_MASK)
 
 #define PERF_REG_MAX_ISA_300   (PERF_REG_POWERPC_MMCR2 + 1)
+#define PERF_REG_MAX_ISA_31    (PERF_REG_POWERPC_SIER3 + 1)
 #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
index 9301e68..8e53f2f 100644
--- a/arch/powerpc/perf/perf_regs.c
+++ b/arch/powerpc/perf/perf_regs.c
@@ -81,6 +81,14 @@ static u64 get_ext_regs_value(int idx)
 		return mfspr(SPRN_MMCR1);
 	case PERF_REG_POWERPC_MMCR2:
 		return mfspr(SPRN_MMCR2);
+#ifdef CONFIG_PPC64
+	case PERF_REG_POWERPC_MMCR3:
+		return mfspr(SPRN_MMCR3);
+	case PERF_REG_POWERPC_SIER2:
+		return mfspr(SPRN_SIER2);
+	case PERF_REG_POWERPC_SIER3:
+		return mfspr(SPRN_SIER3);
+#endif
 	default: return 0;
 	}
 }
@@ -89,7 +97,9 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
 {
 	u64 perf_reg_extended_max = PERF_REG_POWERPC_MAX;
 
-	if (cpu_has_feature(CPU_FTR_ARCH_300))
+	if (cpu_has_feature(CPU_FTR_ARCH_31))
+		perf_reg_extended_max = PERF_REG_MAX_ISA_31;
+	else if (cpu_has_feature(CPU_FTR_ARCH_300))
 		perf_reg_extended_max = PERF_REG_MAX_ISA_300;
 
 	if (idx == PERF_REG_POWERPC_SIER &&
diff --git a/arch/powerpc/perf/power10-pmu.c b/arch/powerpc/perf/power10-pmu.c
index f7cff7f..8314865 100644
--- a/arch/powerpc/perf/power10-pmu.c
+++ b/arch/powerpc/perf/power10-pmu.c
@@ -87,6 +87,8 @@
 #define POWER10_MMCRA_IFM3		0x00000000C0000000UL
 #define POWER10_MMCRA_BHRB_MASK		0x00000000C0000000UL
 
+extern u64 PERF_REG_EXTENDED_MASK;
+
 /* Table of alternatives, sorted by column 0 */
 static const unsigned int power10_event_alternatives[][MAX_ALT] = {
 	{ PM_RUN_CYC_ALT,		PM_RUN_CYC },
@@ -397,6 +399,7 @@ static void power10_config_bhrb(u64 pmu_bhrb_filter)
 	.cache_events		= &power10_cache_events,
 	.attr_groups		= power10_pmu_attr_groups,
 	.bhrb_nr		= 32,
+	.capabilities           = PERF_PMU_CAP_EXTENDED_REGS,
 };
 
 int init_power10_pmu(void)
@@ -408,6 +411,9 @@ int init_power10_pmu(void)
 	    strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power10"))
 		return -ENODEV;
 
+	/* Set the PERF_REG_EXTENDED_MASK here */
+	PERF_REG_EXTENDED_MASK = PERF_REG_PMU_MASK_31;
+
 	rc = register_power_pmu(&power10_pmu);
 	if (rc)
 		return rc;
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH V5 1/4] powerpc/perf: Add support for outputting extended regs in perf intr_regs
From: Athira Rajeev @ 2020-07-27 17:16 UTC (permalink / raw)
  To: mpe, acme, jolsa; +Cc: ravi.bangoria, mikey, maddy, linuxppc-dev, kjain
In-Reply-To: <1595870184-1460-1-git-send-email-atrajeev@linux.vnet.ibm.com>

From: Anju T Sudhakar <anju@linux.vnet.ibm.com>

Add support for perf extended register capability in powerpc.
The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to indicate the
PMU which support extended registers. The generic code define the mask
of extended registers as 0 for non supported architectures.

Patch adds extended regs support for power9 platform by
exposing MMCR0, MMCR1 and MMCR2 registers.

REG_RESERVED mask needs update to include extended regs.
`PERF_REG_EXTENDED_MASK`, contains mask value of the supported registers,
is defined at runtime in the kernel based on platform since the supported
registers may differ from one processor version to another and hence the
MASK value.

with patch
----------

available registers: r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11
r12 r13 r14 r15 r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r26
r27 r28 r29 r30 r31 nip msr orig_r3 ctr link xer ccr softe
trap dar dsisr sier mmcra mmcr0 mmcr1 mmcr2

PERF_RECORD_SAMPLE(IP, 0x1): 4784/4784: 0 period: 1 addr: 0
... intr regs: mask 0xffffffffffff ABI 64-bit
.... r0    0xc00000000012b77c
.... r1    0xc000003fe5e03930
.... r2    0xc000000001b0e000
.... r3    0xc000003fdcddf800
.... r4    0xc000003fc7880000
.... r5    0x9c422724be
.... r6    0xc000003fe5e03908
.... r7    0xffffff63bddc8706
.... r8    0x9e4
.... r9    0x0
.... r10   0x1
.... r11   0x0
.... r12   0xc0000000001299c0
.... r13   0xc000003ffffc4800
.... r14   0x0
.... r15   0x7fffdd8b8b00
.... r16   0x0
.... r17   0x7fffdd8be6b8
.... r18   0x7e7076607730
.... r19   0x2f
.... r20   0xc00000001fc26c68
.... r21   0xc0002041e4227e00
.... r22   0xc00000002018fb60
.... r23   0x1
.... r24   0xc000003ffec4d900
.... r25   0x80000000
.... r26   0x0
.... r27   0x1
.... r28   0x1
.... r29   0xc000000001be1260
.... r30   0x6008010
.... r31   0xc000003ffebb7218
.... nip   0xc00000000012b910
.... msr   0x9000000000009033
.... orig_r3 0xc00000000012b86c
.... ctr   0xc0000000001299c0
.... link  0xc00000000012b77c
.... xer   0x0
.... ccr   0x28002222
.... softe 0x1
.... trap  0xf00
.... dar   0x0
.... dsisr 0x80000000000
.... sier  0x0
.... mmcra 0x80000000000
.... mmcr0 0x82008090
.... mmcr1 0x1e000000
.... mmcr2 0x0
 ... thread: perf:4784

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
[Defined PERF_REG_EXTENDED_MASK at run time to add support for different platforms ]
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
[Fix build issue using CONFIG_PERF_EVENTS without CONFIG_PPC_PERF_CTRS]
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Tested-by: Nageswara R Sastry <nasastry@in.ibm.com>
Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
 arch/powerpc/include/asm/perf_event.h        |  3 +++
 arch/powerpc/include/asm/perf_event_server.h |  5 ++++
 arch/powerpc/include/uapi/asm/perf_regs.h    | 14 +++++++++++-
 arch/powerpc/perf/core-book3s.c              |  1 +
 arch/powerpc/perf/perf_regs.c                | 34 +++++++++++++++++++++++++---
 arch/powerpc/perf/power9-pmu.c               |  6 +++++
 6 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h
index 1e8b2e1..daec64d 100644
--- a/arch/powerpc/include/asm/perf_event.h
+++ b/arch/powerpc/include/asm/perf_event.h
@@ -40,4 +40,7 @@
 
 /* To support perf_regs sier update */
 extern bool is_sier_available(void);
+/* To define perf extended regs mask value */
+extern u64 PERF_REG_EXTENDED_MASK;
+#define PERF_REG_EXTENDED_MASK	PERF_REG_EXTENDED_MASK
 #endif
diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index 86c9eb06..f6acabb 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -62,6 +62,11 @@ struct power_pmu {
 	int 		*blacklist_ev;
 	/* BHRB entries in the PMU */
 	int		bhrb_nr;
+	/*
+	 * set this flag with `PERF_PMU_CAP_EXTENDED_REGS` if
+	 * the pmu supports extended perf regs capability
+	 */
+	int		capabilities;
 };
 
 /*
diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h b/arch/powerpc/include/uapi/asm/perf_regs.h
index f599064..225c64c 100644
--- a/arch/powerpc/include/uapi/asm/perf_regs.h
+++ b/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -48,6 +48,18 @@ enum perf_event_powerpc_regs {
 	PERF_REG_POWERPC_DSISR,
 	PERF_REG_POWERPC_SIER,
 	PERF_REG_POWERPC_MMCRA,
-	PERF_REG_POWERPC_MAX,
+	/* Extended registers */
+	PERF_REG_POWERPC_MMCR0,
+	PERF_REG_POWERPC_MMCR1,
+	PERF_REG_POWERPC_MMCR2,
+	/* Max regs without the extended regs */
+	PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
 };
+
+#define PERF_REG_PMU_MASK	((1ULL << PERF_REG_POWERPC_MAX) - 1)
+
+/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
+#define PERF_REG_PMU_MASK_300   (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) - PERF_REG_PMU_MASK)
+
+#define PERF_REG_MAX_ISA_300   (PERF_REG_POWERPC_MMCR2 + 1)
 #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index e29c846..65a0b76 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2317,6 +2317,7 @@ int register_power_pmu(struct power_pmu *pmu)
 		pmu->name);
 
 	power_pmu.attr_groups = ppmu->attr_groups;
+	power_pmu.capabilities |= (ppmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS);
 
 #ifdef MSR_HV
 	/*
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
index a213a0a..9301e68 100644
--- a/arch/powerpc/perf/perf_regs.c
+++ b/arch/powerpc/perf/perf_regs.c
@@ -13,9 +13,11 @@
 #include <asm/ptrace.h>
 #include <asm/perf_regs.h>
 
+u64 PERF_REG_EXTENDED_MASK;
+
 #define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
 
-#define REG_RESERVED (~((1ULL << PERF_REG_POWERPC_MAX) - 1))
+#define REG_RESERVED (~(PERF_REG_EXTENDED_MASK | PERF_REG_PMU_MASK))
 
 static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = {
 	PT_REGS_OFFSET(PERF_REG_POWERPC_R0,  gpr[0]),
@@ -69,10 +71,26 @@
 	PT_REGS_OFFSET(PERF_REG_POWERPC_MMCRA, dsisr),
 };
 
+/* Function to return the extended register values */
+static u64 get_ext_regs_value(int idx)
+{
+	switch (idx) {
+	case PERF_REG_POWERPC_MMCR0:
+		return mfspr(SPRN_MMCR0);
+	case PERF_REG_POWERPC_MMCR1:
+		return mfspr(SPRN_MMCR1);
+	case PERF_REG_POWERPC_MMCR2:
+		return mfspr(SPRN_MMCR2);
+	default: return 0;
+	}
+}
+
 u64 perf_reg_value(struct pt_regs *regs, int idx)
 {
-	if (WARN_ON_ONCE(idx >= PERF_REG_POWERPC_MAX))
-		return 0;
+	u64 perf_reg_extended_max = PERF_REG_POWERPC_MAX;
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		perf_reg_extended_max = PERF_REG_MAX_ISA_300;
 
 	if (idx == PERF_REG_POWERPC_SIER &&
 	   (IS_ENABLED(CONFIG_FSL_EMB_PERF_EVENT) ||
@@ -85,6 +103,16 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
 	    IS_ENABLED(CONFIG_PPC32)))
 		return 0;
 
+	if (idx >= PERF_REG_POWERPC_MAX && idx < perf_reg_extended_max)
+		return get_ext_regs_value(idx);
+
+	/*
+	 * If the idx is referring to value beyond the
+	 * supported registers, return 0 with a warning
+	 */
+	if (WARN_ON_ONCE(idx >= perf_reg_extended_max))
+		return 0;
+
 	return regs_get_register(regs, pt_regs_offset[idx]);
 }
 
diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 05dae38..2a57e93 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -90,6 +90,8 @@ enum {
 #define POWER9_MMCRA_IFM3		0x00000000C0000000UL
 #define POWER9_MMCRA_BHRB_MASK		0x00000000C0000000UL
 
+extern u64 PERF_REG_EXTENDED_MASK;
+
 /* Nasty Power9 specific hack */
 #define PVR_POWER9_CUMULUS		0x00002000
 
@@ -434,6 +436,7 @@ static void power9_config_bhrb(u64 pmu_bhrb_filter)
 	.cache_events		= &power9_cache_events,
 	.attr_groups		= power9_pmu_attr_groups,
 	.bhrb_nr		= 32,
+	.capabilities           = PERF_PMU_CAP_EXTENDED_REGS,
 };
 
 int init_power9_pmu(void)
@@ -457,6 +460,9 @@ int init_power9_pmu(void)
 		}
 	}
 
+	/* Set the PERF_REG_EXTENDED_MASK here */
+	PERF_REG_EXTENDED_MASK = PERF_REG_PMU_MASK_300;
+
 	rc = register_power_pmu(&power9_pmu);
 	if (rc)
 		return rc;
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH V5 0/4] powerpc/perf: Add support for perf extended regs in powerpc
From: Athira Rajeev @ 2020-07-27 17:16 UTC (permalink / raw)
  To: mpe, acme, jolsa; +Cc: ravi.bangoria, mikey, maddy, linuxppc-dev, kjain

Patch set to add support for perf extended register capability in
powerpc. The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to
indicate the PMU which support extended registers. The generic code
define the mask of extended registers as 0 for non supported architectures.

Patches 1 and 2 are the kernel side changes needed to include
base support for extended regs in powerpc and in power10.
Patches 3 and 4 are the perf tools side changes needed to support the
extended registers.

patch 1/4 defines the PERF_PMU_CAP_EXTENDED_REGS mask to output the
values of mmcr0,mmcr1,mmcr2 for POWER9. Defines `PERF_REG_EXTENDED_MASK`
at runtime which contains mask value of the supported registers under
extended regs.

patch 2/4 adds the extended regs support for power10 and exposes
MMCR3, SIER2, SIER3 registers as part of extended regs.

Patch 3/4 and 4/4 adds extended regs to sample_reg_mask in the tool
side to use with `-I?` option for power9 and power10 respectively.

Ravi bangoria found an issue with `perf record -I` while testing the
changes. The same issue is currently being worked on here:
https://lkml.org/lkml/2020/7/19/413 and will be resolved once fix
from Jin Yao is merged.

This patch series is based on powerpc/next

Changelog:

Changes from v4 -> v5
- initialize `perf_reg_extended_max` to work on
  all platforms as suggested by Ravi Bangoria
- Added Reviewed-and-Tested-by from Ravi Bangoria

Changes from v3 -> v4
- Split the series and send extended regs as separate patch set here.
  Link to previous series :
  https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=190462&state=*
  Other PMU patches are already merged in powerpc/next.

- Fixed kernel build issue when using config having
  CONFIG_PERF_EVENTS set and without CONFIG_PPC_PERF_CTRS
  reported by kernel build bot.
- Included Reviewed-by from Kajol Jain.
- Addressed review comments from Ravi Bangoria to initialize `perf_reg_extended_max`
  and define it in lowercase since it is local variable.

Anju T Sudhakar (2):
  powerpc/perf: Add support for outputting extended regs in perf
    intr_regs
  tools/perf: Add perf tools support for extended register capability in
    powerpc

Athira Rajeev (2):
  powerpc/perf: Add extended regs support for power10 platform
  tools/perf: Add perf tools support for extended regs in power10

 arch/powerpc/include/asm/perf_event.h           |  3 ++
 arch/powerpc/include/asm/perf_event_server.h    |  5 +++
 arch/powerpc/include/uapi/asm/perf_regs.h       | 20 ++++++++-
 arch/powerpc/perf/core-book3s.c                 |  1 +
 arch/powerpc/perf/perf_regs.c                   | 44 ++++++++++++++++++--
 arch/powerpc/perf/power10-pmu.c                 |  6 +++
 arch/powerpc/perf/power9-pmu.c                  |  6 +++
 tools/arch/powerpc/include/uapi/asm/perf_regs.h | 20 ++++++++-
 tools/perf/arch/powerpc/include/perf_regs.h     |  8 +++-
 tools/perf/arch/powerpc/util/header.c           |  9 +---
 tools/perf/arch/powerpc/util/perf_regs.c        | 55 +++++++++++++++++++++++++
 tools/perf/arch/powerpc/util/utils_header.h     | 15 +++++++
 12 files changed, 178 insertions(+), 14 deletions(-)
 create mode 100644 tools/perf/arch/powerpc/util/utils_header.h

-- 
1.8.3.1


^ permalink raw reply

* [PATCH -next] powerpc/powernv/sriov: Remove unused but set variable 'phb'
From: Wei Yongjun @ 2020-07-27 17:11 UTC (permalink / raw)
  To: Hulk Robot, Michael Ellerman, Oliver O'Halloran,
	Alexey Kardashevskiy
  Cc: linuxppc-dev, Wei Yongjun, linux-kernel

Gcc report warning as follows:

arch/powerpc/platforms/powernv/pci-sriov.c:602:25: warning:
 variable 'phb' set but not used [-Wunused-but-set-variable]
  602 |  struct pnv_phb        *phb;
      |                         ^~~

This variable is not used, so this commit removing it.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
---
 arch/powerpc/platforms/powernv/pci-sriov.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
index 8404d8c3901d..7894745fd4f8 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -599,10 +599,8 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 static void pnv_pci_sriov_disable(struct pci_dev *pdev)
 {
 	u16                    num_vfs, base_pe;
-	struct pnv_phb        *phb;
 	struct pnv_iov_data   *iov;
 
-	phb = pci_bus_to_pnvhb(pdev->bus);
 	iov = pnv_iov_get(pdev);
 	num_vfs = iov->num_vfs;
 	base_pe = iov->vf_pe_arr[0].pe_number;


^ permalink raw reply related

* Re: [PATCH 0/5] cpuidle-pseries: Parse extended CEDE information for idle.
From: Rafael J. Wysocki @ 2020-07-27 14:14 UTC (permalink / raw)
  To: Gautham R. Shenoy
  Cc: Nathan Lynch, Michael Neuling, Vaidyanathan Srinivasan, Linux PM,
	Linux Kernel Mailing List, Nicholas Piggin, linuxppc-dev
In-Reply-To: <20200707113235.GM14120@in.ibm.com>

On Tue, Jul 7, 2020 at 1:32 PM Gautham R Shenoy <ego@linux.vnet.ibm.com> wrote:
>
> Hi,
>
> On Tue, Jul 07, 2020 at 04:41:34PM +0530, Gautham R. Shenoy wrote:
> > From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
> >
> > Hi,
> >
> >
> >
> >
> > Gautham R. Shenoy (5):
> >   cpuidle-pseries: Set the latency-hint before entering CEDE
> >   cpuidle-pseries: Add function to parse extended CEDE records
> >   cpuidle-pseries : Fixup exit latency for CEDE(0)
> >   cpuidle-pseries : Include extended CEDE states in cpuidle framework
> >   cpuidle-pseries: Block Extended CEDE(1) which adds no additional
> >     value.
>
> Forgot to mention that these patches are on top of Nathan's series to
> remove extended CEDE offline and bogus topology update code :
> https://lore.kernel.org/linuxppc-dev/20200612051238.1007764-1-nathanl@linux.ibm.com/

OK, so this is targeted at the powerpc maintainers, isn't it?

^ permalink raw reply

* Re: [PATCH v2 4/5] powerpc/mm: Remove custom stack expansion checking
From: Daniel Axtens @ 2020-07-27 13:48 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev; +Cc: linux-kernel
In-Reply-To: <20200724092528.1578671-4-mpe@ellerman.id.au>

Hi Michael,

I tested v1 of this. I ran the test from the bug with a range of stack
sizes, in a loop, for several hours and didn't see any crashes/signal
delivery failures.

I retested v2 for a few minutes just to be sure, and I ran stress-ng's
stack, stackmmap and bad-altstack stressors to make sure no obvious
kernel bugs were exposed. Nothing crashed.

All tests done on a P8 LE guest under KVM.

On that basis:

Tested-by: Daniel Axtens <dja@axtens.net>

The more I look at this the less qualified I feel to Review it, but
certainly it looks better than my ugly hack from late last year.

Kind regards,
Daniel

> We have powerpc specific logic in our page fault handling to decide if
> an access to an unmapped address below the stack pointer should expand
> the stack VMA.
>
> The logic aims to prevent userspace from doing bad accesses below the
> stack pointer. However as long as the stack is < 1MB in size, we allow
> all accesses without further checks. Adding some debug I see that I
> can do a full kernel build and LTP run, and not a single process has
> used more than 1MB of stack. So for the majority of processes the
> logic never even fires.
>
> We also recently found a nasty bug in this code which could cause
> userspace programs to be killed during signal delivery. It went
> unnoticed presumably because most processes use < 1MB of stack.
>
> The generic mm code has also grown support for stack guard pages since
> this code was originally written, so the most heinous case of the
> stack expanding into other mappings is now handled for us.
>
> Finally although some other arches have special logic in this path,
> from what I can tell none of x86, arm64, arm and s390 impose any extra
> checks other than those in expand_stack().
>
> So drop our complicated logic and like other architectures just let
> the stack expand as long as its within the rlimit.
>
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
>  arch/powerpc/mm/fault.c | 109 ++--------------------------------------
>  1 file changed, 5 insertions(+), 104 deletions(-)
>
> v2: no change just rebased.
>
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index 3ebb1792e636..925a7231abb3 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -42,39 +42,7 @@
>  #include <asm/kup.h>
>  #include <asm/inst.h>
>  
> -/*
> - * Check whether the instruction inst is a store using
> - * an update addressing form which will update r1.
> - */
> -static bool store_updates_sp(struct ppc_inst inst)
> -{
> -	/* check for 1 in the rA field */
> -	if (((ppc_inst_val(inst) >> 16) & 0x1f) != 1)
> -		return false;
> -	/* check major opcode */
> -	switch (ppc_inst_primary_opcode(inst)) {
> -	case OP_STWU:
> -	case OP_STBU:
> -	case OP_STHU:
> -	case OP_STFSU:
> -	case OP_STFDU:
> -		return true;
> -	case OP_STD:	/* std or stdu */
> -		return (ppc_inst_val(inst) & 3) == 1;
> -	case OP_31:
> -		/* check minor opcode */
> -		switch ((ppc_inst_val(inst) >> 1) & 0x3ff) {
> -		case OP_31_XOP_STDUX:
> -		case OP_31_XOP_STWUX:
> -		case OP_31_XOP_STBUX:
> -		case OP_31_XOP_STHUX:
> -		case OP_31_XOP_STFSUX:
> -		case OP_31_XOP_STFDUX:
> -			return true;
> -		}
> -	}
> -	return false;
> -}
> +
>  /*
>   * do_page_fault error handling helpers
>   */
> @@ -267,57 +235,6 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
>  	return false;
>  }
>  
> -// This comes from 64-bit struct rt_sigframe + __SIGNAL_FRAMESIZE
> -#define SIGFRAME_MAX_SIZE	(4096 + 128)
> -
> -static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
> -				struct vm_area_struct *vma, unsigned int flags,
> -				bool *must_retry)
> -{
> -	/*
> -	 * N.B. The POWER/Open ABI allows programs to access up to
> -	 * 288 bytes below the stack pointer.
> -	 * The kernel signal delivery code writes a bit over 4KB
> -	 * below the stack pointer (r1) before decrementing it.
> -	 * The exec code can write slightly over 640kB to the stack
> -	 * before setting the user r1.  Thus we allow the stack to
> -	 * expand to 1MB without further checks.
> -	 */
> -	if (address + 0x100000 < vma->vm_end) {
> -		struct ppc_inst __user *nip = (struct ppc_inst __user *)regs->nip;
> -		/* get user regs even if this fault is in kernel mode */
> -		struct pt_regs *uregs = current->thread.regs;
> -		if (uregs == NULL)
> -			return true;
> -
> -		/*
> -		 * A user-mode access to an address a long way below
> -		 * the stack pointer is only valid if the instruction
> -		 * is one which would update the stack pointer to the
> -		 * address accessed if the instruction completed,
> -		 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
> -		 * (or the byte, halfword, float or double forms).
> -		 *
> -		 * If we don't check this then any write to the area
> -		 * between the last mapped region and the stack will
> -		 * expand the stack rather than segfaulting.
> -		 */
> -		if (address + SIGFRAME_MAX_SIZE >= uregs->gpr[1])
> -			return false;
> -
> -		if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
> -		    access_ok(nip, sizeof(*nip))) {
> -			struct ppc_inst inst;
> -
> -			if (!probe_user_read_inst(&inst, nip))
> -				return !store_updates_sp(inst);
> -			*must_retry = true;
> -		}
> -		return true;
> -	}
> -	return false;
> -}
> -
>  #ifdef CONFIG_PPC_MEM_KEYS
>  static bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey,
>  			      struct vm_area_struct *vma)
> @@ -483,7 +400,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>  	int is_user = user_mode(regs);
>  	int is_write = page_fault_is_write(error_code);
>  	vm_fault_t fault, major = 0;
> -	bool must_retry = false;
>  	bool kprobe_fault = kprobe_page_fault(regs, 11);
>  
>  	if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
> @@ -572,30 +488,15 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>  	vma = find_vma(mm, address);
>  	if (unlikely(!vma))
>  		return bad_area(regs, address);
> -	if (likely(vma->vm_start <= address))
> -		goto good_area;
> -	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
> -		return bad_area(regs, address);
>  
> -	/* The stack is being expanded, check if it's valid */
> -	if (unlikely(bad_stack_expansion(regs, address, vma, flags,
> -					 &must_retry))) {
> -		if (!must_retry)
> +	if (unlikely(vma->vm_start > address)) {
> +		if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
>  			return bad_area(regs, address);
>  
> -		mmap_read_unlock(mm);
> -		if (fault_in_pages_readable((const char __user *)regs->nip,
> -					    sizeof(unsigned int)))
> -			return bad_area_nosemaphore(regs, address);
> -		goto retry;
> +		if (unlikely(expand_stack(vma, address)))
> +			return bad_area(regs, address);
>  	}
>  
> -	/* Try to expand it */
> -	if (unlikely(expand_stack(vma, address)))
> -		return bad_area(regs, address);
> -
> -good_area:
> -
>  #ifdef CONFIG_PPC_MEM_KEYS
>  	if (unlikely(access_pkey_error(is_write, is_exec,
>  				       (error_code & DSISR_KEYFAULT), vma)))
> -- 
> 2.25.1

^ permalink raw reply

* Re: [PATCH v3 1/2] cpuidle: Trace IPI based and timer based wakeup latency from idle states
From: Rafael J. Wysocki @ 2020-07-27 13:42 UTC (permalink / raw)
  To: Pratik Rajesh Sampat
  Cc: Gautham R. Shenoy, pratik.r.sampat, Linux PM, Daniel Lezcano,
	Rafael J. Wysocki, linuxppc-dev, Nicholas Piggin, Paul Mackerras,
	linux-kselftest, Shuah Khan, srivatsa, Linux Kernel Mailing List
In-Reply-To: <20200721124300.65615-2-psampat@linux.ibm.com>

On Tue, Jul 21, 2020 at 2:43 PM Pratik Rajesh Sampat
<psampat@linux.ibm.com> wrote:
>
> Fire directed smp_call_function_single IPIs from a specified source
> CPU to the specified target CPU to reduce the noise we have to wade
> through in the trace log.

And what's the purpose of it?

> The module is based on the idea written by Srivatsa Bhat and maintained
> by Vaidyanathan Srinivasan internally.
>
> Queue HR timer and measure jitter. Wakeup latency measurement for idle
> states using hrtimer.  Echo a value in ns to timer_test_function and
> watch trace. A HRtimer will be queued and when it fires the expected
> wakeup vs actual wakeup is computes and delay printed in ns.
>
> Implemented as a module which utilizes debugfs so that it can be
> integrated with selftests.
>
> To include the module, check option and include as module
> kernel hacking -> Cpuidle latency selftests
>
> [srivatsa.bhat@linux.vnet.ibm.com: Initial implementation in
>  cpidle/sysfs]
>
> [svaidy@linux.vnet.ibm.com: wakeup latency measurements using hrtimer
>  and fix some of the time calculation]
>
> [ego@linux.vnet.ibm.com: Fix some whitespace and tab errors and
>  increase the resolution of IPI wakeup]
>
> Signed-off-by: Pratik Rajesh Sampat <psampat@linux.ibm.com>
> Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
> ---
>  drivers/cpuidle/Makefile               |   1 +
>  drivers/cpuidle/test-cpuidle_latency.c | 150 +++++++++++++++++++++++++
>  lib/Kconfig.debug                      |  10 ++
>  3 files changed, 161 insertions(+)
>  create mode 100644 drivers/cpuidle/test-cpuidle_latency.c
>
> diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
> index f07800cbb43f..2ae05968078c 100644
> --- a/drivers/cpuidle/Makefile
> +++ b/drivers/cpuidle/Makefile
> @@ -8,6 +8,7 @@ obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o
>  obj-$(CONFIG_DT_IDLE_STATES)             += dt_idle_states.o
>  obj-$(CONFIG_ARCH_HAS_CPU_RELAX)         += poll_state.o
>  obj-$(CONFIG_HALTPOLL_CPUIDLE)           += cpuidle-haltpoll.o
> +obj-$(CONFIG_IDLE_LATENCY_SELFTEST)      += test-cpuidle_latency.o
>
>  ##################################################################################
>  # ARM SoC drivers
> diff --git a/drivers/cpuidle/test-cpuidle_latency.c b/drivers/cpuidle/test-cpuidle_latency.c
> new file mode 100644
> index 000000000000..61574665e972
> --- /dev/null
> +++ b/drivers/cpuidle/test-cpuidle_latency.c
> @@ -0,0 +1,150 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Module-based API test facility for cpuidle latency using IPIs and timers

I'd like to see a more detailed description of what it does and how it
works here.

> + */
> +
> +#include <linux/debugfs.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +
> +/* IPI based wakeup latencies */
> +struct latency {
> +       unsigned int src_cpu;
> +       unsigned int dest_cpu;
> +       ktime_t time_start;
> +       ktime_t time_end;
> +       u64 latency_ns;
> +} ipi_wakeup;
> +
> +static void measure_latency(void *info)
> +{
> +       struct latency *v;
> +       ktime_t time_diff;
> +
> +       v = (struct latency *)info;
> +       v->time_end = ktime_get();
> +       time_diff = ktime_sub(v->time_end, v->time_start);
> +       v->latency_ns = ktime_to_ns(time_diff);
> +}
> +
> +void run_smp_call_function_test(unsigned int cpu)
> +{
> +       ipi_wakeup.src_cpu = smp_processor_id();
> +       ipi_wakeup.dest_cpu = cpu;
> +       ipi_wakeup.time_start = ktime_get();
> +       smp_call_function_single(cpu, measure_latency, &ipi_wakeup, 1);
> +}
> +
> +/* Timer based wakeup latencies */
> +struct timer_data {
> +       unsigned int src_cpu;
> +       u64 timeout;
> +       ktime_t time_start;
> +       ktime_t time_end;
> +       struct hrtimer timer;
> +       u64 timeout_diff_ns;
> +} timer_wakeup;
> +
> +static enum hrtimer_restart timer_called(struct hrtimer *hrtimer)
> +{
> +       struct timer_data *w;
> +       ktime_t time_diff;
> +
> +       w = container_of(hrtimer, struct timer_data, timer);
> +       w->time_end = ktime_get();
> +
> +       time_diff = ktime_sub(w->time_end, w->time_start);
> +       time_diff = ktime_sub(time_diff, ns_to_ktime(w->timeout));
> +       w->timeout_diff_ns = ktime_to_ns(time_diff);
> +       return HRTIMER_NORESTART;
> +}
> +
> +static void run_timer_test(unsigned int ns)
> +{
> +       hrtimer_init(&timer_wakeup.timer, CLOCK_MONOTONIC,
> +                    HRTIMER_MODE_REL);
> +       timer_wakeup.timer.function = timer_called;
> +       timer_wakeup.time_start = ktime_get();
> +       timer_wakeup.src_cpu = smp_processor_id();
> +       timer_wakeup.timeout = ns;
> +
> +       hrtimer_start(&timer_wakeup.timer, ns_to_ktime(ns),
> +                     HRTIMER_MODE_REL_PINNED);
> +}
> +
> +static struct dentry *dir;
> +
> +static int cpu_read_op(void *data, u64 *value)
> +{
> +       *value = ipi_wakeup.dest_cpu;
> +       return 0;
> +}
> +
> +static int cpu_write_op(void *data, u64 value)
> +{
> +       run_smp_call_function_test(value);
> +       return 0;
> +}
> +DEFINE_SIMPLE_ATTRIBUTE(ipi_ops, cpu_read_op, cpu_write_op, "%llu\n");
> +
> +static int timeout_read_op(void *data, u64 *value)
> +{
> +       *value = timer_wakeup.timeout;
> +       return 0;
> +}
> +
> +static int timeout_write_op(void *data, u64 value)
> +{
> +       run_timer_test(value);
> +       return 0;
> +}
> +DEFINE_SIMPLE_ATTRIBUTE(timeout_ops, timeout_read_op, timeout_write_op, "%llu\n");
> +
> +static int __init latency_init(void)
> +{
> +       struct dentry *temp;
> +
> +       dir = debugfs_create_dir("latency_test", 0);
> +       if (!dir) {
> +               pr_alert("latency_test: failed to create /sys/kernel/debug/latency_test\n");
> +               return -1;
> +       }
> +       temp = debugfs_create_file("ipi_cpu_dest",
> +                                  0666,
> +                                  dir,
> +                                  NULL,
> +                                  &ipi_ops);
> +       if (!temp) {
> +               pr_alert("latency_test: failed to create /sys/kernel/debug/ipi_cpu_dest\n");
> +               return -1;
> +       }
> +       debugfs_create_u64("ipi_latency_ns", 0444, dir, &ipi_wakeup.latency_ns);
> +       debugfs_create_u32("ipi_cpu_src", 0444, dir, &ipi_wakeup.src_cpu);
> +
> +       temp = debugfs_create_file("timeout_expected_ns",
> +                                  0666,
> +                                  dir,
> +                                  NULL,
> +                                  &timeout_ops);
> +       if (!temp) {
> +               pr_alert("latency_test: failed to create /sys/kernel/debug/timeout_expected_ns\n");
> +               return -1;
> +       }
> +       debugfs_create_u64("timeout_diff_ns", 0444, dir, &timer_wakeup.timeout_diff_ns);
> +       debugfs_create_u32("timeout_cpu_src", 0444, dir, &timer_wakeup.src_cpu);
> +       pr_info("Latency Test module loaded\n");
> +       return 0;
> +}
> +
> +static void __exit latency_cleanup(void)
> +{
> +       pr_info("Cleaning up Latency Test module.\n");
> +       debugfs_remove_recursive(dir);
> +}
> +
> +module_init(latency_init);
> +module_exit(latency_cleanup);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("IBM Corporation");
> +MODULE_DESCRIPTION("Measuring idle latency for IPIs and Timers");
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index d74ac0fd6b2d..e2283790245a 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -1375,6 +1375,16 @@ config DEBUG_KOBJECT
>           If you say Y here, some extra kobject debugging messages will be sent
>           to the syslog.
>
> +config IDLE_LATENCY_SELFTEST
> +       tristate "Cpuidle latency selftests"
> +       depends on CPU_IDLE
> +       help
> +         This option provides a kernel module that runs tests using the IPI and
> +         timers to measure latency.

What latency does it measure?

> +
> +         Say M if you want these self tests to build as a module.
> +         Say N if you are unsure.
> +
>  config DEBUG_KOBJECT_RELEASE
>         bool "kobject release debugging"
>         depends on DEBUG_OBJECTS_TIMERS
> --
> 2.25.4
>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox