LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [v4 3/5] KVM: PPC: Book3S HV: in H_SVM_INIT_DONE, migrate remaining normal-GFNs to secure-GFNs.
From: Ram Pai @ 2020-07-17  8:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1594972827-13928-1-git-send-email-linuxram@us.ibm.com>

The Ultravisor is expected to explicitly call H_SVM_PAGE_IN for all the
pages of the SVM before calling H_SVM_INIT_DONE. This causes a huge
delay in tranistioning the VM to SVM. The Ultravisor is only interested
in the pages that contain the kernel, initrd and other important data
structures. The rest contain throw-away content.

However if not all pages are requested by the Ultravisor, the Hypervisor
continues to consider the GFNs corresponding to the non-requested pages
as normal GFNs. This can lead to data-corruption and undefined behavior.

In H_SVM_INIT_DONE handler, move all the PFNs associated with the SVM's
GFNs to secure-PFNs. Skip the GFNs that are already Paged-in or Shared
or Paged-in followed by a Paged-out.

Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 Documentation/powerpc/ultravisor.rst        |   2 +
 arch/powerpc/include/asm/kvm_book3s_uvmem.h |   2 +
 arch/powerpc/kvm/book3s_hv_uvmem.c          | 135 +++++++++++++++++++++++++---
 3 files changed, 125 insertions(+), 14 deletions(-)

diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst
index a1c8c37..ba6b1bf 100644
--- a/Documentation/powerpc/ultravisor.rst
+++ b/Documentation/powerpc/ultravisor.rst
@@ -934,6 +934,8 @@ Return values
 	* H_UNSUPPORTED		if called from the wrong context (e.g.
 				from an SVM or before an H_SVM_INIT_START
 				hypercall).
+	* H_STATE		if the hypervisor could not successfully
+                                transition the VM to Secure VM.
 
 Description
 ~~~~~~~~~~~
diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
index 9cb7d8b..f229ab5 100644
--- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h
+++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
@@ -23,6 +23,8 @@ unsigned long kvmppc_h_svm_page_out(struct kvm *kvm,
 unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm);
 void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 			     struct kvm *kvm, bool skip_page_out);
+int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+			const struct kvm_memory_slot *memslot);
 #else
 static inline int kvmppc_uvmem_init(void)
 {
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index df2e272..3274663 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -93,6 +93,7 @@
 #include <asm/ultravisor.h>
 #include <asm/mman.h>
 #include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s_uvmem.h>
 
 static struct dev_pagemap kvmppc_uvmem_pgmap;
 static unsigned long *kvmppc_uvmem_bitmap;
@@ -348,8 +349,45 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
 	return false;
 }
 
+/*
+ * starting from *gfn search for the next available GFN that is not yet
+ * transitioned to a secure GFN.  return the value of that GFN in *gfn.  If a
+ * GFN is found, return true, else return false
+ */
+static bool kvmppc_next_nontransitioned_gfn(const struct kvm_memory_slot *memslot,
+		struct kvm *kvm, unsigned long *gfn)
+{
+	struct kvmppc_uvmem_slot *p;
+	bool ret = false;
+	unsigned long i;
+
+	mutex_lock(&kvm->arch.uvmem_lock);
+
+	list_for_each_entry(p, &kvm->arch.uvmem_pfns, list)
+		if (*gfn >= p->base_pfn && *gfn < p->base_pfn + p->nr_pfns)
+			break;
+	if (!p)
+		goto out;
+	/*
+	 * The code below assumes, one to one correspondence between
+	 * kvmppc_uvmem_slot and memslot.
+	 */
+	for (i = *gfn; i < p->base_pfn + p->nr_pfns; i++) {
+		unsigned long index = i - p->base_pfn;
+
+		if (!(p->pfns[index] & KVMPPC_GFN_FLAG_MASK)) {
+			*gfn = i;
+			ret = true;
+			break;
+		}
+	}
+out:
+	mutex_unlock(&kvm->arch.uvmem_lock);
+	return ret;
+}
+
 static int kvmppc_memslot_page_merge(struct kvm *kvm,
-		struct kvm_memory_slot *memslot, bool merge)
+		const struct kvm_memory_slot *memslot, bool merge)
 {
 	unsigned long gfn = memslot->base_gfn;
 	unsigned long end, start = gfn_to_hva(kvm, gfn);
@@ -461,12 +499,31 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 
 unsigned long kvmppc_h_svm_init_done(struct kvm *kvm)
 {
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int srcu_idx;
+	long ret = H_SUCCESS;
+
 	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
 		return H_UNSUPPORTED;
 
+	/* migrate any unmoved normal pfn to device pfns*/
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	slots = kvm_memslots(kvm);
+	kvm_for_each_memslot(memslot, slots) {
+		ret = kvmppc_uv_migrate_mem_slot(kvm, memslot);
+		if (ret) {
+			ret = H_STATE;
+			goto out;
+		}
+	}
+
 	kvm->arch.secure_guest |= KVMPPC_SECURE_INIT_DONE;
 	pr_info("LPID %d went secure\n", kvm->arch.lpid);
-	return H_SUCCESS;
+
+out:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	return ret;
 }
 
 /*
@@ -587,12 +644,14 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
 }
 
 /*
- * Alloc a PFN from private device memory pool and copy page from normal
- * memory to secure memory using UV_PAGE_IN uvcall.
+ * Alloc a PFN from private device memory pool. If @pagein is true,
+ * copy page from normal memory to secure memory using UV_PAGE_IN uvcall.
  */
-static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
-		   unsigned long end, unsigned long gpa, struct kvm *kvm,
-		   unsigned long page_shift)
+static int kvmppc_svm_migrate_page(struct vm_area_struct *vma,
+		unsigned long start,
+		unsigned long end, unsigned long gpa, struct kvm *kvm,
+		unsigned long page_shift,
+		bool pagein)
 {
 	unsigned long src_pfn, dst_pfn = 0;
 	struct migrate_vma mig;
@@ -623,11 +682,16 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 		goto out_finalize;
 	}
 
-	pfn = *mig.src >> MIGRATE_PFN_SHIFT;
-	spage = migrate_pfn_to_page(*mig.src);
-	if (spage)
-		uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0,
-			   page_shift);
+	if (pagein) {
+		pfn = *mig.src >> MIGRATE_PFN_SHIFT;
+		spage = migrate_pfn_to_page(*mig.src);
+		if (spage) {
+			ret = uv_page_in(kvm->arch.lpid, pfn << page_shift,
+					gpa, 0, page_shift);
+			if (ret)
+				goto out_finalize;
+		}
+	}
 
 	*mig.dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
 	migrate_vma_pages(&mig);
@@ -636,6 +700,46 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 	return ret;
 }
 
+int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+		const struct kvm_memory_slot *memslot)
+{
+	unsigned long gfn = memslot->base_gfn;
+	struct vm_area_struct *vma;
+	unsigned long start, end;
+	int ret = 0;
+
+	while (kvmppc_next_nontransitioned_gfn(memslot, kvm, &gfn)) {
+
+		mmap_read_lock(kvm->mm);
+		start = gfn_to_hva(kvm, gfn);
+		if (kvm_is_error_hva(start)) {
+			ret = H_STATE;
+			goto next;
+		}
+
+		end = start + (1UL << PAGE_SHIFT);
+		vma = find_vma_intersection(kvm->mm, start, end);
+		if (!vma || vma->vm_start > start || vma->vm_end < end) {
+			ret = H_STATE;
+			goto next;
+		}
+
+		mutex_lock(&kvm->arch.uvmem_lock);
+		ret = kvmppc_svm_migrate_page(vma, start, end,
+				(gfn << PAGE_SHIFT), kvm, PAGE_SHIFT, false);
+		mutex_unlock(&kvm->arch.uvmem_lock);
+		if (ret)
+			ret = H_STATE;
+
+next:
+		mmap_read_unlock(kvm->mm);
+
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
 /*
  * Shares the page with HV, thus making it a normal page.
  *
@@ -740,8 +844,11 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 	if (!vma || vma->vm_start > start || vma->vm_end < end)
 		goto out_unlock;
 
-	if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift))
-		ret = H_SUCCESS;
+	if (kvmppc_svm_migrate_page(vma, start, end, gpa, kvm, page_shift,
+				true))
+		goto out_unlock;
+
+	ret = H_SUCCESS;
 
 out_unlock:
 	mutex_unlock(&kvm->arch.uvmem_lock);
-- 
1.8.3.1


^ permalink raw reply related

* [v4 4/5] KVM: PPC: Book3S HV: retry page migration before erroring-out
From: Ram Pai @ 2020-07-17  8:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1594972827-13928-1-git-send-email-linuxram@us.ibm.com>

The page requested for page-in; sometimes, can have transient
references, and hence cannot migrate immediately. Retry a few times
before returning error.

The same is true for non-migrated pages that are migrated in
H_SVM_INIT_DONE hanlder.  Retry a few times before returning error.

H_SVM_PAGE_IN interface is enhanced to return H_BUSY if the page is
not in a migratable state.

Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 Documentation/powerpc/ultravisor.rst |   1 +
 arch/powerpc/kvm/book3s_hv_uvmem.c   | 106 ++++++++++++++++++++++++-----------
 2 files changed, 74 insertions(+), 33 deletions(-)

diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst
index ba6b1bf..fe533ad 100644
--- a/Documentation/powerpc/ultravisor.rst
+++ b/Documentation/powerpc/ultravisor.rst
@@ -1035,6 +1035,7 @@ Return values
 	* H_PARAMETER	if ``guest_pa`` is invalid.
 	* H_P2		if ``flags`` is invalid.
 	* H_P3		if ``order`` of page is invalid.
+	* H_BUSY	if ``page`` is not in a state to pagein
 
 Description
 ~~~~~~~~~~~
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 3274663..a206984 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -672,7 +672,7 @@ static int kvmppc_svm_migrate_page(struct vm_area_struct *vma,
 		return ret;
 
 	if (!(*mig.src & MIGRATE_PFN_MIGRATE)) {
-		ret = -1;
+		ret = -2;
 		goto out_finalize;
 	}
 
@@ -700,43 +700,73 @@ static int kvmppc_svm_migrate_page(struct vm_area_struct *vma,
 	return ret;
 }
 
-int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
-		const struct kvm_memory_slot *memslot)
+/*
+ * return 1, if some page migration failed because of transient error,
+ * while the remaining pages migrated successfully.
+ * The caller can use this as a hint to retry.
+ *
+ * return 0 otherwise. *ret indicates the success status
+ * of this call.
+ */
+static bool __kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+		const struct kvm_memory_slot *memslot, int *ret)
 {
 	unsigned long gfn = memslot->base_gfn;
 	struct vm_area_struct *vma;
 	unsigned long start, end;
-	int ret = 0;
+	bool retry = false;
 
+	*ret = 0;
 	while (kvmppc_next_nontransitioned_gfn(memslot, kvm, &gfn)) {
 
 		mmap_read_lock(kvm->mm);
 		start = gfn_to_hva(kvm, gfn);
 		if (kvm_is_error_hva(start)) {
-			ret = H_STATE;
+			*ret = H_STATE;
 			goto next;
 		}
 
 		end = start + (1UL << PAGE_SHIFT);
 		vma = find_vma_intersection(kvm->mm, start, end);
 		if (!vma || vma->vm_start > start || vma->vm_end < end) {
-			ret = H_STATE;
+			*ret = H_STATE;
 			goto next;
 		}
 
 		mutex_lock(&kvm->arch.uvmem_lock);
-		ret = kvmppc_svm_migrate_page(vma, start, end,
+		*ret = kvmppc_svm_migrate_page(vma, start, end,
 				(gfn << PAGE_SHIFT), kvm, PAGE_SHIFT, false);
 		mutex_unlock(&kvm->arch.uvmem_lock);
-		if (ret)
-			ret = H_STATE;
 
 next:
 		mmap_read_unlock(kvm->mm);
+		if (*ret == -2) {
+			retry = true;
+			continue;
+		}
 
-		if (ret)
-			break;
+		if (*ret)
+			return false;
 	}
+	return retry;
+}
+
+#define REPEAT_COUNT 10
+
+int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+		const struct kvm_memory_slot *memslot)
+{
+	int ret = 0, repeat_count = REPEAT_COUNT;
+
+	/*
+	 * try migration of pages in the memslot 'repeat_count' number of
+	 * times, provided each time migration fails because of transient
+	 * errors only.
+	 */
+	while (__kvmppc_uv_migrate_mem_slot(kvm, memslot, &ret) &&
+		repeat_count--)
+		;
+
 	return ret;
 }
 
@@ -812,7 +842,7 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 	struct vm_area_struct *vma;
 	int srcu_idx;
 	unsigned long gfn = gpa >> page_shift;
-	int ret;
+	int ret, repeat_count = REPEAT_COUNT;
 
 	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
 		return H_UNSUPPORTED;
@@ -826,34 +856,44 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 	if (flags & H_PAGE_IN_SHARED)
 		return kvmppc_share_page(kvm, gpa, page_shift);
 
-	ret = H_PARAMETER;
 	srcu_idx = srcu_read_lock(&kvm->srcu);
-	mmap_read_lock(kvm->mm);
 
-	start = gfn_to_hva(kvm, gfn);
-	if (kvm_is_error_hva(start))
-		goto out;
-
-	mutex_lock(&kvm->arch.uvmem_lock);
 	/* Fail the page-in request of an already paged-in page */
-	if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, NULL))
-		goto out_unlock;
+	mutex_lock(&kvm->arch.uvmem_lock);
+	ret = kvmppc_gfn_is_uvmem_pfn(gfn, kvm, NULL);
+	mutex_unlock(&kvm->arch.uvmem_lock);
+	if (ret) {
+		srcu_read_unlock(&kvm->srcu, srcu_idx);
+		return H_PARAMETER;
+	}
 
-	end = start + (1UL << page_shift);
-	vma = find_vma_intersection(kvm->mm, start, end);
-	if (!vma || vma->vm_start > start || vma->vm_end < end)
-		goto out_unlock;
+	do {
+		ret = H_PARAMETER;
+		mmap_read_lock(kvm->mm);
 
-	if (kvmppc_svm_migrate_page(vma, start, end, gpa, kvm, page_shift,
-				true))
-		goto out_unlock;
+		start = gfn_to_hva(kvm, gfn);
+		if (kvm_is_error_hva(start)) {
+			mmap_read_unlock(kvm->mm);
+			break;
+		}
 
-	ret = H_SUCCESS;
+		end = start + (1UL << page_shift);
+		vma = find_vma_intersection(kvm->mm, start, end);
+		if (!vma || vma->vm_start > start || vma->vm_end < end) {
+			mmap_read_unlock(kvm->mm);
+			break;
+		}
+
+		mutex_lock(&kvm->arch.uvmem_lock);
+		ret = kvmppc_svm_migrate_page(vma, start, end, gpa, kvm, page_shift, true);
+		mutex_unlock(&kvm->arch.uvmem_lock);
+
+		mmap_read_unlock(kvm->mm);
+	} while (ret == -2 && repeat_count--);
+
+	if (ret == -2)
+		ret = H_BUSY;
 
-out_unlock:
-	mutex_unlock(&kvm->arch.uvmem_lock);
-out:
-	mmap_read_unlock(kvm->mm);
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 	return ret;
 }
-- 
1.8.3.1


^ permalink raw reply related

* [v4 5/5] KVM: PPC: Book3S HV: migrate hot plugged memory
From: Ram Pai @ 2020-07-17  8:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1594972827-13928-1-git-send-email-linuxram@us.ibm.com>

From: Laurent Dufour <ldufour@linux.ibm.com>

When a memory slot is hot plugged to a SVM, PFNs associated with the
GFNs in that slot must be migrated to secure-PFNs, aka device-PFNs.

Call kvmppc_uv_migrate_mem_slot() to accomplish this.
Disable page-merge for all pages in the memory slot.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
[rearranged the code, and modified the commit log]
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
---
 arch/powerpc/include/asm/kvm_book3s_uvmem.h | 10 ++++++++++
 arch/powerpc/kvm/book3s_hv.c                | 10 ++--------
 arch/powerpc/kvm/book3s_hv_uvmem.c          | 22 ++++++++++++++++++++++
 3 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
index f229ab5..6f7da00 100644
--- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h
+++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
@@ -25,6 +25,9 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 			     struct kvm *kvm, bool skip_page_out);
 int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
 			const struct kvm_memory_slot *memslot);
+void kvmppc_memslot_create(struct kvm *kvm, const struct kvm_memory_slot *new);
+void kvmppc_memslot_delete(struct kvm *kvm, const struct kvm_memory_slot *old);
+
 #else
 static inline int kvmppc_uvmem_init(void)
 {
@@ -84,5 +87,12 @@ static inline int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn)
 static inline void
 kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 			struct kvm *kvm, bool skip_page_out) { }
+
+static inline void  kvmppc_memslot_create(struct kvm *kvm,
+		const struct kvm_memory_slot *new) { }
+
+static inline void  kvmppc_memslot_delete(struct kvm *kvm,
+		const struct kvm_memory_slot *old) { }
+
 #endif /* CONFIG_PPC_UV */
 #endif /* __ASM_KVM_BOOK3S_UVMEM_H__ */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d331b46..bf3be3b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4515,16 +4515,10 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 
 	switch (change) {
 	case KVM_MR_CREATE:
-		if (kvmppc_uvmem_slot_init(kvm, new))
-			return;
-		uv_register_mem_slot(kvm->arch.lpid,
-				     new->base_gfn << PAGE_SHIFT,
-				     new->npages * PAGE_SIZE,
-				     0, new->id);
+		kvmppc_memslot_create(kvm, new);
 		break;
 	case KVM_MR_DELETE:
-		uv_unregister_mem_slot(kvm->arch.lpid, old->id);
-		kvmppc_uvmem_slot_free(kvm, old);
+		kvmppc_memslot_delete(kvm, old);
 		break;
 	default:
 		/* TODO: Handle KVM_MR_MOVE */
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index a206984..a2b4d25 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -1089,6 +1089,28 @@ int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn)
 	return (ret == U_SUCCESS) ? RESUME_GUEST : -EFAULT;
 }
 
+void kvmppc_memslot_create(struct kvm *kvm, const struct kvm_memory_slot *new)
+{
+	if (kvmppc_uvmem_slot_init(kvm, new))
+		return;
+
+	if (kvmppc_memslot_page_merge(kvm, new, false))
+		return;
+
+	if (uv_register_mem_slot(kvm->arch.lpid, new->base_gfn << PAGE_SHIFT,
+			new->npages * PAGE_SIZE, 0, new->id))
+		return;
+
+	kvmppc_uv_migrate_mem_slot(kvm, new);
+}
+
+void kvmppc_memslot_delete(struct kvm *kvm, const struct kvm_memory_slot *old)
+{
+	uv_unregister_mem_slot(kvm->arch.lpid, old->id);
+	kvmppc_memslot_page_merge(kvm, old, true);
+	kvmppc_uvmem_slot_free(kvm, old);
+}
+
 static u64 kvmppc_get_secmem_size(void)
 {
 	struct device_node *np;
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH 07/11] Powerpc/numa: Detect support for coregroup
From: Gautham R Shenoy @ 2020-07-17  8:08 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-8-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:20AM +0530, Srikar Dronamraju wrote:
> Add support for grouping cores based on the device-tree classification.
> - The last domain in the associativity domains always refers to the
> core.
> - If primary reference domain happens to be the penultimate domain in
> the associativity domains device-tree property, then there are no
> coregroups. However if its not a penultimate domain, then there are
> coregroups. There can be more than one coregroup. For now we would be
> interested in the last or the smallest coregroups.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

This looks good to me from the discovery of the core-group point of
view.

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>

> ---
>  arch/powerpc/include/asm/smp.h |  1 +
>  arch/powerpc/kernel/smp.c      |  1 +
>  arch/powerpc/mm/numa.c         | 34 +++++++++++++++++++++-------------
>  3 files changed, 23 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
> index 49a25e2400f2..5bdc17a7049f 100644
> --- a/arch/powerpc/include/asm/smp.h
> +++ b/arch/powerpc/include/asm/smp.h
> @@ -28,6 +28,7 @@
>  extern int boot_cpuid;
>  extern int spinning_secondaries;
>  extern u32 *cpu_to_phys_id;
> +extern bool coregroup_enabled;
> 
>  extern void cpu_die(void);
>  extern int cpu_to_chip_id(int cpu);
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index f8faf75135af..96e47450d9b3 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -74,6 +74,7 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 };
> 
>  struct task_struct *secondary_current;
>  bool has_big_cores;
> +bool coregroup_enabled;
> 
>  DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index fc7b0505bdd8..a43eab455be4 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -887,7 +887,9 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
>  static void __init find_possible_nodes(void)
>  {
>  	struct device_node *rtas;
> -	u32 numnodes, i;
> +	const __be32 *domains;
> +	int prop_length, max_nodes;
> +	u32 i;
> 
>  	if (!numa_enabled)
>  		return;
> @@ -896,25 +898,31 @@ static void __init find_possible_nodes(void)
>  	if (!rtas)
>  		return;
> 
> -	if (of_property_read_u32_index(rtas, "ibm,current-associativity-domains",
> -				min_common_depth, &numnodes)) {
> -		/*
> -		 * ibm,current-associativity-domains is a fairly recent
> -		 * property. If it doesn't exist, then fallback on
> -		 * ibm,max-associativity-domains. Current denotes what the
> -		 * platform can support compared to max which denotes what the
> -		 * Hypervisor can support.
> -		 */
> -		if (of_property_read_u32_index(rtas, "ibm,max-associativity-domains",
> -				min_common_depth, &numnodes))
> +	/*
> +	 * ibm,current-associativity-domains is a fairly recent property. If
> +	 * it doesn't exist, then fallback on ibm,max-associativity-domains.
> +	 * Current denotes what the platform can support compared to max
> +	 * which denotes what the Hypervisor can support.
> +	 */
> +	domains = of_get_property(rtas, "ibm,current-associativity-domains",
> +					&prop_length);
> +	if (!domains) {
> +		domains = of_get_property(rtas, "ibm,max-associativity-domains",
> +					&prop_length);
> +		if (!domains)
>  			goto out;
>  	}
> 
> -	for (i = 0; i < numnodes; i++) {
> +	max_nodes = of_read_number(&domains[min_common_depth], 1);
> +	for (i = 0; i < max_nodes; i++) {
>  		if (!node_possible(i))
>  			node_set(i, node_possible_map);
>  	}
> 
> +	prop_length /= sizeof(int);
> +	if (prop_length > min_common_depth + 2)
> +		coregroup_enabled = 1;
> +
>  out:
>  	of_node_put(rtas);
>  }
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH 08/11] powerpc/smp: Allocate cpumask only after searching thread group
From: Gautham R Shenoy @ 2020-07-17  8:08 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-9-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:21AM +0530, Srikar Dronamraju wrote:
> If allocated earlier and the search fails, then cpumask need to be
> freed. However cpu_l1_cache_map can be allocated after we search thread
> group.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

Good fix.

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>

> ---
>  arch/powerpc/kernel/smp.c | 7 +++----
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 96e47450d9b3..ef19eeccd21e 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -797,10 +797,6 @@ static int init_cpu_l1_cache_map(int cpu)
>  	if (err)
>  		goto out;
> 
> -	zalloc_cpumask_var_node(&per_cpu(cpu_l1_cache_map, cpu),
> -				GFP_KERNEL,
> -				cpu_to_node(cpu));
> -
>  	cpu_group_start = get_cpu_thread_group_start(cpu, &tg);
> 
>  	if (unlikely(cpu_group_start == -1)) {
> @@ -809,6 +805,9 @@ static int init_cpu_l1_cache_map(int cpu)
>  		goto out;
>  	}
> 
> +	zalloc_cpumask_var_node(&per_cpu(cpu_l1_cache_map, cpu),
> +				GFP_KERNEL, cpu_to_node(cpu));
> +
>  	for (i = first_thread; i < first_thread + threads_per_core; i++) {
>  		int i_group_start = get_cpu_thread_group_start(i, &tg);
> 
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH 09/11] Powerpc/smp: Create coregroup domain
From: Gautham R Shenoy @ 2020-07-17  8:19 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-10-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:22AM +0530, Srikar Dronamraju wrote:
> Add percpu coregroup maps and masks to create coregroup domain.
> If a coregroup doesn't exist, the coregroup domain will be degenerated
> in favour of SMT/CACHE domain.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/include/asm/topology.h | 10 ++++++++
>  arch/powerpc/kernel/smp.c           | 37 +++++++++++++++++++++++++++++
>  arch/powerpc/mm/numa.c              |  5 ++++
>  3 files changed, 52 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
> index 2db7ba789720..34812c35018e 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -98,6 +98,7 @@ extern int stop_topology_update(void);
>  extern int prrn_is_enabled(void);
>  extern int find_and_online_cpu_nid(int cpu);
>  extern int timed_topology_update(int nsecs);
> +extern int cpu_to_coregroup_id(int cpu);
>  #else
>  static inline int start_topology_update(void)
>  {
> @@ -120,6 +121,15 @@ static inline int timed_topology_update(int nsecs)
>  	return 0;
>  }
> 
> +static inline int cpu_to_coregroup_id(int cpu)
> +{
> +#ifdef CONFIG_SMP
> +	return cpu_to_core_id(cpu);
> +#else
> +	return 0;
> +#endif
> +}
> +
>  #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
> 
>  #include <asm-generic/topology.h>
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index ef19eeccd21e..bb25c13bbb79 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -80,6 +80,7 @@ DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
> +DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map);
> 
>  EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
>  EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
> @@ -91,6 +92,7 @@ enum {
>  	smt_idx,
>  #endif
>  	bigcore_idx,
> +	mc_idx,
>  	die_idx,
>  };
> 
> @@ -869,6 +871,21 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
>  }
>  #endif
> 
> +static struct cpumask *cpu_coregroup_mask(int cpu)
> +{
> +	return per_cpu(cpu_coregroup_map, cpu);
> +}
> +
> +static bool has_coregroup_support(void)
> +{
> +	return coregroup_enabled;
> +}
> +
> +static const struct cpumask *cpu_mc_mask(int cpu)
> +{
> +	return cpu_coregroup_mask(cpu);
> +}
> +
>  static const struct cpumask *cpu_bigcore_mask(int cpu)
>  {
>  	return cpu_core_mask(cpu);
> @@ -879,6 +896,7 @@ static struct sched_domain_topology_level powerpc_topology[] = {
>  	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
>  #endif
>  	{ cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
> +	{ cpu_mc_mask, SD_INIT_NAME(MC) },
>  	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
>  	{ NULL, },
>  };
> @@ -933,6 +951,10 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>  					GFP_KERNEL, node);
>  		zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu),
>  					GFP_KERNEL, node);
> +		if (has_coregroup_support())
> +			zalloc_cpumask_var_node(&per_cpu(cpu_coregroup_map, cpu),
> +						GFP_KERNEL, node);
> +
>  #ifdef CONFIG_NEED_MULTIPLE_NODES
>  		/*
>  		 * numa_node_id() works after this.
> @@ -950,6 +972,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>  	cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
>  	cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
> 
> +	if (has_coregroup_support())
> +		cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
> +	else
> +		powerpc_topology[mc_idx].mask = cpu_bigcore_mask;
> +

The else part could be moved to the common function where we are
modifying the other attributes of the topology array.


>  	init_big_cores();
>  	if (has_big_cores) {
>  		cpumask_set_cpu(boot_cpuid,
> @@ -1241,6 +1268,8 @@ static void remove_cpu_from_masks(int cpu)
>  		set_cpus_unrelated(cpu, i, cpu_sibling_mask);
>  		if (has_big_cores)
>  			set_cpus_unrelated(cpu, i, cpu_smallcore_mask);
> +		if (has_coregroup_support())
> +			set_cpus_unrelated(cpu, i, cpu_coregroup_mask);
>  	}
>  }
>  #endif
> @@ -1301,6 +1330,14 @@ static void add_cpu_to_masks(int cpu)
>  	add_cpu_to_smallcore_masks(cpu);
>  	update_mask_by_l2(cpu, cpu_l2_cache_mask);
> 
> +	if (has_coregroup_support()) {
> +		cpumask_set_cpu(cpu, cpu_coregroup_mask(cpu));
> +		for_each_cpu(i, cpu_online_mask) {
> +			if (cpu_to_coregroup_id(cpu) == cpu_to_coregroup_id(i))
> +				set_cpus_related(cpu, i, cpu_coregroup_mask);
> +		}
> +	}
> +
>  	if (pkg_id == -1) {
>  		struct cpumask *(*mask)(int) = cpu_sibling_mask;
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index a43eab455be4..d9ab9da85eab 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1695,6 +1695,11 @@ static const struct proc_ops topology_proc_ops = {
>  	.proc_release	= single_release,
>  };
> 
> +int cpu_to_coregroup_id(int cpu)
> +{
> +	return cpu_to_core_id(cpu);
> +}


So, if has_coregroup_support() returns true, then since the core_group
identification is currently done through the core-id, the
coregroup_mask is going to be the same as the
cpu_core_mask/cpu_cpu_mask. Thus, we will be degenerating the DIE
domain. Right ? Instead we could have kept the core-group to be a
single bigcore by default, so that those domains can get degenerated
preserving the legacy SMT, DIE, NUMA hierarchy.




> +
>  static int topology_update_init(void)
>  {
>  	start_topology_update();
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH 09/11] Powerpc/smp: Create coregroup domain
From: Gautham R Shenoy @ 2020-07-17  8:23 UTC (permalink / raw)
  To: Gautham R Shenoy
  Cc: Nathan Lynch, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200717081926.GD32531@in.ibm.com>

On Fri, Jul 17, 2020 at 01:49:26PM +0530, Gautham R Shenoy wrote:

> > +int cpu_to_coregroup_id(int cpu)
> > +{
> > +	return cpu_to_core_id(cpu);
> > +}
> 
> 
> So, if has_coregroup_support() returns true, then since the core_group
> identification is currently done through the core-id, the
> coregroup_mask is going to be the same as the
> cpu_core_mask/cpu_cpu_mask. Thus, we will be degenerating the DIE
> domain. Right ? Instead we could have kept the core-group to be a
> single bigcore by default, so that those domains can get degenerated
> preserving the legacy SMT, DIE, NUMA hierarchy.

Never mind, got confused with core_id and cpu_core_mask (which
corresponds to the cores of a chip).  cpu_to_core_id() does exactly
what we have described above. Sorry for the noise.

I am ok with this patch, except that I would request if all the
changes to the topology structure be done within a single function for
ease of tracking it instead of distributing it all over the code.


> 
> 
> 
> 
> > +
> >  static int topology_update_init(void)
> >  {
> >  	start_topology_update();
> > -- 
> > 2.17.1
> > 

^ permalink raw reply

* Re: [PATCH 10/11] powerpc/smp: Implement cpu_to_coregroup_id
From: Gautham R Shenoy @ 2020-07-17  8:26 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-11-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:23AM +0530, Srikar Dronamraju wrote:
> Lookup the coregroup id from the associativity array.
> 
> If unable to detect the coregroup id, fallback on the core id.
> This way, ensure sched_domain degenerates and an extra sched domain is
> not created.
> 
> Ideally this function should have been implemented in
> arch/powerpc/kernel/smp.c. However if its implemented in mm/numa.c, we
> don't need to find the primary domain again.
> 
> If the device-tree mentions more than one coregroup, then kernel
> implements only the last or the smallest coregroup, which currently
> corresponds to the penultimate domain in the device-tree.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/mm/numa.c | 17 +++++++++++++++++
>  1 file changed, 17 insertions(+)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index d9ab9da85eab..4e85564ef62a 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1697,6 +1697,23 @@ static const struct proc_ops topology_proc_ops = {
> 
>  int cpu_to_coregroup_id(int cpu)
>  {
> +	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
> +	int index;
> +

It would be good to have an assert here to ensure that we are calling
this function only when coregroups are enabled.

Else, we may end up returning the penultimate index which maps to the
chip-id.



> +	if (cpu < 0 || cpu > nr_cpu_ids)
> +		return -1;
> +
> +	if (!firmware_has_feature(FW_FEATURE_VPHN))
> +		goto out;
> +
> +	if (vphn_get_associativity(cpu, associativity))
> +		goto out;
> +
> +	index = of_read_number(associativity, 1);
> +	if ((index > min_common_depth + 1) && coregroup_enabled)
> +		return of_read_number(&associativity[index - 1], 1);
> +
> +out:
>  	return cpu_to_core_id(cpu);
>  }
> 
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH 11/11] powerpc/smp: Provide an ability to disable coregroup
From: Gautham R Shenoy @ 2020-07-17  8:28 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-12-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:24AM +0530, Srikar Dronamraju wrote:
> If user wants to enable coregroup sched_domain then they can boot with
> kernel parameter "coregroup_support=on"
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>


We need this documented in the
Documentation/admin-guide/kernel-parameters.txt

Other than that, the patch looks good to me.
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>

> ---
>  arch/powerpc/kernel/smp.c | 19 ++++++++++++++++++-
>  1 file changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index bb25c13bbb79..c43909e6e8e9 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -118,6 +118,23 @@ struct smp_ops_t *smp_ops;
>  volatile unsigned int cpu_callin_map[NR_CPUS];
> 
>  int smt_enabled_at_boot = 1;
> +int coregroup_support;
> +
> +static int __init early_coregroup(char *p)
> +{
> +	if (!p)
> +		return 0;
> +
> +	if (strstr(p, "on"))
> +		coregroup_support = 1;
> +
> +	if (strstr(p, "1"))
> +		coregroup_support = 1;
> +
> +	return 0;
> +}
> +
> +early_param("coregroup_support", early_coregroup);
> 
>  /*
>   * Returns 1 if the specified cpu should be brought up during boot.
> @@ -878,7 +895,7 @@ static struct cpumask *cpu_coregroup_mask(int cpu)
> 
>  static bool has_coregroup_support(void)
>  {
> -	return coregroup_enabled;
> +	return coregroup_enabled && coregroup_support;
>  }
> 
>  static const struct cpumask *cpu_mc_mask(int cpu)
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH v3] powerpc/pseries: detect secure and trusted boot state of the system.
From: Michal Suchánek @ 2020-07-17  8:35 UTC (permalink / raw)
  To: Daniel Axtens; +Cc: linuxppc-dev, Nayna Jain, linux-kernel, Mimi Zohar
In-Reply-To: <87pn8uu1hy.fsf@dja-thinkpad.axtens.net>

On Fri, Jul 17, 2020 at 03:58:01PM +1000, Daniel Axtens wrote:
> Michal Suchánek <msuchanek@suse.de> writes:
> 
> > On Wed, Jul 15, 2020 at 07:52:01AM -0400, Nayna Jain wrote:
> >> The device-tree property to check secure and trusted boot state is
> >> different for guests(pseries) compared to baremetal(powernv).
> >> 
> >> This patch updates the existing is_ppc_secureboot_enabled() and
> >> is_ppc_trustedboot_enabled() functions to add support for pseries.
> >> 
> >> The secureboot and trustedboot state are exposed via device-tree property:
> >> /proc/device-tree/ibm,secure-boot and /proc/device-tree/ibm,trusted-boot
> >> 
> >> The values of ibm,secure-boot under pseries are interpreted as:
> >                                       ^^^
> >> 
> >> 0 - Disabled
> >> 1 - Enabled in Log-only mode. This patch interprets this value as
> >> disabled, since audit mode is currently not supported for Linux.
> >> 2 - Enabled and enforced.
> >> 3-9 - Enabled and enforcing; requirements are at the discretion of the
> >> operating system.
> >> 
> >> The values of ibm,trusted-boot under pseries are interpreted as:
> >                                        ^^^
> > These two should be different I suppose?
> 
> I'm not quite sure what you mean? They'll be documented in a future
> revision of the PAPR, once I get my act together and submit the
> relevant internal paperwork.

Nevermind, one talks about secure boot, the other about trusted boot.

Thanks

Michal

^ permalink raw reply

* Re: [PATCH v2 2/5] powerpc/lib: Initialize a temporary mm for code patching
From: kernel test robot @ 2020-07-17  8:57 UTC (permalink / raw)
  To: Christopher M. Riedl, linuxppc-dev
  Cc: clang-built-linux, kbuild-all, kernel-hardening
In-Reply-To: <20200709040316.12789-3-cmr@informatik.wtf>

[-- Attachment #1: Type: text/plain, Size: 2895 bytes --]

Hi "Christopher,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on powerpc/next]
[also build test WARNING on char-misc/char-misc-testing v5.8-rc5 next-20200716]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Christopher-M-Riedl/Use-per-CPU-temporary-mappings-for-patching/20200709-123827
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-randconfig-r013-20200717 (attached as .config)
compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project ed6b578040a85977026c93bf4188f996148f3218)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install powerpc cross compiling tool for clang build
        # apt-get install binutils-powerpc-linux-gnu
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> arch/powerpc/lib/code-patching.c:53:13: warning: no previous prototype for function 'poking_init' [-Wmissing-prototypes]
   void __init poking_init(void)
               ^
   arch/powerpc/lib/code-patching.c:53:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void __init poking_init(void)
   ^
   static 
   1 warning generated.

vim +/poking_init +53 arch/powerpc/lib/code-patching.c

    52	
  > 53	void __init poking_init(void)
    54	{
    55		spinlock_t *ptl; /* for protecting pte table */
    56		pte_t *ptep;
    57	
    58		/*
    59		 * Some parts of the kernel (static keys for example) depend on
    60		 * successful code patching. Code patching under STRICT_KERNEL_RWX
    61		 * requires this setup - otherwise we cannot patch at all. We use
    62		 * BUG_ON() here and later since an early failure is preferred to
    63		 * buggy behavior and/or strange crashes later.
    64		 */
    65		patching_mm = copy_init_mm();
    66		BUG_ON(!patching_mm);
    67	
    68		/*
    69		 * In hash we cannot go above DEFAULT_MAP_WINDOW easily.
    70		 * XXX: Do we want additional bits of entropy for radix?
    71		 */
    72		patching_addr = (get_random_long() & PAGE_MASK) %
    73			(DEFAULT_MAP_WINDOW - PAGE_SIZE);
    74	
    75		ptep = get_locked_pte(patching_mm, patching_addr, &ptl);
    76		BUG_ON(!ptep);
    77		pte_unmap_unlock(ptep, ptl);
    78	}
    79	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 30733 bytes --]

^ permalink raw reply

* linux-next: manual merge of the set_fs tree with the powerpc tree
From: Stephen Rothwell @ 2020-07-17  9:09 UTC (permalink / raw)
  To: Christoph Hellwig, Michael Ellerman, PowerPC
  Cc: Nathan Lynch, Linux Next Mailing List, Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 819 bytes --]

Hi all,

Today's linux-next merge of the set_fs tree got a conflict in:

  arch/powerpc/mm/numa.c

between commit:

  c30f931e891e ("powerpc/numa: remove ability to enable topology updates")

from the powerpc tree and commit:

  16a04bde8169 ("proc: switch over direct seq_read method calls to seq_read_iter")

from the set_fs tree.

I fixed it up (the former removed the code updated by the latter, so I
just did that) and can carry the fix as necessary. This is now fixed as
far as linux-next is concerned, but any non trivial conflicts should be
mentioned to your upstream maintainer when your tree is submitted for
merging.  You may also want to consider cooperating with the maintainer
of the conflicting tree to minimise any particularly complex conflicts.

-- 
Cheers,
Stephen Rothwell

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* [PATCH v2 0/2] Selftest for cpuidle latency measurement
From: Pratik Rajesh Sampat @ 2020-07-17  9:17 UTC (permalink / raw)
  To: rjw, daniel.lezcano, mpe, benh, paulus, srivatsa, shuah, npiggin,
	ego, svaidy, linux-pm, linuxppc-dev, linux-kernel,
	linux-kselftest

v1: https://lkml.org/lkml/2020/7/7/1036
Changelog v1 --> v2
1. Based on Shuah Khan's comment, changed exit code to ksft_skip to
   indicate the test is being skipped
2. Change the busy workload for baseline measurement from
   "yes > /dev/null" to "cat /dev/random to /dev/null", based on
   observed CPU utilization for "yes" consuming ~60% CPU while the
   latter consumes 100% of CPUs, giving more accurate baseline numbers
---

The patch series introduces a mechanism to measure wakeup latency for
IPI and timer based interrupts
The motivation behind this series is to find significant deviations
behind advertised latency and resisdency values

To achieve this, we introduce a kernel module and expose its control
knobs through the debugfs interface that the selftests can engage with.

The kernel module provides the following interfaces within
/sys/kernel/debug/latency_test/ for,
1. IPI test:
  ipi_cpu_dest   # Destination CPU for the IPI
  ipi_cpu_src    # Origin of the IPI
  ipi_latency_ns # Measured latency time in ns
2. Timeout test:
  timeout_cpu_src     # CPU on which the timer to be queued
  timeout_expected_ns # Timer duration
  timeout_diff_ns     # Difference of actual duration vs expected timer
To include the module, check option and include as module
kernel hacking -> Cpuidle latency selftests

The selftest inserts the module, disables all the idle states and
enables them one by one testing the following:
1. Keeping source CPU constant, iterates through all the CPUS measuring
   IPI latency for baseline (CPU is busy with
   "cat /dev/random > /dev/null" workload) and the when the CPU is
   allowed to be at rest
2. Iterating through all the CPUs, sending expected timer durations to
   be equivalent to the residency of the the deepest idle state
   enabled and extracting the difference in time between the time of
   wakeup and the expected timer duration

Usage
-----
Can be used in conjuction to the rest of the selftests.
Default Output location in: tools/testing/cpuidle/cpuidle.log

To run this test specifically:
$ make -C tools/testing/selftests TARGETS="cpuidle" run_tests

There are a few optinal arguments too that the script can take
	[-h <help>]
	[-m <location of the module>]
	[-o <location of the output>]

Sample output snippet
---------------------
--IPI Latency Test---
--Baseline IPI Latency measurement: CPU Busy--
SRC_CPU   DEST_CPU IPI_Latency(ns)
...
0            8         1996
0            9         2125
0           10         1264
0           11         1788
0           12         2045
Baseline Average IPI latency(ns): 1843
---Enabling state: 5---
SRC_CPU   DEST_CPU IPI_Latency(ns)
0            8       621719
0            9       624752
0           10       622218
0           11       623968
0           12       621303
Expected IPI latency(ns): 100000
Observed Average IPI latency(ns): 622792

--Timeout Latency Test--
--Baseline Timeout Latency measurement: CPU Busy--
Wakeup_src Baseline_delay(ns) 
...
8            2249
9            2226
10           2211
11           2183
12           2263
Baseline Average timeout diff(ns): 2226
---Enabling state: 5---
8           10749                   
9           10911                   
10          10912                   
11          12100                   
12          73276                   
Expected timeout(ns): 10000200
Observed Average timeout diff(ns): 23589

Pratik Rajesh Sampat (2):
  cpuidle: Trace IPI based and timer based wakeup latency from idle
    states
  selftest/cpuidle: Add support for cpuidle latency measurement

 drivers/cpuidle/Makefile                   |   1 +
 drivers/cpuidle/test-cpuidle_latency.c     | 150 ++++++++++++
 lib/Kconfig.debug                          |  10 +
 tools/testing/selftests/Makefile           |   1 +
 tools/testing/selftests/cpuidle/Makefile   |   6 +
 tools/testing/selftests/cpuidle/cpuidle.sh | 257 +++++++++++++++++++++
 tools/testing/selftests/cpuidle/settings   |   1 +
 7 files changed, 426 insertions(+)
 create mode 100644 drivers/cpuidle/test-cpuidle_latency.c
 create mode 100644 tools/testing/selftests/cpuidle/Makefile
 create mode 100755 tools/testing/selftests/cpuidle/cpuidle.sh
 create mode 100644 tools/testing/selftests/cpuidle/settings

-- 
2.25.4


^ permalink raw reply

* [PATCH v2 1/2] cpuidle: Trace IPI based and timer based wakeup latency from idle states
From: Pratik Rajesh Sampat @ 2020-07-17  9:18 UTC (permalink / raw)
  To: rjw, daniel.lezcano, mpe, benh, paulus, srivatsa, shuah, npiggin,
	ego, svaidy, linux-pm, linuxppc-dev, linux-kernel,
	linux-kselftest
In-Reply-To: <20200717091801.29289-1-psampat@linux.ibm.com>

Fire directed smp_call_function_single IPIs from a specified source
CPU to the specified target CPU to reduce the noise we have to wade
through in the trace log.
The module is based on the idea written by Srivatsa Bhat and maintained
by Vaidyanathan Srinivasan internally.

Queue HR timer and measure jitter. Wakeup latency measurement for idle
states using hrtimer.  Echo a value in ns to timer_test_function and
watch trace. A HRtimer will be queued and when it fires the expected
wakeup vs actual wakeup is computes and delay printed in ns.

Implemented as a module which utilizes debugfs so that it can be
integrated with selftests.

To include the module, check option and include as module
kernel hacking -> Cpuidle latency selftests

[srivatsa.bhat@linux.vnet.ibm.com: Initial implementation in
 cpidle/sysfs]

[svaidy@linux.vnet.ibm.com: wakeup latency measurements using hrtimer
 and fix some of the time calculation]

[ego@linux.vnet.ibm.com: Fix some whitespace and tab errors and
 increase the resolution of IPI wakeup]

Signed-off-by: Pratik Rajesh Sampat <psampat@linux.ibm.com>
---
 drivers/cpuidle/Makefile               |   1 +
 drivers/cpuidle/test-cpuidle_latency.c | 150 +++++++++++++++++++++++++
 lib/Kconfig.debug                      |  10 ++
 3 files changed, 161 insertions(+)
 create mode 100644 drivers/cpuidle/test-cpuidle_latency.c

diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
index f07800cbb43f..2ae05968078c 100644
--- a/drivers/cpuidle/Makefile
+++ b/drivers/cpuidle/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o
 obj-$(CONFIG_DT_IDLE_STATES)		  += dt_idle_states.o
 obj-$(CONFIG_ARCH_HAS_CPU_RELAX)	  += poll_state.o
 obj-$(CONFIG_HALTPOLL_CPUIDLE)		  += cpuidle-haltpoll.o
+obj-$(CONFIG_IDLE_LATENCY_SELFTEST)	  += test-cpuidle_latency.o
 
 ##################################################################################
 # ARM SoC drivers
diff --git a/drivers/cpuidle/test-cpuidle_latency.c b/drivers/cpuidle/test-cpuidle_latency.c
new file mode 100644
index 000000000000..61574665e972
--- /dev/null
+++ b/drivers/cpuidle/test-cpuidle_latency.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module-based API test facility for cpuidle latency using IPIs and timers
+ */
+
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+/* IPI based wakeup latencies */
+struct latency {
+	unsigned int src_cpu;
+	unsigned int dest_cpu;
+	ktime_t time_start;
+	ktime_t time_end;
+	u64 latency_ns;
+} ipi_wakeup;
+
+static void measure_latency(void *info)
+{
+	struct latency *v;
+	ktime_t time_diff;
+
+	v = (struct latency *)info;
+	v->time_end = ktime_get();
+	time_diff = ktime_sub(v->time_end, v->time_start);
+	v->latency_ns = ktime_to_ns(time_diff);
+}
+
+void run_smp_call_function_test(unsigned int cpu)
+{
+	ipi_wakeup.src_cpu = smp_processor_id();
+	ipi_wakeup.dest_cpu = cpu;
+	ipi_wakeup.time_start = ktime_get();
+	smp_call_function_single(cpu, measure_latency, &ipi_wakeup, 1);
+}
+
+/* Timer based wakeup latencies */
+struct timer_data {
+	unsigned int src_cpu;
+	u64 timeout;
+	ktime_t time_start;
+	ktime_t time_end;
+	struct hrtimer timer;
+	u64 timeout_diff_ns;
+} timer_wakeup;
+
+static enum hrtimer_restart timer_called(struct hrtimer *hrtimer)
+{
+	struct timer_data *w;
+	ktime_t time_diff;
+
+	w = container_of(hrtimer, struct timer_data, timer);
+	w->time_end = ktime_get();
+
+	time_diff = ktime_sub(w->time_end, w->time_start);
+	time_diff = ktime_sub(time_diff, ns_to_ktime(w->timeout));
+	w->timeout_diff_ns = ktime_to_ns(time_diff);
+	return HRTIMER_NORESTART;
+}
+
+static void run_timer_test(unsigned int ns)
+{
+	hrtimer_init(&timer_wakeup.timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL);
+	timer_wakeup.timer.function = timer_called;
+	timer_wakeup.time_start = ktime_get();
+	timer_wakeup.src_cpu = smp_processor_id();
+	timer_wakeup.timeout = ns;
+
+	hrtimer_start(&timer_wakeup.timer, ns_to_ktime(ns),
+		      HRTIMER_MODE_REL_PINNED);
+}
+
+static struct dentry *dir;
+
+static int cpu_read_op(void *data, u64 *value)
+{
+	*value = ipi_wakeup.dest_cpu;
+	return 0;
+}
+
+static int cpu_write_op(void *data, u64 value)
+{
+	run_smp_call_function_test(value);
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(ipi_ops, cpu_read_op, cpu_write_op, "%llu\n");
+
+static int timeout_read_op(void *data, u64 *value)
+{
+	*value = timer_wakeup.timeout;
+	return 0;
+}
+
+static int timeout_write_op(void *data, u64 value)
+{
+	run_timer_test(value);
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(timeout_ops, timeout_read_op, timeout_write_op, "%llu\n");
+
+static int __init latency_init(void)
+{
+	struct dentry *temp;
+
+	dir = debugfs_create_dir("latency_test", 0);
+	if (!dir) {
+		pr_alert("latency_test: failed to create /sys/kernel/debug/latency_test\n");
+		return -1;
+	}
+	temp = debugfs_create_file("ipi_cpu_dest",
+				   0666,
+				   dir,
+				   NULL,
+				   &ipi_ops);
+	if (!temp) {
+		pr_alert("latency_test: failed to create /sys/kernel/debug/ipi_cpu_dest\n");
+		return -1;
+	}
+	debugfs_create_u64("ipi_latency_ns", 0444, dir, &ipi_wakeup.latency_ns);
+	debugfs_create_u32("ipi_cpu_src", 0444, dir, &ipi_wakeup.src_cpu);
+
+	temp = debugfs_create_file("timeout_expected_ns",
+				   0666,
+				   dir,
+				   NULL,
+				   &timeout_ops);
+	if (!temp) {
+		pr_alert("latency_test: failed to create /sys/kernel/debug/timeout_expected_ns\n");
+		return -1;
+	}
+	debugfs_create_u64("timeout_diff_ns", 0444, dir, &timer_wakeup.timeout_diff_ns);
+	debugfs_create_u32("timeout_cpu_src", 0444, dir, &timer_wakeup.src_cpu);
+	pr_info("Latency Test module loaded\n");
+	return 0;
+}
+
+static void __exit latency_cleanup(void)
+{
+	pr_info("Cleaning up Latency Test module.\n");
+	debugfs_remove_recursive(dir);
+}
+
+module_init(latency_init);
+module_exit(latency_cleanup);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("IBM Corporation");
+MODULE_DESCRIPTION("Measuring idle latency for IPIs and Timers");
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d74ac0fd6b2d..e2283790245a 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1375,6 +1375,16 @@ config DEBUG_KOBJECT
 	  If you say Y here, some extra kobject debugging messages will be sent
 	  to the syslog.
 
+config IDLE_LATENCY_SELFTEST
+	tristate "Cpuidle latency selftests"
+	depends on CPU_IDLE
+	help
+	  This option provides a kernel module that runs tests using the IPI and
+	  timers to measure latency.
+
+	  Say M if you want these self tests to build as a module.
+	  Say N if you are unsure.
+
 config DEBUG_KOBJECT_RELEASE
 	bool "kobject release debugging"
 	depends on DEBUG_OBJECTS_TIMERS
-- 
2.25.4


^ permalink raw reply related

* [PATCH v2 2/2] selftest/cpuidle: Add support for cpuidle latency measurement
From: Pratik Rajesh Sampat @ 2020-07-17  9:18 UTC (permalink / raw)
  To: rjw, daniel.lezcano, mpe, benh, paulus, srivatsa, shuah, npiggin,
	ego, svaidy, linux-pm, linuxppc-dev, linux-kernel,
	linux-kselftest
In-Reply-To: <20200717091801.29289-1-psampat@linux.ibm.com>

This patch adds support to trace IPI based and timer based wakeup
latency from idle states

Latches onto the test-cpuidle_latency kernel module using the debugfs
interface to send IPIs or schedule a timer based event, which in-turn
populates the debugfs with the latency measurements.

Currently for the IPI and timer tests; first disable all idle states
and then test for latency measurements incrementally enabling each state

Signed-off-by: Pratik Rajesh Sampat <psampat@linux.ibm.com>
---
 tools/testing/selftests/Makefile           |   1 +
 tools/testing/selftests/cpuidle/Makefile   |   6 +
 tools/testing/selftests/cpuidle/cpuidle.sh | 257 +++++++++++++++++++++
 tools/testing/selftests/cpuidle/settings   |   1 +
 4 files changed, 265 insertions(+)
 create mode 100644 tools/testing/selftests/cpuidle/Makefile
 create mode 100755 tools/testing/selftests/cpuidle/cpuidle.sh
 create mode 100644 tools/testing/selftests/cpuidle/settings

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 1195bd85af38..ab6cf51f3518 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -7,6 +7,7 @@ TARGETS += capabilities
 TARGETS += cgroup
 TARGETS += clone3
 TARGETS += cpufreq
+TARGETS += cpuidle
 TARGETS += cpu-hotplug
 TARGETS += drivers/dma-buf
 TARGETS += efivarfs
diff --git a/tools/testing/selftests/cpuidle/Makefile b/tools/testing/selftests/cpuidle/Makefile
new file mode 100644
index 000000000000..72fd5d2e974d
--- /dev/null
+++ b/tools/testing/selftests/cpuidle/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+all:
+
+TEST_PROGS := cpuidle.sh
+
+include ../lib.mk
diff --git a/tools/testing/selftests/cpuidle/cpuidle.sh b/tools/testing/selftests/cpuidle/cpuidle.sh
new file mode 100755
index 000000000000..5c313a3abb0c
--- /dev/null
+++ b/tools/testing/selftests/cpuidle/cpuidle.sh
@@ -0,0 +1,257 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+LOG=cpuidle.log
+MODULE=/lib/modules/$(uname -r)/kernel/drivers/cpuidle/test-cpuidle_latency.ko
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+helpme()
+{
+	printf "Usage: $0 [-h] [-todg args]
+	[-h <help>]
+	[-m <location of the module>]
+	[-o <location of the output>]
+	\n"
+	exit 2
+}
+
+parse_arguments()
+{
+	while getopts ht:m:o: arg
+	do
+		case $arg in
+			h) # --help
+				helpme
+				;;
+			m) # --mod-file
+				MODULE=$OPTARG
+				;;
+			o) # output log files
+				LOG=$OPTARG
+				;;
+			\?)
+				helpme
+				;;
+		esac
+	done
+}
+
+ins_mod()
+{
+	if [ ! -f "$MODULE" ]; then
+		printf "$MODULE module does not exist. Exitting\n"
+		exit $ksft_skip
+	fi
+	printf "Inserting $MODULE module\n\n"
+	insmod $MODULE
+	if [ $? != 0 ]; then
+		printf "Insmod $MODULE failed\n"
+		exit $ksft_skip
+	fi
+}
+
+compute_average()
+{
+	arr=("$@")
+	sum=0
+	size=${#arr[@]}
+	for i in "${arr[@]}"
+	do
+		sum=$((sum + i))
+	done
+	avg=$((sum/size))
+}
+
+# Disable all stop states
+disable_idle()
+{
+	for ((cpu=0; cpu<NUM_CPUS; cpu++))
+	do
+		for ((state=0; state<NUM_STATES; state++))
+		do
+			echo 1 > /sys/devices/system/cpu/cpu$cpu/cpuidle/state$state/disable
+		done
+	done
+}
+
+# Perform operation on each CPU for the given state
+# $1 - Operation: enable (0) / disable (1)
+# $2 - State to enable
+op_state()
+{
+	for ((cpu=0; cpu<NUM_CPUS; cpu++))
+	do
+		echo $1 > /sys/devices/system/cpu/cpu$cpu/cpuidle/state$2/disable
+	done
+}
+
+# Extract latency in microseconds and convert to nanoseconds
+extract_latency()
+{
+	for ((state=0; state<NUM_STATES; state++))
+	do
+		latency=$(($(cat /sys/devices/system/cpu/cpu0/cpuidle/state$state/latency) * 1000))
+		latency_arr+=($latency)
+	done
+}
+
+# Run the IPI test
+# $1 run for baseline - busy cpu or regular environment
+# $2 destination cpu
+ipi_test_once()
+{
+        dest_cpu=$2
+        if [ "$1" = "baseline" ]; then
+			# Keep the CPU busy
+			taskset -c $dest_cpu cat /dev/random > /dev/null &
+			task_pid=$!
+			# Wait for the workload to achieve 100% CPU usage
+			sleep 1
+        fi
+        taskset 0x1 echo $dest_cpu > /sys/kernel/debug/latency_test/ipi_cpu_dest
+        ipi_latency=$(cat /sys/kernel/debug/latency_test/ipi_latency_ns)
+        src_cpu=$(cat /sys/kernel/debug/latency_test/ipi_cpu_src)
+        if [ "$1" = "baseline" ]; then
+			kill $task_pid
+			wait $task_pid 2>/dev/null
+        fi
+}
+
+# Incrementally Enable idle states one by one and compute the latency
+run_ipi_tests()
+{
+        extract_latency
+        disable_idle
+        declare -a avg_arr
+        echo -e "--IPI Latency Test---" >> $LOG
+
+		echo -e "--Baseline IPI Latency measurement: CPU Busy--" >> $LOG
+		printf "%s %10s %12s\n" "SRC_CPU" "DEST_CPU" "IPI_Latency(ns)" >> $LOG
+		for ((cpu=0; cpu<NUM_CPUS; cpu++))
+		do
+			ipi_test_once "baseline" $cpu
+			printf "%-3s %10s %12s\n" $src_cpu $cpu $ipi_latency >> $LOG
+			avg_arr+=($ipi_latency)
+		done
+		compute_average "${avg_arr[@]}"
+		echo -e "Baseline Average IPI latency(ns): $avg" >> $LOG
+
+        for ((state=0; state<NUM_STATES; state++))
+        do
+			unset avg_arr
+			echo -e "---Enabling state: $state---" >> $LOG
+			op_state 0 $state
+			printf "%s %10s %12s\n" "SRC_CPU" "DEST_CPU" "IPI_Latency(ns)" >> $LOG
+			for ((cpu=0; cpu<NUM_CPUS; cpu++))
+			do
+				# Running IPI test and logging results
+				sleep 1
+				ipi_test_once "test" $cpu
+				printf "%-3s %10s %12s\n" $src_cpu $cpu $ipi_latency >> $LOG
+				avg_arr+=($ipi_latency)
+			done
+			compute_average "${avg_arr[@]}"
+			echo -e "Expected IPI latency(ns): ${latency_arr[$state]}" >> $LOG
+			echo -e "Observed Average IPI latency(ns): $avg" >> $LOG
+			op_state 1 $state
+        done
+}
+
+# Extract the residency in microseconds and convert to nanoseconds.
+# Add 100 ns so that the timer stays for a little longer than the residency
+extract_residency()
+{
+	for ((state=0; state<NUM_STATES; state++))
+	do
+		residency=$(($(cat /sys/devices/system/cpu/cpu0/cpuidle/state$state/residency) * 1000 + 200))
+		residency_arr+=($residency)
+	done
+}
+
+# Run the Timeout test
+# $1 run for baseline - busy cpu or regular environment
+# $2 destination cpu
+# $3 timeout
+timeout_test_once()
+{
+	dest_cpu=$2
+	if [ "$1" = "baseline" ]; then
+		# Keep the CPU busy
+		taskset -c $dest_cpu cat /dev/random > /dev/null &
+		task_pid=$!
+		# Wait for the workload to achieve 100% CPU usage
+		sleep 1
+	fi
+	taskset -c $dest_cpu echo $3 > /sys/kernel/debug/latency_test/timeout_expected_ns
+	# Wait for the result to populate
+	sleep 0.1
+	timeout_diff=$(cat /sys/kernel/debug/latency_test/timeout_diff_ns)
+	src_cpu=$(cat /sys/kernel/debug/latency_test/timeout_cpu_src)
+	if [ "$1" = "baseline" ]; then
+		kill $task_pid
+		wait $task_pid 2>/dev/null
+	fi
+}
+
+run_timeout_tests()
+{
+	extract_residency
+	disable_idle
+	declare -a avg_arr
+	echo -e "\n--Timeout Latency Test--" >> $LOG
+
+	echo -e "--Baseline Timeout Latency measurement: CPU Busy--" >> $LOG
+	printf "%s %10s %10s\n" "Wakeup_src" "Baseline_delay(ns)">> $LOG
+	for ((cpu=0; cpu<NUM_CPUS; cpu++))
+	do
+		timeout_test_once "baseline" $cpu ${residency_arr[0]}
+		printf "%-3s %13s\n" $src_cpu $timeout_diff >> $LOG
+		avg_arr+=($timeout_diff)
+	done
+	compute_average "${avg_arr[@]}"
+	echo -e "Baseline Average timeout diff(ns): $avg" >> $LOG
+
+	for ((state=0; state<NUM_STATES; state++))
+	do
+		echo -e "---Enabling state: $state---" >> $LOG
+		op_state 0 $state
+		printf "%s %10s %10s\n" "Wakeup_src" "Baseline_delay(ns)" "Delay(ns)" >> $LOG
+		unset avg_arr
+		for ((cpu=0; cpu<NUM_CPUS; cpu++))
+		do
+			timeout_test_once "test" $cpu ${residency_arr[$state]}
+			printf "%-3s %13s %18s\n" $src_cpu $baseline_timeout_diff $timeout_diff >> $LOG
+			avg_arr+=($timeout_diff)
+		done
+		compute_average "${avg_arr[@]}"
+		echo -e "Expected timeout(ns): ${residency_arr[$state]}" >> $LOG
+		echo -e "Observed Average timeout diff(ns): $avg" >> $LOG
+		op_state 1 $state
+	done
+}
+
+declare -a residency_arr
+declare -a latency_arr
+
+# Parse arguments
+parse_arguments $@
+
+rm -f $LOG
+touch $LOG
+NUM_CPUS=$(nproc --all)
+NUM_STATES=$(ls -1 /sys/devices/system/cpu/cpu0/cpuidle/ | wc -l)
+
+# Insert the module
+ins_mod $MODULE
+
+printf "Started IPI latency tests\n"
+run_ipi_tests
+
+printf "Started Timer latency tests\n"
+run_timeout_tests
+
+printf "Removing $MODULE module\n"
+printf "Output logged at: $LOG\n"
+rmmod $MODULE
diff --git a/tools/testing/selftests/cpuidle/settings b/tools/testing/selftests/cpuidle/settings
new file mode 100644
index 000000000000..e7b9417537fb
--- /dev/null
+++ b/tools/testing/selftests/cpuidle/settings
@@ -0,0 +1 @@
+timeout=0
-- 
2.25.4


^ permalink raw reply related

* Re: ASMedia USB 3.x host controllers triggering EEH on POWER9
From: Forest Crossman @ 2020-07-17 10:10 UTC (permalink / raw)
  To: Oliver O'Halloran; +Cc: linux-pci, linux-usb, linuxppc-dev
In-Reply-To: <CAOSf1CEOu5jX6KXKWiConBf=TDwF3qdASMxtfDjqSR6vOKN4XA@mail.gmail.com>

> In the future you can use this script to automate some of the tedium
> of parsing the eeh dumps:
> https://patchwork.ozlabs.org/project/skiboot/patch/20200717044243.1195833-1-oohall@gmail.com/

Ah, nice, thanks for showing me this! I had written my own parser that
just dumped a few register names and what bits were set in each, but
your script seems much more feature-complete.

> Anyway, for background the way PHB3 and PHB4 handle incoming DMAs goes
> as follows:
>
> 1. Map the 16bit <bus><devfn> number to an IOMMU context, we call
> those PEs. PE means "partitionable endpoint", but for the purpose of
> processing DMAs you can ignore that and just treat it as an IOMMU
> context ID.
> 2. Use the PE number and some of the upper bits of the DMA address to
> form the index into the Translation Validation Table.
> 3. Use the table entry to validate the DMA address is within bounds
> and whether it should be translated by the IOMMU or used as-is.
>
> If the table entry says the DMA needs to be translated by the IOMMU we'll also:
> 4. Walk the IOMMU table to get the relevant IOMMU table entry.
> 5. Validate the device has permission to read/write to that address.
>
> The "TVT Address Range Error" you're seeing means that the bounds
> checks done in 3) is failing. OPAL configures the PHB so there's two
> TVT entries (TVEs for short) assigned to each PE. Bit 59 of the DMA
> address is used to select which TVE to use. We typically configure
> TVE#0 to map 0x0...0x8000_0000 so there's a 2GB 32bit DMA window.
> TVE#1 is configured for no-translate (bypass) mode so you can convert
> from a system physical address to a DMA address by ORing in bit 59.

Thanks for the in-depth explanation, I find these low-level details
really fascinating.

> From word 2 of the PEST entry the faulting DMA address is:
> 0x0000203974c00000. That address is interesting since it looks a lot
> like a memory address on the 2nd chip, but it doesn't have bit 59 set
> so TVE#0 is used to validate it. Obviously that address is above 2GB
> so we get the error.

Ah, I see. Do you know if the information on the PEST registers is
documented publicly somewhere? I tried searching for what those
registers meant in the PHB4 spec but it basically just said, "the PEST
registers contain PEST data," which isn't particularly helpful.

> What's probably happening is that the ASmedia controller doesn't
> actually implement all 64 address bits and truncates the upper bits of
> the DMA address. Doing that is a blatant violation of the PCIe (and
> probably the xHCI) spec, but it's also pretty common since "it works
> on x86." Something to try would be booting with the iommu=nobypass in
> the kernel command line. That'll disable TVE#1 and force all DMAs to
> go through TVE#0.

Thanks, iommu=nobypass fixed it! Plugging in one or more USB devices
no longer triggers any EEH errors.

> Assuming the nobypass trick above works, what you really need to do is
> have the driver report that it can't address all 64bits by setting its
> DMA mask accordingly. For the xhci driver it looks like this is done
> in xhci_gen_setup(), there might be a quirks-style interface for
> working around bugs in specific controllers that you can use. Have a
> poke around and see what you can find :)

Yup, the xhci driver has a quirks system, and conveniently one of
those is XHCI_NO_64BIT_SUPPORT. After making a 3-line patch to
xhci-pci.c to add that quirk for this chip, the host controller is now
able to work without setting iommu=nobypass in the kernel arguments.

Thank you so much for your help! You've likely saved me several hours
of reading documentation, as well as several more hours of fiddling
around with the xhci driver. I'm almost disappointed the fix was so
simple, but the time savings alone more than makes up for it. I'll
submit the patch to the USB ML shortly.

Thanks again!

Forest

^ permalink raw reply

* Re: ASMedia USB 3.x host controllers triggering EEH on POWER9
From: Oliver O'Halloran @ 2020-07-17 10:43 UTC (permalink / raw)
  To: Forest Crossman; +Cc: linux-pci, linux-usb, linuxppc-dev
In-Reply-To: <CAO3ALPwZPN1vdqHaFsuSpgj63o6Z69VUg0LngmCSnvESrO4kNg@mail.gmail.com>

On Fri, Jul 17, 2020 at 8:10 PM Forest Crossman <cyrozap@gmail.com> wrote:
>
> > From word 2 of the PEST entry the faulting DMA address is:
> > 0x0000203974c00000. That address is interesting since it looks a lot
> > like a memory address on the 2nd chip, but it doesn't have bit 59 set
> > so TVE#0 is used to validate it. Obviously that address is above 2GB
> > so we get the error.
>
> Ah, I see. Do you know if the information on the PEST registers is
> documented publicly somewhere? I tried searching for what those
> registers meant in the PHB4 spec but it basically just said, "the PEST
> registers contain PEST data," which isn't particularly helpful.

Most of it is in the IODA spec. See Sec 2.2.6 "Partitionable-Endpoint
State Table", the only part that isn't documented there is the top
bits of each word which are documented in the PHB spec for some
reason. For word one the top bit (PPCBIT(0)) means MMIO is frozen and
for word two (PPCBIT(64)) the top bit indicates DMA is frozen.

> > What's probably happening is that the ASmedia controller doesn't
> > actually implement all 64 address bits and truncates the upper bits of
> > the DMA address. Doing that is a blatant violation of the PCIe (and
> > probably the xHCI) spec, but it's also pretty common since "it works
> > on x86." Something to try would be booting with the iommu=nobypass in
> > the kernel command line. That'll disable TVE#1 and force all DMAs to
> > go through TVE#0.
>
> Thanks, iommu=nobypass fixed it! Plugging in one or more USB devices
> no longer triggers any EEH errors.
>
> > Assuming the nobypass trick above works, what you really need to do is
> > have the driver report that it can't address all 64bits by setting its
> > DMA mask accordingly. For the xhci driver it looks like this is done
> > in xhci_gen_setup(), there might be a quirks-style interface for
> > working around bugs in specific controllers that you can use. Have a
> > poke around and see what you can find :)
>
> Yup, the xhci driver has a quirks system, and conveniently one of
> those is XHCI_NO_64BIT_SUPPORT. After making a 3-line patch to
> xhci-pci.c to add that quirk for this chip, the host controller is now
> able to work without setting iommu=nobypass in the kernel arguments.

Cool

> Thank you so much for your help! You've likely saved me several hours
> of reading documentation, as well as several more hours of fiddling
> around with the xhci driver.

> I'm almost disappointed the fix was so
> simple, but the time savings alone more than makes up for it. I'll
> submit the patch to the USB ML shortly.

It might be only three lines, but it doesn't mean it's trivial. Most
bugs like that require a lot of context to understand, let alone fix.
Thanks for looking into it!

Oliver

^ permalink raw reply

* Re: [PATCH 0/4] ASoC: fsl_asrc: allow selecting arbitrary clocks
From: Arnaud Ferraris @ 2020-07-17 11:16 UTC (permalink / raw)
  To: Nicolin Chen
  Cc: devicetree, alsa-devel, linuxppc-dev, Timur Tabi, Xiubo Li,
	linux-kernel, Takashi Iwai, Liam Girdwood, Rob Herring,
	Jaroslav Kysela, Mark Brown, kernel, Fabio Estevam
In-Reply-To: <20200702184226.GA23935@Asurada-Nvidia>

Hi Nic,

Le 02/07/2020 à 20:42, Nicolin Chen a écrit :
> Hi Arnaud,
> 
> On Thu, Jul 02, 2020 at 04:22:31PM +0200, Arnaud Ferraris wrote:
>> The current ASRC driver hardcodes the input and output clocks used for
>> sample rate conversions. In order to allow greater flexibility and to
>> cover more use cases, it would be preferable to select the clocks using
>> device-tree properties.
> 
> We recent just merged a new change that auto-selecting internal
> clocks based on sample rates as the first option -- ideal ratio
> mode is the fallback mode now. Please refer to:
> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?h=next-20200702&id=d0250cf4f2abfbea64ed247230f08f5ae23979f0

While working on fixing the automatic clock selection (see my v3), I
came across another potential issue, which would be better explained
with an example:
  - Input has sample rate 8kHz and uses clock SSI1 with rate 512kHz
  - Output has sample rate 16kHz and uses clock SSI2 with rate 1024kHz

Let's say my v3 patch is merged, then the selected input clock will be
SSI1, while the selected output clock will be SSI2. In that case, it's
all good, as the driver will calculate the dividers right.

Now, suppose a similar board has the input wired to SSI2 and output to
SSI1, meaning we're now in the following case:
  - Input has sample rate 8kHz and uses clock SSI2 with rate 512kHz
  - Output has sample rate 16kHz and uses clock SSI1 with rate 1024kHz
(the same result is achieved during capture with the initial example
setup, as input and output properties are then swapped)

In that case, the selected clocks will still be SSI1 for input (just
because it appears first in the clock table), and SSI2 for output,
meaning the calculated dividers will be:
  - input: 512 / 16 => 32 (should be 64)
  - output: 1024 / 8 => 128 (should be 64 here too)

---

I can't see how the clock selection algorithm could be made smart enough
to cover cases such as this one, as it would need to be aware of the
exact relationship between the sample rate and the clock rate (my
example demonstrates a case where the "sample rate to clock rate"
multiplier is identical for both input and output, but this can't be
assumed to be always the case).

Therefore, I still believe being able to force clock selection using
optional DT properties would make sense, while still using the
auto-selection by default.

Regards,
Arnaud

> 
> Having a quick review at your changes, I think the DT part may
> not be necessary as it's more likely a software configuration.
> I personally like the new auto-selecting solution more.
> 
>> This series also fix register configuration and clock assignment so
>> conversion can be conducted effectively in both directions with a good
>> quality.
> 
> If there's any further change that you feel you can improve on
> the top of mentioned change after rebasing, I'd like to review.
> 
> Thanks
> Nic
> 

^ permalink raw reply

* [PATCH -next] powerpc: Remove unneeded inline functions
From: YueHaibing @ 2020-07-17 11:27 UTC (permalink / raw)
  To: mpe, benh, paulus, haren, dave.hansen, npiggin
  Cc: YueHaibing, linuxppc-dev, linux-kernel
In-Reply-To: <20200715031939.31968-1-yuehaibing@huawei.com>

Both of those functions are only called from 64-bit only code, so the
stubs should not be needed at all.

Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 arch/powerpc/include/asm/mmu_context.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 1a474f6b1992..7f3658a97384 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -218,8 +218,6 @@ static inline void inc_mm_active_cpus(struct mm_struct *mm) { }
 static inline void dec_mm_active_cpus(struct mm_struct *mm) { }
 static inline void mm_context_add_copro(struct mm_struct *mm) { }
 static inline void mm_context_remove_copro(struct mm_struct *mm) { }
-static inline void mm_context_add_vas_windows(struct mm_struct *mm) { }
-static inline void mm_context_remove_vas_windows(struct mm_struct *mm) { }
 #endif
 
 
-- 
2.17.1



^ permalink raw reply related

* Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Mathieu Desnoyers @ 2020-07-17 13:39 UTC (permalink / raw)
  To: Alan Stern
  Cc: linux-arch, paulmck, Arnd Bergmann, Peter Zijlstra, x86,
	linux-kernel, Nicholas Piggin, linux-mm, Andy Lutomirski,
	linuxppc-dev
In-Reply-To: <20200716212416.GA1126458@rowland.harvard.edu>

----- On Jul 16, 2020, at 5:24 PM, Alan Stern stern@rowland.harvard.edu wrote:

> On Thu, Jul 16, 2020 at 02:58:41PM -0400, Mathieu Desnoyers wrote:
>> ----- On Jul 16, 2020, at 12:03 PM, Mathieu Desnoyers
>> mathieu.desnoyers@efficios.com wrote:
>> 
>> > ----- On Jul 16, 2020, at 11:46 AM, Mathieu Desnoyers
>> > mathieu.desnoyers@efficios.com wrote:
>> > 
>> >> ----- On Jul 16, 2020, at 12:42 AM, Nicholas Piggin npiggin@gmail.com wrote:
>> >>> I should be more complete here, especially since I was complaining
>> >>> about unclear barrier comment :)
>> >>> 
>> >>> 
>> >>> CPU0                     CPU1
>> >>> a. user stuff            1. user stuff
>> >>> b. membarrier()          2. enter kernel
>> >>> c. smp_mb()              3. smp_mb__after_spinlock(); // in __schedule
>> >>> d. read rq->curr         4. rq->curr switched to kthread
>> >>> e. is kthread, skip IPI  5. switch_to kthread
>> >>> f. return to user        6. rq->curr switched to user thread
>> >>> g. user stuff            7. switch_to user thread
>> >>>                         8. exit kernel
>> >>>                         9. more user stuff
>> >>> 
>> >>> What you're really ordering is a, g vs 1, 9 right?
>> >>> 
>> >>> In other words, 9 must see a if it sees g, g must see 1 if it saw 9,
>> >>> etc.
>> >>> 
>> >>> Userspace does not care where the barriers are exactly or what kernel
>> >>> memory accesses might be being ordered by them, so long as there is a
>> >>> mb somewhere between a and g, and 1 and 9. Right?
>> >> 
>> >> This is correct.
>> > 
>> > Actually, sorry, the above is not quite right. It's been a while
>> > since I looked into the details of membarrier.
>> > 
>> > The smp_mb() at the beginning of membarrier() needs to be paired with a
>> > smp_mb() _after_ rq->curr is switched back to the user thread, so the
>> > memory barrier is between store to rq->curr and following user-space
>> > accesses.
>> > 
>> > The smp_mb() at the end of membarrier() needs to be paired with the
>> > smp_mb__after_spinlock() at the beginning of schedule, which is
>> > between accesses to userspace memory and switching rq->curr to kthread.
>> > 
>> > As to *why* this ordering is needed, I'd have to dig through additional
>> > scenarios from https://lwn.net/Articles/573436/. Or maybe Paul remembers ?
>> 
>> Thinking further about this, I'm beginning to consider that maybe we have been
>> overly cautious by requiring memory barriers before and after store to rq->curr.
>> 
>> If CPU0 observes a CPU1's rq->curr->mm which differs from its own process
>> (current)
>> while running the membarrier system call, it necessarily means that CPU1 had
>> to issue smp_mb__after_spinlock when entering the scheduler, between any
>> user-space
>> loads/stores and update of rq->curr.
>> 
>> Requiring a memory barrier between update of rq->curr (back to current process's
>> thread) and following user-space memory accesses does not seem to guarantee
>> anything more than what the initial barrier at the beginning of __schedule
>> already
>> provides, because the guarantees are only about accesses to user-space memory.
>> 
>> Therefore, with the memory barrier at the beginning of __schedule, just
>> observing that
>> CPU1's rq->curr differs from current should guarantee that a memory barrier was
>> issued
>> between any sequentially consistent instructions belonging to the current
>> process on
>> CPU1.
>> 
>> Or am I missing/misremembering an important point here ?
> 
> Is it correct to say that the switch_to operations in 5 and 7 include
> memory barriers?  If they do, then skipping the IPI should be okay.
> 
> The reason is as follows: The guarantee you need to enforce is that
> anything written by CPU0 before the membarrier() will be visible to CPU1
> after it returns to user mode.  Let's say that a writes to X and 9
> reads from X.
> 
> Then we have an instance of the Store Buffer pattern:
> 
>	CPU0			CPU1
>	a. Write X		6. Write rq->curr for user thread
>	c. smp_mb()		7. switch_to memory barrier
>	d. Read rq->curr	9. Read X
> 
> In this pattern, the memory barriers make it impossible for both reads
> to miss their corresponding writes.  Since d does fail to read 6 (it
> sees the earlier value stored by 4), 9 must read a.
> 
> The other guarantee you need is that g on CPU0 will observe anything
> written by CPU1 in 1.  This is easier to see, using the fact that 3 is a
> memory barrier and d reads from 4.

Right, and Nick's reply involving pairs of loads/stores on each side
clarifies the situation even further.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Mathieu Desnoyers @ 2020-07-17 13:42 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, x86, linux-kernel,
	Andy Lutomirski, linux-mm, Andy Lutomirski, linuxppc-dev
In-Reply-To: <1594906688.ikv6r4gznx.astroid@bobo.none>

----- On Jul 16, 2020, at 7:26 PM, Nicholas Piggin npiggin@gmail.com wrote:
[...]
> 
> membarrier does replace barrier instructions on remote CPUs, which do
> order accesses performed by the kernel on the user address space. So
> membarrier should too I guess.
> 
> Normal process context accesses like read(2) will do so because they
> don't get filtered out from IPIs, but kernel threads using the mm may
> not.

But it should not be an issue, because membarrier's ordering is only with respect
to submit and completion of io_uring requests, which are performed through
system calls from the context of user-space threads, which are called from the
right mm.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* [v3 00/15] powerpc/perf: Add support for power10 PMU Hardware
From: Athira Rajeev @ 2020-07-17 14:38 UTC (permalink / raw)
  To: mpe; +Cc: ego, mikey, maddy, kvm, kvm-ppc, svaidyan, acme, jolsa,
	linuxppc-dev

The patch series adds support for power10 PMU hardware.

Patches 1..3 are the clean up patches which refactors the way how
PMU SPR's are stored in core-book3s and in KVM book3s, as well as update
data type for PMU cache_events.

Patches 12 and 13 adds base support for perf extended register
capability in powerpc. Support for extended regs in power10 is
covered in patches 14,15

Other patches includes main changes to support for power10 PMU.

Anju T Sudhakar (2):
  powerpc/perf: Add support for outputting extended regs in perf
    intr_regs
  tools/perf: Add perf tools support for extended register capability in
    powerpc

Athira Rajeev (10):
  powerpc/perf: Update cpu_hw_event to use `struct` for storing MMCR
    registers
  KVM: PPC: Book3S HV: Cleanup updates for kvm vcpu MMCR
  powerpc/perf: Update Power PMU cache_events to u64 type
  KVM: PPC: Book3S HV: Save/restore new PMU registers
  powerpc/perf: power10 Performance Monitoring support
  powerpc/perf: Ignore the BHRB kernel address filtering for P10
  powerpc/perf: Add Power10 BHRB filter support for
    PERF_SAMPLE_BRANCH_IND_CALL/COND
  powerpc/perf: BHRB control to disable BHRB logic when not used
  powerpc/perf: Add extended regs support for power10 platform
  tools/perf: Add perf tools support for extended regs in power10

Madhavan Srinivasan (3):
  powerpc/perf: Add support for ISA3.1 PMU SPRs
  powerpc/xmon: Add PowerISA v3.1 PMU SPRs
  powerpc/perf: Add power10_feat to dt_cpu_ftrs

---
Changes from v2 -> v3
- Addressed review comments from Michael Neuling,
  Michael Ellerman, Gautham Shenoy and Paul Mackerras

Changes from v1 -> v2
- Added support for extended regs in powerpc
  for power9/power10 platform ( patches 12 to 15)
- Addressed change/removal of some event codes
  in the PMU driver
---

 Documentation/virt/kvm/api.rst                  |   3 +
 arch/powerpc/include/asm/kvm_book3s_asm.h       |   2 +-
 arch/powerpc/include/asm/kvm_host.h             |   6 +-
 arch/powerpc/include/asm/perf_event_server.h    |  22 +-
 arch/powerpc/include/asm/processor.h            |   4 +
 arch/powerpc/include/asm/reg.h                  |   9 +
 arch/powerpc/include/uapi/asm/kvm.h             |   9 +-
 arch/powerpc/include/uapi/asm/perf_regs.h       |  20 +-
 arch/powerpc/kernel/asm-offsets.c               |   5 +
 arch/powerpc/kernel/cpu_setup_power.S           |   8 +
 arch/powerpc/kernel/dt_cpu_ftrs.c               |  26 ++
 arch/powerpc/kernel/sysfs.c                     |   8 +
 arch/powerpc/kvm/book3s_hv.c                    |  38 ++-
 arch/powerpc/kvm/book3s_hv_interrupts.S         |   8 +
 arch/powerpc/kvm/book3s_hv_rmhandlers.S         |  36 +-
 arch/powerpc/perf/Makefile                      |   2 +-
 arch/powerpc/perf/core-book3s.c                 | 108 ++++--
 arch/powerpc/perf/generic-compat-pmu.c          |   2 +-
 arch/powerpc/perf/internal.h                    |   1 +
 arch/powerpc/perf/isa207-common.c               |  91 +++--
 arch/powerpc/perf/isa207-common.h               |  37 ++-
 arch/powerpc/perf/mpc7450-pmu.c                 |  23 +-
 arch/powerpc/perf/perf_regs.c                   |  44 ++-
 arch/powerpc/perf/power10-events-list.h         |  70 ++++
 arch/powerpc/perf/power10-pmu.c                 | 425 ++++++++++++++++++++++++
 arch/powerpc/perf/power5+-pmu.c                 |  19 +-
 arch/powerpc/perf/power5-pmu.c                  |  19 +-
 arch/powerpc/perf/power6-pmu.c                  |  18 +-
 arch/powerpc/perf/power7-pmu.c                  |  19 +-
 arch/powerpc/perf/power8-pmu.c                  |   2 +-
 arch/powerpc/perf/power9-pmu.c                  |   8 +-
 arch/powerpc/perf/ppc970-pmu.c                  |  26 +-
 arch/powerpc/platforms/powernv/idle.c           |  22 +-
 arch/powerpc/xmon/xmon.c                        |  13 +
 tools/arch/powerpc/include/uapi/asm/kvm.h       |   9 +-
 tools/arch/powerpc/include/uapi/asm/perf_regs.h |  20 +-
 tools/perf/arch/powerpc/include/perf_regs.h     |   8 +-
 tools/perf/arch/powerpc/util/header.c           |   9 +-
 tools/perf/arch/powerpc/util/perf_regs.c        |  55 +++
 tools/perf/arch/powerpc/util/utils_header.h     |  15 +
 40 files changed, 1117 insertions(+), 152 deletions(-)
 create mode 100644 arch/powerpc/perf/power10-events-list.h
 create mode 100644 arch/powerpc/perf/power10-pmu.c
 create mode 100644 tools/perf/arch/powerpc/util/utils_header.h

-- 
1.8.3.1


^ permalink raw reply

* [v3 01/15] powerpc/perf: Update cpu_hw_event to use `struct` for storing MMCR registers
From: Athira Rajeev @ 2020-07-17 14:38 UTC (permalink / raw)
  To: mpe; +Cc: ego, mikey, maddy, kvm, kvm-ppc, svaidyan, acme, jolsa,
	linuxppc-dev
In-Reply-To: <1594996707-3727-1-git-send-email-atrajeev@linux.vnet.ibm.com>

core-book3s currently uses array to store the MMCR registers as part
of per-cpu `cpu_hw_events`. This patch does a clean up to use `struct`
to store mmcr regs instead of array. This will make code easier to read
and reduces chance of any subtle bug that may come in the future, say
when new registers are added. Patch updates all relevant code that was
using MMCR array ( cpuhw->mmcr[x]) to use newly introduced `struct`.
This includes the PMU driver code for supported platforms (power5
to power9) and ISA macros for counter support functions.

Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/perf_event_server.h | 10 ++++--
 arch/powerpc/perf/core-book3s.c              | 53 +++++++++++++---------------
 arch/powerpc/perf/isa207-common.c            | 20 +++++------
 arch/powerpc/perf/isa207-common.h            |  4 +--
 arch/powerpc/perf/mpc7450-pmu.c              | 21 +++++++----
 arch/powerpc/perf/power5+-pmu.c              | 17 ++++-----
 arch/powerpc/perf/power5-pmu.c               | 17 ++++-----
 arch/powerpc/perf/power6-pmu.c               | 16 ++++-----
 arch/powerpc/perf/power7-pmu.c               | 17 ++++-----
 arch/powerpc/perf/ppc970-pmu.c               | 24 ++++++-------
 10 files changed, 105 insertions(+), 94 deletions(-)

diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index 3e9703f..f9a3668 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -17,6 +17,12 @@
 
 struct perf_event;
 
+struct mmcr_regs {
+	unsigned long mmcr0;
+	unsigned long mmcr1;
+	unsigned long mmcr2;
+	unsigned long mmcra;
+};
 /*
  * This struct provides the constants and functions needed to
  * describe the PMU on a particular POWER-family CPU.
@@ -28,7 +34,7 @@ struct power_pmu {
 	unsigned long	add_fields;
 	unsigned long	test_adder;
 	int		(*compute_mmcr)(u64 events[], int n_ev,
-				unsigned int hwc[], unsigned long mmcr[],
+				unsigned int hwc[], struct mmcr_regs *mmcr,
 				struct perf_event *pevents[]);
 	int		(*get_constraint)(u64 event_id, unsigned long *mskp,
 				unsigned long *valp);
@@ -41,7 +47,7 @@ struct power_pmu {
 	unsigned long	group_constraint_val;
 	u64             (*bhrb_filter_map)(u64 branch_sample_type);
 	void            (*config_bhrb)(u64 pmu_bhrb_filter);
-	void		(*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
+	void		(*disable_pmc)(unsigned int pmc, struct mmcr_regs *mmcr);
 	int		(*limited_pmc_event)(u64 event_id);
 	u32		flags;
 	const struct attribute_group	**attr_groups;
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index cd6a742..18b1b6a 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -37,12 +37,7 @@ struct cpu_hw_events {
 	struct perf_event *event[MAX_HWEVENTS];
 	u64 events[MAX_HWEVENTS];
 	unsigned int flags[MAX_HWEVENTS];
-	/*
-	 * The order of the MMCR array is:
-	 *  - 64-bit, MMCR0, MMCR1, MMCRA, MMCR2
-	 *  - 32-bit, MMCR0, MMCR1, MMCR2
-	 */
-	unsigned long mmcr[4];
+	struct mmcr_regs mmcr;
 	struct perf_event *limited_counter[MAX_LIMITED_HWCOUNTERS];
 	u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
 	u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
@@ -121,7 +116,7 @@ static void ebb_event_add(struct perf_event *event) { }
 static void ebb_switch_out(unsigned long mmcr0) { }
 static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw)
 {
-	return cpuhw->mmcr[0];
+	return cpuhw->mmcr.mmcr0;
 }
 
 static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
@@ -590,7 +585,7 @@ static void ebb_switch_out(unsigned long mmcr0)
 
 static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw)
 {
-	unsigned long mmcr0 = cpuhw->mmcr[0];
+	unsigned long mmcr0 = cpuhw->mmcr.mmcr0;
 
 	if (!ebb)
 		goto out;
@@ -624,7 +619,7 @@ static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw)
 	 * unfreeze counters, it should not set exclude_xxx in its events and
 	 * instead manage the MMCR2 entirely by itself.
 	 */
-	mtspr(SPRN_MMCR2, cpuhw->mmcr[3] | current->thread.mmcr2);
+	mtspr(SPRN_MMCR2, cpuhw->mmcr.mmcr2 | current->thread.mmcr2);
 out:
 	return mmcr0;
 }
@@ -1232,9 +1227,9 @@ static void power_pmu_disable(struct pmu *pmu)
 		/*
 		 * Disable instruction sampling if it was enabled
 		 */
-		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+		if (cpuhw->mmcr.mmcra & MMCRA_SAMPLE_ENABLE) {
 			mtspr(SPRN_MMCRA,
-			      cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+			      cpuhw->mmcr.mmcra & ~MMCRA_SAMPLE_ENABLE);
 			mb();
 			isync();
 		}
@@ -1308,18 +1303,18 @@ static void power_pmu_enable(struct pmu *pmu)
 	 * (possibly updated for removal of events).
 	 */
 	if (!cpuhw->n_added) {
-		mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
-		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+		mtspr(SPRN_MMCRA, cpuhw->mmcr.mmcra & ~MMCRA_SAMPLE_ENABLE);
+		mtspr(SPRN_MMCR1, cpuhw->mmcr.mmcr1);
 		goto out_enable;
 	}
 
 	/*
 	 * Clear all MMCR settings and recompute them for the new set of events.
 	 */
-	memset(cpuhw->mmcr, 0, sizeof(cpuhw->mmcr));
+	memset(&cpuhw->mmcr, 0, sizeof(cpuhw->mmcr));
 
 	if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index,
-			       cpuhw->mmcr, cpuhw->event)) {
+			       &cpuhw->mmcr, cpuhw->event)) {
 		/* shouldn't ever get here */
 		printk(KERN_ERR "oops compute_mmcr failed\n");
 		goto out;
@@ -1333,11 +1328,11 @@ static void power_pmu_enable(struct pmu *pmu)
 		 */
 		event = cpuhw->event[0];
 		if (event->attr.exclude_user)
-			cpuhw->mmcr[0] |= MMCR0_FCP;
+			cpuhw->mmcr.mmcr0 |= MMCR0_FCP;
 		if (event->attr.exclude_kernel)
-			cpuhw->mmcr[0] |= freeze_events_kernel;
+			cpuhw->mmcr.mmcr0 |= freeze_events_kernel;
 		if (event->attr.exclude_hv)
-			cpuhw->mmcr[0] |= MMCR0_FCHV;
+			cpuhw->mmcr.mmcr0 |= MMCR0_FCHV;
 	}
 
 	/*
@@ -1346,12 +1341,12 @@ static void power_pmu_enable(struct pmu *pmu)
 	 * Then unfreeze the events.
 	 */
 	ppc_set_pmu_inuse(1);
-	mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
-	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
-	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
+	mtspr(SPRN_MMCRA, cpuhw->mmcr.mmcra & ~MMCRA_SAMPLE_ENABLE);
+	mtspr(SPRN_MMCR1, cpuhw->mmcr.mmcr1);
+	mtspr(SPRN_MMCR0, (cpuhw->mmcr.mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
 				| MMCR0_FC);
 	if (ppmu->flags & PPMU_ARCH_207S)
-		mtspr(SPRN_MMCR2, cpuhw->mmcr[3]);
+		mtspr(SPRN_MMCR2, cpuhw->mmcr.mmcr2);
 
 	/*
 	 * Read off any pre-existing events that need to move
@@ -1402,7 +1397,7 @@ static void power_pmu_enable(struct pmu *pmu)
 		perf_event_update_userpage(event);
 	}
 	cpuhw->n_limited = n_lim;
-	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
+	cpuhw->mmcr.mmcr0 |= MMCR0_PMXE | MMCR0_FCECE;
 
  out_enable:
 	pmao_restore_workaround(ebb);
@@ -1418,9 +1413,9 @@ static void power_pmu_enable(struct pmu *pmu)
 	/*
 	 * Enable instruction sampling if necessary
 	 */
-	if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+	if (cpuhw->mmcr.mmcra & MMCRA_SAMPLE_ENABLE) {
 		mb();
-		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+		mtspr(SPRN_MMCRA, cpuhw->mmcr.mmcra);
 	}
 
  out:
@@ -1550,7 +1545,7 @@ static void power_pmu_del(struct perf_event *event, int ef_flags)
 				cpuhw->flags[i-1] = cpuhw->flags[i];
 			}
 			--cpuhw->n_events;
-			ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr);
+			ppmu->disable_pmc(event->hw.idx - 1, &cpuhw->mmcr);
 			if (event->hw.idx) {
 				write_pmc(event->hw.idx, 0);
 				event->hw.idx = 0;
@@ -1571,7 +1566,7 @@ static void power_pmu_del(struct perf_event *event, int ef_flags)
 	}
 	if (cpuhw->n_events == 0) {
 		/* disable exceptions if no events are running */
-		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
+		cpuhw->mmcr.mmcr0 &= ~(MMCR0_PMXE | MMCR0_FCECE);
 	}
 
 	if (has_branch_stack(event))
@@ -2240,7 +2235,7 @@ static void __perf_event_interrupt(struct pt_regs *regs)
 	 * XXX might want to use MSR.PM to keep the events frozen until
 	 * we get back out of this interrupt.
 	 */
-	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
+	write_mmcr0(cpuhw, cpuhw->mmcr.mmcr0);
 
 	if (nmi)
 		nmi_exit();
@@ -2262,7 +2257,7 @@ static int power_pmu_prepare_cpu(unsigned int cpu)
 
 	if (ppmu) {
 		memset(cpuhw, 0, sizeof(*cpuhw));
-		cpuhw->mmcr[0] = MMCR0_FC;
+		cpuhw->mmcr.mmcr0 = MMCR0_FC;
 	}
 	return 0;
 }
diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c
index 4c86da5..2fe63f2 100644
--- a/arch/powerpc/perf/isa207-common.c
+++ b/arch/powerpc/perf/isa207-common.c
@@ -363,7 +363,7 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
 }
 
 int isa207_compute_mmcr(u64 event[], int n_ev,
-			       unsigned int hwc[], unsigned long mmcr[],
+			       unsigned int hwc[], struct mmcr_regs *mmcr,
 			       struct perf_event *pevents[])
 {
 	unsigned long mmcra, mmcr1, mmcr2, unit, combine, psel, cache, val;
@@ -464,30 +464,30 @@ int isa207_compute_mmcr(u64 event[], int n_ev,
 	}
 
 	/* Return MMCRx values */
-	mmcr[0] = 0;
+	mmcr->mmcr0 = 0;
 
 	/* pmc_inuse is 1-based */
 	if (pmc_inuse & 2)
-		mmcr[0] = MMCR0_PMC1CE;
+		mmcr->mmcr0 = MMCR0_PMC1CE;
 
 	if (pmc_inuse & 0x7c)
-		mmcr[0] |= MMCR0_PMCjCE;
+		mmcr->mmcr0 |= MMCR0_PMCjCE;
 
 	/* If we're not using PMC 5 or 6, freeze them */
 	if (!(pmc_inuse & 0x60))
-		mmcr[0] |= MMCR0_FC56;
+		mmcr->mmcr0 |= MMCR0_FC56;
 
-	mmcr[1] = mmcr1;
-	mmcr[2] = mmcra;
-	mmcr[3] = mmcr2;
+	mmcr->mmcr1 = mmcr1;
+	mmcr->mmcra = mmcra;
+	mmcr->mmcr2 = mmcr2;
 
 	return 0;
 }
 
-void isa207_disable_pmc(unsigned int pmc, unsigned long mmcr[])
+void isa207_disable_pmc(unsigned int pmc, struct mmcr_regs *mmcr)
 {
 	if (pmc <= 3)
-		mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SHIFT(pmc + 1));
+		mmcr->mmcr1 &= ~(0xffUL << MMCR1_PMCSEL_SHIFT(pmc + 1));
 }
 
 static int find_alternative(u64 event, const unsigned int ev_alt[][MAX_ALT], int size)
diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h
index 63fd4f3..df968fd 100644
--- a/arch/powerpc/perf/isa207-common.h
+++ b/arch/powerpc/perf/isa207-common.h
@@ -217,9 +217,9 @@
 
 int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp);
 int isa207_compute_mmcr(u64 event[], int n_ev,
-				unsigned int hwc[], unsigned long mmcr[],
+				unsigned int hwc[], struct mmcr_regs *mmcr,
 				struct perf_event *pevents[]);
-void isa207_disable_pmc(unsigned int pmc, unsigned long mmcr[]);
+void isa207_disable_pmc(unsigned int pmc, struct mmcr_regs *mmcr);
 int isa207_get_alternatives(u64 event, u64 alt[], int size, unsigned int flags,
 					const unsigned int ev_alt[][MAX_ALT]);
 void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
diff --git a/arch/powerpc/perf/mpc7450-pmu.c b/arch/powerpc/perf/mpc7450-pmu.c
index 4d5ef92..826de25 100644
--- a/arch/powerpc/perf/mpc7450-pmu.c
+++ b/arch/powerpc/perf/mpc7450-pmu.c
@@ -257,7 +257,7 @@ static int mpc7450_get_alternatives(u64 event, unsigned int flags, u64 alt[])
  * Compute MMCR0/1/2 values for a set of events.
  */
 static int mpc7450_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[],
-				unsigned long mmcr[],
+				struct mmcr_regs *mmcr,
 				struct perf_event *pevents[])
 {
 	u8 event_index[N_CLASSES][N_COUNTER];
@@ -321,9 +321,16 @@ static int mpc7450_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[],
 		mmcr0 |= MMCR0_PMCnCE;
 
 	/* Return MMCRx values */
-	mmcr[0] = mmcr0;
-	mmcr[1] = mmcr1;
-	mmcr[2] = mmcr2;
+	mmcr->mmcr0 = mmcr0;
+	mmcr->mmcr1 = mmcr1;
+	mmcr->mmcr2 = mmcr2;
+	/*
+	 * 32-bit doesn't have an MMCRA and uses SPRN_MMCR2 to define
+	 * SPRN_MMCRA. So assign mmcra of cpu_hw_events with `mmcr2`
+	 * value to ensure that any write to this SPRN_MMCRA will
+	 * use mmcr2 value.
+	 */
+	mmcr->mmcra = mmcr2;
 	return 0;
 }
 
@@ -331,12 +338,12 @@ static int mpc7450_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[],
  * Disable counting by a PMC.
  * Note that the pmc argument is 0-based here, not 1-based.
  */
-static void mpc7450_disable_pmc(unsigned int pmc, unsigned long mmcr[])
+static void mpc7450_disable_pmc(unsigned int pmc, struct mmcr_regs *mmcr)
 {
 	if (pmc <= 1)
-		mmcr[0] &= ~(pmcsel_mask[pmc] << pmcsel_shift[pmc]);
+		mmcr->mmcr0 &= ~(pmcsel_mask[pmc] << pmcsel_shift[pmc]);
 	else
-		mmcr[1] &= ~(pmcsel_mask[pmc] << pmcsel_shift[pmc]);
+		mmcr->mmcr1 &= ~(pmcsel_mask[pmc] << pmcsel_shift[pmc]);
 }
 
 static int mpc7450_generic_events[] = {
diff --git a/arch/powerpc/perf/power5+-pmu.c b/arch/powerpc/perf/power5+-pmu.c
index f857454..5f0821e 100644
--- a/arch/powerpc/perf/power5+-pmu.c
+++ b/arch/powerpc/perf/power5+-pmu.c
@@ -448,7 +448,8 @@ static int power5p_marked_instr_event(u64 event)
 }
 
 static int power5p_compute_mmcr(u64 event[], int n_ev,
-				unsigned int hwc[], unsigned long mmcr[], struct perf_event *pevents[])
+				unsigned int hwc[], struct mmcr_regs *mmcr,
+				struct perf_event *pevents[])
 {
 	unsigned long mmcr1 = 0;
 	unsigned long mmcra = 0;
@@ -586,20 +587,20 @@ static int power5p_compute_mmcr(u64 event[], int n_ev,
 	}
 
 	/* Return MMCRx values */
-	mmcr[0] = 0;
+	mmcr->mmcr0 = 0;
 	if (pmc_inuse & 1)
-		mmcr[0] = MMCR0_PMC1CE;
+		mmcr->mmcr0 = MMCR0_PMC1CE;
 	if (pmc_inuse & 0x3e)
-		mmcr[0] |= MMCR0_PMCjCE;
-	mmcr[1] = mmcr1;
-	mmcr[2] = mmcra;
+		mmcr->mmcr0 |= MMCR0_PMCjCE;
+	mmcr->mmcr1 = mmcr1;
+	mmcr->mmcra = mmcra;
 	return 0;
 }
 
-static void power5p_disable_pmc(unsigned int pmc, unsigned long mmcr[])
+static void power5p_disable_pmc(unsigned int pmc, struct mmcr_regs *mmcr)
 {
 	if (pmc <= 3)
-		mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
+		mmcr->mmcr1 &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
 }
 
 static int power5p_generic_events[] = {
diff --git a/arch/powerpc/perf/power5-pmu.c b/arch/powerpc/perf/power5-pmu.c
index da52eca..426021d 100644
--- a/arch/powerpc/perf/power5-pmu.c
+++ b/arch/powerpc/perf/power5-pmu.c
@@ -379,7 +379,8 @@ static int power5_marked_instr_event(u64 event)
 }
 
 static int power5_compute_mmcr(u64 event[], int n_ev,
-			       unsigned int hwc[], unsigned long mmcr[], struct perf_event *pevents[])
+			       unsigned int hwc[], struct mmcr_regs *mmcr,
+			       struct perf_event *pevents[])
 {
 	unsigned long mmcr1 = 0;
 	unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS;
@@ -528,20 +529,20 @@ static int power5_compute_mmcr(u64 event[], int n_ev,
 	}
 
 	/* Return MMCRx values */
-	mmcr[0] = 0;
+	mmcr->mmcr0 = 0;
 	if (pmc_inuse & 1)
-		mmcr[0] = MMCR0_PMC1CE;
+		mmcr->mmcr0 = MMCR0_PMC1CE;
 	if (pmc_inuse & 0x3e)
-		mmcr[0] |= MMCR0_PMCjCE;
-	mmcr[1] = mmcr1;
-	mmcr[2] = mmcra;
+		mmcr->mmcr0 |= MMCR0_PMCjCE;
+	mmcr->mmcr1 = mmcr1;
+	mmcr->mmcra = mmcra;
 	return 0;
 }
 
-static void power5_disable_pmc(unsigned int pmc, unsigned long mmcr[])
+static void power5_disable_pmc(unsigned int pmc, struct mmcr_regs *mmcr)
 {
 	if (pmc <= 3)
-		mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
+		mmcr->mmcr1 &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
 }
 
 static int power5_generic_events[] = {
diff --git a/arch/powerpc/perf/power6-pmu.c b/arch/powerpc/perf/power6-pmu.c
index 3929cac..e343a51 100644
--- a/arch/powerpc/perf/power6-pmu.c
+++ b/arch/powerpc/perf/power6-pmu.c
@@ -171,7 +171,7 @@ static int power6_marked_instr_event(u64 event)
  * Assign PMC numbers and compute MMCR1 value for a set of events
  */
 static int p6_compute_mmcr(u64 event[], int n_ev,
-			   unsigned int hwc[], unsigned long mmcr[], struct perf_event *pevents[])
+			   unsigned int hwc[], struct mmcr_regs *mmcr, struct perf_event *pevents[])
 {
 	unsigned long mmcr1 = 0;
 	unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS;
@@ -243,13 +243,13 @@ static int p6_compute_mmcr(u64 event[], int n_ev,
 		if (pmc < 4)
 			mmcr1 |= (unsigned long)psel << MMCR1_PMCSEL_SH(pmc);
 	}
-	mmcr[0] = 0;
+	mmcr->mmcr0 = 0;
 	if (pmc_inuse & 1)
-		mmcr[0] = MMCR0_PMC1CE;
+		mmcr->mmcr0 = MMCR0_PMC1CE;
 	if (pmc_inuse & 0xe)
-		mmcr[0] |= MMCR0_PMCjCE;
-	mmcr[1] = mmcr1;
-	mmcr[2] = mmcra;
+		mmcr->mmcr0 |= MMCR0_PMCjCE;
+	mmcr->mmcr1 = mmcr1;
+	mmcr->mmcra = mmcra;
 	return 0;
 }
 
@@ -457,11 +457,11 @@ static int p6_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 	return nalt;
 }
 
-static void p6_disable_pmc(unsigned int pmc, unsigned long mmcr[])
+static void p6_disable_pmc(unsigned int pmc, struct mmcr_regs *mmcr)
 {
 	/* Set PMCxSEL to 0 to disable PMCx */
 	if (pmc <= 3)
-		mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
+		mmcr->mmcr1 &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
 }
 
 static int power6_generic_events[] = {
diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c
index a137813..3152336 100644
--- a/arch/powerpc/perf/power7-pmu.c
+++ b/arch/powerpc/perf/power7-pmu.c
@@ -242,7 +242,8 @@ static int power7_marked_instr_event(u64 event)
 }
 
 static int power7_compute_mmcr(u64 event[], int n_ev,
-			       unsigned int hwc[], unsigned long mmcr[], struct perf_event *pevents[])
+			       unsigned int hwc[], struct mmcr_regs *mmcr,
+			       struct perf_event *pevents[])
 {
 	unsigned long mmcr1 = 0;
 	unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS;
@@ -298,20 +299,20 @@ static int power7_compute_mmcr(u64 event[], int n_ev,
 	}
 
 	/* Return MMCRx values */
-	mmcr[0] = 0;
+	mmcr->mmcr0 = 0;
 	if (pmc_inuse & 1)
-		mmcr[0] = MMCR0_PMC1CE;
+		mmcr->mmcr0 = MMCR0_PMC1CE;
 	if (pmc_inuse & 0x3e)
-		mmcr[0] |= MMCR0_PMCjCE;
-	mmcr[1] = mmcr1;
-	mmcr[2] = mmcra;
+		mmcr->mmcr0 |= MMCR0_PMCjCE;
+	mmcr->mmcr1 = mmcr1;
+	mmcr->mmcra = mmcra;
 	return 0;
 }
 
-static void power7_disable_pmc(unsigned int pmc, unsigned long mmcr[])
+static void power7_disable_pmc(unsigned int pmc, struct mmcr_regs *mmcr)
 {
 	if (pmc <= 3)
-		mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
+		mmcr->mmcr1 &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
 }
 
 static int power7_generic_events[] = {
diff --git a/arch/powerpc/perf/ppc970-pmu.c b/arch/powerpc/perf/ppc970-pmu.c
index 4035d93..89a90ab 100644
--- a/arch/powerpc/perf/ppc970-pmu.c
+++ b/arch/powerpc/perf/ppc970-pmu.c
@@ -253,7 +253,8 @@ static int p970_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 }
 
 static int p970_compute_mmcr(u64 event[], int n_ev,
-			     unsigned int hwc[], unsigned long mmcr[], struct perf_event *pevents[])
+			     unsigned int hwc[], struct mmcr_regs *mmcr,
+			     struct perf_event *pevents[])
 {
 	unsigned long mmcr0 = 0, mmcr1 = 0, mmcra = 0;
 	unsigned int pmc, unit, byte, psel;
@@ -393,27 +394,26 @@ static int p970_compute_mmcr(u64 event[], int n_ev,
 	mmcra |= 0x2000;	/* mark only one IOP per PPC instruction */
 
 	/* Return MMCRx values */
-	mmcr[0] = mmcr0;
-	mmcr[1] = mmcr1;
-	mmcr[2] = mmcra;
+	mmcr->mmcr0 = mmcr0;
+	mmcr->mmcr1 = mmcr1;
+	mmcr->mmcra = mmcra;
 	return 0;
 }
 
-static void p970_disable_pmc(unsigned int pmc, unsigned long mmcr[])
+static void p970_disable_pmc(unsigned int pmc, struct mmcr_regs *mmcr)
 {
-	int shift, i;
+	int shift;
 
+	/*
+	 * Setting the PMCxSEL field to 0x08 disables PMC x.
+	 */
 	if (pmc <= 1) {
 		shift = MMCR0_PMC1SEL_SH - 7 * pmc;
-		i = 0;
+		mmcr->mmcr0 = (mmcr->mmcr0 & ~(0x1fUL << shift)) | (0x08UL << shift);
 	} else {
 		shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2);
-		i = 1;
+		mmcr->mmcr1 = (mmcr->mmcr1 & ~(0x1fUL << shift)) | (0x08UL << shift);
 	}
-	/*
-	 * Setting the PMCxSEL field to 0x08 disables PMC x.
-	 */
-	mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift);
 }
 
 static int ppc970_generic_events[] = {
-- 
1.8.3.1


^ permalink raw reply related

* [v3 02/15] KVM: PPC: Book3S HV: Cleanup updates for kvm vcpu MMCR
From: Athira Rajeev @ 2020-07-17 14:38 UTC (permalink / raw)
  To: mpe; +Cc: ego, mikey, maddy, kvm, kvm-ppc, svaidyan, acme, jolsa,
	linuxppc-dev
In-Reply-To: <1594996707-3727-1-git-send-email-atrajeev@linux.vnet.ibm.com>

Currently `kvm_vcpu_arch` stores all Monitor Mode Control registers
in a flat array in order: mmcr0, mmcr1, mmcra, mmcr2, mmcrs
Split this to give mmcra and mmcrs its own entries in vcpu and
use a flat array for mmcr0 to mmcr2. This patch implements this
cleanup to make code easier to read.

Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/kvm_host.h       |  4 +++-
 arch/powerpc/include/uapi/asm/kvm.h       |  4 ++--
 arch/powerpc/kernel/asm-offsets.c         |  2 ++
 arch/powerpc/kvm/book3s_hv.c              | 16 ++++++++++++++--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 12 ++++++------
 tools/arch/powerpc/include/uapi/asm/kvm.h |  4 ++--
 6 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 7e2d061..fc115e2 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -637,7 +637,9 @@ struct kvm_vcpu_arch {
 	u32 ccr1;
 	u32 dbsr;
 
-	u64 mmcr[5];
+	u64 mmcr[3];
+	u64 mmcra;
+	u64 mmcrs;
 	u32 pmc[8];
 	u32 spmc[2];
 	u64 siar;
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 264e266..e55d847 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -510,8 +510,8 @@ struct kvm_ppc_cpu_char {
 
 #define KVM_REG_PPC_MMCR0	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10)
 #define KVM_REG_PPC_MMCR1	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11)
-#define KVM_REG_PPC_MMCRA	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
-#define KVM_REG_PPC_MMCR2	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13)
+#define KVM_REG_PPC_MMCR2	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
+#define KVM_REG_PPC_MMCRA	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13)
 #define KVM_REG_PPC_MMCRS	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x14)
 #define KVM_REG_PPC_SIAR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x15)
 #define KVM_REG_PPC_SDAR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x16)
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 6657dc6..6fa4853 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -559,6 +559,8 @@ int main(void)
 	OFFSET(VCPU_IRQ_PENDING, kvm_vcpu, arch.irq_pending);
 	OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request);
 	OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr);
+	OFFSET(VCPU_MMCRA, kvm_vcpu, arch.mmcra);
+	OFFSET(VCPU_MMCRS, kvm_vcpu, arch.mmcrs);
 	OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc);
 	OFFSET(VCPU_SPMC, kvm_vcpu, arch.spmc);
 	OFFSET(VCPU_SIAR, kvm_vcpu, arch.siar);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6bf66649..3f90eee 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1679,10 +1679,16 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_UAMOR:
 		*val = get_reg_val(id, vcpu->arch.uamor);
 		break;
-	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS:
+	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR2:
 		i = id - KVM_REG_PPC_MMCR0;
 		*val = get_reg_val(id, vcpu->arch.mmcr[i]);
 		break;
+	case KVM_REG_PPC_MMCRA:
+		*val = get_reg_val(id, vcpu->arch.mmcra);
+		break;
+	case KVM_REG_PPC_MMCRS:
+		*val = get_reg_val(id, vcpu->arch.mmcrs);
+		break;
 	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
 		i = id - KVM_REG_PPC_PMC1;
 		*val = get_reg_val(id, vcpu->arch.pmc[i]);
@@ -1900,10 +1906,16 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_UAMOR:
 		vcpu->arch.uamor = set_reg_val(id, *val);
 		break;
-	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS:
+	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR2:
 		i = id - KVM_REG_PPC_MMCR0;
 		vcpu->arch.mmcr[i] = set_reg_val(id, *val);
 		break;
+	case KVM_REG_PPC_MMCRA:
+		vcpu->arch.mmcra = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MMCRS:
+		vcpu->arch.mmcrs = set_reg_val(id, *val);
+		break;
 	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
 		i = id - KVM_REG_PPC_PMC1;
 		vcpu->arch.pmc[i] = set_reg_val(id, *val);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 7194389..702eaa2 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -3428,7 +3428,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
 	mtspr	SPRN_PMC6, r9
 	ld	r3, VCPU_MMCR(r4)
 	ld	r5, VCPU_MMCR + 8(r4)
-	ld	r6, VCPU_MMCR + 16(r4)
+	ld	r6, VCPU_MMCRA(r4)
 	ld	r7, VCPU_SIAR(r4)
 	ld	r8, VCPU_SDAR(r4)
 	mtspr	SPRN_MMCR1, r5
@@ -3436,14 +3436,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
 	mtspr	SPRN_SIAR, r7
 	mtspr	SPRN_SDAR, r8
 BEGIN_FTR_SECTION
-	ld	r5, VCPU_MMCR + 24(r4)
+	ld	r5, VCPU_MMCR + 16(r4)
 	ld	r6, VCPU_SIER(r4)
 	mtspr	SPRN_MMCR2, r5
 	mtspr	SPRN_SIER, r6
 BEGIN_FTR_SECTION_NESTED(96)
 	lwz	r7, VCPU_PMC + 24(r4)
 	lwz	r8, VCPU_PMC + 28(r4)
-	ld	r9, VCPU_MMCR + 32(r4)
+	ld	r9, VCPU_MMCRS(r4)
 	mtspr	SPRN_SPMC1, r7
 	mtspr	SPRN_SPMC2, r8
 	mtspr	SPRN_MMCRS, r9
@@ -3551,9 +3551,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	mfspr	r8, SPRN_SDAR
 	std	r4, VCPU_MMCR(r9)
 	std	r5, VCPU_MMCR + 8(r9)
-	std	r6, VCPU_MMCR + 16(r9)
+	std	r6, VCPU_MMCRA(r9)
 BEGIN_FTR_SECTION
-	std	r10, VCPU_MMCR + 24(r9)
+	std	r10, VCPU_MMCR + 16(r9)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	std	r7, VCPU_SIAR(r9)
 	std	r8, VCPU_SDAR(r9)
@@ -3578,7 +3578,7 @@ BEGIN_FTR_SECTION_NESTED(96)
 	mfspr	r8, SPRN_MMCRS
 	stw	r6, VCPU_PMC + 24(r9)
 	stw	r7, VCPU_PMC + 28(r9)
-	std	r8, VCPU_MMCR + 32(r9)
+	std	r8, VCPU_MMCRS(r9)
 	lis	r4, 0x8000
 	mtspr	SPRN_MMCRS, r4
 END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h
index 264e266..e55d847 100644
--- a/tools/arch/powerpc/include/uapi/asm/kvm.h
+++ b/tools/arch/powerpc/include/uapi/asm/kvm.h
@@ -510,8 +510,8 @@ struct kvm_ppc_cpu_char {
 
 #define KVM_REG_PPC_MMCR0	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10)
 #define KVM_REG_PPC_MMCR1	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11)
-#define KVM_REG_PPC_MMCRA	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
-#define KVM_REG_PPC_MMCR2	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13)
+#define KVM_REG_PPC_MMCR2	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
+#define KVM_REG_PPC_MMCRA	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13)
 #define KVM_REG_PPC_MMCRS	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x14)
 #define KVM_REG_PPC_SIAR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x15)
 #define KVM_REG_PPC_SDAR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x16)
-- 
1.8.3.1


^ permalink raw reply related

* [v3 03/15] powerpc/perf: Update Power PMU cache_events to u64 type
From: Athira Rajeev @ 2020-07-17 14:38 UTC (permalink / raw)
  To: mpe; +Cc: ego, mikey, maddy, kvm, kvm-ppc, svaidyan, acme, jolsa,
	linuxppc-dev
In-Reply-To: <1594996707-3727-1-git-send-email-atrajeev@linux.vnet.ibm.com>

Events of type PERF_TYPE_HW_CACHE was described for Power PMU
as: int (*cache_events)[type][op][result];

where type, op, result values unpacked from the event attribute config
value is used to generate the raw event code at runtime.

So far the event code values which used to create these cache-related
events were within 32 bit and `int` type worked. In power10,
some of the event codes are of 64-bit value and hence update the
Power PMU cache_events to `u64` type in `power_pmu` struct.
Also propagate this change to existing all PMU driver code paths
which are using ppmu->cache_events.

Signed-off-by: Athira Rajeev<atrajeev@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/perf_event_server.h | 2 +-
 arch/powerpc/perf/core-book3s.c              | 2 +-
 arch/powerpc/perf/generic-compat-pmu.c       | 2 +-
 arch/powerpc/perf/mpc7450-pmu.c              | 2 +-
 arch/powerpc/perf/power5+-pmu.c              | 2 +-
 arch/powerpc/perf/power5-pmu.c               | 2 +-
 arch/powerpc/perf/power6-pmu.c               | 2 +-
 arch/powerpc/perf/power7-pmu.c               | 2 +-
 arch/powerpc/perf/power8-pmu.c               | 2 +-
 arch/powerpc/perf/power9-pmu.c               | 2 +-
 arch/powerpc/perf/ppc970-pmu.c               | 2 +-
 11 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index f9a3668..14b8dc1 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -53,7 +53,7 @@ struct power_pmu {
 	const struct attribute_group	**attr_groups;
 	int		n_generic;
 	int		*generic_events;
-	int		(*cache_events)[PERF_COUNT_HW_CACHE_MAX]
+	u64		(*cache_events)[PERF_COUNT_HW_CACHE_MAX]
 			       [PERF_COUNT_HW_CACHE_OP_MAX]
 			       [PERF_COUNT_HW_CACHE_RESULT_MAX];
 
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 18b1b6a..f4d07b5 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -1790,7 +1790,7 @@ static void hw_perf_event_destroy(struct perf_event *event)
 static int hw_perf_cache_event(u64 config, u64 *eventp)
 {
 	unsigned long type, op, result;
-	int ev;
+	u64 ev;
 
 	if (!ppmu->cache_events)
 		return -EINVAL;
diff --git a/arch/powerpc/perf/generic-compat-pmu.c b/arch/powerpc/perf/generic-compat-pmu.c
index 5e5a54d..eb8a6aaf 100644
--- a/arch/powerpc/perf/generic-compat-pmu.c
+++ b/arch/powerpc/perf/generic-compat-pmu.c
@@ -101,7 +101,7 @@ enum {
  * 0 means not supported, -1 means nonsensical, other values
  * are event codes.
  */
-static int generic_compat_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+static u64 generic_compat_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 	[ C(L1D) ] = {
 		[ C(OP_READ) ] = {
 			[ C(RESULT_ACCESS) ] = 0,
diff --git a/arch/powerpc/perf/mpc7450-pmu.c b/arch/powerpc/perf/mpc7450-pmu.c
index 826de25..1919e9d 100644
--- a/arch/powerpc/perf/mpc7450-pmu.c
+++ b/arch/powerpc/perf/mpc7450-pmu.c
@@ -361,7 +361,7 @@ static void mpc7450_disable_pmc(unsigned int pmc, struct mmcr_regs *mmcr)
  * 0 means not supported, -1 means nonsensical, other values
  * are event codes.
  */
-static int mpc7450_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+static u64 mpc7450_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0x225	},
 		[C(OP_WRITE)] = {	0,		0x227	},
diff --git a/arch/powerpc/perf/power5+-pmu.c b/arch/powerpc/perf/power5+-pmu.c
index 5f0821e..a62b2cd 100644
--- a/arch/powerpc/perf/power5+-pmu.c
+++ b/arch/powerpc/perf/power5+-pmu.c
@@ -619,7 +619,7 @@ static void power5p_disable_pmc(unsigned int pmc, struct mmcr_regs *mmcr)
  * 0 means not supported, -1 means nonsensical, other values
  * are event codes.
  */
-static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+static u64 power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0x1c10a8,	0x3c1088	},
 		[C(OP_WRITE)] = {	0x2c10a8,	0xc10c3		},
diff --git a/arch/powerpc/perf/power5-pmu.c b/arch/powerpc/perf/power5-pmu.c
index 426021d..8732b58 100644
--- a/arch/powerpc/perf/power5-pmu.c
+++ b/arch/powerpc/perf/power5-pmu.c
@@ -561,7 +561,7 @@ static void power5_disable_pmc(unsigned int pmc, struct mmcr_regs *mmcr)
  * 0 means not supported, -1 means nonsensical, other values
  * are event codes.
  */
-static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+static u64 power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0x4c1090,	0x3c1088	},
 		[C(OP_WRITE)] = {	0x3c1090,	0xc10c3		},
diff --git a/arch/powerpc/perf/power6-pmu.c b/arch/powerpc/perf/power6-pmu.c
index e343a51..0e318cf 100644
--- a/arch/powerpc/perf/power6-pmu.c
+++ b/arch/powerpc/perf/power6-pmu.c
@@ -481,7 +481,7 @@ static void p6_disable_pmc(unsigned int pmc, struct mmcr_regs *mmcr)
  * are event codes.
  * The "DTLB" and "ITLB" events relate to the DERAT and IERAT.
  */
-static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+static u64 power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0x280030,	0x80080		},
 		[C(OP_WRITE)] = {	0x180032,	0x80088		},
diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c
index 3152336..5e0bf09 100644
--- a/arch/powerpc/perf/power7-pmu.c
+++ b/arch/powerpc/perf/power7-pmu.c
@@ -333,7 +333,7 @@ static void power7_disable_pmc(unsigned int pmc, struct mmcr_regs *mmcr)
  * 0 means not supported, -1 means nonsensical, other values
  * are event codes.
  */
-static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+static u64 power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0xc880,		0x400f0	},
 		[C(OP_WRITE)] = {	0,		0x300f0	},
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index 3a5fcc2..5282e84 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -253,7 +253,7 @@ static void power8_config_bhrb(u64 pmu_bhrb_filter)
  * 0 means not supported, -1 means nonsensical, other values
  * are event codes.
  */
-static int power8_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+static u64 power8_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 	[ C(L1D) ] = {
 		[ C(OP_READ) ] = {
 			[ C(RESULT_ACCESS) ] = PM_LD_REF_L1,
diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 08c3ef7..05dae38 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -310,7 +310,7 @@ static void power9_config_bhrb(u64 pmu_bhrb_filter)
  * 0 means not supported, -1 means nonsensical, other values
  * are event codes.
  */
-static int power9_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+static u64 power9_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 	[ C(L1D) ] = {
 		[ C(OP_READ) ] = {
 			[ C(RESULT_ACCESS) ] = PM_LD_REF_L1,
diff --git a/arch/powerpc/perf/ppc970-pmu.c b/arch/powerpc/perf/ppc970-pmu.c
index 89a90ab..d35223f 100644
--- a/arch/powerpc/perf/ppc970-pmu.c
+++ b/arch/powerpc/perf/ppc970-pmu.c
@@ -432,7 +432,7 @@ static void p970_disable_pmc(unsigned int pmc, struct mmcr_regs *mmcr)
  * 0 means not supported, -1 means nonsensical, other values
  * are event codes.
  */
-static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+static u64 ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0x8810,		0x3810	},
 		[C(OP_WRITE)] = {	0x7810,		0x813	},
-- 
1.8.3.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox