LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [v4 3/5] KVM: PPC: Book3S HV: in H_SVM_INIT_DONE, migrate remaining normal-GFNs to secure-GFNs.
From: Ram Pai @ 2020-07-17  8:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1594972827-13928-1-git-send-email-linuxram@us.ibm.com>

The Ultravisor is expected to explicitly call H_SVM_PAGE_IN for all the
pages of the SVM before calling H_SVM_INIT_DONE. This causes a huge
delay in tranistioning the VM to SVM. The Ultravisor is only interested
in the pages that contain the kernel, initrd and other important data
structures. The rest contain throw-away content.

However if not all pages are requested by the Ultravisor, the Hypervisor
continues to consider the GFNs corresponding to the non-requested pages
as normal GFNs. This can lead to data-corruption and undefined behavior.

In H_SVM_INIT_DONE handler, move all the PFNs associated with the SVM's
GFNs to secure-PFNs. Skip the GFNs that are already Paged-in or Shared
or Paged-in followed by a Paged-out.

Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 Documentation/powerpc/ultravisor.rst        |   2 +
 arch/powerpc/include/asm/kvm_book3s_uvmem.h |   2 +
 arch/powerpc/kvm/book3s_hv_uvmem.c          | 135 +++++++++++++++++++++++++---
 3 files changed, 125 insertions(+), 14 deletions(-)

diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst
index a1c8c37..ba6b1bf 100644
--- a/Documentation/powerpc/ultravisor.rst
+++ b/Documentation/powerpc/ultravisor.rst
@@ -934,6 +934,8 @@ Return values
 	* H_UNSUPPORTED		if called from the wrong context (e.g.
 				from an SVM or before an H_SVM_INIT_START
 				hypercall).
+	* H_STATE		if the hypervisor could not successfully
+                                transition the VM to Secure VM.
 
 Description
 ~~~~~~~~~~~
diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
index 9cb7d8b..f229ab5 100644
--- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h
+++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
@@ -23,6 +23,8 @@ unsigned long kvmppc_h_svm_page_out(struct kvm *kvm,
 unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm);
 void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 			     struct kvm *kvm, bool skip_page_out);
+int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+			const struct kvm_memory_slot *memslot);
 #else
 static inline int kvmppc_uvmem_init(void)
 {
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index df2e272..3274663 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -93,6 +93,7 @@
 #include <asm/ultravisor.h>
 #include <asm/mman.h>
 #include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s_uvmem.h>
 
 static struct dev_pagemap kvmppc_uvmem_pgmap;
 static unsigned long *kvmppc_uvmem_bitmap;
@@ -348,8 +349,45 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
 	return false;
 }
 
+/*
+ * starting from *gfn search for the next available GFN that is not yet
+ * transitioned to a secure GFN.  return the value of that GFN in *gfn.  If a
+ * GFN is found, return true, else return false
+ */
+static bool kvmppc_next_nontransitioned_gfn(const struct kvm_memory_slot *memslot,
+		struct kvm *kvm, unsigned long *gfn)
+{
+	struct kvmppc_uvmem_slot *p;
+	bool ret = false;
+	unsigned long i;
+
+	mutex_lock(&kvm->arch.uvmem_lock);
+
+	list_for_each_entry(p, &kvm->arch.uvmem_pfns, list)
+		if (*gfn >= p->base_pfn && *gfn < p->base_pfn + p->nr_pfns)
+			break;
+	if (!p)
+		goto out;
+	/*
+	 * The code below assumes, one to one correspondence between
+	 * kvmppc_uvmem_slot and memslot.
+	 */
+	for (i = *gfn; i < p->base_pfn + p->nr_pfns; i++) {
+		unsigned long index = i - p->base_pfn;
+
+		if (!(p->pfns[index] & KVMPPC_GFN_FLAG_MASK)) {
+			*gfn = i;
+			ret = true;
+			break;
+		}
+	}
+out:
+	mutex_unlock(&kvm->arch.uvmem_lock);
+	return ret;
+}
+
 static int kvmppc_memslot_page_merge(struct kvm *kvm,
-		struct kvm_memory_slot *memslot, bool merge)
+		const struct kvm_memory_slot *memslot, bool merge)
 {
 	unsigned long gfn = memslot->base_gfn;
 	unsigned long end, start = gfn_to_hva(kvm, gfn);
@@ -461,12 +499,31 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 
 unsigned long kvmppc_h_svm_init_done(struct kvm *kvm)
 {
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int srcu_idx;
+	long ret = H_SUCCESS;
+
 	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
 		return H_UNSUPPORTED;
 
+	/* migrate any unmoved normal pfn to device pfns*/
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	slots = kvm_memslots(kvm);
+	kvm_for_each_memslot(memslot, slots) {
+		ret = kvmppc_uv_migrate_mem_slot(kvm, memslot);
+		if (ret) {
+			ret = H_STATE;
+			goto out;
+		}
+	}
+
 	kvm->arch.secure_guest |= KVMPPC_SECURE_INIT_DONE;
 	pr_info("LPID %d went secure\n", kvm->arch.lpid);
-	return H_SUCCESS;
+
+out:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	return ret;
 }
 
 /*
@@ -587,12 +644,14 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
 }
 
 /*
- * Alloc a PFN from private device memory pool and copy page from normal
- * memory to secure memory using UV_PAGE_IN uvcall.
+ * Alloc a PFN from private device memory pool. If @pagein is true,
+ * copy page from normal memory to secure memory using UV_PAGE_IN uvcall.
  */
-static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
-		   unsigned long end, unsigned long gpa, struct kvm *kvm,
-		   unsigned long page_shift)
+static int kvmppc_svm_migrate_page(struct vm_area_struct *vma,
+		unsigned long start,
+		unsigned long end, unsigned long gpa, struct kvm *kvm,
+		unsigned long page_shift,
+		bool pagein)
 {
 	unsigned long src_pfn, dst_pfn = 0;
 	struct migrate_vma mig;
@@ -623,11 +682,16 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 		goto out_finalize;
 	}
 
-	pfn = *mig.src >> MIGRATE_PFN_SHIFT;
-	spage = migrate_pfn_to_page(*mig.src);
-	if (spage)
-		uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0,
-			   page_shift);
+	if (pagein) {
+		pfn = *mig.src >> MIGRATE_PFN_SHIFT;
+		spage = migrate_pfn_to_page(*mig.src);
+		if (spage) {
+			ret = uv_page_in(kvm->arch.lpid, pfn << page_shift,
+					gpa, 0, page_shift);
+			if (ret)
+				goto out_finalize;
+		}
+	}
 
 	*mig.dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
 	migrate_vma_pages(&mig);
@@ -636,6 +700,46 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 	return ret;
 }
 
+int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+		const struct kvm_memory_slot *memslot)
+{
+	unsigned long gfn = memslot->base_gfn;
+	struct vm_area_struct *vma;
+	unsigned long start, end;
+	int ret = 0;
+
+	while (kvmppc_next_nontransitioned_gfn(memslot, kvm, &gfn)) {
+
+		mmap_read_lock(kvm->mm);
+		start = gfn_to_hva(kvm, gfn);
+		if (kvm_is_error_hva(start)) {
+			ret = H_STATE;
+			goto next;
+		}
+
+		end = start + (1UL << PAGE_SHIFT);
+		vma = find_vma_intersection(kvm->mm, start, end);
+		if (!vma || vma->vm_start > start || vma->vm_end < end) {
+			ret = H_STATE;
+			goto next;
+		}
+
+		mutex_lock(&kvm->arch.uvmem_lock);
+		ret = kvmppc_svm_migrate_page(vma, start, end,
+				(gfn << PAGE_SHIFT), kvm, PAGE_SHIFT, false);
+		mutex_unlock(&kvm->arch.uvmem_lock);
+		if (ret)
+			ret = H_STATE;
+
+next:
+		mmap_read_unlock(kvm->mm);
+
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
 /*
  * Shares the page with HV, thus making it a normal page.
  *
@@ -740,8 +844,11 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 	if (!vma || vma->vm_start > start || vma->vm_end < end)
 		goto out_unlock;
 
-	if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift))
-		ret = H_SUCCESS;
+	if (kvmppc_svm_migrate_page(vma, start, end, gpa, kvm, page_shift,
+				true))
+		goto out_unlock;
+
+	ret = H_SUCCESS;
 
 out_unlock:
 	mutex_unlock(&kvm->arch.uvmem_lock);
-- 
1.8.3.1


^ permalink raw reply related

* [v4 1/5] KVM: PPC: Book3S HV: Disable page merging in H_SVM_INIT_START
From: Ram Pai @ 2020-07-17  8:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1594972827-13928-1-git-send-email-linuxram@us.ibm.com>

Page-merging of pages in memory-slots associated with a Secure VM,
is disabled in H_SVM_PAGE_IN handler.

This operation should have been done much earlier; the moment the VM
is initiated for secure-transition. Delaying this operation, increases
the probability for those pages to acquire new references , making it
impossible to migrate those pages.

Disable page-migration in H_SVM_INIT_START handling.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 Documentation/powerpc/ultravisor.rst |  1 +
 arch/powerpc/kvm/book3s_hv_uvmem.c   | 98 +++++++++++++++++++++++++++---------
 2 files changed, 76 insertions(+), 23 deletions(-)

diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst
index df136c8..a1c8c37 100644
--- a/Documentation/powerpc/ultravisor.rst
+++ b/Documentation/powerpc/ultravisor.rst
@@ -895,6 +895,7 @@ Return values
     One of the following values:
 
 	* H_SUCCESS	 on success.
+        * H_STATE        if the VM is not in a position to switch to secure.
 
 Description
 ~~~~~~~~~~~
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index e6f76bc..0baa293 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -211,6 +211,65 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
 	return false;
 }
 
+static int kvmppc_memslot_page_merge(struct kvm *kvm,
+		struct kvm_memory_slot *memslot, bool merge)
+{
+	unsigned long gfn = memslot->base_gfn;
+	unsigned long end, start = gfn_to_hva(kvm, gfn);
+	int ret = 0;
+	struct vm_area_struct *vma;
+	int merge_flag = (merge) ? MADV_MERGEABLE : MADV_UNMERGEABLE;
+
+	if (kvm_is_error_hva(start))
+		return H_STATE;
+
+	end = start + (memslot->npages << PAGE_SHIFT);
+
+	mmap_write_lock(kvm->mm);
+	do {
+		vma = find_vma_intersection(kvm->mm, start, end);
+		if (!vma) {
+			ret = H_STATE;
+			break;
+		}
+		ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
+			  merge_flag, &vma->vm_flags);
+		if (ret) {
+			ret = H_STATE;
+			break;
+		}
+		start = vma->vm_end + 1;
+	} while (end > vma->vm_end);
+
+	mmap_write_unlock(kvm->mm);
+	return ret;
+}
+
+static int __kvmppc_page_merge(struct kvm *kvm, bool merge)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int ret = 0;
+
+	slots = kvm_memslots(kvm);
+	kvm_for_each_memslot(memslot, slots) {
+		ret = kvmppc_memslot_page_merge(kvm, memslot, merge);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+static inline int kvmppc_disable_page_merge(struct kvm *kvm)
+{
+	return __kvmppc_page_merge(kvm, false);
+}
+
+static inline int kvmppc_enable_page_merge(struct kvm *kvm)
+{
+	return __kvmppc_page_merge(kvm, true);
+}
+
 unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 {
 	struct kvm_memslots *slots;
@@ -232,11 +291,18 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 		return H_AUTHORITY;
 
 	srcu_idx = srcu_read_lock(&kvm->srcu);
+
+	/* disable page-merging for all memslot */
+	ret = kvmppc_disable_page_merge(kvm);
+	if (ret)
+		goto out;
+
+	/* register the memslot */
 	slots = kvm_memslots(kvm);
 	kvm_for_each_memslot(memslot, slots) {
 		if (kvmppc_uvmem_slot_init(kvm, memslot)) {
 			ret = H_PARAMETER;
-			goto out;
+			break;
 		}
 		ret = uv_register_mem_slot(kvm->arch.lpid,
 					   memslot->base_gfn << PAGE_SHIFT,
@@ -245,9 +311,12 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 		if (ret < 0) {
 			kvmppc_uvmem_slot_free(kvm, memslot);
 			ret = H_PARAMETER;
-			goto out;
+			break;
 		}
 	}
+
+	if (ret)
+		kvmppc_enable_page_merge(kvm);
 out:
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 	return ret;
@@ -384,7 +453,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
  */
 static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 		   unsigned long end, unsigned long gpa, struct kvm *kvm,
-		   unsigned long page_shift, bool *downgrade)
+		   unsigned long page_shift)
 {
 	unsigned long src_pfn, dst_pfn = 0;
 	struct migrate_vma mig;
@@ -400,18 +469,6 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 	mig.src = &src_pfn;
 	mig.dst = &dst_pfn;
 
-	/*
-	 * We come here with mmap_lock write lock held just for
-	 * ksm_madvise(), otherwise we only need read mmap_lock.
-	 * Hence downgrade to read lock once ksm_madvise() is done.
-	 */
-	ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
-			  MADV_UNMERGEABLE, &vma->vm_flags);
-	mmap_write_downgrade(kvm->mm);
-	*downgrade = true;
-	if (ret)
-		return ret;
-
 	ret = migrate_vma_setup(&mig);
 	if (ret)
 		return ret;
@@ -503,7 +560,6 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 		unsigned long flags,
 		unsigned long page_shift)
 {
-	bool downgrade = false;
 	unsigned long start, end;
 	struct vm_area_struct *vma;
 	int srcu_idx;
@@ -524,7 +580,7 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 
 	ret = H_PARAMETER;
 	srcu_idx = srcu_read_lock(&kvm->srcu);
-	mmap_write_lock(kvm->mm);
+	mmap_read_lock(kvm->mm);
 
 	start = gfn_to_hva(kvm, gfn);
 	if (kvm_is_error_hva(start))
@@ -540,16 +596,12 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 	if (!vma || vma->vm_start > start || vma->vm_end < end)
 		goto out_unlock;
 
-	if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift,
-				&downgrade))
+	if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift))
 		ret = H_SUCCESS;
 out_unlock:
 	mutex_unlock(&kvm->arch.uvmem_lock);
 out:
-	if (downgrade)
-		mmap_read_unlock(kvm->mm);
-	else
-		mmap_write_unlock(kvm->mm);
+	mmap_read_unlock(kvm->mm);
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 	return ret;
 }
-- 
1.8.3.1


^ permalink raw reply related

* [v4 2/5] KVM: PPC: Book3S HV: track the state GFNs associated with secure VMs
From: Ram Pai @ 2020-07-17  8:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1594972827-13928-1-git-send-email-linuxram@us.ibm.com>

During the life of SVM, its GFNs transition through normal, secure and
shared states. Since the kernel does not track GFNs that are shared, it
is not possible to disambiguate a shared GFN from a GFN whose PFN has
not yet been migrated to a secure-PFN. Also it is not possible to
disambiguate a secure-GFN from a GFN whose GFN has been pagedout from
the ultravisor.

The ability to identify the state of a GFN is needed to skip migration
of its PFN to secure-PFN during ESM transition.

The code is re-organized to track the states of a GFN as explained
below.

************************************************************************
 1. States of a GFN
    ---------------
 The GFN can be in one of the following states.

 (a) Secure - The GFN is secure. The GFN is associated with
 	a Secure VM, the contents of the GFN is not accessible
 	to the Hypervisor.  This GFN can be backed by a secure-PFN,
 	or can be backed by a normal-PFN with contents encrypted.
 	The former is true when the GFN is paged-in into the
 	ultravisor. The latter is true when the GFN is paged-out
 	of the ultravisor.

 (b) Shared - The GFN is shared. The GFN is associated with a
 	a secure VM. The contents of the GFN is accessible to
 	Hypervisor. This GFN is backed by a normal-PFN and its
 	content is un-encrypted.

 (c) Normal - The GFN is a normal. The GFN is associated with
 	a normal VM. The contents of the GFN is accesible to
 	the Hypervisor. Its content is never encrypted.

 2. States of a VM.
    ---------------

 (a) Normal VM:  A VM whose contents are always accessible to
 	the hypervisor.  All its GFNs are normal-GFNs.

 (b) Secure VM: A VM whose contents are not accessible to the
 	hypervisor without the VM's consent.  Its GFNs are
 	either Shared-GFN or Secure-GFNs.

 (c) Transient VM: A Normal VM that is transitioning to secure VM.
 	The transition starts on successful return of
 	H_SVM_INIT_START, and ends on successful return
 	of H_SVM_INIT_DONE. This transient VM, can have GFNs
 	in any of the three states; i.e Secure-GFN, Shared-GFN,
 	and Normal-GFN.	The VM never executes in this state
 	in supervisor-mode.

 3. Memory slot State.
    ------------------
  	The state of a memory slot mirrors the state of the
  	VM the memory slot is associated with.

 4. VM State transition.
    --------------------

  A VM always starts in Normal Mode.

  H_SVM_INIT_START moves the VM into transient state. During this
  time the Ultravisor may request some of its GFNs to be shared or
  secured. So its GFNs can be in one of the three GFN states.

  H_SVM_INIT_DONE moves the VM entirely from transient state to
  secure-state. At this point any left-over normal-GFNs are
  transitioned to Secure-GFN.

  H_SVM_INIT_ABORT moves the transient VM back to normal VM.
  All its GFNs are moved to Normal-GFNs.

  UV_TERMINATE transitions the secure-VM back to normal-VM. All
  the secure-GFN and shared-GFNs are tranistioned to normal-GFN
  Note: The contents of the normal-GFN is undefined at this point.

 5. GFN state implementation:
    -------------------------

 Secure GFN is associated with a secure-PFN; also called uvmem_pfn,
 when the GFN is paged-in. Its pfn[] has KVMPPC_GFN_UVMEM_PFN flag
 set, and contains the value of the secure-PFN.
 It is associated with a normal-PFN; also called mem_pfn, when
 the GFN is pagedout. Its pfn[] has KVMPPC_GFN_MEM_PFN flag set.
 The value of the normal-PFN is not tracked.

 Shared GFN is associated with a normal-PFN. Its pfn[] has
 KVMPPC_UVMEM_SHARED_PFN flag set. The value of the normal-PFN
 is not tracked.

 Normal GFN is associated with normal-PFN. Its pfn[] has
 no flag set. The value of the normal-PFN is not tracked.

 6. Life cycle of a GFN
    --------------------
 --------------------------------------------------------------
 |        |     Share  |  Unshare | SVM       |H_SVM_INIT_DONE|
 |        |operation   |operation | abort/    |               |
 |        |            |          | terminate |               |
 -------------------------------------------------------------
 |        |            |          |           |               |
 | Secure |     Shared | Secure   |Normal     |Secure         |
 |        |            |          |           |               |
 | Shared |     Shared | Secure   |Normal     |Shared         |
 |        |            |          |           |               |
 | Normal |     Shared | Secure   |Normal     |Secure         |
 --------------------------------------------------------------

 7. Life cycle of a VM
    --------------------
 --------------------------------------------------------------------
 |         |  start    |  H_SVM_  |H_SVM_   |H_SVM_     |UV_SVM_    |
 |         |  VM       |INIT_START|INIT_DONE|INIT_ABORT |TERMINATE  |
 |         |           |          |         |           |           |
 --------- ----------------------------------------------------------
 |         |           |          |         |           |           |
 | Normal  | Normal    | Transient|Error    |Error      |Normal     |
 |         |           |          |         |           |           |
 | Secure  |   Error   | Error    |Error    |Error      |Normal     |
 |         |           |          |         |           |           |
 |Transient|   N/A     | Error    |Secure   |Normal     |Normal     |
 --------------------------------------------------------------------

************************************************************************

Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 arch/powerpc/kvm/book3s_hv_uvmem.c | 187 +++++++++++++++++++++++++++++++++----
 1 file changed, 168 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 0baa293..df2e272 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -98,7 +98,127 @@
 static unsigned long *kvmppc_uvmem_bitmap;
 static DEFINE_SPINLOCK(kvmppc_uvmem_bitmap_lock);
 
-#define KVMPPC_UVMEM_PFN	(1UL << 63)
+/*
+ * States of a GFN
+ * ---------------
+ * The GFN can be in one of the following states.
+ *
+ * (a) Secure - The GFN is secure. The GFN is associated with
+ *	a Secure VM, the contents of the GFN is not accessible
+ *	to the Hypervisor.  This GFN can be backed by a secure-PFN,
+ *	or can be backed by a normal-PFN with contents encrypted.
+ *	The former is true when the GFN is paged-in into the
+ *	ultravisor. The latter is true when the GFN is paged-out
+ *	of the ultravisor.
+ *
+ * (b) Shared - The GFN is shared. The GFN is associated with a
+ *	a secure VM. The contents of the GFN is accessible to
+ *	Hypervisor. This GFN is backed by a normal-PFN and its
+ *	content is un-encrypted.
+ *
+ * (c) Normal - The GFN is a normal. The GFN is associated with
+ *	a normal VM. The contents of the GFN is accesible to
+ *	the Hypervisor. Its content is never encrypted.
+ *
+ * States of a VM.
+ * ---------------
+ *
+ * Normal VM:  A VM whose contents are always accessible to
+ *	the hypervisor.  All its GFNs are normal-GFNs.
+ *
+ * Secure VM: A VM whose contents are not accessible to the
+ *	hypervisor without the VM's consent.  Its GFNs are
+ *	either Shared-GFN or Secure-GFNs.
+ *
+ * Transient VM: A Normal VM that is transitioning to secure VM.
+ *	The transition starts on successful return of
+ *	H_SVM_INIT_START, and ends on successful return
+ *	of H_SVM_INIT_DONE. This transient VM, can have GFNs
+ *	in any of the three states; i.e Secure-GFN, Shared-GFN,
+ *	and Normal-GFN.	The VM never executes in this state
+ *	in supervisor-mode.
+ *
+ * Memory slot State.
+ * -----------------------------
+ *	The state of a memory slot mirrors the state of the
+ *	VM the memory slot is associated with.
+ *
+ * VM State transition.
+ * --------------------
+ *
+ *  A VM always starts in Normal Mode.
+ *
+ *  H_SVM_INIT_START moves the VM into transient state. During this
+ *  time the Ultravisor may request some of its GFNs to be shared or
+ *  secured. So its GFNs can be in one of the three GFN states.
+ *
+ *  H_SVM_INIT_DONE moves the VM entirely from transient state to
+ *  secure-state. At this point any left-over normal-GFNs are
+ *  transitioned to Secure-GFN.
+ *
+ *  H_SVM_INIT_ABORT moves the transient VM back to normal VM.
+ *  All its GFNs are moved to Normal-GFNs.
+ *
+ *  UV_TERMINATE transitions the secure-VM back to normal-VM. All
+ *  the secure-GFN and shared-GFNs are tranistioned to normal-GFN
+ *  Note: The contents of the normal-GFN is undefined at this point.
+ *
+ * GFN state implementation:
+ * -------------------------
+ *
+ * Secure GFN is associated with a secure-PFN; also called uvmem_pfn,
+ * when the GFN is paged-in. Its pfn[] has KVMPPC_GFN_UVMEM_PFN flag
+ * set, and contains the value of the secure-PFN.
+ * It is associated with a normal-PFN; also called mem_pfn, when
+ * the GFN is pagedout. Its pfn[] has KVMPPC_GFN_MEM_PFN flag set.
+ * The value of the normal-PFN is not tracked.
+ *
+ * Shared GFN is associated with a normal-PFN. Its pfn[] has
+ * KVMPPC_UVMEM_SHARED_PFN flag set. The value of the normal-PFN
+ * is not tracked.
+ *
+ * Normal GFN is associated with normal-PFN. Its pfn[] has
+ * no flag set. The value of the normal-PFN is not tracked.
+ *
+ * Life cycle of a GFN
+ * --------------------
+ *
+ * --------------------------------------------------------------
+ * |        |     Share  |  Unshare | SVM       |H_SVM_INIT_DONE|
+ * |        |operation   |operation | abort/    |               |
+ * |        |            |          | terminate |               |
+ * -------------------------------------------------------------
+ * |        |            |          |           |               |
+ * | Secure |     Shared | Secure   |Normal     |Secure         |
+ * |        |            |          |           |               |
+ * | Shared |     Shared | Secure   |Normal     |Shared         |
+ * |        |            |          |           |               |
+ * | Normal |     Shared | Secure   |Normal     |Secure         |
+ * --------------------------------------------------------------
+ *
+ * Life cycle of a VM
+ * --------------------
+ *
+ * --------------------------------------------------------------------
+ * |         |  start    |  H_SVM_  |H_SVM_   |H_SVM_     |UV_SVM_    |
+ * |         |  VM       |INIT_START|INIT_DONE|INIT_ABORT |TERMINATE  |
+ * |         |           |          |         |           |           |
+ * --------- ----------------------------------------------------------
+ * |         |           |          |         |           |           |
+ * | Normal  | Normal    | Transient|Error    |Error      |Normal     |
+ * |         |           |          |         |           |           |
+ * | Secure  |   Error   | Error    |Error    |Error      |Normal     |
+ * |         |           |          |         |           |           |
+ * |Transient|   N/A     | Error    |Secure   |Normal     |Normal     |
+ * --------------------------------------------------------------------
+ */
+
+#define KVMPPC_GFN_UVMEM_PFN	(1UL << 63)
+#define KVMPPC_GFN_MEM_PFN	(1UL << 62)
+#define KVMPPC_GFN_SHARED	(1UL << 61)
+#define KVMPPC_GFN_SECURE	(KVMPPC_GFN_UVMEM_PFN | KVMPPC_GFN_MEM_PFN)
+#define KVMPPC_GFN_FLAG_MASK	(KVMPPC_GFN_SECURE | KVMPPC_GFN_SHARED)
+#define KVMPPC_GFN_PFN_MASK	(~KVMPPC_GFN_FLAG_MASK)
 
 struct kvmppc_uvmem_slot {
 	struct list_head list;
@@ -106,11 +226,11 @@ struct kvmppc_uvmem_slot {
 	unsigned long base_pfn;
 	unsigned long *pfns;
 };
-
 struct kvmppc_uvmem_page_pvt {
 	struct kvm *kvm;
 	unsigned long gpa;
 	bool skip_page_out;
+	bool remove_gfn;
 };
 
 bool kvmppc_uvmem_available(void)
@@ -163,8 +283,8 @@ void kvmppc_uvmem_slot_free(struct kvm *kvm, const struct kvm_memory_slot *slot)
 	mutex_unlock(&kvm->arch.uvmem_lock);
 }
 
-static void kvmppc_uvmem_pfn_insert(unsigned long gfn, unsigned long uvmem_pfn,
-				    struct kvm *kvm)
+static void kvmppc_mark_gfn(unsigned long gfn, struct kvm *kvm,
+			unsigned long flag, unsigned long uvmem_pfn)
 {
 	struct kvmppc_uvmem_slot *p;
 
@@ -172,24 +292,41 @@ static void kvmppc_uvmem_pfn_insert(unsigned long gfn, unsigned long uvmem_pfn,
 		if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
 			unsigned long index = gfn - p->base_pfn;
 
-			p->pfns[index] = uvmem_pfn | KVMPPC_UVMEM_PFN;
+			if (flag == KVMPPC_GFN_UVMEM_PFN)
+				p->pfns[index] = uvmem_pfn | flag;
+			else
+				p->pfns[index] = flag;
 			return;
 		}
 	}
 }
 
-static void kvmppc_uvmem_pfn_remove(unsigned long gfn, struct kvm *kvm)
+/* mark the GFN as secure-GFN associated with @uvmem pfn device-PFN. */
+static void kvmppc_gfn_secure_uvmem_pfn(unsigned long gfn,
+			unsigned long uvmem_pfn, struct kvm *kvm)
 {
-	struct kvmppc_uvmem_slot *p;
+	kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_UVMEM_PFN, uvmem_pfn);
+}
 
-	list_for_each_entry(p, &kvm->arch.uvmem_pfns, list) {
-		if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
-			p->pfns[gfn - p->base_pfn] = 0;
-			return;
-		}
-	}
+/* mark the GFN as secure-GFN associated with a memory-PFN. */
+static void kvmppc_gfn_secure_mem_pfn(unsigned long gfn, struct kvm *kvm)
+{
+	kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_MEM_PFN, 0);
+}
+
+/* mark the GFN as a shared GFN. */
+static void kvmppc_gfn_shared(unsigned long gfn, struct kvm *kvm)
+{
+	kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_SHARED, 0);
+}
+
+/* mark the GFN as a non-existent GFN. */
+static void kvmppc_gfn_remove(unsigned long gfn, struct kvm *kvm)
+{
+	kvmppc_mark_gfn(gfn, kvm, 0, 0);
 }
 
+/* return true, if the GFN is a secure-GFN backed by a secure-PFN */
 static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
 				    unsigned long *uvmem_pfn)
 {
@@ -199,10 +336,10 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
 		if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
 			unsigned long index = gfn - p->base_pfn;
 
-			if (p->pfns[index] & KVMPPC_UVMEM_PFN) {
+			if (p->pfns[index] & KVMPPC_GFN_UVMEM_PFN) {
 				if (uvmem_pfn)
 					*uvmem_pfn = p->pfns[index] &
-						     ~KVMPPC_UVMEM_PFN;
+						     KVMPPC_GFN_PFN_MASK;
 				return true;
 			} else
 				return false;
@@ -353,6 +490,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 
 		mutex_lock(&kvm->arch.uvmem_lock);
 		if (!kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) {
+			kvmppc_gfn_remove(gfn, kvm);
 			mutex_unlock(&kvm->arch.uvmem_lock);
 			continue;
 		}
@@ -360,6 +498,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 		uvmem_page = pfn_to_page(uvmem_pfn);
 		pvt = uvmem_page->zone_device_data;
 		pvt->skip_page_out = skip_page_out;
+		pvt->remove_gfn = true;
 		mutex_unlock(&kvm->arch.uvmem_lock);
 
 		pfn = gfn_to_pfn(kvm, gfn);
@@ -429,7 +568,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
 		goto out_clear;
 
 	uvmem_pfn = bit + pfn_first;
-	kvmppc_uvmem_pfn_insert(gpa >> PAGE_SHIFT, uvmem_pfn, kvm);
+	kvmppc_gfn_secure_uvmem_pfn(gpa >> PAGE_SHIFT, uvmem_pfn, kvm);
 
 	pvt->gpa = gpa;
 	pvt->kvm = kvm;
@@ -524,6 +663,7 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
 		uvmem_page = pfn_to_page(uvmem_pfn);
 		pvt = uvmem_page->zone_device_data;
 		pvt->skip_page_out = true;
+		pvt->remove_gfn = false;
 	}
 
 retry:
@@ -537,12 +677,16 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
 		uvmem_page = pfn_to_page(uvmem_pfn);
 		pvt = uvmem_page->zone_device_data;
 		pvt->skip_page_out = true;
+		pvt->remove_gfn = false;
 		kvm_release_pfn_clean(pfn);
 		goto retry;
 	}
 
-	if (!uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0, page_shift))
+	if (!uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0,
+				page_shift)) {
+		kvmppc_gfn_shared(gfn, kvm);
 		ret = H_SUCCESS;
+	}
 	kvm_release_pfn_clean(pfn);
 	mutex_unlock(&kvm->arch.uvmem_lock);
 out:
@@ -598,6 +742,7 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 
 	if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift))
 		ret = H_SUCCESS;
+
 out_unlock:
 	mutex_unlock(&kvm->arch.uvmem_lock);
 out:
@@ -706,7 +851,8 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf)
 /*
  * Release the device PFN back to the pool
  *
- * Gets called when secure page becomes a normal page during H_SVM_PAGE_OUT.
+ * Gets called when secure GFN tranistions from a secure-PFN
+ * to a normal PFN during H_SVM_PAGE_OUT.
  * Gets called with kvm->arch.uvmem_lock held.
  */
 static void kvmppc_uvmem_page_free(struct page *page)
@@ -721,7 +867,10 @@ static void kvmppc_uvmem_page_free(struct page *page)
 
 	pvt = page->zone_device_data;
 	page->zone_device_data = NULL;
-	kvmppc_uvmem_pfn_remove(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
+	if (pvt->remove_gfn)
+		kvmppc_gfn_remove(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
+	else
+		kvmppc_gfn_secure_mem_pfn(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
 	kfree(pvt);
 }
 
-- 
1.8.3.1


^ permalink raw reply related

* [v4 0/5] Migrate non-migrated pages of a SVM.
From: Ram Pai @ 2020-07-17  8:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david

The time to switch a VM to Secure-VM, increases by the size of the VM.
A 100GB VM takes about 7minutes. This is unacceptable.  This linear
increase is caused by a suboptimal behavior by the Ultravisor and the
Hypervisor.  The Ultravisor unnecessarily migrates all the GFN of the
VM from normal-memory to secure-memory. It has to just migrate the
necessary and sufficient GFNs.

However when the optimization is incorporated in the Ultravisor, the
Hypervisor starts misbehaving. The Hypervisor has a inbuilt assumption
that the Ultravisor will explicitly request to migrate, each and every
GFN of the VM. If only necessary and sufficient GFNs are requested for
migration, the Hypervisor continues to manage the remaining GFNs as
normal GFNs. This leads to memory corruption; manifested
consistently when the SVM reboots.

The same is true, when a memory slot is hotplugged into a SVM. The
Hypervisor expects the ultravisor to request migration of all GFNs to
secure-GFN.  But the hypervisor cannot handle any H_SVM_PAGE_IN
requests from the Ultravisor, done in the context of
UV_REGISTER_MEM_SLOT ucall.  This problem manifests as random errors
in the SVM, when a memory-slot is hotplugged.

This patch series automatically migrates the non-migrated pages of a
SVM, and thus solves the problem.

Testing: Passed rigorous testing using various sized SVMs.

Changelog:

v4:  .  Incorported Bharata's comments:
	- Optimization -- replace write mmap semaphore with read mmap semphore.
	- disable page-merge during memory hotplug.
	- rearranged the patches. consolidated the page-migration-retry logic
		in a single patch.

v3: . Optimized the page-migration retry-logic. 
    . Relax and relinquish the cpu regularly while bulk migrating
    	the non-migrated pages. This issue was causing soft-lockups.
	Fixed it.
    . Added a new patch, to retry page-migration a couple of times
    	before returning H_BUSY in H_SVM_PAGE_IN. This issue was
	seen a few times in a 24hour continuous reboot test of the SVMs.

v2: . fixed a bug observed by Laurent. The state of the GFN's associated
	with Secure-VMs were not reset during memslot flush.
    . Re-organized the code, for easier review.
    . Better description of the patch series.

v1: fixed a bug observed by Bharata. Pages that where paged-in and later
paged-out must also be skipped from migration during H_SVM_INIT_DONE.


Laurent Dufour (1):
  KVM: PPC: Book3S HV: migrate hot plugged memory

Ram Pai (4):
  KVM: PPC: Book3S HV: Disable page merging in H_SVM_INIT_START
  KVM: PPC: Book3S HV: track the state GFNs associated with secure VMs
  KVM: PPC: Book3S HV: in H_SVM_INIT_DONE, migrate remaining normal-GFNs
    to secure-GFNs.
  KVM: PPC: Book3S HV: retry page migration before erroring-out

 Documentation/powerpc/ultravisor.rst        |   4 +
 arch/powerpc/include/asm/kvm_book3s_uvmem.h |  12 +
 arch/powerpc/kvm/book3s_hv.c                |  10 +-
 arch/powerpc/kvm/book3s_hv_uvmem.c          | 508 ++++++++++++++++++++++++----
 4 files changed, 457 insertions(+), 77 deletions(-)

-- 
1.8.3.1


^ permalink raw reply

* Re: [PATCH 06/11] powerpc/smp: Generalize 2nd sched domain
From: Gautham R Shenoy @ 2020-07-17  6:37 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-7-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:19AM +0530, Srikar Dronamraju wrote:
> Currently "CACHE" domain happens to be the 2nd sched domain as per
> powerpc_topology. This domain will collapse if cpumask of l2-cache is
> same as SMT domain. However we could generalize this domain such that it
> could mean either be a "CACHE" domain or a "BIGCORE" domain.
> 
> While setting up the "CACHE" domain, check if shared_cache is already
> set.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/kernel/smp.c | 48 +++++++++++++++++++++++++++------------
>  1 file changed, 34 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 875f57e41355..f8faf75135af 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -85,6 +85,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
>  EXPORT_PER_CPU_SYMBOL(cpu_core_map);
>  EXPORT_SYMBOL_GPL(has_big_cores);
> 
> +enum {
> +#ifdef CONFIG_SCHED_SMT
> +	smt_idx,
> +#endif
> +	bigcore_idx,
> +	die_idx,
> +};
> +
>  #define MAX_THREAD_LIST_SIZE	8
>  #define THREAD_GROUP_SHARE_L1   1
>  struct thread_groups {
> @@ -851,13 +859,7 @@ static int powerpc_shared_cache_flags(void)
>   */
>  static const struct cpumask *shared_cache_mask(int cpu)
>  {
> -	if (shared_caches)
> -		return cpu_l2_cache_mask(cpu);
> -
> -	if (has_big_cores)
> -		return cpu_smallcore_mask(cpu);
> -
> -	return cpu_smt_mask(cpu);
> +	return per_cpu(cpu_l2_cache_map, cpu);
>  }
> 
>  #ifdef CONFIG_SCHED_SMT
> @@ -867,11 +869,16 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
>  }
>  #endif
> 
> +static const struct cpumask *cpu_bigcore_mask(int cpu)
> +{
> +	return cpu_core_mask(cpu);

It should be cpu_smt_mask() if we want the redundant big-core to be
degenerated in favour of the SMT level on P8, no? Because
cpu_core_mask refers to all the CPUs that are in the same chip.


> +}
> +
>  static struct sched_domain_topology_level powerpc_topology[] = {
>  #ifdef CONFIG_SCHED_SMT
>  	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
>  #endif
> -	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
> +	{ cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
>  	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
>  	{ NULL, },
>  };
> @@ -895,7 +902,7 @@ static int init_big_cores(void)
> 
>  #ifdef CONFIG_SCHED_SMT
>  	pr_info("Big cores detected. Using small core scheduling\n");
> -	powerpc_topology[0].mask = smallcore_smt_mask;
> +	powerpc_topology[smt_idx].mask = smallcore_smt_mask;
>  #endif
> 
>  	return 0;
> @@ -1319,7 +1326,6 @@ static void add_cpu_to_masks(int cpu)
>  void start_secondary(void *unused)
>  {
>  	unsigned int cpu = smp_processor_id();
> -	struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
> 
>  	mmgrab(&init_mm);
>  	current->active_mm = &init_mm;
> @@ -1345,14 +1351,20 @@ void start_secondary(void *unused)
>  	/* Update topology CPU masks */
>  	add_cpu_to_masks(cpu);
> 
> -	if (has_big_cores)
> -		sibling_mask = cpu_smallcore_mask;
>  	/*
>  	 * Check for any shared caches. Note that this must be done on a
>  	 * per-core basis because one core in the pair might be disabled.
>  	 */
> -	if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu)))
> -		shared_caches = true;
> +	if (!shared_caches) {
> +		struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
> +		struct cpumask *mask = cpu_l2_cache_mask(cpu);
> +
> +		if (has_big_cores)
> +			sibling_mask = cpu_smallcore_mask;
> +
> +		if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu)))
> +			shared_caches = true;

Shouldn't we use cpumask_subset() here ?

  			
> +	}
> 
>  	set_numa_node(numa_cpu_lookup_table[cpu]);
>  	set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
> @@ -1390,6 +1402,14 @@ void __init smp_cpus_done(unsigned int max_cpus)
>  		smp_ops->bringup_done();
> 
>  	dump_numa_cpu_topology();
> +	if (shared_caches) {
> +		pr_info("Using shared cache scheduler topology\n");
> +		powerpc_topology[bigcore_idx].mask = shared_cache_mask;
> +#ifdef CONFIG_SCHED_DEBUG
> +		powerpc_topology[bigcore_idx].name = "CACHE";
> +#endif
> +		powerpc_topology[bigcore_idx].sd_flags = powerpc_shared_cache_flags;
> +	}


I would much rather that we have all the topology-fixups done in one
function.

fixup_topology(void) {
     if (has_big_core)
        powerpc_topology[smt_idx].mask = smallcore_smt_mask;

    if (shared_caches) {
       const char *name = "CACHE";
       powerpc_topology[bigcore_idx].mask = shared_cache_mask;
       strlcpy(powerpc_topology[bigcore_idx].name, name,
       				strlen(name));
       powerpc_topology[bigcore_idx].sd_flags = powerpc_shared_cache_flags;
    }

    /* Any other changes to the topology structure here */

And also as an optimization, get rid of degenerate structures here
itself so that we don't pay additional penalty while building the
sched-domains each time.

}

> 
>  	set_sched_topology(powerpc_topology);
>  }
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: ASMedia USB 3.x host controllers triggering EEH on POWER9
From: Oliver O'Halloran @ 2020-07-17  6:10 UTC (permalink / raw)
  To: Forest Crossman; +Cc: linux-pci, linux-usb, linuxppc-dev
In-Reply-To: <CAO3ALPy8_pxVyFROZUWNafEH1vUCP6LVpNmBBuMDSewGitzdLw@mail.gmail.com>

On Fri, Jul 17, 2020 at 2:14 PM Forest Crossman <cyrozap@gmail.com> wrote:
>
> Hi, all,
>
> I have several ASMedia USB 3.x host controllers (ASM2142 and ASM3142,
> both share the same Vendor ID/Device ID pair) that I'd like to use
> with a POWER9 system (a Raptor Computing Systems Talos II).
> Unfortunately, while the kernel recognizes the controllers just fine,
> as soon as I plug in a device, an EEH error occurs and the host
> controller gets repeatedly reset until it eventually gets disabled. An
> example of one of these errors can be seen here:
> https://paste.debian.net/hidden/e39698eb
>
> Based on the "PHB4 Diag-data" reported by the kernel, it seems that
> LEM_WOF_R bit 35, PHB_FESR bit 20, and RXE_ARB_FESR bit 28 have been
> set. According to the PHB4 specification
> (https://ibm.ent.box.com/s/jftnfhceul07qjh9jtn91xwjmclabc71), they
> respectively mean the following:
>  - ARB: IODA TVT Errors - "TCE Validation Table error occurred. The
> entry is invalid, or the PCI Address was out of range as defined by
> the TTA bounds in the TVE entry."
>  - RXE_ARB OR Error Status - "RXE_ARB error bits, ... OR of all error
> status bits."
>  - IODA TVT Address Range Error - "IODA Error: The PCI Address was out
> of range as defined by the TTA bounds in the TVE entry."

Welcome to my world!

In the future you can use this script to automate some of the tedium
of parsing the eeh dumps:
https://patchwork.ozlabs.org/project/skiboot/patch/20200717044243.1195833-1-oohall@gmail.com/

Anyway, for background the way PHB3 and PHB4 handle incoming DMAs goes
as follows:

1. Map the 16bit <bus><devfn> number to an IOMMU context, we call
those PEs. PE means "partitionable endpoint", but for the purpose of
processing DMAs you can ignore that and just treat it as an IOMMU
context ID.
2. Use the PE number and some of the upper bits of the DMA address to
form the index into the Translation Validation Table.
3. Use the table entry to validate the DMA address is within bounds
and whether it should be translated by the IOMMU or used as-is.

If the table entry says the DMA needs to be translated by the IOMMU we'll also:
4. Walk the IOMMU table to get the relevant IOMMU table entry.
5. Validate the device has permission to read/write to that address.

The "TVT Address Range Error" you're seeing means that the bounds
checks done in 3) is failing. OPAL configures the PHB so there's two
TVT entries (TVEs for short) assigned to each PE. Bit 59 of the DMA
address is used to select which TVE to use. We typically configure
TVE#0 to map 0x0...0x8000_0000 so there's a 2GB 32bit DMA window.
TVE#1 is configured for no-translate (bypass) mode so you can convert
from a system physical address to a DMA address by ORing in bit 59.

From word 2 of the PEST entry the faulting DMA address is:
0x0000203974c00000. That address is interesting since it looks a lot
like a memory address on the 2nd chip, but it doesn't have bit 59 set
so TVE#0 is used to validate it. Obviously that address is above 2GB
so we get the error.

What's probably happening is that the ASmedia controller doesn't
actually implement all 64 address bits and truncates the upper bits of
the DMA address. Doing that is a blatant violation of the PCIe (and
probably the xHCI) spec, but it's also pretty common since "it works
on x86." Something to try would be booting with the iommu=nobypass in
the kernel command line. That'll disable TVE#1 and force all DMAs to
go through TVE#0.

> In other words, the ASMedia USB controllers seem to be trying to write
> to addresses they're not supposed to, and thankfully the PHB4 is
> catching these bad writes before they can cause any corruption of my
> system's memory. Of course, this has the unfortunate side-effect that
> these devices are completely unable to operate with my computer, and
> since it seems to be possible to use these controllers on x86 systems
> (presumably because of the less-strict/disabled-by-default IOMMU), I
> wonder if maybe it would be possible to work around these errors in
> either the kernel or the OPAL firmware?

It's not easy to work around at the platform level since we have no
way to know what an arbitrary device can and can't do. There's various
tricks we can pull like putting the TVE#0 into bypass mode, but that
causes problems if there are any devices in the PE that are actually
limited to 32bit DMA. Some GPUs have a secondary audio function which
only supports 32bit DMA so it's not a completely academic concern
either.

> Now, I'm a novice at kernel hacking, so I don't really know what I'm
> doing, but just for fun I did try to paper over the issue by adding an
> EEH handler to the xhci driver
> (https://paste.debian.net/hidden/16081515), but as you might expect,
> that didn't do anything but prevent further communication with the
> device.

Assuming the nobypass trick above works, what you really need to do is
have the driver report that it can't address all 64bits by setting its
DMA mask accordingly. For the xhci driver it looks like this is done
in xhci_gen_setup(), there might be a quirks-style interface for
working around bugs in specific controllers that you can use. Have a
poke around and see what you can find :)

Oliver

^ permalink raw reply

* Re: [PATCH 05/11] powerpc/smp: Dont assume l2-cache to be superset of sibling
From: Gautham R Shenoy @ 2020-07-17  6:00 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-6-srikar@linux.vnet.ibm.com>

Hi Srikar,

On Tue, Jul 14, 2020 at 10:06:18AM +0530, Srikar Dronamraju wrote:
> Current code assumes that cpumask of cpus sharing a l2-cache mask will
> always be a superset of cpu_sibling_mask.
> 
> Lets stop that assumption.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/kernel/smp.c | 28 +++++++++++++++-------------
>  1 file changed, 15 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 7d430fc536cc..875f57e41355 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1198,6 +1198,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int))
>  	struct device_node *l2_cache, *np;
>  	int i;
> 
> +	cpumask_set_cpu(cpu, mask_fn(cpu));

It would be good to comment why do we need to do set the CPU in the
l2-mask if we don't have a l2cache domain.

>  	l2_cache = cpu_to_l2cache(cpu);
>  	if (!l2_cache)
>  		return false;
> @@ -1284,29 +1285,30 @@ static void add_cpu_to_masks(int cpu)
>  	 * add it to it's own thread sibling mask.
>  	 */
>  	cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
> +	cpumask_set_cpu(cpu, cpu_core_mask(cpu));
> 
>  	for (i = first_thread; i < first_thread + threads_per_core; i++)
>  		if (cpu_online(i))
>  			set_cpus_related(i, cpu, cpu_sibling_mask);
> 
>  	add_cpu_to_smallcore_masks(cpu);
> -	/*
> -	 * Copy the thread sibling mask into the cache sibling mask
> -	 * and mark any CPUs that share an L2 with this CPU.
> -	 */
> -	for_each_cpu(i, cpu_sibling_mask(cpu))
> -		set_cpus_related(cpu, i, cpu_l2_cache_mask);
>  	update_mask_by_l2(cpu, cpu_l2_cache_mask);
> 
> -	/*
> -	 * Copy the cache sibling mask into core sibling mask and mark
> -	 * any CPUs on the same chip as this CPU.
> -	 */
> -	for_each_cpu(i, cpu_l2_cache_mask(cpu))
> -		set_cpus_related(cpu, i, cpu_core_mask);
> +	if (pkg_id == -1) {
> +		struct cpumask *(*mask)(int) = cpu_sibling_mask;
> +
> +		/*
> +		 * Copy the sibling mask into core sibling mask and
> +		 * mark any CPUs on the same chip as this CPU.
> +		 */
> +		if (shared_caches)
> +			mask = cpu_l2_cache_mask;
> +


Now that we decoupling the containment relationship between
sibling_mask and l2-cache mask, should we set all the CPUs that are
both in cpu_sibling_mask(cpu) as well as cpu_l2_mask(cpu) in
cpu_core_mask ? 

> +		for_each_cpu(i, mask(cpu))
> +			set_cpus_related(cpu, i, cpu_core_mask);
> 
> -	if (pkg_id == -1)
>  		return;
> +	}
> 
>  	for_each_cpu(i, cpu_online_mask)
>  		if (get_physical_package_id(i) == pkg_id)
> -- 
> 2.17.1
> 
--
Thanks and Regards
gautham.

^ permalink raw reply

* Re: [PATCH v3] powerpc/pseries: detect secure and trusted boot state of the system.
From: Daniel Axtens @ 2020-07-17  5:58 UTC (permalink / raw)
  To: Michal Suchánek, Nayna Jain; +Cc: linuxppc-dev, linux-kernel, Mimi Zohar
In-Reply-To: <20200716081337.GB32107@kitsune.suse.cz>

Michal Suchánek <msuchanek@suse.de> writes:

> On Wed, Jul 15, 2020 at 07:52:01AM -0400, Nayna Jain wrote:
>> The device-tree property to check secure and trusted boot state is
>> different for guests(pseries) compared to baremetal(powernv).
>> 
>> This patch updates the existing is_ppc_secureboot_enabled() and
>> is_ppc_trustedboot_enabled() functions to add support for pseries.
>> 
>> The secureboot and trustedboot state are exposed via device-tree property:
>> /proc/device-tree/ibm,secure-boot and /proc/device-tree/ibm,trusted-boot
>> 
>> The values of ibm,secure-boot under pseries are interpreted as:
>                                       ^^^
>> 
>> 0 - Disabled
>> 1 - Enabled in Log-only mode. This patch interprets this value as
>> disabled, since audit mode is currently not supported for Linux.
>> 2 - Enabled and enforced.
>> 3-9 - Enabled and enforcing; requirements are at the discretion of the
>> operating system.
>> 
>> The values of ibm,trusted-boot under pseries are interpreted as:
>                                        ^^^
> These two should be different I suppose?

I'm not quite sure what you mean? They'll be documented in a future
revision of the PAPR, once I get my act together and submit the
relevant internal paperwork.

Daniel
>
> Thanks
>
> Michal
>> 0 - Disabled
>> 1 - Enabled
>> 
>> Signed-off-by: Nayna Jain <nayna@linux.ibm.com>
>> Reviewed-by: Daniel Axtens <dja@axtens.net>
>> ---
>> v3:
>> * fixed double check. Thanks Daniel for noticing it.
>> * updated patch description.
>> 
>> v2:
>> * included Michael Ellerman's feedback.
>> * added Daniel Axtens's Reviewed-by.
>> 
>>  arch/powerpc/kernel/secure_boot.c | 19 +++++++++++++++++--
>>  1 file changed, 17 insertions(+), 2 deletions(-)
>> 
>> diff --git a/arch/powerpc/kernel/secure_boot.c b/arch/powerpc/kernel/secure_boot.c
>> index 4b982324d368..118bcb5f79c4 100644
>> --- a/arch/powerpc/kernel/secure_boot.c
>> +++ b/arch/powerpc/kernel/secure_boot.c
>> @@ -6,6 +6,7 @@
>>  #include <linux/types.h>
>>  #include <linux/of.h>
>>  #include <asm/secure_boot.h>
>> +#include <asm/machdep.h>
>>  
>>  static struct device_node *get_ppc_fw_sb_node(void)
>>  {
>> @@ -23,12 +24,19 @@ bool is_ppc_secureboot_enabled(void)
>>  {
>>  	struct device_node *node;
>>  	bool enabled = false;
>> +	u32 secureboot;
>>  
>>  	node = get_ppc_fw_sb_node();
>>  	enabled = of_property_read_bool(node, "os-secureboot-enforcing");
>> -
>>  	of_node_put(node);
>>  
>> +	if (enabled)
>> +		goto out;
>> +
>> +	if (!of_property_read_u32(of_root, "ibm,secure-boot", &secureboot))
>> +		enabled = (secureboot > 1);
>> +
>> +out:
>>  	pr_info("Secure boot mode %s\n", enabled ? "enabled" : "disabled");
>>  
>>  	return enabled;
>> @@ -38,12 +46,19 @@ bool is_ppc_trustedboot_enabled(void)
>>  {
>>  	struct device_node *node;
>>  	bool enabled = false;
>> +	u32 trustedboot;
>>  
>>  	node = get_ppc_fw_sb_node();
>>  	enabled = of_property_read_bool(node, "trusted-enabled");
>> -
>>  	of_node_put(node);
>>  
>> +	if (enabled)
>> +		goto out;
>> +
>> +	if (!of_property_read_u32(of_root, "ibm,trusted-boot", &trustedboot))
>> +		enabled = (trustedboot > 0);
>> +
>> +out:
>>  	pr_info("Trusted boot mode %s\n", enabled ? "enabled" : "disabled");
>>  
>>  	return enabled;
>> -- 
>> 2.26.2
>> 

^ permalink raw reply

* Re: [PATCH 04/11] powerpc/smp: Enable small core scheduling sooner
From: Gautham R Shenoy @ 2020-07-17  5:48 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-5-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:17AM +0530, Srikar Dronamraju wrote:
> Enable small core scheduling as soon as we detect that we are in a
> system that supports thread group. Doing so would avoid a redundant
> check.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

I don't see a problem with this.

However, since we are now going to be maintaining a single topology
structure, wouldn't it be better to collate all the changes being made
to the mask_functions/flags/names of this structure within a single
function so that it becomes easier to keep track of what all changes
are going into the topology and why are we doing it?


> ---
>  arch/powerpc/kernel/smp.c | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 24529f6134aa..7d430fc536cc 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -892,6 +892,12 @@ static int init_big_cores(void)
>  	}
> 
>  	has_big_cores = true;
> +
> +#ifdef CONFIG_SCHED_SMT
> +	pr_info("Big cores detected. Using small core scheduling\n");
> +	powerpc_topology[0].mask = smallcore_smt_mask;
> +#endif
> +
>  	return 0;
>  }
> 
> @@ -1383,12 +1389,6 @@ void __init smp_cpus_done(unsigned int max_cpus)
> 
>  	dump_numa_cpu_topology();
> 
> -#ifdef CONFIG_SCHED_SMT
> -	if (has_big_cores) {
> -		pr_info("Big cores detected but using small core scheduling\n");
> -		powerpc_topology[0].mask = smallcore_smt_mask;
> -	}
> -#endif
>  	set_sched_topology(powerpc_topology);
>  }
> 
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH 03/11] powerpc/smp: Move powerpc_topology above
From: Gautham R Shenoy @ 2020-07-17  5:45 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-4-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:16AM +0530, Srikar Dronamraju wrote:
> Just moving the powerpc_topology description above.
> This will help in using functions in this file and avoid declarations.
> 
> No other functional changes
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>

> ---
>  arch/powerpc/kernel/smp.c | 116 +++++++++++++++++++-------------------
>  1 file changed, 58 insertions(+), 58 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 069ea4b21c6d..24529f6134aa 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -818,6 +818,64 @@ static int init_cpu_l1_cache_map(int cpu)
>  	return err;
>  }
> 
> +static bool shared_caches;
> +
> +#ifdef CONFIG_SCHED_SMT
> +/* cpumask of CPUs with asymmetric SMT dependency */
> +static int powerpc_smt_flags(void)
> +{
> +	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
> +
> +	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
> +		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
> +		flags |= SD_ASYM_PACKING;
> +	}
> +	return flags;
> +}
> +#endif
> +
> +/*
> + * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
> + * This topology makes it *much* cheaper to migrate tasks between adjacent cores
> + * since the migrated task remains cache hot. We want to take advantage of this
> + * at the scheduler level so an extra topology level is required.
> + */
> +static int powerpc_shared_cache_flags(void)
> +{
> +	return SD_SHARE_PKG_RESOURCES;
> +}
> +
> +/*
> + * We can't just pass cpu_l2_cache_mask() directly because
> + * returns a non-const pointer and the compiler barfs on that.
> + */
> +static const struct cpumask *shared_cache_mask(int cpu)
> +{
> +	if (shared_caches)
> +		return cpu_l2_cache_mask(cpu);
> +
> +	if (has_big_cores)
> +		return cpu_smallcore_mask(cpu);
> +
> +	return cpu_smt_mask(cpu);
> +}
> +
> +#ifdef CONFIG_SCHED_SMT
> +static const struct cpumask *smallcore_smt_mask(int cpu)
> +{
> +	return cpu_smallcore_mask(cpu);
> +}
> +#endif
> +
> +static struct sched_domain_topology_level powerpc_topology[] = {
> +#ifdef CONFIG_SCHED_SMT
> +	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
> +#endif
> +	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
> +	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
> +	{ NULL, },
> +};
> +
>  static int init_big_cores(void)
>  {
>  	int cpu;
> @@ -1249,8 +1307,6 @@ static void add_cpu_to_masks(int cpu)
>  			set_cpus_related(cpu, i, cpu_core_mask);
>  }
> 
> -static bool shared_caches;
> -
>  /* Activate a secondary processor. */
>  void start_secondary(void *unused)
>  {
> @@ -1314,62 +1370,6 @@ int setup_profiling_timer(unsigned int multiplier)
>  	return 0;
>  }
> 
> -#ifdef CONFIG_SCHED_SMT
> -/* cpumask of CPUs with asymmetric SMT dependency */
> -static int powerpc_smt_flags(void)
> -{
> -	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
> -
> -	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
> -		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
> -		flags |= SD_ASYM_PACKING;
> -	}
> -	return flags;
> -}
> -#endif
> -
> -/*
> - * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
> - * This topology makes it *much* cheaper to migrate tasks between adjacent cores
> - * since the migrated task remains cache hot. We want to take advantage of this
> - * at the scheduler level so an extra topology level is required.
> - */
> -static int powerpc_shared_cache_flags(void)
> -{
> -	return SD_SHARE_PKG_RESOURCES;
> -}
> -
> -/*
> - * We can't just pass cpu_l2_cache_mask() directly because
> - * returns a non-const pointer and the compiler barfs on that.
> - */
> -static const struct cpumask *shared_cache_mask(int cpu)
> -{
> -	if (shared_caches)
> -		return cpu_l2_cache_mask(cpu);
> -
> -	if (has_big_cores)
> -		return cpu_smallcore_mask(cpu);
> -
> -	return cpu_smt_mask(cpu);
> -}
> -
> -#ifdef CONFIG_SCHED_SMT
> -static const struct cpumask *smallcore_smt_mask(int cpu)
> -{
> -	return cpu_smallcore_mask(cpu);
> -}
> -#endif
> -
> -static struct sched_domain_topology_level powerpc_topology[] = {
> -#ifdef CONFIG_SCHED_SMT
> -	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
> -#endif
> -	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
> -	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
> -	{ NULL, },
> -};
> -
>  void __init smp_cpus_done(unsigned int max_cpus)
>  {
>  	/*
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH v4 05/10] powerpc/dt_cpu_ftrs: Add feature for 2nd DAWR
From: Jordan Niethe @ 2020-07-17  5:44 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: Christophe Leroy, apopple, mikey, miltonm, peterz, oleg,
	Nicholas Piggin, linux-kernel, Paul Mackerras, jolsa, fweisbec,
	pedromfc, naveen.n.rao, linuxppc-dev, mingo
In-Reply-To: <20200717040958.70561-6-ravi.bangoria@linux.ibm.com>

On Fri, Jul 17, 2020 at 2:10 PM Ravi Bangoria
<ravi.bangoria@linux.ibm.com> wrote:
>
> Add new device-tree feature for 2nd DAWR. If this feature is present,
> 2nd DAWR is supported, otherwise not.
>
> Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
> ---
>  arch/powerpc/include/asm/cputable.h | 7 +++++--
>  arch/powerpc/kernel/dt_cpu_ftrs.c   | 7 +++++++
>  2 files changed, 12 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
> index e506d429b1af..3445c86e1f6f 100644
> --- a/arch/powerpc/include/asm/cputable.h
> +++ b/arch/powerpc/include/asm/cputable.h
> @@ -214,6 +214,7 @@ static inline void cpu_feature_keys_init(void) { }
>  #define CPU_FTR_P9_TLBIE_ERAT_BUG      LONG_ASM_CONST(0x0001000000000000)
>  #define CPU_FTR_P9_RADIX_PREFETCH_BUG  LONG_ASM_CONST(0x0002000000000000)
>  #define CPU_FTR_ARCH_31                        LONG_ASM_CONST(0x0004000000000000)
> +#define CPU_FTR_DAWR1                  LONG_ASM_CONST(0x0008000000000000)
>
>  #ifndef __ASSEMBLY__
>
> @@ -497,14 +498,16 @@ static inline void cpu_feature_keys_init(void) { }
>  #define CPU_FTRS_POSSIBLE      \
>             (CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | CPU_FTRS_POWER8 | \
>              CPU_FTR_ALTIVEC_COMP | CPU_FTR_VSX_COMP | CPU_FTRS_POWER9 | \
> -            CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10)
> +            CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10 | \
> +            CPU_FTR_DAWR1)
>  #else
>  #define CPU_FTRS_POSSIBLE      \
>             (CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \
>              CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \
>              CPU_FTRS_POWER8 | CPU_FTRS_CELL | CPU_FTRS_PA6T | \
>              CPU_FTR_VSX_COMP | CPU_FTR_ALTIVEC_COMP | CPU_FTRS_POWER9 | \
> -            CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10)
> +            CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10 | \
> +            CPU_FTR_DAWR1)
Instead of putting CPU_FTR_DAWR1 into CPU_FTRS_POSSIBLE should it go
into CPU_FTRS_POWER10?
Then it will be picked up by CPU_FTRS_POSSIBLE.
>  #endif /* CONFIG_CPU_LITTLE_ENDIAN */
>  #endif
>  #else
> diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
> index ac650c233cd9..c78cd3596ec4 100644
> --- a/arch/powerpc/kernel/dt_cpu_ftrs.c
> +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
> @@ -574,6 +574,12 @@ static int __init feat_enable_mma(struct dt_cpu_feature *f)
>         return 1;
>  }
>
> +static int __init feat_enable_debug_facilities_v31(struct dt_cpu_feature *f)
> +{
> +       cur_cpu_spec->cpu_features |= CPU_FTR_DAWR1;
> +       return 1;
> +}
> +
>  struct dt_cpu_feature_match {
>         const char *name;
>         int (*enable)(struct dt_cpu_feature *f);
> @@ -649,6 +655,7 @@ static struct dt_cpu_feature_match __initdata
>         {"wait-v3", feat_enable, 0},
>         {"prefix-instructions", feat_enable, 0},
>         {"matrix-multiply-assist", feat_enable_mma, 0},
> +       {"debug-facilities-v31", feat_enable_debug_facilities_v31, 0},
Since all feat_enable_debug_facilities_v31() does is set
CPU_FTR_DAWR1, if you just have:
{"debug-facilities-v31", feat_enable, CPU_FTR_DAWR1},
I think cpufeatures_process_feature() should set it in for you at this point:
            if (m->enable(f)) {
                cur_cpu_spec->cpu_features |= m->cpu_ftr_bit_mask;
                break;
            }

>  };
>
>  static bool __initdata using_dt_cpu_ftrs;
> --
> 2.26.2
>

^ permalink raw reply

* Re: [PATCH 02/11] powerpc/smp: Merge Power9 topology with Power topology
From: Gautham R Shenoy @ 2020-07-17  5:44 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-3-srikar@linux.vnet.ibm.com>

Hi Srikar,

On Tue, Jul 14, 2020 at 10:06:15AM +0530, Srikar Dronamraju wrote:
> A new sched_domain_topology_level was added just for Power9. However the
> same can be achieved by merging powerpc_topology with power9_topology
> and makes the code more simpler especially when adding a new sched
> domain.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/kernel/smp.c | 33 ++++++++++-----------------------
>  1 file changed, 10 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 680c0edcc59d..069ea4b21c6d 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1315,7 +1315,7 @@ int setup_profiling_timer(unsigned int multiplier)
>  }
> 
>  #ifdef CONFIG_SCHED_SMT
> -/* cpumask of CPUs with asymetric SMT dependancy */
> +/* cpumask of CPUs with asymmetric SMT dependency */
>  static int powerpc_smt_flags(void)
>  {
>  	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
> @@ -1328,14 +1328,6 @@ static int powerpc_smt_flags(void)
>  }
>  #endif
> 
> -static struct sched_domain_topology_level powerpc_topology[] = {
> -#ifdef CONFIG_SCHED_SMT
> -	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
> -#endif
> -	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
> -	{ NULL, },
> -};
> -
>  /*
>   * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
>   * This topology makes it *much* cheaper to migrate tasks between adjacent cores
> @@ -1353,7 +1345,13 @@ static int powerpc_shared_cache_flags(void)
>   */
>  static const struct cpumask *shared_cache_mask(int cpu)
>  {
> -	return cpu_l2_cache_mask(cpu);
> +	if (shared_caches)
> +		return cpu_l2_cache_mask(cpu);
> +
> +	if (has_big_cores)
> +		return cpu_smallcore_mask(cpu);
> +
> +	return cpu_smt_mask(cpu);
>  }
> 
>  #ifdef CONFIG_SCHED_SMT
> @@ -1363,7 +1361,7 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
>  }
>  #endif
> 
> -static struct sched_domain_topology_level power9_topology[] = {
> +static struct sched_domain_topology_level powerpc_topology[] = {


>  #ifdef CONFIG_SCHED_SMT
>  	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
>  #endif
> @@ -1388,21 +1386,10 @@ void __init smp_cpus_done(unsigned int max_cpus)
>  #ifdef CONFIG_SCHED_SMT
>  	if (has_big_cores) {
>  		pr_info("Big cores detected but using small core scheduling\n");
I> -		power9_topology[0].mask = smallcore_smt_mask;
>  		powerpc_topology[0].mask = smallcore_smt_mask;
>  	}
>  #endif
> -	/*
> -	 * If any CPU detects that it's sharing a cache with another CPU then
> -	 * use the deeper topology that is aware of this sharing.
> -	 */
> -	if (shared_caches) {
> -		pr_info("Using shared cache scheduler topology\n");
> -		set_sched_topology(power9_topology);
> -	} else {
> -		pr_info("Using standard scheduler topology\n");
> -		set_sched_topology(powerpc_topology);


Ok, so we will go with the three level topology by default (SMT,
CACHE, DIE) and will rely on the sched-domain creation code to
degenerate CACHE domain in case SMT and CACHE have the same set of
CPUs (POWER8 for eg).

From a cleanup perspective this is better, since we won't have to
worry about defining multiple topology structures, but from a
performance point of view, wouldn't we now pay an extra penalty of
degenerating the CACHE domains on POWER8 kind of systems, each time
when a CPU comes online ?

Do we know how bad it is ? If the degeneration takes a few extra
microseconds, that should be ok I suppose.

--
Thanks and Regards
gautham.

^ permalink raw reply

* Re: [PATCH 01/11] powerpc/smp: Cache node for reuse
From: Gautham R Shenoy @ 2020-07-17  4:51 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-2-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:14AM +0530, Srikar Dronamraju wrote:
> While cpu_to_node is inline function with access to per_cpu variable.
> However when using repeatedly, it may be cleaner to cache it in a local
> variable.
> 
> Also fix a build error in a some weird config.
> "error: _numa_cpu_lookup_table_ undeclared"
> 
> No functional change
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>


LGTM.

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>

> ---
>  arch/powerpc/kernel/smp.c | 18 +++++++++++-------
>  1 file changed, 11 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 73199470c265..680c0edcc59d 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -843,7 +843,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
> 
>  	DBG("smp_prepare_cpus\n");
> 
> -	/* 
> +	/*
>  	 * setup_cpu may need to be called on the boot cpu. We havent
>  	 * spun any cpus up but lets be paranoid.
>  	 */
> @@ -854,20 +854,24 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>  	cpu_callin_map[boot_cpuid] = 1;
> 
>  	for_each_possible_cpu(cpu) {
> +		int node = cpu_to_node(cpu);
> +
>  		zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu),
> -					GFP_KERNEL, cpu_to_node(cpu));
> +					GFP_KERNEL, node);
>  		zalloc_cpumask_var_node(&per_cpu(cpu_l2_cache_map, cpu),
> -					GFP_KERNEL, cpu_to_node(cpu));
> +					GFP_KERNEL, node);
>  		zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu),
> -					GFP_KERNEL, cpu_to_node(cpu));
> +					GFP_KERNEL, node);
> +#ifdef CONFIG_NEED_MULTIPLE_NODES
>  		/*
>  		 * numa_node_id() works after this.
>  		 */
>  		if (cpu_present(cpu)) {
> -			set_cpu_numa_node(cpu, numa_cpu_lookup_table[cpu]);
> -			set_cpu_numa_mem(cpu,
> -				local_memory_node(numa_cpu_lookup_table[cpu]));
> +			node = numa_cpu_lookup_table[cpu];
> +			set_cpu_numa_node(cpu, node);
> +			set_cpu_numa_mem(cpu, local_memory_node(node));
>  		}
> +#endif
>  	}
> 
>  	/* Init the cpumasks so the boot CPU is related to itself */
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH v3 02/12] powerpc/kexec_file: mark PPC64 specific code
From: Hari Bathini @ 2020-07-17  4:46 UTC (permalink / raw)
  To: Thiago Jung Bauermann
  Cc: Pingfan Liu, Petr Tesarik, Nayna Jain, Kexec-ml,
	Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
	Andrew Morton, Dave Young, Vivek Goyal, Eric Biederman
In-Reply-To: <87v9io8c13.fsf@morokweng.localdomain>



On 16/07/20 7:19 am, Thiago Jung Bauermann wrote:
> 
> I didn't forget about this patch. I just wanted to see more of the
> changes before comenting on it.
> 
> Hari Bathini <hbathini@linux.ibm.com> writes:
> 
>> Some of the kexec_file_load code isn't PPC64 specific. Move PPC64
>> specific code from kexec/file_load.c to kexec/file_load_64.c. Also,
>> rename purgatory/trampoline.S to purgatory/trampoline_64.S in the
>> same spirit.
> 
> There's only a 64 bit implementation of kexec_file_load() so this is a
> somewhat theoretical exercise, but there's no harm in getting the code
> organized, so:
> 
> Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
> 
> I have just one question below.

<snip>

>> +/**
>> + * setup_new_fdt_ppc64 - Update the flattend device-tree of the kernel
>> + *                       being loaded.
>> + * @image:               kexec image being loaded.
>> + * @fdt:                 Flattened device tree for the next kernel.
>> + * @initrd_load_addr:    Address where the next initrd will be loaded.
>> + * @initrd_len:          Size of the next initrd, or 0 if there will be none.
>> + * @cmdline:             Command line for the next kernel, or NULL if there will
>> + *                       be none.
>> + *
>> + * Returns 0 on success, negative errno on error.
>> + */
>> +int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
>> +			unsigned long initrd_load_addr,
>> +			unsigned long initrd_len, const char *cmdline)
>> +{
>> +	int chosen_node, ret;
>> +
>> +	/* Remove memory reservation for the current device tree. */
>> +	ret = delete_fdt_mem_rsv(fdt, __pa(initial_boot_params),
>> +				 fdt_totalsize(initial_boot_params));
>> +	if (ret == 0)
>> +		pr_debug("Removed old device tree reservation.\n");
>> +	else if (ret != -ENOENT) {
>> +		pr_err("Failed to remove old device-tree reservation.\n");
>> +		return ret;
>> +	}
>> +
>> +	ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len,
>> +			    cmdline, &chosen_node);
>> +	if (ret)
>> +		return ret;
>> +
>> +	ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0);
>> +	if (ret)
>> +		pr_err("Failed to update device-tree with linux,booted-from-kexec\n");
>> +
>> +	return ret;
>> +}
> 
> For setup_purgatory_ppc64() you start with an empty function and build
> from there, but for setup_new_fdt_ppc64() you moved some code here. Is
> the code above 64 bit specific?

Actually, I was not quiet sure if fdt updates like in patch 6 & patch 9 can be
done after setup_ima_buffer() call. If you can confirm, I will move them back
to setup_purgatory()

Thanks
Hari

^ permalink raw reply

* Re: [PATCH -next] cpuidle/pseries: Make symbol 'pseries_idle_driver' static
From: Daniel Lezcano @ 2020-07-17  4:41 UTC (permalink / raw)
  To: Michael Ellerman, Hulk Robot, Wei Yongjun, Rafael J. Wysocki,
	Michael Ellerman
  Cc: linuxppc-dev, linux-pm
In-Reply-To: <159490401706.3805857.7133480973769495238.b4-ty@ellerman.id.au>

On 16/07/2020 14:56, Michael Ellerman wrote:
> On Tue, 14 Jul 2020 22:24:24 +0800, Wei Yongjun wrote:
>> The sparse tool complains as follows:
>>
>> drivers/cpuidle/cpuidle-pseries.c:25:23: warning:
>>  symbol 'pseries_idle_driver' was not declared. Should it be static?
>>
>> 'pseries_idle_driver' is not used outside of this file, so marks
>> it static.
> 
> Applied to powerpc/next.
> 
> [1/1] cpuidle/pseries: Make symbol 'pseries_idle_driver' static
>       https://git.kernel.org/powerpc/c/92fe8483b1660feaa602d8be6ca7efe95ae4789b

Rafael already picked the patch.


-- 
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog

^ permalink raw reply

* Re: [PATCH v3 03/12] powerpc/kexec_file: add helper functions for getting memory ranges
From: Hari Bathini @ 2020-07-17  4:32 UTC (permalink / raw)
  To: Thiago Jung Bauermann
  Cc: Pingfan Liu, Petr Tesarik, Nayna Jain, Kexec-ml,
	Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
	Andrew Morton, Dave Young, Vivek Goyal, Eric Biederman
In-Reply-To: <874kq98xo4.fsf@morokweng.localdomain>



On 15/07/20 5:19 am, Thiago Jung Bauermann wrote:
> 
> Hello Hari,
> 
> Hari Bathini <hbathini@linux.ibm.com> writes:
> 
>> In kexec case, the kernel to be loaded uses the same memory layout as
>> the running kernel. So, passing on the DT of the running kernel would
>> be good enough.
>>
>> But in case of kdump, different memory ranges are needed to manage
>> loading the kdump kernel, booting into it and exporting the elfcore
>> of the crashing kernel. The ranges are exlude memory ranges, usable
> 
> s/exlude/exclude/
> 
>> memory ranges, reserved memory ranges and crash memory ranges.
>>
>> Exclude memory ranges specify the list of memory ranges to avoid while
>> loading kdump segments. Usable memory ranges list the memory ranges
>> that could be used for booting kdump kernel. Reserved memory ranges
>> list the memory regions for the loading kernel's reserve map. Crash
>> memory ranges list the memory ranges to be exported as the crashing
>> kernel's elfcore.
>>
>> Add helper functions for setting up the above mentioned memory ranges.
>> This helpers facilitate in understanding the subsequent changes better
>> and make it easy to setup the different memory ranges listed above, as
>> and when appropriate.
>>
>> Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
>> Tested-by: Pingfan Liu <piliu@redhat.com>
> 

<snip>

>> +/**
>> + * add_reserved_ranges - Adds "/reserved-ranges" regions exported by f/w
>> + *                       to the given memory ranges list.
>> + * @mem_ranges:          Range list to add the memory ranges to.
>> + *
>> + * Returns 0 on success, negative errno on error.
>> + */
>> +int add_reserved_ranges(struct crash_mem **mem_ranges)
>> +{
>> +	int i, len, ret = 0;
>> +	const __be32 *prop;
>> +
>> +	prop = of_get_property(of_root, "reserved-ranges", &len);
>> +	if (!prop)
>> +		return 0;
>> +
>> +	/*
>> +	 * Each reserved range is an (address,size) pair, 2 cells each,
>> +	 * totalling 4 cells per range.
> 
> Can you assume that, or do you need to check the #address-cells and
> #size-cells properties of the root node?

Taken from early_reserve_mem_dt() which did not seem to care.
Should we be doing any different here?

Thanks
Hari

^ permalink raw reply

* Re: [PATCH v3 0/3] Off-load TLB invalidations to host for !GTSE
From: Bharata B Rao @ 2020-07-17  4:28 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: sfr, linux-kernel, linux-next, Qian Cai, aneesh.kumar,
	linuxppc-dev
In-Reply-To: <1594953143.b8px5ir35m.astroid@bobo.none>

On Fri, Jul 17, 2020 at 12:44:00PM +1000, Nicholas Piggin wrote:
> Excerpts from Nicholas Piggin's message of July 17, 2020 12:08 pm:
> > Excerpts from Qian Cai's message of July 17, 2020 3:27 am:
> >> On Fri, Jul 03, 2020 at 11:06:05AM +0530, Bharata B Rao wrote:
> >>> Hypervisor may choose not to enable Guest Translation Shootdown Enable
> >>> (GTSE) option for the guest. When GTSE isn't ON, the guest OS isn't
> >>> permitted to use instructions like tblie and tlbsync directly, but is
> >>> expected to make hypervisor calls to get the TLB flushed.
> >>> 
> >>> This series enables the TLB flush routines in the radix code to
> >>> off-load TLB flushing to hypervisor via the newly proposed hcall
> >>> H_RPT_INVALIDATE. 
> >>> 
> >>> To easily check the availability of GTSE, it is made an MMU feature.
> >>> The OV5 handling and H_REGISTER_PROC_TBL hcall are changed to
> >>> handle GTSE as an optionally available feature and to not assume GTSE
> >>> when radix support is available.
> >>> 
> >>> The actual hcall implementation for KVM isn't included in this
> >>> patchset and will be posted separately.
> >>> 
> >>> Changes in v3
> >>> =============
> >>> - Fixed a bug in the hcall wrapper code where we were missing setting
> >>>   H_RPTI_TYPE_NESTED while retrying the failed flush request with
> >>>   a full flush for the nested case.
> >>> - s/psize_to_h_rpti/psize_to_rpti_pgsize
> >>> 
> >>> v2: https://lore.kernel.org/linuxppc-dev/20200626131000.5207-1-bharata@linux.ibm.com/T/#t
> >>> 
> >>> Bharata B Rao (2):
> >>>   powerpc/mm: Enable radix GTSE only if supported.
> >>>   powerpc/pseries: H_REGISTER_PROC_TBL should ask for GTSE only if
> >>>     enabled
> >>> 
> >>> Nicholas Piggin (1):
> >>>   powerpc/mm/book3s64/radix: Off-load TLB invalidations to host when
> >>>     !GTSE
> >> 
> >> Reverting the whole series fixed random memory corruptions during boot on
> >> POWER9 PowerNV systems below.
> > 
> > If I s/mmu_has_feature(MMU_FTR_GTSE)/(1)/g in radix_tlb.c, then the .o
> > disasm is the same as reverting my patch.
> > 
> > Feature bits not being set right? PowerNV should be pretty simple, seems
> > to do the same as FTR_TYPE_RADIX.
> 
> Might need this fix
> 
> ---
> 
> diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
> index 9cc49f265c86..54c9bcea9d4e 100644
> --- a/arch/powerpc/kernel/prom.c
> +++ b/arch/powerpc/kernel/prom.c
> @@ -163,7 +163,7 @@ static struct ibm_pa_feature {
>  	{ .pabyte = 0,  .pabit = 6, .cpu_features  = CPU_FTR_NOEXECUTE },
>  	{ .pabyte = 1,  .pabit = 2, .mmu_features  = MMU_FTR_CI_LARGE_PAGE },
>  #ifdef CONFIG_PPC_RADIX_MMU
> -	{ .pabyte = 40, .pabit = 0, .mmu_features  = MMU_FTR_TYPE_RADIX },
> +	{ .pabyte = 40, .pabit = 0, .mmu_features  = (MMU_FTR_TYPE_RADIX | MMU_FTR_GTSE) },
>  #endif
>  	{ .pabyte = 1,  .pabit = 1, .invert = 1, .cpu_features = CPU_FTR_NODSISRALIGN },
>  	{ .pabyte = 5,  .pabit = 0, .cpu_features  = CPU_FTR_REAL_LE,

Michael - Let me know if this should be folded into 1/3 and the complete
series resent.

Regards,
Bharata.

^ permalink raw reply

* Re: [PATCH v4 04/10] powerpc/watchpoint: Enable watchpoint functionality on power10 guest
From: Jordan Niethe @ 2020-07-17  4:23 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: Christophe Leroy, apopple, mikey, miltonm, peterz, oleg,
	Nicholas Piggin, linux-kernel, Paul Mackerras, jolsa, fweisbec,
	pedromfc, naveen.n.rao, linuxppc-dev, mingo
In-Reply-To: <20200717040958.70561-5-ravi.bangoria@linux.ibm.com>

On Fri, Jul 17, 2020 at 2:10 PM Ravi Bangoria
<ravi.bangoria@linux.ibm.com> wrote:
>
> CPU_FTR_DAWR is by default enabled for host via CPU_FTRS_DT_CPU_BASE
> (controlled by CONFIG_PPC_DT_CPU_FTRS). But cpu-features device-tree
> node is not PAPR compatible and thus not yet used by kvm or pHyp
> guests. Enable watchpoint functionality on power10 guest (both kvm
> and powervm) by adding CPU_FTR_DAWR to CPU_FTRS_POWER10. Note that
> this change does not enable 2nd DAWR support.
>
> Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
I ran the ptrace-hwbreak selftest successfully within a power10 kvm guest.
Tested-by: Jordan Niethe <jniethe5@gmail.com>
> ---
>  arch/powerpc/include/asm/cputable.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
> index bac2252c839e..e506d429b1af 100644
> --- a/arch/powerpc/include/asm/cputable.h
> +++ b/arch/powerpc/include/asm/cputable.h
> @@ -478,7 +478,7 @@ static inline void cpu_feature_keys_init(void) { }
>             CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
>             CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \
>             CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \
> -           CPU_FTR_ARCH_31)
> +           CPU_FTR_ARCH_31 | CPU_FTR_DAWR)
>  #define CPU_FTRS_CELL  (CPU_FTR_LWSYNC | \
>             CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
>             CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
> --
> 2.26.2
>

^ permalink raw reply

* Re: [PATCH v3 06/12] ppc64/kexec_file: restrict memory usage of kdump kernel
From: Hari Bathini @ 2020-07-17  4:17 UTC (permalink / raw)
  To: Thiago Jung Bauermann
  Cc: Pingfan Liu, Petr Tesarik, Nayna Jain, Kexec-ml,
	Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
	Andrew Morton, Dave Young, Vivek Goyal, Eric Biederman
In-Reply-To: <875zance3n.fsf@morokweng.localdomain>



On 17/07/20 3:33 am, Thiago Jung Bauermann wrote:
> 
> Hari Bathini <hbathini@linux.ibm.com> writes:
> 
>> On 16/07/20 4:22 am, Thiago Jung Bauermann wrote:
>>>
>>> Hari Bathini <hbathini@linux.ibm.com> writes:
>>>

<snip>
 
>>>> +	 * each representing a memory range.
>>>> +	 */
>>>> +	ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
>>>> +
>>>> +	for (i = 0; i < ranges; i++) {
>>>> +		base = of_read_number(prop, n_mem_addr_cells);
>>>> +		prop += n_mem_addr_cells;
>>>> +		end = base + of_read_number(prop, n_mem_size_cells) - 1;
>>
>> prop is not used after the above.
>>
>>> You need to `prop += n_mem_size_cells` here.
>>
>> But yeah, adding it would make it look complete in some sense..
> 
> Isn't it used in the next iteration of the loop?

Memory@XXX/reg typically has only one range. I was looking at it
from that perspective which is not right. Will update.

Thanks
Hari

^ permalink raw reply

* Re: [powerpc:next-test 125/127] arch/powerpc/mm/book3s64/pkeys.c:392:7: error: implicit declaration of function 'is_pkey_enabled'; did you mean
From: Aneesh Kumar K.V @ 2020-07-17  4:16 UTC (permalink / raw)
  To: kernel test robot; +Cc: linuxppc-dev, kbuild-all
In-Reply-To: <202007170943.BGjGSyKg%lkp@intel.com>

On 7/17/20 7:29 AM, kernel test robot wrote:
> tree:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next-test
> head:   0fbd1eb4df96e1cbd039e0b95fdf62cf65a7faf9
> commit: ed411c66eea2ccf93a634ae661a1f79c2bc63d88 [125/127] powerpc/book3s64/pkeys: Remove is_pkey_enabled()
> config: powerpc-allmodconfig (attached as .config)
> compiler: powerpc64-linux-gcc (GCC) 9.3.0
> reproduce (this is a W=1 build):
>          wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>          chmod +x ~/bin/make.cross
>          git checkout ed411c66eea2ccf93a634ae661a1f79c2bc63d88
>          # save the attached .config to linux build tree
>          COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc
> 
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>
> 
> All errors (new ones prefixed by >>):
> 
>     arch/powerpc/mm/book3s64/pkeys.c: In function 'pkey_access_permitted':
>>> arch/powerpc/mm/book3s64/pkeys.c:392:7: error: implicit declaration of function 'is_pkey_enabled'; did you mean 'arch_pkeys_enabled'? [-Werror=implicit-function-declaration]
>       392 |  if (!is_pkey_enabled(pkey))
>           |       ^~~~~~~~~~~~~~~
>           |       arch_pkeys_enabled
>     cc1: some warnings being treated as errors
> 
> vim +392 arch/powerpc/mm/book3s64/pkeys.c
> 


We removed that upstream in

19ab500edb5d6020010caba48ce3b4ce4182ab63     powerpc/mm/pkeys: Make pkey 
access check work on execute_only_key


next-test need to be rebased?

-aneesh

^ permalink raw reply

* ASMedia USB 3.x host controllers triggering EEH on POWER9
From: Forest Crossman @ 2020-07-17  4:13 UTC (permalink / raw)
  To: linuxppc-dev, linux-pci, linux-usb

Hi, all,

I have several ASMedia USB 3.x host controllers (ASM2142 and ASM3142,
both share the same Vendor ID/Device ID pair) that I'd like to use
with a POWER9 system (a Raptor Computing Systems Talos II).
Unfortunately, while the kernel recognizes the controllers just fine,
as soon as I plug in a device, an EEH error occurs and the host
controller gets repeatedly reset until it eventually gets disabled. An
example of one of these errors can be seen here:
https://paste.debian.net/hidden/e39698eb

Based on the "PHB4 Diag-data" reported by the kernel, it seems that
LEM_WOF_R bit 35, PHB_FESR bit 20, and RXE_ARB_FESR bit 28 have been
set. According to the PHB4 specification
(https://ibm.ent.box.com/s/jftnfhceul07qjh9jtn91xwjmclabc71), they
respectively mean the following:
 - ARB: IODA TVT Errors - "TCE Validation Table error occurred. The
entry is invalid, or the PCI Address was out of range as defined by
the TTA bounds in the TVE entry."
 - RXE_ARB OR Error Status - "RXE_ARB error bits, ... OR of all error
status bits."
 - IODA TVT Address Range Error - "IODA Error: The PCI Address was out
of range as defined by the TTA bounds in the TVE entry."

In other words, the ASMedia USB controllers seem to be trying to write
to addresses they're not supposed to, and thankfully the PHB4 is
catching these bad writes before they can cause any corruption of my
system's memory. Of course, this has the unfortunate side-effect that
these devices are completely unable to operate with my computer, and
since it seems to be possible to use these controllers on x86 systems
(presumably because of the less-strict/disabled-by-default IOMMU), I
wonder if maybe it would be possible to work around these errors in
either the kernel or the OPAL firmware? My thinking is that instead of
disconnecting the misbehaving devices, maybe the errors could be
"forgiven" (but still blocked) and the device permitted to continue
operating, possibly with some USB data loss from "writes to nowhere"
or retries that may reduce performance. Or maybe if the issue is
caused by some high address bits being set to random values, those
bits could be masked-off so as to not trigger the errors and even
avoid data loss.

So, my question is, is any of this possible? I know the simple
solution for me is to just RMA the cards and avoid purchasing
ASMedia-based USB host controllers in the future, but the fact that
they still seem to work "mostly ok" on x86 systems (with the
occasional kernel panics and BSODs reported by users) piques my
curiosity and makes me wonder if maybe there's a way for me to have my
cheap, buggy hardware cake and eat it, too.

Now, I'm a novice at kernel hacking, so I don't really know what I'm
doing, but just for fun I did try to paper over the issue by adding an
EEH handler to the xhci driver
(https://paste.debian.net/hidden/16081515), but as you might expect,
that didn't do anything but prevent further communication with the
device. I also read a bunch of the PHB4 and IODA2 specs to see if
maybe there'd be a way to implement that bit-masking thing I
mentioned, but both of those documents are, uh, rather dry reading, so
I haven't read them in their entirety, and I don't know enough about
how this all works to try to search the text for what I need.

All that said, if anyone has any suggestions or comments, I'd be
really interested to hear them, even if it's just to question why I'd
go to such ridiculous lengths to try to get software to account for
buggy hardware.


All the best,

Forest

^ permalink raw reply

* [PATCH v4 10/10] powerpc/watchpoint: Remove 512 byte boundary
From: Ravi Bangoria @ 2020-07-17  4:09 UTC (permalink / raw)
  To: mpe, mikey
  Cc: christophe.leroy, ravi.bangoria, apopple, peterz, fweisbec,
	miltonm, oleg, npiggin, linux-kernel, paulus, jolsa, jniethe5,
	pedromfc, naveen.n.rao, linuxppc-dev, mingo
In-Reply-To: <20200717040958.70561-1-ravi.bangoria@linux.ibm.com>

Power10 has removed 512 bytes boundary from match criteria. i.e. The watch
range can cross 512 bytes boundary.

Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
 arch/powerpc/kernel/hw_breakpoint.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index c55e67bab271..1f4a1efa0074 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -418,8 +418,9 @@ static int hw_breakpoint_validate_len(struct arch_hw_breakpoint *hw)
 
 	if (dawr_enabled()) {
 		max_len = DAWR_MAX_LEN;
-		/* DAWR region can't cross 512 bytes boundary */
-		if (ALIGN_DOWN(start_addr, SZ_512) != ALIGN_DOWN(end_addr - 1, SZ_512))
+		/* DAWR region can't cross 512 bytes boundary on p10 predecessors */
+		if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
+		    (ALIGN_DOWN(start_addr, SZ_512) != ALIGN_DOWN(end_addr - 1, SZ_512)))
 			return -EINVAL;
 	} else if (IS_ENABLED(CONFIG_PPC_8xx)) {
 		/* 8xx can setup a range without limitation */
-- 
2.26.2


^ permalink raw reply related

* [PATCH v4 09/10] powerpc/watchpoint: Return available watchpoints dynamically
From: Ravi Bangoria @ 2020-07-17  4:09 UTC (permalink / raw)
  To: mpe, mikey
  Cc: christophe.leroy, ravi.bangoria, apopple, peterz, fweisbec,
	miltonm, oleg, npiggin, linux-kernel, paulus, jolsa, jniethe5,
	pedromfc, naveen.n.rao, linuxppc-dev, mingo
In-Reply-To: <20200717040958.70561-1-ravi.bangoria@linux.ibm.com>

So far Book3S Powerpc supported only one watchpoint. Power10 is
introducing 2nd DAWR. Enable 2nd DAWR support for Power10.
Availability of 2nd DAWR will depend on CPU_FTR_DAWR1.

Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
 arch/powerpc/include/asm/cputable.h      | 4 +++-
 arch/powerpc/include/asm/hw_breakpoint.h | 5 +++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 3445c86e1f6f..36a0851a7a9b 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -633,7 +633,9 @@ enum {
  * Maximum number of hw breakpoint supported on powerpc. Number of
  * breakpoints supported by actual hw might be less than this.
  */
-#define HBP_NUM_MAX	1
+#define HBP_NUM_MAX	2
+#define HBP_NUM_ONE	1
+#define HBP_NUM_TWO	2
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h
index cb424799da0d..d4eab1694bcd 100644
--- a/arch/powerpc/include/asm/hw_breakpoint.h
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -5,10 +5,11 @@
  * Copyright 2010, IBM Corporation.
  * Author: K.Prasad <prasad@linux.vnet.ibm.com>
  */
-
 #ifndef _PPC_BOOK3S_64_HW_BREAKPOINT_H
 #define _PPC_BOOK3S_64_HW_BREAKPOINT_H
 
+#include <asm/cpu_has_feature.h>
+
 #ifdef	__KERNEL__
 struct arch_hw_breakpoint {
 	unsigned long	address;
@@ -46,7 +47,7 @@ struct arch_hw_breakpoint {
 
 static inline int nr_wp_slots(void)
 {
-	return HBP_NUM_MAX;
+	return cpu_has_feature(CPU_FTR_DAWR1) ? HBP_NUM_TWO : HBP_NUM_ONE;
 }
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-- 
2.26.2


^ permalink raw reply related

* [PATCH v4 08/10] powerpc/watchpoint: Guest support for 2nd DAWR hcall
From: Ravi Bangoria @ 2020-07-17  4:09 UTC (permalink / raw)
  To: mpe, mikey
  Cc: christophe.leroy, ravi.bangoria, apopple, peterz, fweisbec,
	miltonm, oleg, npiggin, linux-kernel, paulus, jolsa, jniethe5,
	pedromfc, naveen.n.rao, linuxppc-dev, mingo
In-Reply-To: <20200717040958.70561-1-ravi.bangoria@linux.ibm.com>

2nd DAWR can be set/unset using H_SET_MODE hcall with resource value 5.
Enable powervm guest support with that. This has no effect on kvm guest
because kvm will return error if guest does hcall with resource value 5.

Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
 arch/powerpc/include/asm/hvcall.h         | 1 +
 arch/powerpc/include/asm/machdep.h        | 2 +-
 arch/powerpc/include/asm/plpar_wrappers.h | 5 +++++
 arch/powerpc/kernel/dawr.c                | 2 +-
 arch/powerpc/platforms/pseries/setup.c    | 7 +++++--
 5 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index b785e9f0071c..33793444144c 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -358,6 +358,7 @@
 #define H_SET_MODE_RESOURCE_SET_DAWR0		2
 #define H_SET_MODE_RESOURCE_ADDR_TRANS_MODE	3
 #define H_SET_MODE_RESOURCE_LE			4
+#define H_SET_MODE_RESOURCE_SET_DAWR1		5
 
 /* Values for argument to H_SIGNAL_SYS_RESET */
 #define H_SIGNAL_SYS_RESET_ALL			-1
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 7bcb64444a39..a90b892f0bfe 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -131,7 +131,7 @@ struct machdep_calls {
 				    unsigned long dabrx);
 
 	/* Set DAWR for this platform, leave empty for default implementation */
-	int		(*set_dawr)(unsigned long dawr,
+	int		(*set_dawr)(int nr, unsigned long dawr,
 				    unsigned long dawrx);
 
 #ifdef CONFIG_PPC32	/* XXX for now */
diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h
index d12c3680d946..ece84a430701 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -315,6 +315,11 @@ static inline long plpar_set_watchpoint0(unsigned long dawr0, unsigned long dawr
 	return plpar_set_mode(0, H_SET_MODE_RESOURCE_SET_DAWR0, dawr0, dawrx0);
 }
 
+static inline long plpar_set_watchpoint1(unsigned long dawr1, unsigned long dawrx1)
+{
+	return plpar_set_mode(0, H_SET_MODE_RESOURCE_SET_DAWR1, dawr1, dawrx1);
+}
+
 static inline long plpar_signal_sys_reset(long cpu)
 {
 	return plpar_hcall_norets(H_SIGNAL_SYS_RESET, cpu);
diff --git a/arch/powerpc/kernel/dawr.c b/arch/powerpc/kernel/dawr.c
index 500f52fa4711..cdc2dccb987d 100644
--- a/arch/powerpc/kernel/dawr.c
+++ b/arch/powerpc/kernel/dawr.c
@@ -37,7 +37,7 @@ int set_dawr(int nr, struct arch_hw_breakpoint *brk)
 	dawrx |= (mrd & 0x3f) << (63 - 53);
 
 	if (ppc_md.set_dawr)
-		return ppc_md.set_dawr(dawr, dawrx);
+		return ppc_md.set_dawr(nr, dawr, dawrx);
 
 	if (nr == 0) {
 		mtspr(SPRN_DAWR0, dawr);
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 2db8469e475f..d516ee8eb7fc 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -831,12 +831,15 @@ static int pseries_set_xdabr(unsigned long dabr, unsigned long dabrx)
 	return plpar_hcall_norets(H_SET_XDABR, dabr, dabrx);
 }
 
-static int pseries_set_dawr(unsigned long dawr, unsigned long dawrx)
+static int pseries_set_dawr(int nr, unsigned long dawr, unsigned long dawrx)
 {
 	/* PAPR says we can't set HYP */
 	dawrx &= ~DAWRX_HYP;
 
-	return  plpar_set_watchpoint0(dawr, dawrx);
+	if (nr == 0)
+		return plpar_set_watchpoint0(dawr, dawrx);
+	else
+		return plpar_set_watchpoint1(dawr, dawrx);
 }
 
 #define CMO_CHARACTERISTICS_TOKEN 44
-- 
2.26.2


^ permalink raw reply related

* [PATCH v4 07/10] powerpc/watchpoint: Rename current H_SET_MODE DAWR macro
From: Ravi Bangoria @ 2020-07-17  4:09 UTC (permalink / raw)
  To: mpe, mikey
  Cc: christophe.leroy, ravi.bangoria, apopple, peterz, fweisbec,
	miltonm, oleg, npiggin, linux-kernel, paulus, jolsa, jniethe5,
	pedromfc, naveen.n.rao, linuxppc-dev, mingo
In-Reply-To: <20200717040958.70561-1-ravi.bangoria@linux.ibm.com>

Current H_SET_MODE hcall macro name for setting/resetting DAWR0 is
H_SET_MODE_RESOURCE_SET_DAWR. Add suffix 0 to macro name as well.

Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
 arch/powerpc/include/asm/hvcall.h         | 2 +-
 arch/powerpc/include/asm/plpar_wrappers.h | 2 +-
 arch/powerpc/kvm/book3s_hv.c              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 43486e773bd6..b785e9f0071c 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -355,7 +355,7 @@
 
 /* Values for 2nd argument to H_SET_MODE */
 #define H_SET_MODE_RESOURCE_SET_CIABR		1
-#define H_SET_MODE_RESOURCE_SET_DAWR		2
+#define H_SET_MODE_RESOURCE_SET_DAWR0		2
 #define H_SET_MODE_RESOURCE_ADDR_TRANS_MODE	3
 #define H_SET_MODE_RESOURCE_LE			4
 
diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h
index 4293c5d2ddf4..d12c3680d946 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -312,7 +312,7 @@ static inline long plpar_set_ciabr(unsigned long ciabr)
 
 static inline long plpar_set_watchpoint0(unsigned long dawr0, unsigned long dawrx0)
 {
-	return plpar_set_mode(0, H_SET_MODE_RESOURCE_SET_DAWR, dawr0, dawrx0);
+	return plpar_set_mode(0, H_SET_MODE_RESOURCE_SET_DAWR0, dawr0, dawrx0);
 }
 
 static inline long plpar_signal_sys_reset(long cpu)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6bf66649ab92..7ad692c2d7c7 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -764,7 +764,7 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
 			return H_P3;
 		vcpu->arch.ciabr  = value1;
 		return H_SUCCESS;
-	case H_SET_MODE_RESOURCE_SET_DAWR:
+	case H_SET_MODE_RESOURCE_SET_DAWR0:
 		if (!kvmppc_power8_compatible(vcpu))
 			return H_P2;
 		if (!ppc_breakpoint_available())
-- 
2.26.2


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox