[RFC PATCH 19/32] KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization

All of lore.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH 19/32] KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization
@ 2018-09-21 10:01 Paul Mackerras
  2018-09-26  5:19 ` David Gibson
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Paul Mackerras @ 2018-09-21 10:01 UTC (permalink / raw)
  To: kvm-ppc

This starts the process of adding the code to support nested HV-style
virtualization.  It defines a new H_SET_PARTITION_TABLE hypercall which
a nested hypervisor can use to set the base address and size of a
partition table in its memory (analogous to the PTCR register).
On the host (level 0 hypervisor) side, the H_SET_PARTITION_TABLE
hypercall from the guest is handled by code that saves the virtual
PTCR value for the guest.

This also adds code for creating and destroying nested guests and for
reading the partition table entry for a nested guest from L1 memory.
Each nested guest has its own shadow LPID value, different in general
from the LPID value used by the nested hypervisor to refer to it.  The
shadow LPID value is allocated at nested guest creation time.

Nested hypervisor functionality is only available for a radix guest,
which therefore means a radix host on a POWER9 (or later) processor.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/hvcall.h         |   4 +
 arch/powerpc/include/asm/kvm_book3s.h     |  10 +-
 arch/powerpc/include/asm/kvm_book3s_64.h  |  17 ++
 arch/powerpc/include/asm/kvm_book3s_asm.h |   3 +
 arch/powerpc/include/asm/kvm_host.h       |   5 +
 arch/powerpc/kvm/Makefile                 |   3 +-
 arch/powerpc/kvm/book3s_hv.c              |  23 ++-
 arch/powerpc/kvm/book3s_hv_nested.c       | 286 ++++++++++++++++++++++++++++++
 8 files changed, 344 insertions(+), 7 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_hv_nested.c

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index cc9fe87..9afaa82 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -322,6 +322,10 @@
 #define H_GET_24X7_DATA		0xF07C
 #define H_GET_PERF_COUNTER_INFO	0xF080
 
+/* Platform-specific hcalls used for nested HV KVM */
+#define H_SET_PARTITION_TABLE	0xF800
+#define H_ENTER_NESTED		0xF804
+
 /* Values for 2nd argument to H_SET_MODE */
 #define H_SET_MODE_RESOURCE_SET_CIABR		1
 #define H_SET_MODE_RESOURCE_SET_DAWR		2
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 91c9779..7719ca5 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -274,6 +274,13 @@ static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {}
 static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
 #endif
 
+bool kvmhv_nested_init(void);
+void kvmhv_nested_exit(void);
+void kvmhv_vm_nested_init(struct kvm *kvm);
+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
+void kvmhv_release_all_nested(struct kvm *kvm);
+
 void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
 
 extern int kvm_irq_bypass;
@@ -387,9 +394,6 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
 /* TO = 31 for unconditional trap */
 #define INS_TW				0x7fe00008
 
-/* LPIDs we support with this build -- runtime limit may be lower */
-#define KVMPPC_NR_LPIDS			(LPID_RSVD + 1)
-
 #define SPLIT_HACK_MASK			0xff000000
 #define SPLIT_HACK_OFFS			0xfb000000
 
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 5c0e2d9..0c90d56 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -24,6 +24,23 @@
 #include <asm/bitops.h>
 #include <asm/book3s/64/mmu-hash.h>
 
+/* Structure for a nested guest */
+struct kvm_nested_guest {
+	struct kvm *parent;		/* L1 VM that owns this nested guest */
+	int l1_lpid;			/* lpid L1 guest thinks this guest is */
+	int shadow_lpid;		/* real lpid of this nested guest */
+	pgd_t *shadow_pgtable;		/* our page table for this guest */
+	u64 l1_gr_to_hr;		/* L1's addr of part'n-scoped table */
+	u64 process_table;		/* process table entry for this guest */
+	long refcnt;			/* number of pointers to this struct */
+	struct mutex tlb_lock;		/* serialize page faults and tlbies */
+	struct kvm_nested_guest *next;
+};
+
+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int lpid,
+					  bool create);
+void kvmhv_put_nested(struct kvm_nested_guest *gp);
+
 /* Power architecture requires HPT is at least 256kiB, at most 64TiB */
 #define PPC_MIN_HPT_ORDER	18
 #define PPC_MAX_HPT_ORDER	46
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index d978fdf..eb3ba63 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -25,6 +25,9 @@
 #define XICS_MFRR		0xc
 #define XICS_IPI		2	/* interrupt source # for IPIs */
 
+/* LPIDs we support with this build -- runtime limit may be lower */
+#define KVMPPC_NR_LPIDS			(LPID_RSVD + 1)
+
 /* Maximum number of threads per physical core */
 #define MAX_SMT_THREADS		8
 
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index c9cc42f..c35d4f2 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -46,6 +46,7 @@
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 #include <asm/kvm_book3s_asm.h>		/* for MAX_SMT_THREADS */
 #define KVM_MAX_VCPU_ID		(MAX_SMT_THREADS * KVM_MAX_VCORES)
+#define KVM_MAX_NESTED_GUESTS	KVMPPC_NR_LPIDS
 
 #else
 #define KVM_MAX_VCPU_ID		KVM_MAX_VCPUS
@@ -287,6 +288,7 @@ struct kvm_arch {
 	u8 radix;
 	u8 fwnmi_enabled;
 	bool threads_indep;
+	bool nested_enable;
 	pgd_t *pgtable;
 	u64 process_table;
 	struct dentry *debugfs_dir;
@@ -312,6 +314,9 @@ struct kvm_arch {
 #endif
 	struct kvmppc_ops *kvm_ops;
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	u64 l1_ptcr;
+	int max_nested_lpid;
+	struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];
 	/* This array can grow quite large, keep it at the end */
 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
 #endif
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index f872c04..e814f40 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -75,7 +75,8 @@ kvm-hv-y += \
 	book3s_hv.o \
 	book3s_hv_interrupts.o \
 	book3s_64_mmu_hv.o \
-	book3s_64_mmu_radix.o
+	book3s_64_mmu_radix.o \
+	book3s_hv_nested.o
 
 kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
 	book3s_hv_tm.o
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 82d6668..82c9a1e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -934,6 +934,16 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 		if (ret = H_TOO_HARD)
 			return RESUME_HOST;
 		break;
+
+	case H_SET_PARTITION_TABLE:
+		ret = H_FUNCTION;
+		if (vcpu->kvm->arch.nested_enable)
+			ret = kvmhv_set_partition_table(vcpu);
+		break;
+	case H_ENTER_NESTED:
+		ret = H_FUNCTION;
+		break;
+
 	default:
 		return RESUME_HOST;
 	}
@@ -4147,8 +4157,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm)
 			__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
 		dw1 = PATB_GR | kvm->arch.process_table;
 	}
-
-	mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
+	kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
 }
 
 /*
@@ -4364,6 +4373,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 
 	kvmppc_alloc_host_rm_ops();
 
+	kvmhv_vm_nested_init(kvm);
+
 	/*
 	 * Since we don't flush the TLB when tearing down a VM,
 	 * and this lpid might have previously been used,
@@ -4507,8 +4518,10 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 
 	/* Perform global invalidation and return lpid to the pool */
 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		if (kvm->arch.nested_enable)
+			kvmhv_release_all_nested(kvm);
 		kvm->arch.process_table = 0;
-		kvmppc_setup_partition_table(kvm);
+		kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
 	}
 	kvmppc_free_lpid(kvm->arch.lpid);
 
@@ -4979,6 +4992,9 @@ static int kvmppc_book3s_init_hv(void)
 	if (r < 0)
 		return -ENODEV;
 
+	if (!cpu_has_feature(CPU_FTR_HVMODE) && !kvmhv_nested_init())
+		return -ENODEV;
+
 	r = kvm_init_subcore_bitmap();
 	if (r)
 		return r;
@@ -5037,6 +5053,7 @@ static void kvmppc_book3s_exit_hv(void)
 	if (kvmppc_radix_possible())
 		kvmppc_radix_exit();
 	kvmppc_hv_ops = NULL;
+	kvmhv_nested_exit();
 }
 
 module_init(kvmppc_book3s_init_hv);
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
new file mode 100644
index 0000000..5fe3ea4
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2018
+ * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com>
+ *	   Paul Mackerras <paulus@ozlabs.org>
+ *
+ * Description: KVM functions specific to running nested KVM-HV guests
+ * on Book3S processors (specifically POWER9 and later).
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+
+static struct patb_entry *pseries_partition_tb;
+
+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
+
+/* Only called when we're not in hypervisor mode */
+bool kvmhv_nested_init(void)
+{
+	long int ptb_order;
+	unsigned long ptcr;
+	long rc;
+
+	if (!radix_enabled())
+		return false;
+
+	/* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
+	ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
+	if (ptb_order < 8)
+		ptb_order = 8;
+	pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
+				       GFP_KERNEL);
+	if (!pseries_partition_tb) {
+		pr_err("kvm-hv: failed to allocated nested partition table\n");
+		return false;
+	}
+
+	ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
+	rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
+	if (rc != H_SUCCESS) {
+		pr_err("kvm-hv: hypervisor does not support nesting (rc=%ld)\n",
+		       rc);
+		kfree(pseries_partition_tb);
+		pseries_partition_tb = NULL;
+		return false;
+	}
+
+	return true;
+}
+
+void kvmhv_nested_exit(void)
+{
+	if (pseries_partition_tb) {
+		plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
+		kfree(pseries_partition_tb);
+		pseries_partition_tb = NULL;
+	}
+}
+
+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
+{
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		mmu_partition_table_set_entry(lpid, dw0, dw1);
+	} else {
+		pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+		pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+		/* this will be emulated, L0 will do the necessary barriers */
+		asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
+			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+	}
+}
+
+static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
+{
+	unsigned long dw0;
+
+	dw0 = PATB_HR | radix__get_tree_size() |
+		__pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
+	kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
+}
+
+void kvmhv_vm_nested_init(struct kvm *kvm)
+{
+	kvm->arch.max_nested_lpid = -1;
+}
+
+/*
+ * Handle the H_SET_PARTITION_TABLE hcall.
+ * r4 = guest real address of partition table + log_2(size) - 12
+ * (formatted as for the PTCR).
+ */
+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
+
+	kvm->arch.l1_ptcr = ptcr;
+	return H_SUCCESS;
+}
+
+/*
+ * Reload the partition table entry for a guest.
+ * Caller must hold gp->tlb_lock.
+ */
+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
+{
+	int ret;
+	struct patb_entry ptbl_entry;
+	unsigned long ptbl_addr;
+	struct kvm *kvm = gp->parent;
+
+	ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4);
+	ret = kvm_read_guest(kvm, ptbl_addr,
+			     &ptbl_entry, sizeof(ptbl_entry));
+	if (ret) {
+		gp->l1_gr_to_hr = 0;
+		gp->process_table = 0;
+	} else {
+		gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0);
+		gp->process_table = be64_to_cpu(ptbl_entry.patb1);
+	}
+	kvmhv_set_nested_ptbl(gp);
+}
+
+struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
+{
+	struct kvm_nested_guest *gp;
+	long shadow_lpid;
+
+	gp = kzalloc(sizeof(*gp), GFP_KERNEL);
+	if (!gp)
+		return NULL;
+	gp->parent = kvm;
+	gp->l1_lpid = lpid;
+	mutex_init(&gp->tlb_lock);
+	gp->shadow_pgtable = pgd_alloc(kvm->mm);
+	if (!gp->shadow_pgtable)
+		goto out_free;
+	shadow_lpid = kvmppc_alloc_lpid();
+	if (shadow_lpid < 0)
+		goto out_free2;
+	gp->shadow_lpid = shadow_lpid;
+
+	return gp;
+
+ out_free2:
+	pgd_free(kvm->mm, gp->shadow_pgtable);
+ out_free:
+	kfree(gp);
+	return NULL;
+}
+
+/*
+ * Free up any resources allocated for a nested guest.
+ */
+static void kvmhv_release_nested(struct kvm_nested_guest *gp)
+{
+	kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
+	kvmppc_free_lpid(gp->shadow_lpid);
+	if (gp->shadow_pgtable)
+		pgd_free(gp->parent->mm, gp->shadow_pgtable);
+	kfree(gp);
+}
+
+static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
+{
+	struct kvm *kvm = gp->parent;
+	int lpid = gp->l1_lpid;
+	long ref;
+
+	spin_lock(&kvm->mmu_lock);
+	if (gp = kvm->arch.nested_guests[lpid]) {
+		kvm->arch.nested_guests[lpid] = NULL;
+		while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
+			;
+		kvm->arch.max_nested_lpid = lpid;
+		--gp->refcnt;
+	}
+	ref = gp->refcnt;
+	spin_unlock(&kvm->mmu_lock);
+	if (ref = 0)
+		kvmhv_release_nested(gp);
+}
+
+/*
+ * Free up all nested resources allocated for this guest.
+ */
+void kvmhv_release_all_nested(struct kvm *kvm)
+{
+	int i;
+	struct kvm_nested_guest *gp;
+	struct kvm_nested_guest *freelist = NULL;
+
+	spin_lock(&kvm->mmu_lock);
+	for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
+		gp = kvm->arch.nested_guests[i];
+		if (!gp)
+			continue;
+		kvm->arch.nested_guests[i] = NULL;
+		if (--gp->refcnt = 0) {
+			gp->next = freelist;
+			freelist = gp;
+		}
+	}
+	kvm->arch.max_nested_lpid = -1;
+	spin_unlock(&kvm->mmu_lock);
+	while ((gp = freelist) != NULL) {
+		freelist = gp->next;
+		kvmhv_release_nested(gp);
+	}
+}
+
+/* caller must hold gp->tlb_lock */
+void kvmhv_flush_nested(struct kvm_nested_guest *gp)
+{
+	kvmhv_update_ptbl_cache(gp);
+	if (gp->l1_gr_to_hr = 0)
+		kvmhv_remove_nested(gp);
+}
+
+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int lpid,
+					  bool create)
+{
+	struct kvm_nested_guest *gp, *newgp;
+
+	if (lpid >= KVM_MAX_NESTED_GUESTS ||
+	    lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
+		return NULL;
+
+	spin_lock(&kvm->mmu_lock);
+	gp = kvm->arch.nested_guests[lpid];
+	if (gp)
+		++gp->refcnt;
+	spin_unlock(&kvm->mmu_lock);
+
+	if (gp || !create)
+		return gp;
+
+	newgp = kvmhv_alloc_nested(kvm, lpid);
+	if (!newgp)
+		return NULL;
+	spin_lock(&kvm->mmu_lock);
+	if (kvm->arch.nested_guests[lpid]) {
+		/* someone else beat us to it */
+		gp = kvm->arch.nested_guests[lpid];
+	} else {
+		kvm->arch.nested_guests[lpid] = newgp;
+		++newgp->refcnt;
+		gp = newgp;
+		newgp = NULL;
+		if (lpid > kvm->arch.max_nested_lpid)
+			kvm->arch.max_nested_lpid = lpid;
+	}
+	++gp->refcnt;
+	spin_unlock(&kvm->mmu_lock);
+
+	if (newgp)
+		kvmhv_release_nested(newgp);
+
+	return gp;
+}
+
+void kvmhv_put_nested(struct kvm_nested_guest *gp)
+{
+	struct kvm *kvm = gp->parent;
+	long ref;
+
+	spin_lock(&kvm->mmu_lock);
+	ref = --gp->refcnt;
+	spin_unlock(&kvm->mmu_lock);
+	if (ref = 0)
+		kvmhv_release_nested(gp);
+}
+
+struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
+{
+	if (lpid > kvm->arch.max_nested_lpid)
+		return NULL;
+	return kvm->arch.nested_guests[lpid];
+}
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH 19/32] KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization
  2018-09-21 10:01 [RFC PATCH 19/32] KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization Paul Mackerras
@ 2018-09-26  5:19 ` David Gibson
  2018-09-26 11:16 ` Paul Mackerras
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: David Gibson @ 2018-09-26  5:19 UTC (permalink / raw)
  To: kvm-ppc

[-- Attachment #1: Type: text/plain, Size: 19011 bytes --]

On Fri, Sep 21, 2018 at 08:01:50PM +1000, Paul Mackerras wrote:
> This starts the process of adding the code to support nested HV-style
> virtualization.  It defines a new H_SET_PARTITION_TABLE hypercall which
> a nested hypervisor can use to set the base address and size of a
> partition table in its memory (analogous to the PTCR register).
> On the host (level 0 hypervisor) side, the H_SET_PARTITION_TABLE
> hypercall from the guest is handled by code that saves the virtual
> PTCR value for the guest.
> 
> This also adds code for creating and destroying nested guests and for
> reading the partition table entry for a nested guest from L1 memory.
> Each nested guest has its own shadow LPID value, different in general
> from the LPID value used by the nested hypervisor to refer to it.  The
> shadow LPID value is allocated at nested guest creation time.
> 
> Nested hypervisor functionality is only available for a radix guest,
> which therefore means a radix host on a POWER9 (or later) processor.
> 
> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
> ---
>  arch/powerpc/include/asm/hvcall.h         |   4 +
>  arch/powerpc/include/asm/kvm_book3s.h     |  10 +-
>  arch/powerpc/include/asm/kvm_book3s_64.h  |  17 ++
>  arch/powerpc/include/asm/kvm_book3s_asm.h |   3 +
>  arch/powerpc/include/asm/kvm_host.h       |   5 +
>  arch/powerpc/kvm/Makefile                 |   3 +-
>  arch/powerpc/kvm/book3s_hv.c              |  23 ++-
>  arch/powerpc/kvm/book3s_hv_nested.c       | 286 ++++++++++++++++++++++++++++++
>  8 files changed, 344 insertions(+), 7 deletions(-)
>  create mode 100644 arch/powerpc/kvm/book3s_hv_nested.c
> 
> diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
> index cc9fe87..9afaa82 100644
> --- a/arch/powerpc/include/asm/hvcall.h
> +++ b/arch/powerpc/include/asm/hvcall.h
> @@ -322,6 +322,10 @@
>  #define H_GET_24X7_DATA		0xF07C
>  #define H_GET_PERF_COUNTER_INFO	0xF080
>  
> +/* Platform-specific hcalls used for nested HV KVM */
> +#define H_SET_PARTITION_TABLE	0xF800
> +#define H_ENTER_NESTED		0xF804

So, these are in the platform specific hypercall range.  Do we expect
these to ever be PAPR standardized, or will they always be a
"vendor-specific" extension?

If the latter it might be more sensible to put them next to the
existing KVM/qemu defined hypercalls (e.g. H_RTAS) rather than closer
to the vendor-specific-but-implemented-by-phyp ones.

>  /* Values for 2nd argument to H_SET_MODE */
>  #define H_SET_MODE_RESOURCE_SET_CIABR		1
>  #define H_SET_MODE_RESOURCE_SET_DAWR		2
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
> index 91c9779..7719ca5 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -274,6 +274,13 @@ static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {}
>  static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
>  #endif
>  
> +bool kvmhv_nested_init(void);
> +void kvmhv_nested_exit(void);
> +void kvmhv_vm_nested_init(struct kvm *kvm);
> +long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
> +void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
> +void kvmhv_release_all_nested(struct kvm *kvm);
> +
>  void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
>  
>  extern int kvm_irq_bypass;
> @@ -387,9 +394,6 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
>  /* TO = 31 for unconditional trap */
>  #define INS_TW				0x7fe00008
>  
> -/* LPIDs we support with this build -- runtime limit may be lower */
> -#define KVMPPC_NR_LPIDS			(LPID_RSVD + 1)
> -
>  #define SPLIT_HACK_MASK			0xff000000
>  #define SPLIT_HACK_OFFS			0xfb000000
>  
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 5c0e2d9..0c90d56 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -24,6 +24,23 @@
>  #include <asm/bitops.h>
>  #include <asm/book3s/64/mmu-hash.h>
>  
> +/* Structure for a nested guest */

Might make it easier to read that this represents a nested guest from
the PoV of the L0 hypervisor, rather than the L1 hypervisor.

Also, do these exist only in the true L0 host, or in any level of host
which has guest more than one level below itself?

> +struct kvm_nested_guest {
> +	struct kvm *parent;		/* L1 VM that owns this nested guest */

"parent" might not be the best name.  That suggests it represents the
hypervisor immediately above this nested guest.  But AFAICT, if this
is a multiply nested guest, then this will be the immediate guest of
*this* host which indirectly owns the nested guest.  Maybe "l1_host" ?

> +	int l1_lpid;			/* lpid L1 guest thinks this guest is */
> +	int shadow_lpid;		/* real lpid of this nested guest */
> +	pgd_t *shadow_pgtable;		/* our page table for this guest */
> +	u64 l1_gr_to_hr;		/* L1's addr of part'n-scoped table */
> +	u64 process_table;		/* process table entry for this guest */
> +	long refcnt;			/* number of pointers to this struct */
> +	struct mutex tlb_lock;		/* serialize page faults and tlbies */
> +	struct kvm_nested_guest *next;
> +};
> +
> +struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int lpid,

Might be good to rename the 'lpid' parameter to make it clearer if
this takes the L1 or L0 value of the lpid.

> +					  bool create);
> +void kvmhv_put_nested(struct kvm_nested_guest *gp);
> +
>  /* Power architecture requires HPT is at least 256kiB, at most 64TiB */
>  #define PPC_MIN_HPT_ORDER	18
>  #define PPC_MAX_HPT_ORDER	46
> diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
> index d978fdf..eb3ba63 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> @@ -25,6 +25,9 @@
>  #define XICS_MFRR		0xc
>  #define XICS_IPI		2	/* interrupt source # for IPIs */
>  
> +/* LPIDs we support with this build -- runtime limit may be lower */
> +#define KVMPPC_NR_LPIDS			(LPID_RSVD + 1)
> +
>  /* Maximum number of threads per physical core */
>  #define MAX_SMT_THREADS		8
>  
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index c9cc42f..c35d4f2 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -46,6 +46,7 @@
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>  #include <asm/kvm_book3s_asm.h>		/* for MAX_SMT_THREADS */
>  #define KVM_MAX_VCPU_ID		(MAX_SMT_THREADS * KVM_MAX_VCORES)
> +#define KVM_MAX_NESTED_GUESTS	KVMPPC_NR_LPIDS
>  
>  #else
>  #define KVM_MAX_VCPU_ID		KVM_MAX_VCPUS
> @@ -287,6 +288,7 @@ struct kvm_arch {
>  	u8 radix;
>  	u8 fwnmi_enabled;
>  	bool threads_indep;
> +	bool nested_enable;
>  	pgd_t *pgtable;
>  	u64 process_table;
>  	struct dentry *debugfs_dir;
> @@ -312,6 +314,9 @@ struct kvm_arch {
>  #endif
>  	struct kvmppc_ops *kvm_ops;
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +	u64 l1_ptcr;
> +	int max_nested_lpid;
> +	struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];

kvm_nested_guest includes a next pointer.  Is this intended to be an
array, a linked list, or something more complex?

>  	/* This array can grow quite large, keep it at the end */
>  	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
>  #endif
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index f872c04..e814f40 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -75,7 +75,8 @@ kvm-hv-y += \
>  	book3s_hv.o \
>  	book3s_hv_interrupts.o \
>  	book3s_64_mmu_hv.o \
> -	book3s_64_mmu_radix.o
> +	book3s_64_mmu_radix.o \
> +	book3s_hv_nested.o
>  
>  kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
>  	book3s_hv_tm.o
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 82d6668..82c9a1e 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -934,6 +934,16 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>  		if (ret == H_TOO_HARD)
>  			return RESUME_HOST;
>  		break;
> +
> +	case H_SET_PARTITION_TABLE:
> +		ret = H_FUNCTION;
> +		if (vcpu->kvm->arch.nested_enable)
> +			ret = kvmhv_set_partition_table(vcpu);
> +		break;
> +	case H_ENTER_NESTED:
> +		ret = H_FUNCTION;
> +		break;
> +
>  	default:
>  		return RESUME_HOST;
>  	}
> @@ -4147,8 +4157,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm)
>  			__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
>  		dw1 = PATB_GR | kvm->arch.process_table;
>  	}
> -
> -	mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
> +	kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
>  }
>  
>  /*
> @@ -4364,6 +4373,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
>  
>  	kvmppc_alloc_host_rm_ops();
>  
> +	kvmhv_vm_nested_init(kvm);
> +
>  	/*
>  	 * Since we don't flush the TLB when tearing down a VM,
>  	 * and this lpid might have previously been used,
> @@ -4507,8 +4518,10 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
>  
>  	/* Perform global invalidation and return lpid to the pool */
>  	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
> +		if (kvm->arch.nested_enable)
> +			kvmhv_release_all_nested(kvm);
>  		kvm->arch.process_table = 0;
> -		kvmppc_setup_partition_table(kvm);
> +		kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
>  	}
>  	kvmppc_free_lpid(kvm->arch.lpid);
>  
> @@ -4979,6 +4992,9 @@ static int kvmppc_book3s_init_hv(void)
>  	if (r < 0)
>  		return -ENODEV;
>  
> +	if (!cpu_has_feature(CPU_FTR_HVMODE) && !kvmhv_nested_init())
> +		return -ENODEV;
> +
>  	r = kvm_init_subcore_bitmap();
>  	if (r)
>  		return r;
> @@ -5037,6 +5053,7 @@ static void kvmppc_book3s_exit_hv(void)
>  	if (kvmppc_radix_possible())
>  		kvmppc_radix_exit();
>  	kvmppc_hv_ops = NULL;
> +	kvmhv_nested_exit();
>  }
>  
>  module_init(kvmppc_book3s_init_hv);
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
> new file mode 100644
> index 0000000..5fe3ea4
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -0,0 +1,286 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright IBM Corporation, 2018
> + * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> + *	   Paul Mackerras <paulus@ozlabs.org>
> + *
> + * Description: KVM functions specific to running nested KVM-HV guests
> + * on Book3S processors (specifically POWER9 and later).
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/kvm_host.h>
> +
> +#include <asm/kvm_ppc.h>
> +#include <asm/mmu.h>
> +#include <asm/pgtable.h>
> +#include <asm/pgalloc.h>
> +
> +static struct patb_entry *pseries_partition_tb;
> +
> +static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
> +
> +/* Only called when we're not in hypervisor mode */
> +bool kvmhv_nested_init(void)
> +{
> +	long int ptb_order;
> +	unsigned long ptcr;
> +	long rc;
> +
> +	if (!radix_enabled())
> +		return false;
> +
> +	/* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
> +	ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
> +	if (ptb_order < 8)
> +		ptb_order = 8;
> +	pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
> +				       GFP_KERNEL);
> +	if (!pseries_partition_tb) {
> +		pr_err("kvm-hv: failed to allocated nested partition table\n");
> +		return false;
> +	}

Would it make sense to have a knob allowing the L0 to limit how many
nested guests each L1 can have (rather than just "0" or "some")?  If
so, would it then also make sense to advertise that to the L1 and have
it allocate its partition table accordingly?

> +
> +	ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
> +	rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
> +	if (rc != H_SUCCESS) {
> +		pr_err("kvm-hv: hypervisor does not support nesting (rc=%ld)\n",
> +		       rc);

Might want to make it clearer that this is about the hypervisor
*above* this kernel not supporting nesting, rather than the hypervisor
*in* this kernel not supporting nesting.

> +		kfree(pseries_partition_tb);
> +		pseries_partition_tb = NULL;
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +void kvmhv_nested_exit(void)
> +{
> +	if (pseries_partition_tb) {
> +		plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
> +		kfree(pseries_partition_tb);
> +		pseries_partition_tb = NULL;
> +	}
> +}
> +
> +void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
> +{
> +	if (cpu_has_feature(CPU_FTR_HVMODE)) {
> +		mmu_partition_table_set_entry(lpid, dw0, dw1);
> +	} else {
> +		pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
> +		pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
> +		/* this will be emulated, L0 will do the necessary barriers */
> +		asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
> +			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
> +	}
> +}
> +
> +static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
> +{
> +	unsigned long dw0;
> +
> +	dw0 = PATB_HR | radix__get_tree_size() |
> +		__pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
> +	kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
> +}
> +
> +void kvmhv_vm_nested_init(struct kvm *kvm)
> +{
> +	kvm->arch.max_nested_lpid = -1;
> +}
> +
> +/*
> + * Handle the H_SET_PARTITION_TABLE hcall.
> + * r4 = guest real address of partition table + log_2(size) - 12
> + * (formatted as for the PTCR).
> + */
> +long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
> +
> +	kvm->arch.l1_ptcr = ptcr;
> +	return H_SUCCESS;
> +}
> +
> +/*
> + * Reload the partition table entry for a guest.
> + * Caller must hold gp->tlb_lock.
> + */
> +static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
> +{
> +	int ret;
> +	struct patb_entry ptbl_entry;
> +	unsigned long ptbl_addr;
> +	struct kvm *kvm = gp->parent;
> +
> +	ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4);
> +	ret = kvm_read_guest(kvm, ptbl_addr,
> +			     &ptbl_entry, sizeof(ptbl_entry));
> +	if (ret) {
> +		gp->l1_gr_to_hr = 0;
> +		gp->process_table = 0;
> +	} else {
> +		gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0);
> +		gp->process_table = be64_to_cpu(ptbl_entry.patb1);
> +	}
> +	kvmhv_set_nested_ptbl(gp);
> +}
> +
> +struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
> +{
> +	struct kvm_nested_guest *gp;
> +	long shadow_lpid;
> +
> +	gp = kzalloc(sizeof(*gp), GFP_KERNEL);
> +	if (!gp)
> +		return NULL;
> +	gp->parent = kvm;
> +	gp->l1_lpid = lpid;
> +	mutex_init(&gp->tlb_lock);
> +	gp->shadow_pgtable = pgd_alloc(kvm->mm);
> +	if (!gp->shadow_pgtable)
> +		goto out_free;
> +	shadow_lpid = kvmppc_alloc_lpid();
> +	if (shadow_lpid < 0)
> +		goto out_free2;
> +	gp->shadow_lpid = shadow_lpid;
> +
> +	return gp;
> +
> + out_free2:
> +	pgd_free(kvm->mm, gp->shadow_pgtable);
> + out_free:
> +	kfree(gp);
> +	return NULL;
> +}
> +
> +/*
> + * Free up any resources allocated for a nested guest.
> + */
> +static void kvmhv_release_nested(struct kvm_nested_guest *gp)
> +{
> +	kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
> +	kvmppc_free_lpid(gp->shadow_lpid);
> +	if (gp->shadow_pgtable)
> +		pgd_free(gp->parent->mm, gp->shadow_pgtable);
> +	kfree(gp);
> +}
> +
> +static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
> +{
> +	struct kvm *kvm = gp->parent;
> +	int lpid = gp->l1_lpid;
> +	long ref;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	if (gp == kvm->arch.nested_guests[lpid]) {
> +		kvm->arch.nested_guests[lpid] = NULL;
> +		while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
> +			;

This only looks correct of gp is the guest of KVM with the highest
l1_lpid.  It's not obvious why that would have to be the case.

> +		kvm->arch.max_nested_lpid = lpid;
> +		--gp->refcnt;
> +	}
> +	ref = gp->refcnt;
> +	spin_unlock(&kvm->mmu_lock);
> +	if (ref == 0)
> +		kvmhv_release_nested(gp);
> +}
> +
> +/*
> + * Free up all nested resources allocated for this guest.
> + */
> +void kvmhv_release_all_nested(struct kvm *kvm)
> +{
> +	int i;
> +	struct kvm_nested_guest *gp;
> +	struct kvm_nested_guest *freelist = NULL;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
> +		gp = kvm->arch.nested_guests[i];
> +		if (!gp)
> +			continue;
> +		kvm->arch.nested_guests[i] = NULL;
> +		if (--gp->refcnt == 0) {
> +			gp->next = freelist;
> +			freelist = gp;
> +		}
> +	}
> +	kvm->arch.max_nested_lpid = -1;
> +	spin_unlock(&kvm->mmu_lock);
> +	while ((gp = freelist) != NULL) {
> +		freelist = gp->next;
> +		kvmhv_release_nested(gp);
> +	}
> +}
> +
> +/* caller must hold gp->tlb_lock */
> +void kvmhv_flush_nested(struct kvm_nested_guest *gp)
> +{
> +	kvmhv_update_ptbl_cache(gp);
> +	if (gp->l1_gr_to_hr == 0)
> +		kvmhv_remove_nested(gp);
> +}
> +
> +struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int lpid,
> +					  bool create)
> +{
> +	struct kvm_nested_guest *gp, *newgp;
> +
> +	if (lpid >= KVM_MAX_NESTED_GUESTS ||
> +	    lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
> +		return NULL;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	gp = kvm->arch.nested_guests[lpid];
> +	if (gp)
> +		++gp->refcnt;
> +	spin_unlock(&kvm->mmu_lock);
> +
> +	if (gp || !create)
> +		return gp;
> +
> +	newgp = kvmhv_alloc_nested(kvm, lpid);
> +	if (!newgp)
> +		return NULL;
> +	spin_lock(&kvm->mmu_lock);
> +	if (kvm->arch.nested_guests[lpid]) {
> +		/* someone else beat us to it */
> +		gp = kvm->arch.nested_guests[lpid];
> +	} else {
> +		kvm->arch.nested_guests[lpid] = newgp;
> +		++newgp->refcnt;
> +		gp = newgp;
> +		newgp = NULL;
> +		if (lpid > kvm->arch.max_nested_lpid)
> +			kvm->arch.max_nested_lpid = lpid;
> +	}
> +	++gp->refcnt;
> +	spin_unlock(&kvm->mmu_lock);
> +
> +	if (newgp)
> +		kvmhv_release_nested(newgp);
> +
> +	return gp;
> +}
> +
> +void kvmhv_put_nested(struct kvm_nested_guest *gp)
> +{
> +	struct kvm *kvm = gp->parent;
> +	long ref;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	ref = --gp->refcnt;
> +	spin_unlock(&kvm->mmu_lock);
> +	if (ref == 0)
> +		kvmhv_release_nested(gp);
> +}
> +
> +struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)

Why do you need this as well as get_nested()?  Plus this is a
non-static function that hasn't been added to any headers.

> +{
> +	if (lpid > kvm->arch.max_nested_lpid)
> +		return NULL;
> +	return kvm->arch.nested_guests[lpid];
> +}

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH 19/32] KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization
  2018-09-21 10:01 [RFC PATCH 19/32] KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization Paul Mackerras
  2018-09-26  5:19 ` David Gibson
@ 2018-09-26 11:16 ` Paul Mackerras
  2018-09-26 11:23 ` Paul Mackerras
  2018-09-27  0:47 ` David Gibson
  3 siblings, 0 replies; 5+ messages in thread
From: Paul Mackerras @ 2018-09-26 11:16 UTC (permalink / raw)
  To: kvm-ppc

On Wed, Sep 26, 2018 at 03:19:00PM +1000, David Gibson wrote:
> On Fri, Sep 21, 2018 at 08:01:50PM +1000, Paul Mackerras wrote:
> >  
> > +/* Platform-specific hcalls used for nested HV KVM */
> > +#define H_SET_PARTITION_TABLE	0xF800
> > +#define H_ENTER_NESTED		0xF804
> 
> So, these are in the platform specific hypercall range.  Do we expect
> these to ever be PAPR standardized, or will they always be a
> "vendor-specific" extension?
> 
> If the latter it might be more sensible to put them next to the
> existing KVM/qemu defined hypercalls (e.g. H_RTAS) rather than closer
> to the vendor-specific-but-implemented-by-phyp ones.

Interesting question.  It's possible they could be added to PAPR (and
even not-impossibly implemented by pHyp one day, maybe).  I should
ping the PAPR maintainer and see what he thinks.

> > @@ -24,6 +24,23 @@
> >  #include <asm/bitops.h>
> >  #include <asm/book3s/64/mmu-hash.h>
> >  
> > +/* Structure for a nested guest */
> 
> Might make it easier to read that this represents a nested guest from
> the PoV of the L0 hypervisor, rather than the L1 hypervisor.
> 
> Also, do these exist only in the true L0 host, or in any level of host
> which has guest more than one level below itself?

The latter.  With multiple nesting, each level of hypervisor has a
representation of every guest underneath it (which amounts to a struct
kvm_nested_guest and a shadow page table).

> > +struct kvm_nested_guest {
> > +	struct kvm *parent;		/* L1 VM that owns this nested guest */
> 
> "parent" might not be the best name.  That suggests it represents the
> hypervisor immediately above this nested guest.  But AFAICT, if this
> is a multiply nested guest, then this will be the immediate guest of
> *this* host which indirectly owns the nested guest.  Maybe "l1_host" ?

Well, since every Ln guest is a guest of Lm for all 0 <= m < n, that
means that L0 only sees L1 and L2.  If there is an L3 or L4 they look
identical to L2 from L0's point of view.  In other words, L0 does
regard L1 as owning all the Ln guests under it for n >= 2.  But we can
change the name if you think it's clearer.

> > +struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int lpid,
> 
> Might be good to rename the 'lpid' parameter to make it clearer if
> this takes the L1 or L0 value of the lpid.

OK.  It's the l1 lpid.

> > +	u64 l1_ptcr;
> > +	int max_nested_lpid;
> > +	struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];
> 
> kvm_nested_guest includes a next pointer.  Is this intended to be an
> array, a linked list, or something more complex?

It's a temporary linked list used while freeing all guests to avoid
dropping and re-taking the mmu_lock.

> > +/* Only called when we're not in hypervisor mode */
> > +bool kvmhv_nested_init(void)
> > +{
> > +	long int ptb_order;
> > +	unsigned long ptcr;
> > +	long rc;
> > +
> > +	if (!radix_enabled())
> > +		return false;
> > +
> > +	/* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
> > +	ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
> > +	if (ptb_order < 8)
> > +		ptb_order = 8;
> > +	pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
> > +				       GFP_KERNEL);
> > +	if (!pseries_partition_tb) {
> > +		pr_err("kvm-hv: failed to allocated nested partition table\n");
> > +		return false;
> > +	}
> 
> Would it make sense to have a knob allowing the L0 to limit how many
> nested guests each L1 can have (rather than just "0" or "some")?  If
> so, would it then also make sense to advertise that to the L1 and have
> it allocate its partition table accordingly?

Maybe? Probably? Though then we get into having to have that as a
capability with a minimum required value across a migration domain so
that an L1 guest running on one host doesn't suddenly die, or have
some of its nested guests die, when it gets migrated.

We do already have a limit introduced in this patch of
KVMPPC_NR_LPIDS, which is currently 1024.  If we ignore POWER7 then we
can support up to 4095 guests at L0.

Let me know if you think we want a user-controllable limit on nested
guests per L1 guest, and if so how that would work in a migration
domain.

> > +
> > +	ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
> > +	rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
> > +	if (rc != H_SUCCESS) {
> > +		pr_err("kvm-hv: hypervisor does not support nesting (rc=%ld)\n",
> > +		       rc);
> 
> Might want to make it clearer that this is about the hypervisor
> *above* this kernel not supporting nesting, rather than the hypervisor
> *in* this kernel not supporting nesting.

OK.  "our hypervisor..." perhaps? or what?

> > +void kvmhv_put_nested(struct kvm_nested_guest *gp)
> > +{
> > +	struct kvm *kvm = gp->parent;
> > +	long ref;
> > +
> > +	spin_lock(&kvm->mmu_lock);
> > +	ref = --gp->refcnt;
> > +	spin_unlock(&kvm->mmu_lock);
> > +	if (ref = 0)
> > +		kvmhv_release_nested(gp);
> > +}
> > +
> > +struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
> 
> Why do you need this as well as get_nested()?  Plus this is a
> non-static function that hasn't been added to any headers.

It's for use in the TLB invalidation functions that get added later,
and we add "static" at that point.  I could just add the function with
its first use.

Paul.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH 19/32] KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization
  2018-09-21 10:01 [RFC PATCH 19/32] KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization Paul Mackerras
  2018-09-26  5:19 ` David Gibson
  2018-09-26 11:16 ` Paul Mackerras
@ 2018-09-26 11:23 ` Paul Mackerras
  2018-09-27  0:47 ` David Gibson
  3 siblings, 0 replies; 5+ messages in thread
From: Paul Mackerras @ 2018-09-26 11:23 UTC (permalink / raw)
  To: kvm-ppc

On Wed, Sep 26, 2018 at 03:19:00PM +1000, David Gibson wrote:
> On Fri, Sep 21, 2018 at 08:01:50PM +1000, Paul Mackerras wrote:

> > +static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
> > +{
> > +	struct kvm *kvm = gp->parent;
> > +	int lpid = gp->l1_lpid;
> > +	long ref;
> > +
> > +	spin_lock(&kvm->mmu_lock);
> > +	if (gp = kvm->arch.nested_guests[lpid]) {
> > +		kvm->arch.nested_guests[lpid] = NULL;
> > +		while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
> > +			;
> 
> This only looks correct of gp is the guest of KVM with the highest
> l1_lpid.  It's not obvious why that would have to be the case.

Not obvious because not true.  Good catch. :)

Paul.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH 19/32] KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization
  2018-09-21 10:01 [RFC PATCH 19/32] KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization Paul Mackerras
                   ` (2 preceding siblings ...)
  2018-09-26 11:23 ` Paul Mackerras
@ 2018-09-27  0:47 ` David Gibson
  3 siblings, 0 replies; 5+ messages in thread
From: David Gibson @ 2018-09-27  0:47 UTC (permalink / raw)
  To: kvm-ppc

[-- Attachment #1: Type: text/plain, Size: 6505 bytes --]

On Wed, Sep 26, 2018 at 09:16:09PM +1000, Paul Mackerras wrote:
> On Wed, Sep 26, 2018 at 03:19:00PM +1000, David Gibson wrote:
> > On Fri, Sep 21, 2018 at 08:01:50PM +1000, Paul Mackerras wrote:
> > >  
> > > +/* Platform-specific hcalls used for nested HV KVM */
> > > +#define H_SET_PARTITION_TABLE	0xF800
> > > +#define H_ENTER_NESTED		0xF804
> > 
> > So, these are in the platform specific hypercall range.  Do we expect
> > these to ever be PAPR standardized, or will they always be a
> > "vendor-specific" extension?
> > 
> > If the latter it might be more sensible to put them next to the
> > existing KVM/qemu defined hypercalls (e.g. H_RTAS) rather than closer
> > to the vendor-specific-but-implemented-by-phyp ones.
> 
> Interesting question.  It's possible they could be added to PAPR (and
> even not-impossibly implemented by pHyp one day, maybe).  I should
> ping the PAPR maintainer and see what he thinks.

Based on our discussion on the call today, I think where they are is
fine on reflection.

> 
> > > @@ -24,6 +24,23 @@
> > >  #include <asm/bitops.h>
> > >  #include <asm/book3s/64/mmu-hash.h>
> > >  
> > > +/* Structure for a nested guest */
> > 
> > Might make it easier to read that this represents a nested guest from
> > the PoV of the L0 hypervisor, rather than the L1 hypervisor.
> > 
> > Also, do these exist only in the true L0 host, or in any level of host
> > which has guest more than one level below itself?
> 
> The latter.  With multiple nesting, each level of hypervisor has a
> representation of every guest underneath it (which amounts to a struct
> kvm_nested_guest and a shadow page table).

> > > +struct kvm_nested_guest {
> > > +	struct kvm *parent;		/* L1 VM that owns this nested guest */
> > 
> > "parent" might not be the best name.  That suggests it represents the
> > hypervisor immediately above this nested guest.  But AFAICT, if this
> > is a multiply nested guest, then this will be the immediate guest of
> > *this* host which indirectly owns the nested guest.  Maybe "l1_host" ?
> 
> Well, since every Ln guest is a guest of Lm for all 0 <= m < n, that
> means that L0 only sees L1 and L2.  If there is an L3 or L4 they look
> identical to L2 from L0's point of view.  In other words, L0 does
> regard L1 as owning all the Ln guests under it for n >= 2.  But we can
> change the name if you think it's clearer.

Ok.  I feel like we need better terminology for "one level below us",
"one level above us" etc. that distinguish from absolute levels like
L0, L1 etc.  Good options don't immediately occur to me.

> > > +struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int lpid,
> > 
> > Might be good to rename the 'lpid' parameter to make it clearer if
> > this takes the L1 or L0 value of the lpid.
> 
> OK.  It's the l1 lpid.
> 
> > > +	u64 l1_ptcr;
> > > +	int max_nested_lpid;
> > > +	struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];
> > 
> > kvm_nested_guest includes a next pointer.  Is this intended to be an
> > array, a linked list, or something more complex?
> 
> It's a temporary linked list used while freeing all guests to avoid
> dropping and re-taking the mmu_lock.

Ok, maybe a comment on the next pointer saying it's just used for the
free list.

> > > +/* Only called when we're not in hypervisor mode */
> > > +bool kvmhv_nested_init(void)
> > > +{
> > > +	long int ptb_order;
> > > +	unsigned long ptcr;
> > > +	long rc;
> > > +
> > > +	if (!radix_enabled())
> > > +		return false;
> > > +
> > > +	/* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
> > > +	ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
> > > +	if (ptb_order < 8)
> > > +		ptb_order = 8;
> > > +	pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
> > > +				       GFP_KERNEL);
> > > +	if (!pseries_partition_tb) {
> > > +		pr_err("kvm-hv: failed to allocated nested partition table\n");
> > > +		return false;
> > > +	}
> > 
> > Would it make sense to have a knob allowing the L0 to limit how many
> > nested guests each L1 can have (rather than just "0" or "some")?  If
> > so, would it then also make sense to advertise that to the L1 and have
> > it allocate its partition table accordingly?
> 
> Maybe? Probably? Though then we get into having to have that as a
> capability with a minimum required value across a migration domain so
> that an L1 guest running on one host doesn't suddenly die, or have
> some of its nested guests die, when it gets migrated.
> 
> We do already have a limit introduced in this patch of
> KVMPPC_NR_LPIDS, which is currently 1024.  If we ignore POWER7 then we
> can support up to 4095 guests at L0.
> 
> Let me know if you think we want a user-controllable limit on nested
> guests per L1 guest, and if so how that would work in a migration
> domain.

As discussed on the call, we can look at this as a later refinement.

> > > +
> > > +	ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
> > > +	rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
> > > +	if (rc != H_SUCCESS) {
> > > +		pr_err("kvm-hv: hypervisor does not support nesting (rc=%ld)\n",
> > > +		       rc);
> > 
> > Might want to make it clearer that this is about the hypervisor
> > *above* this kernel not supporting nesting, rather than the hypervisor
> > *in* this kernel not supporting nesting.
> 
> OK.  "our hypervisor..." perhaps? or what?

Yeah, now I come to think of it, making this clearer is kind of hard.

"Parent hypervisor..." maybe?

> > > +void kvmhv_put_nested(struct kvm_nested_guest *gp)
> > > +{
> > > +	struct kvm *kvm = gp->parent;
> > > +	long ref;
> > > +
> > > +	spin_lock(&kvm->mmu_lock);
> > > +	ref = --gp->refcnt;
> > > +	spin_unlock(&kvm->mmu_lock);
> > > +	if (ref == 0)
> > > +		kvmhv_release_nested(gp);
> > > +}
> > > +
> > > +struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
> > 
> > Why do you need this as well as get_nested()?  Plus this is a
> > non-static function that hasn't been added to any headers.
> 
> It's for use in the TLB invalidation functions that get added later,
> and we add "static" at that point.  I could just add the function with
> its first use.

Sounds like an idea.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2018-09-27  0:47 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-09-21 10:01 [RFC PATCH 19/32] KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization Paul Mackerras
2018-09-26  5:19 ` David Gibson
2018-09-26 11:16 ` Paul Mackerras
2018-09-26 11:23 ` Paul Mackerras
2018-09-27  0:47 ` David Gibson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.