Kernel KVM virtualization development
 help / color / mirror / Atom feed
* [PATCH kvmtool 0/3] arm64: Nested virtualization support
@ 2025-06-20 10:44 Andre Przywara
  2025-06-20 10:44 ` [PATCH kvmtool 1/3] Sync kernel UAPI headers with v6.16-rc1 Andre Przywara
                   ` (3 more replies)
  0 siblings, 4 replies; 8+ messages in thread
From: Andre Przywara @ 2025-06-20 10:44 UTC (permalink / raw)
  To: Will Deacon, Julien Thierry, Marc Zyngier; +Cc: kvm, kvmarm

Thanks to the imperturbable efforts from Marc, arm64 support for nested
virtualization has now reached the mainline kernel, which means the
respective kvmtool support should now be ready as well.

Patch 1 updates the kernel headers, to get the new EL2 capability, and
the VGIC device control to setup the maintenance IRQ.
Patch 2 introduces the new "--nested" command line option, to let the
VCPUs start in EL2. To allow KVM guests running in such a guest, we also
need VGIC support, which patch 3 allows by setting the maintenance IRQ.

Tested on the FVP (with some good deal of patience), and some commercial
(non-fruity) hardware, down to a guest's guest's guest.

Cheers,
Andre

P.S.: Marc: I saw the other patches in your kernel.org repo, do we need any
of them - HYP timer IRQ, E2H0, counter offset? I guess E2H0 for fruity
hardware, what about the others?

Andre Przywara (3):
  Sync kernel UAPI headers with v6.16-rc1
  arm64: Initial nested virt support
  arm64: nested: add support for setting maintenance IRQ

 arm64/arm-cpu.c                     |  3 +-
 arm64/fdt.c                         |  5 +-
 arm64/gic.c                         | 21 +++++++-
 arm64/include/asm/kvm.h             | 23 +++++++--
 arm64/include/kvm/gic.h             |  2 +-
 arm64/include/kvm/kvm-config-arch.h |  5 +-
 arm64/kvm-cpu.c                     | 12 ++++-
 include/linux/kvm.h                 |  5 ++
 include/linux/virtio_net.h          | 13 +++++
 include/linux/virtio_pci.h          |  1 +
 riscv/include/asm/kvm.h             |  2 +
 x86/include/asm/kvm.h               | 75 +++++++++++++++++++++++++++++
 12 files changed, 157 insertions(+), 10 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH kvmtool 1/3] Sync kernel UAPI headers with v6.16-rc1
  2025-06-20 10:44 [PATCH kvmtool 0/3] arm64: Nested virtualization support Andre Przywara
@ 2025-06-20 10:44 ` Andre Przywara
  2025-06-20 10:44 ` [PATCH kvmtool 2/3] arm64: Initial nested virt support Andre Przywara
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 8+ messages in thread
From: Andre Przywara @ 2025-06-20 10:44 UTC (permalink / raw)
  To: Will Deacon, Julien Thierry, Marc Zyngier; +Cc: kvm, kvmarm

Needed for ARM nested virt support.
Generated using util/update_headers.sh.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
---
 arm64/include/asm/kvm.h    | 23 ++++++++++--
 include/linux/kvm.h        |  5 +++
 include/linux/virtio_net.h | 13 +++++++
 include/linux/virtio_pci.h |  1 +
 riscv/include/asm/kvm.h    |  2 +
 x86/include/asm/kvm.h      | 75 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 115 insertions(+), 4 deletions(-)

diff --git a/arm64/include/asm/kvm.h b/arm64/include/asm/kvm.h
index 568bf858f..ed5f38926 100644
--- a/arm64/include/asm/kvm.h
+++ b/arm64/include/asm/kvm.h
@@ -105,6 +105,7 @@ struct kvm_regs {
 #define KVM_ARM_VCPU_PTRAUTH_ADDRESS	5 /* VCPU uses address authentication */
 #define KVM_ARM_VCPU_PTRAUTH_GENERIC	6 /* VCPU uses generic authentication */
 #define KVM_ARM_VCPU_HAS_EL2		7 /* Support nested virtualization */
+#define KVM_ARM_VCPU_HAS_EL2_E2H0	8 /* Limit NV support to E2H RES0 */
 
 struct kvm_vcpu_init {
 	__u32 target;
@@ -371,6 +372,7 @@ enum {
 #endif
 };
 
+/* Vendor hyper call function numbers 0-63 */
 #define KVM_REG_ARM_VENDOR_HYP_BMAP		KVM_REG_ARM_FW_FEAT_BMAP_REG(2)
 
 enum {
@@ -381,6 +383,17 @@ enum {
 #endif
 };
 
+/* Vendor hyper call function numbers 64-127 */
+#define KVM_REG_ARM_VENDOR_HYP_BMAP_2		KVM_REG_ARM_FW_FEAT_BMAP_REG(3)
+
+enum {
+	KVM_REG_ARM_VENDOR_HYP_BIT_DISCOVER_IMPL_VER	= 0,
+	KVM_REG_ARM_VENDOR_HYP_BIT_DISCOVER_IMPL_CPUS	= 1,
+#ifdef __KERNEL__
+	KVM_REG_ARM_VENDOR_HYP_BMAP_2_BIT_COUNT,
+#endif
+};
+
 /* Device Control API on vm fd */
 #define KVM_ARM_VM_SMCCC_CTRL		0
 #define   KVM_ARM_VM_SMCCC_FILTER	0
@@ -403,6 +416,7 @@ enum {
 #define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6
 #define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO  7
 #define KVM_DEV_ARM_VGIC_GRP_ITS_REGS 8
+#define KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ  9
 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT	10
 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \
 			(0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT)
@@ -417,10 +431,11 @@ enum {
 
 /* Device Control API on vcpu fd */
 #define KVM_ARM_VCPU_PMU_V3_CTRL	0
-#define   KVM_ARM_VCPU_PMU_V3_IRQ	0
-#define   KVM_ARM_VCPU_PMU_V3_INIT	1
-#define   KVM_ARM_VCPU_PMU_V3_FILTER	2
-#define   KVM_ARM_VCPU_PMU_V3_SET_PMU	3
+#define   KVM_ARM_VCPU_PMU_V3_IRQ		0
+#define   KVM_ARM_VCPU_PMU_V3_INIT		1
+#define   KVM_ARM_VCPU_PMU_V3_FILTER		2
+#define   KVM_ARM_VCPU_PMU_V3_SET_PMU		3
+#define   KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS	4
 #define KVM_ARM_VCPU_TIMER_CTRL		1
 #define   KVM_ARM_VCPU_TIMER_IRQ_VTIMER		0
 #define   KVM_ARM_VCPU_TIMER_IRQ_PTIMER		1
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 45e6d8fca..d00b85cb1 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -375,6 +375,7 @@ struct kvm_run {
 #define KVM_SYSTEM_EVENT_WAKEUP         4
 #define KVM_SYSTEM_EVENT_SUSPEND        5
 #define KVM_SYSTEM_EVENT_SEV_TERM       6
+#define KVM_SYSTEM_EVENT_TDX_FATAL      7
 			__u32 type;
 			__u32 ndata;
 			union {
@@ -929,6 +930,10 @@ struct kvm_enable_cap {
 #define KVM_CAP_PRE_FAULT_MEMORY 236
 #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237
 #define KVM_CAP_X86_GUEST_MODE 238
+#define KVM_CAP_ARM_WRITABLE_IMP_ID_REGS 239
+#define KVM_CAP_ARM_EL2 240
+#define KVM_CAP_ARM_EL2_E2H0 241
+#define KVM_CAP_RISCV_MP_STATE_RESET 242
 
 struct kvm_irq_routing_irqchip {
 	__u32 irqchip;
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index ac9174717..963540dea 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -327,6 +327,19 @@ struct virtio_net_rss_config {
 	__u8 hash_key_data[/* hash_key_length */];
 };
 
+struct virtio_net_rss_config_hdr {
+	__le32 hash_types;
+	__le16 indirection_table_mask;
+	__le16 unclassified_queue;
+	__le16 indirection_table[/* 1 + indirection_table_mask */];
+};
+
+struct virtio_net_rss_config_trailer {
+	__le16 max_tx_vq;
+	__u8 hash_key_length;
+	__u8 hash_key_data[/* hash_key_length */];
+};
+
  #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG          1
 
 /*
diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
index 8549d4571..c691ac210 100644
--- a/include/linux/virtio_pci.h
+++ b/include/linux/virtio_pci.h
@@ -246,6 +246,7 @@ struct virtio_pci_cfg_cap {
 #define VIRTIO_ADMIN_CMD_LIST_USE	0x1
 
 /* Admin command group type. */
+#define VIRTIO_ADMIN_GROUP_TYPE_SELF	0x0
 #define VIRTIO_ADMIN_GROUP_TYPE_SRIOV	0x1
 
 /* Transitional device admin command. */
diff --git a/riscv/include/asm/kvm.h b/riscv/include/asm/kvm.h
index f06bc5efc..5f59fd226 100644
--- a/riscv/include/asm/kvm.h
+++ b/riscv/include/asm/kvm.h
@@ -182,6 +182,8 @@ enum KVM_RISCV_ISA_EXT_ID {
 	KVM_RISCV_ISA_EXT_SVVPTC,
 	KVM_RISCV_ISA_EXT_ZABHA,
 	KVM_RISCV_ISA_EXT_ZICCRSE,
+	KVM_RISCV_ISA_EXT_ZAAMO,
+	KVM_RISCV_ISA_EXT_ZALRSC,
 	KVM_RISCV_ISA_EXT_MAX,
 };
 
diff --git a/x86/include/asm/kvm.h b/x86/include/asm/kvm.h
index 9e75da97b..6f3499507 100644
--- a/x86/include/asm/kvm.h
+++ b/x86/include/asm/kvm.h
@@ -441,6 +441,7 @@ struct kvm_sync_regs {
 #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS	(1 << 6)
 #define KVM_X86_QUIRK_SLOT_ZAP_ALL		(1 << 7)
 #define KVM_X86_QUIRK_STUFF_FEATURE_MSRS	(1 << 8)
+#define KVM_X86_QUIRK_IGNORE_GUEST_PAT		(1 << 9)
 
 #define KVM_STATE_NESTED_FORMAT_VMX	0
 #define KVM_STATE_NESTED_FORMAT_SVM	1
@@ -559,6 +560,9 @@ struct kvm_x86_mce {
 #define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE	(1 << 7)
 #define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA	(1 << 8)
 
+#define KVM_XEN_MSR_MIN_INDEX			0x40000000u
+#define KVM_XEN_MSR_MAX_INDEX			0x4fffffffu
+
 struct kvm_xen_hvm_config {
 	__u32 flags;
 	__u32 msr;
@@ -841,6 +845,7 @@ struct kvm_sev_snp_launch_start {
 };
 
 /* Kept in sync with firmware values for simplicity. */
+#define KVM_SEV_PAGE_TYPE_INVALID		0x0
 #define KVM_SEV_SNP_PAGE_TYPE_NORMAL		0x1
 #define KVM_SEV_SNP_PAGE_TYPE_ZERO		0x3
 #define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED	0x4
@@ -927,4 +932,74 @@ struct kvm_hyperv_eventfd {
 #define KVM_X86_SNP_VM		4
 #define KVM_X86_TDX_VM		5
 
+/* Trust Domain eXtension sub-ioctl() commands. */
+enum kvm_tdx_cmd_id {
+	KVM_TDX_CAPABILITIES = 0,
+	KVM_TDX_INIT_VM,
+	KVM_TDX_INIT_VCPU,
+	KVM_TDX_INIT_MEM_REGION,
+	KVM_TDX_FINALIZE_VM,
+	KVM_TDX_GET_CPUID,
+
+	KVM_TDX_CMD_NR_MAX,
+};
+
+struct kvm_tdx_cmd {
+	/* enum kvm_tdx_cmd_id */
+	__u32 id;
+	/* flags for sub-commend. If sub-command doesn't use this, set zero. */
+	__u32 flags;
+	/*
+	 * data for each sub-command. An immediate or a pointer to the actual
+	 * data in process virtual address.  If sub-command doesn't use it,
+	 * set zero.
+	 */
+	__u64 data;
+	/*
+	 * Auxiliary error code.  The sub-command may return TDX SEAMCALL
+	 * status code in addition to -Exxx.
+	 */
+	__u64 hw_error;
+};
+
+struct kvm_tdx_capabilities {
+	__u64 supported_attrs;
+	__u64 supported_xfam;
+	__u64 reserved[254];
+
+	/* Configurable CPUID bits for userspace */
+	struct kvm_cpuid2 cpuid;
+};
+
+struct kvm_tdx_init_vm {
+	__u64 attributes;
+	__u64 xfam;
+	__u64 mrconfigid[6];	/* sha384 digest */
+	__u64 mrowner[6];	/* sha384 digest */
+	__u64 mrownerconfig[6];	/* sha384 digest */
+
+	/* The total space for TD_PARAMS before the CPUIDs is 256 bytes */
+	__u64 reserved[12];
+
+	/*
+	 * Call KVM_TDX_INIT_VM before vcpu creation, thus before
+	 * KVM_SET_CPUID2.
+	 * This configuration supersedes KVM_SET_CPUID2s for VCPUs because the
+	 * TDX module directly virtualizes those CPUIDs without VMM.  The user
+	 * space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with
+	 * those values.  If it doesn't, KVM may have wrong idea of vCPUIDs of
+	 * the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX
+	 * module doesn't virtualize.
+	 */
+	struct kvm_cpuid2 cpuid;
+};
+
+#define KVM_TDX_MEASURE_MEMORY_REGION   _BITULL(0)
+
+struct kvm_tdx_init_mem_region {
+	__u64 source_addr;
+	__u64 gpa;
+	__u64 nr_pages;
+};
+
 #endif /* _ASM_X86_KVM_H */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH kvmtool 2/3] arm64: Initial nested virt support
  2025-06-20 10:44 [PATCH kvmtool 0/3] arm64: Nested virtualization support Andre Przywara
  2025-06-20 10:44 ` [PATCH kvmtool 1/3] Sync kernel UAPI headers with v6.16-rc1 Andre Przywara
@ 2025-06-20 10:44 ` Andre Przywara
  2025-06-20 11:09   ` Alexandru Elisei
  2025-06-20 10:44 ` [PATCH kvmtool 3/3] arm64: nested: add support for setting maintenance IRQ Andre Przywara
  2025-06-20 11:13 ` [PATCH kvmtool 0/3] arm64: Nested virtualization support Marc Zyngier
  3 siblings, 1 reply; 8+ messages in thread
From: Andre Przywara @ 2025-06-20 10:44 UTC (permalink / raw)
  To: Will Deacon, Julien Thierry, Marc Zyngier; +Cc: kvm, kvmarm

The ARMv8.3 architecture update includes support for nested
virtualization. Allow the user to specify "--nested" to start a guest in
(virtual) EL2 instead of EL1.
This will also change the PSCI conduit from HVC to SMC in the device
tree.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
---
 arm64/fdt.c                         |  5 ++++-
 arm64/include/kvm/kvm-config-arch.h |  5 ++++-
 arm64/kvm-cpu.c                     | 12 +++++++++++-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/arm64/fdt.c b/arm64/fdt.c
index df7775876..98f1dd9d4 100644
--- a/arm64/fdt.c
+++ b/arm64/fdt.c
@@ -205,7 +205,10 @@ static int setup_fdt(struct kvm *kvm)
 		_FDT(fdt_property_string(fdt, "compatible", "arm,psci"));
 		fns = &psci_0_1_fns;
 	}
-	_FDT(fdt_property_string(fdt, "method", "hvc"));
+	if (kvm->cfg.arch.nested_virt)
+		_FDT(fdt_property_string(fdt, "method", "smc"));
+	else
+		_FDT(fdt_property_string(fdt, "method", "hvc"));
 	_FDT(fdt_property_cell(fdt, "cpu_suspend", fns->cpu_suspend));
 	_FDT(fdt_property_cell(fdt, "cpu_off", fns->cpu_off));
 	_FDT(fdt_property_cell(fdt, "cpu_on", fns->cpu_on));
diff --git a/arm64/include/kvm/kvm-config-arch.h b/arm64/include/kvm/kvm-config-arch.h
index ee031f010..a1dac28e6 100644
--- a/arm64/include/kvm/kvm-config-arch.h
+++ b/arm64/include/kvm/kvm-config-arch.h
@@ -10,6 +10,7 @@ struct kvm_config_arch {
 	bool		aarch32_guest;
 	bool		has_pmuv3;
 	bool		mte_disabled;
+	bool		nested_virt;
 	u64		kaslr_seed;
 	enum irqchip_type irqchip;
 	u64		fw_addr;
@@ -57,6 +58,8 @@ int sve_vl_parser(const struct option *opt, const char *arg, int unset);
 		     "Type of interrupt controller to emulate in the guest",	\
 		     irqchip_parser, NULL),					\
 	OPT_U64('\0', "firmware-address", &(cfg)->fw_addr,			\
-		"Address where firmware should be loaded"),
+		"Address where firmware should be loaded"),			\
+	OPT_BOOLEAN('\0', "nested", &(cfg)->nested_virt,			\
+		    "Start VCPUs in EL2 (for nested virt)"),
 
 #endif /* ARM_COMMON__KVM_CONFIG_ARCH_H */
diff --git a/arm64/kvm-cpu.c b/arm64/kvm-cpu.c
index 94c08a4d7..42dc11dad 100644
--- a/arm64/kvm-cpu.c
+++ b/arm64/kvm-cpu.c
@@ -71,6 +71,12 @@ static void kvm_cpu__select_features(struct kvm *kvm, struct kvm_vcpu_init *init
 	/* Enable SVE if available */
 	if (kvm__supports_extension(kvm, KVM_CAP_ARM_SVE))
 		init->features[0] |= 1UL << KVM_ARM_VCPU_SVE;
+
+	if (kvm->cfg.arch.nested_virt) {
+		if (!kvm__supports_extension(kvm, KVM_CAP_ARM_EL2))
+			die("EL2 (nested virt) is not supported");
+		init->features[0] |= 1UL << KVM_ARM_VCPU_HAS_EL2;
+	}
 }
 
 static int vcpu_configure_sve(struct kvm_cpu *vcpu)
@@ -313,7 +319,11 @@ static void reset_vcpu_aarch64(struct kvm_cpu *vcpu)
 	reg.addr = (u64)&data;
 
 	/* pstate = all interrupts masked */
-	data	= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_MODE_EL1h;
+	data	= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT;
+	if (vcpu->kvm->cfg.arch.nested_virt)
+		data |= PSR_MODE_EL2h;
+	else
+		data |= PSR_MODE_EL1h;
 	reg.id	= ARM64_CORE_REG(regs.pstate);
 	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
 		die_perror("KVM_SET_ONE_REG failed (spsr[EL1])");
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH kvmtool 3/3] arm64: nested: add support for setting maintenance IRQ
  2025-06-20 10:44 [PATCH kvmtool 0/3] arm64: Nested virtualization support Andre Przywara
  2025-06-20 10:44 ` [PATCH kvmtool 1/3] Sync kernel UAPI headers with v6.16-rc1 Andre Przywara
  2025-06-20 10:44 ` [PATCH kvmtool 2/3] arm64: Initial nested virt support Andre Przywara
@ 2025-06-20 10:44 ` Andre Przywara
  2025-06-20 11:13 ` [PATCH kvmtool 0/3] arm64: Nested virtualization support Marc Zyngier
  3 siblings, 0 replies; 8+ messages in thread
From: Andre Przywara @ 2025-06-20 10:44 UTC (permalink / raw)
  To: Will Deacon, Julien Thierry, Marc Zyngier; +Cc: kvm, kvmarm

Uses the new VGIC KVM device attribute to set the maintenance IRQ.
This is fixed to use PPI 9, as a platform decision made by kvmtool,
matching the SBSA recommendation.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
---
 arm64/arm-cpu.c         |  3 ++-
 arm64/gic.c             | 21 ++++++++++++++++++++-
 arm64/include/kvm/gic.h |  2 +-
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/arm64/arm-cpu.c b/arm64/arm-cpu.c
index 69bb2cb2c..1e456f2c6 100644
--- a/arm64/arm-cpu.c
+++ b/arm64/arm-cpu.c
@@ -14,7 +14,8 @@ static void generate_fdt_nodes(void *fdt, struct kvm *kvm)
 {
 	int timer_interrupts[4] = {13, 14, 11, 10};
 
-	gic__generate_fdt_nodes(fdt, kvm->cfg.arch.irqchip);
+	gic__generate_fdt_nodes(fdt, kvm->cfg.arch.irqchip,
+				kvm->cfg.arch.nested_virt);
 	timer__generate_fdt_nodes(fdt, kvm, timer_interrupts);
 	pmu__generate_fdt_nodes(fdt, kvm);
 }
diff --git a/arm64/gic.c b/arm64/gic.c
index b0d3a1abb..7461b0f3f 100644
--- a/arm64/gic.c
+++ b/arm64/gic.c
@@ -11,6 +11,8 @@
 
 #define IRQCHIP_GIC 0
 
+#define GIC_MAINT_IRQ	9
+
 static int gic_fd = -1;
 static u64 gic_redists_base;
 static u64 gic_redists_size;
@@ -302,10 +304,15 @@ static int gic__init_gic(struct kvm *kvm)
 
 	int lines = irq__get_nr_allocated_lines();
 	u32 nr_irqs = ALIGN(lines, 32) + GIC_SPI_IRQ_BASE;
+	u32 maint_irq = GIC_MAINT_IRQ + 16;			/* PPI */
 	struct kvm_device_attr nr_irqs_attr = {
 		.group	= KVM_DEV_ARM_VGIC_GRP_NR_IRQS,
 		.addr	= (u64)(unsigned long)&nr_irqs,
 	};
+	struct kvm_device_attr maint_irq_attr = {
+		.group	= KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ,
+		.addr	= (u64)(unsigned long)&maint_irq,
+	};
 	struct kvm_device_attr vgic_init_attr = {
 		.group	= KVM_DEV_ARM_VGIC_GRP_CTRL,
 		.attr	= KVM_DEV_ARM_VGIC_CTRL_INIT,
@@ -325,6 +332,13 @@ static int gic__init_gic(struct kvm *kvm)
 			return ret;
 	}
 
+	if (kvm->cfg.arch.nested_virt &&
+	    !ioctl(gic_fd, KVM_HAS_DEVICE_ATTR, &maint_irq_attr)) {
+		ret = ioctl(gic_fd, KVM_SET_DEVICE_ATTR, &maint_irq_attr);
+		if (ret)
+			return ret;
+	}
+
 	irq__routing_init(kvm);
 
 	if (!ioctl(gic_fd, KVM_HAS_DEVICE_ATTR, &vgic_init_attr)) {
@@ -342,7 +356,7 @@ static int gic__init_gic(struct kvm *kvm)
 }
 late_init(gic__init_gic)
 
-void gic__generate_fdt_nodes(void *fdt, enum irqchip_type type)
+void gic__generate_fdt_nodes(void *fdt, enum irqchip_type type, bool nested)
 {
 	const char *compatible, *msi_compatible = NULL;
 	u64 msi_prop[2];
@@ -350,6 +364,8 @@ void gic__generate_fdt_nodes(void *fdt, enum irqchip_type type)
 		cpu_to_fdt64(ARM_GIC_DIST_BASE), cpu_to_fdt64(ARM_GIC_DIST_SIZE),
 		0, 0,				/* to be filled */
 	};
+	u32 maint_irq[3] = {cpu_to_fdt32(1), cpu_to_fdt32(GIC_MAINT_IRQ),
+			    cpu_to_fdt32(0xff04)};
 
 	switch (type) {
 	case IRQCHIP_GICV2M:
@@ -377,6 +393,9 @@ void gic__generate_fdt_nodes(void *fdt, enum irqchip_type type)
 	_FDT(fdt_property_cell(fdt, "#interrupt-cells", GIC_FDT_IRQ_NUM_CELLS));
 	_FDT(fdt_property(fdt, "interrupt-controller", NULL, 0));
 	_FDT(fdt_property(fdt, "reg", reg_prop, sizeof(reg_prop)));
+	if (nested)
+		_FDT(fdt_property(fdt, "interrupts", maint_irq,
+				  sizeof(maint_irq)));
 	_FDT(fdt_property_cell(fdt, "phandle", PHANDLE_GIC));
 	_FDT(fdt_property_cell(fdt, "#address-cells", 2));
 	_FDT(fdt_property_cell(fdt, "#size-cells", 2));
diff --git a/arm64/include/kvm/gic.h b/arm64/include/kvm/gic.h
index ad8bcbf21..1541a5824 100644
--- a/arm64/include/kvm/gic.h
+++ b/arm64/include/kvm/gic.h
@@ -36,7 +36,7 @@ struct kvm;
 int gic__alloc_irqnum(void);
 int gic__create(struct kvm *kvm, enum irqchip_type type);
 int gic__create_gicv2m_frame(struct kvm *kvm, u64 msi_frame_addr);
-void gic__generate_fdt_nodes(void *fdt, enum irqchip_type type);
+void gic__generate_fdt_nodes(void *fdt, enum irqchip_type type, bool nested);
 u32 gic__get_fdt_irq_cpumask(struct kvm *kvm);
 
 int gic__add_irqfd(struct kvm *kvm, unsigned int gsi, int trigger_fd,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH kvmtool 2/3] arm64: Initial nested virt support
  2025-06-20 10:44 ` [PATCH kvmtool 2/3] arm64: Initial nested virt support Andre Przywara
@ 2025-06-20 11:09   ` Alexandru Elisei
  2025-06-20 11:52     ` Marc Zyngier
  0 siblings, 1 reply; 8+ messages in thread
From: Alexandru Elisei @ 2025-06-20 11:09 UTC (permalink / raw)
  To: Andre Przywara; +Cc: Will Deacon, Julien Thierry, Marc Zyngier, kvm, kvmarm

Hi Andre,

Thanks for doing this, it was needed. Haven't given this a proper look (I'm
planning to do that though!), but something jumped at me, below.

On Fri, Jun 20, 2025 at 11:44:53AM +0100, Andre Przywara wrote:
> The ARMv8.3 architecture update includes support for nested
> virtualization. Allow the user to specify "--nested" to start a guest in

'./vm help run' shows:

--pmu             Create PMUv3 device
--disable-mte     Disable Memory Tagging Extension
--no-pvtime       Disable stolen time

Where:

--pmu checks for KVM_CAP_ARM_PMU_V3.
--disable-mte is there because MTE is enabled automatically for a guest when
KVM_CAP_ARM_MTE is present.
--no-pvtime is there because pvtime is enabled automatically; no capability
check is needed, but the control group for pvtime is called
KVM_ARM_VCPU_PVTIME_CTRL.

What I'm trying to get at is that the name for the kvmtool command line option
matches KVM's name for the capability. What do you think about naming the
parameter --el2 to match KVM_CAP_ARM_EL2 instead of --nested?

 Also, I seem to remember that the command line option for enabling
 KVM_CAP_ARM_EL2_E2H0 in Marc's repo is --e2h0, so having --el2 instead of
 --nested looks somewhat more consistent to me.

 Thoughts?

 Thanks,
 Alex

> (virtual) EL2 instead of EL1.
> This will also change the PSCI conduit from HVC to SMC in the device
> tree.
> 
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> ---
>  arm64/fdt.c                         |  5 ++++-
>  arm64/include/kvm/kvm-config-arch.h |  5 ++++-
>  arm64/kvm-cpu.c                     | 12 +++++++++++-
>  3 files changed, 19 insertions(+), 3 deletions(-)
> 
> diff --git a/arm64/fdt.c b/arm64/fdt.c
> index df7775876..98f1dd9d4 100644
> --- a/arm64/fdt.c
> +++ b/arm64/fdt.c
> @@ -205,7 +205,10 @@ static int setup_fdt(struct kvm *kvm)
>  		_FDT(fdt_property_string(fdt, "compatible", "arm,psci"));
>  		fns = &psci_0_1_fns;
>  	}
> -	_FDT(fdt_property_string(fdt, "method", "hvc"));
> +	if (kvm->cfg.arch.nested_virt)
> +		_FDT(fdt_property_string(fdt, "method", "smc"));
> +	else
> +		_FDT(fdt_property_string(fdt, "method", "hvc"));
>  	_FDT(fdt_property_cell(fdt, "cpu_suspend", fns->cpu_suspend));
>  	_FDT(fdt_property_cell(fdt, "cpu_off", fns->cpu_off));
>  	_FDT(fdt_property_cell(fdt, "cpu_on", fns->cpu_on));
> diff --git a/arm64/include/kvm/kvm-config-arch.h b/arm64/include/kvm/kvm-config-arch.h
> index ee031f010..a1dac28e6 100644
> --- a/arm64/include/kvm/kvm-config-arch.h
> +++ b/arm64/include/kvm/kvm-config-arch.h
> @@ -10,6 +10,7 @@ struct kvm_config_arch {
>  	bool		aarch32_guest;
>  	bool		has_pmuv3;
>  	bool		mte_disabled;
> +	bool		nested_virt;
>  	u64		kaslr_seed;
>  	enum irqchip_type irqchip;
>  	u64		fw_addr;
> @@ -57,6 +58,8 @@ int sve_vl_parser(const struct option *opt, const char *arg, int unset);
>  		     "Type of interrupt controller to emulate in the guest",	\
>  		     irqchip_parser, NULL),					\
>  	OPT_U64('\0', "firmware-address", &(cfg)->fw_addr,			\
> -		"Address where firmware should be loaded"),
> +		"Address where firmware should be loaded"),			\
> +	OPT_BOOLEAN('\0', "nested", &(cfg)->nested_virt,			\
> +		    "Start VCPUs in EL2 (for nested virt)"),
>  
>  #endif /* ARM_COMMON__KVM_CONFIG_ARCH_H */
> diff --git a/arm64/kvm-cpu.c b/arm64/kvm-cpu.c
> index 94c08a4d7..42dc11dad 100644
> --- a/arm64/kvm-cpu.c
> +++ b/arm64/kvm-cpu.c
> @@ -71,6 +71,12 @@ static void kvm_cpu__select_features(struct kvm *kvm, struct kvm_vcpu_init *init
>  	/* Enable SVE if available */
>  	if (kvm__supports_extension(kvm, KVM_CAP_ARM_SVE))
>  		init->features[0] |= 1UL << KVM_ARM_VCPU_SVE;
> +
> +	if (kvm->cfg.arch.nested_virt) {
> +		if (!kvm__supports_extension(kvm, KVM_CAP_ARM_EL2))
> +			die("EL2 (nested virt) is not supported");
> +		init->features[0] |= 1UL << KVM_ARM_VCPU_HAS_EL2;
> +	}
>  }
>  
>  static int vcpu_configure_sve(struct kvm_cpu *vcpu)
> @@ -313,7 +319,11 @@ static void reset_vcpu_aarch64(struct kvm_cpu *vcpu)
>  	reg.addr = (u64)&data;
>  
>  	/* pstate = all interrupts masked */
> -	data	= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_MODE_EL1h;
> +	data	= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT;
> +	if (vcpu->kvm->cfg.arch.nested_virt)
> +		data |= PSR_MODE_EL2h;
> +	else
> +		data |= PSR_MODE_EL1h;
>  	reg.id	= ARM64_CORE_REG(regs.pstate);
>  	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
>  		die_perror("KVM_SET_ONE_REG failed (spsr[EL1])");
> -- 
> 2.25.1
> 
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH kvmtool 0/3] arm64: Nested virtualization support
  2025-06-20 10:44 [PATCH kvmtool 0/3] arm64: Nested virtualization support Andre Przywara
                   ` (2 preceding siblings ...)
  2025-06-20 10:44 ` [PATCH kvmtool 3/3] arm64: nested: add support for setting maintenance IRQ Andre Przywara
@ 2025-06-20 11:13 ` Marc Zyngier
  3 siblings, 0 replies; 8+ messages in thread
From: Marc Zyngier @ 2025-06-20 11:13 UTC (permalink / raw)
  To: Andre Przywara; +Cc: Will Deacon, Julien Thierry, kvm, kvmarm

Hi Andre,

On Fri, 20 Jun 2025 11:44:51 +0100,
Andre Przywara <andre.przywara@arm.com> wrote:
> 
> Thanks to the imperturbable efforts from Marc, arm64 support for nested
> virtualization has now reached the mainline kernel, which means the
> respective kvmtool support should now be ready as well.

Thanks for pushing this stuff out.

> 
> Patch 1 updates the kernel headers, to get the new EL2 capability, and
> the VGIC device control to setup the maintenance IRQ.
> Patch 2 introduces the new "--nested" command line option, to let the
> VCPUs start in EL2. To allow KVM guests running in such a guest, we also
> need VGIC support, which patch 3 allows by setting the maintenance IRQ.
> 
> Tested on the FVP (with some good deal of patience), and some commercial
> (non-fruity) hardware, down to a guest's guest's guest.
> 
> Cheers,
> Andre
> 
> P.S.: Marc: I saw the other patches in your kernel.org repo, do we need any
> of them - HYP timer IRQ, E2H0, counter offset?

Yes, please. They are very much necessary, and should serve as a
template for other VMMs (exposing all the interrupts is required, the
counter offset is necessary to test things resembling live migration,
and the e2h0 selection to run nVHE.

You can probably ignore the virtio patch for now, as this needs to be
properly debugged,

> I guess E2H0 for fruity hardware, what about the others?

The other way around. The '--e2h0' option forces the use of
HCR_EL2.NV1. Rotten fruits can't use NV1 (they actually can, but the
EL2 S1 PTW is fscked, so we hide it from KVM). However, other
implementations do have proper NV1 support, and that option is
extremely useful to boot a nVHE hypervisor.

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH kvmtool 2/3] arm64: Initial nested virt support
  2025-06-20 11:09   ` Alexandru Elisei
@ 2025-06-20 11:52     ` Marc Zyngier
  2025-06-20 13:43       ` Alexandru Elisei
  0 siblings, 1 reply; 8+ messages in thread
From: Marc Zyngier @ 2025-06-20 11:52 UTC (permalink / raw)
  To: Alexandru Elisei; +Cc: Andre Przywara, Will Deacon, Julien Thierry, kvm, kvmarm

On Fri, 20 Jun 2025 12:09:38 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi Andre,
> 
> Thanks for doing this, it was needed. Haven't given this a proper look (I'm
> planning to do that though!), but something jumped at me, below.
> 
> On Fri, Jun 20, 2025 at 11:44:53AM +0100, Andre Przywara wrote:
> > The ARMv8.3 architecture update includes support for nested
> > virtualization. Allow the user to specify "--nested" to start a guest in
> 
> './vm help run' shows:
> 
> --pmu             Create PMUv3 device
> --disable-mte     Disable Memory Tagging Extension
> --no-pvtime       Disable stolen time
> 
> Where:
> 
> --pmu checks for KVM_CAP_ARM_PMU_V3.
> --disable-mte is there because MTE is enabled automatically for a guest when
> KVM_CAP_ARM_MTE is present.
> --no-pvtime is there because pvtime is enabled automatically; no capability
> check is needed, but the control group for pvtime is called
> KVM_ARM_VCPU_PVTIME_CTRL.
> 
> What I'm trying to get at is that the name for the kvmtool command line option
> matches KVM's name for the capability. What do you think about naming the
> parameter --el2 to match KVM_CAP_ARM_EL2 instead of --nested?
> 
>  Also, I seem to remember that the command line option for enabling
>  KVM_CAP_ARM_EL2_E2H0 in Marc's repo is --e2h0, so having --el2 instead of
>  --nested looks somewhat more consistent to me.
> 
>  Thoughts?

I think --el2 describes the wrong thing. We don't only expose EL2 to a
guest, but we also expose FEAT_NV2 by default. So "nested" is IMO
closer to the effects of the capability. If anything, it is
KVM_CAP_ARM_EL2 that is badly named (yes, there is some history here,
but I'm not going to entertain changing the #define after 8 years).

Similarly, QEMU has "virtualization=on" as an indication that it
should engage NV, and not "el2=on".

If you wanted a pure --el2 flag, then it should engage NV just like
--nested does, but disable FEAT_NV2 in the idregs. This would give you
EL2 without recursive NV and HCR_EL2.E2H RES1.

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH kvmtool 2/3] arm64: Initial nested virt support
  2025-06-20 11:52     ` Marc Zyngier
@ 2025-06-20 13:43       ` Alexandru Elisei
  0 siblings, 0 replies; 8+ messages in thread
From: Alexandru Elisei @ 2025-06-20 13:43 UTC (permalink / raw)
  To: Marc Zyngier; +Cc: Andre Przywara, Will Deacon, Julien Thierry, kvm, kvmarm

Hi Marc,

On Fri, Jun 20, 2025 at 12:52:08PM +0100, Marc Zyngier wrote:
> On Fri, 20 Jun 2025 12:09:38 +0100,
> Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > 
> > Hi Andre,
> > 
> > Thanks for doing this, it was needed. Haven't given this a proper look (I'm
> > planning to do that though!), but something jumped at me, below.
> > 
> > On Fri, Jun 20, 2025 at 11:44:53AM +0100, Andre Przywara wrote:
> > > The ARMv8.3 architecture update includes support for nested
> > > virtualization. Allow the user to specify "--nested" to start a guest in
> > 
> > './vm help run' shows:
> > 
> > --pmu             Create PMUv3 device
> > --disable-mte     Disable Memory Tagging Extension
> > --no-pvtime       Disable stolen time
> > 
> > Where:
> > 
> > --pmu checks for KVM_CAP_ARM_PMU_V3.
> > --disable-mte is there because MTE is enabled automatically for a guest when
> > KVM_CAP_ARM_MTE is present.
> > --no-pvtime is there because pvtime is enabled automatically; no capability
> > check is needed, but the control group for pvtime is called
> > KVM_ARM_VCPU_PVTIME_CTRL.
> > 
> > What I'm trying to get at is that the name for the kvmtool command line option
> > matches KVM's name for the capability. What do you think about naming the
> > parameter --el2 to match KVM_CAP_ARM_EL2 instead of --nested?
> > 
> >  Also, I seem to remember that the command line option for enabling
> >  KVM_CAP_ARM_EL2_E2H0 in Marc's repo is --e2h0, so having --el2 instead of
> >  --nested looks somewhat more consistent to me.
> > 
> >  Thoughts?
> 
> I think --el2 describes the wrong thing. We don't only expose EL2 to a
> guest, but we also expose FEAT_NV2 by default. So "nested" is IMO
> closer to the effects of the capability. If anything, it is
> KVM_CAP_ARM_EL2 that is badly named (yes, there is some history here,
> but I'm not going to entertain changing the #define after 8 years).
> 
> Similarly, QEMU has "virtualization=on" as an indication that it
> should engage NV, and not "el2=on".
> 
> If you wanted a pure --el2 flag, then it should engage NV just like
                                                         ^^
							 EL2?
> --nested does, but disable FEAT_NV2 in the idregs. This would give you
> EL2 without recursive NV and HCR_EL2.E2H RES1.

That's a very interesting perspective. My comment was from the point of view of
what kvmtool does when the option is present - it sets the *_EL2 VCPU flag, not
what effect the flag has on a virtual machine.

I can see what you're saying, --nested looks fine.

Thanks,
Alex

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2025-06-20 13:43 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-20 10:44 [PATCH kvmtool 0/3] arm64: Nested virtualization support Andre Przywara
2025-06-20 10:44 ` [PATCH kvmtool 1/3] Sync kernel UAPI headers with v6.16-rc1 Andre Przywara
2025-06-20 10:44 ` [PATCH kvmtool 2/3] arm64: Initial nested virt support Andre Przywara
2025-06-20 11:09   ` Alexandru Elisei
2025-06-20 11:52     ` Marc Zyngier
2025-06-20 13:43       ` Alexandru Elisei
2025-06-20 10:44 ` [PATCH kvmtool 3/3] arm64: nested: add support for setting maintenance IRQ Andre Przywara
2025-06-20 11:13 ` [PATCH kvmtool 0/3] arm64: Nested virtualization support Marc Zyngier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox