public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
* [kvm-unit-tests PATCH v1 0/3] arm64: Add Stage-2 MMU and Nested Guest Framework
@ 2026-03-16 22:43 Jing Zhang
  2026-03-16 22:43 ` [kvm-unit-tests PATCH v1 1/3] lib: arm64: Add stage2 page table management library Jing Zhang
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Jing Zhang @ 2026-03-16 22:43 UTC (permalink / raw)
  To: KVM, KVMARM
  Cc: Marc Zyngier, Joey Gouly, Andrew Jones, Alexandru Elisei,
	Oliver Upton, Jing Zhang

This patch series introduces a lightweight infrastructure for managing ARM64
Stage-2 translation tables and executing nested guests. These components are
essential for testing advanced virtualization features such as nested
virtualization (NV) and GICv4 direct interrupt injection.

The series provides a generic Stage-2 MMU library supporting multiple
translation granules (4K, 16K, 64K) and dynamic page table management.
Building on this, it adds a guest execution framework that handles guest
lifecycle management, context switching and guest exit routing. A new test
case for Stage-2 MMU demand paging to verify fault handling.

Please note that this is a very preliminary implementation intended as a
startup baseline for future work in virtualization testing. Users should be
aware that because this is an early-stage baseline, some portions of the code
may just happen to work in its current state. There might be critical
architectural elements or edge-case handling missing that will need to be
addressed as the framework matures.

---

Jing Zhang (3):
  lib: arm64: Add stage2 page table management library
  lib: arm64: Add bare-metal guest execution framework
  arm64: Add Stage-2 MMU demand paging test

 arm/Makefile.arm64         |   4 +
 arm/stage2-mmu-test.c      | 100 +++++++++
 lib/arm64/asm/guest.h      | 156 ++++++++++++++
 lib/arm64/asm/stage2_mmu.h |  74 +++++++
 lib/arm64/guest.c          | 197 ++++++++++++++++++
 lib/arm64/guest_arch.S     | 263 ++++++++++++++++++++++++
 lib/arm64/stage2_mmu.c     | 402 +++++++++++++++++++++++++++++++++++++
 7 files changed, 1196 insertions(+)
 create mode 100644 arm/stage2-mmu-test.c
 create mode 100644 lib/arm64/asm/guest.h
 create mode 100644 lib/arm64/asm/stage2_mmu.h
 create mode 100644 lib/arm64/guest.c
 create mode 100644 lib/arm64/guest_arch.S
 create mode 100644 lib/arm64/stage2_mmu.c


base-commit: 86e53277ac80dabb04f4fa5fa6a6cc7649392bdc
-- 
2.53.0.851.ga537e3e6e9-goog


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [kvm-unit-tests PATCH v1 1/3] lib: arm64: Add stage2 page table management library
  2026-03-16 22:43 [kvm-unit-tests PATCH v1 0/3] arm64: Add Stage-2 MMU and Nested Guest Framework Jing Zhang
@ 2026-03-16 22:43 ` Jing Zhang
  2026-03-24 15:12   ` Wei-Lin Chang
  2026-03-16 22:43 ` [kvm-unit-tests PATCH v1 2/3] lib: arm64: Add bare-metal guest execution framework Jing Zhang
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 10+ messages in thread
From: Jing Zhang @ 2026-03-16 22:43 UTC (permalink / raw)
  To: KVM, KVMARM
  Cc: Marc Zyngier, Joey Gouly, Andrew Jones, Alexandru Elisei,
	Oliver Upton, Jing Zhang

Tests running at EL2 (hypervisor level) often require the ability to
manage Stage 2 translation tables to control Guest Physical Address (IPA)
to Host Physical Address (PA) translation.

Add a generic Stage 2 MMU library that provides software management of
ARM64 Stage 2 translation tables.

The library features include:
- Support for 4K, 16K, and 64K translation granules.
- Dynamic page table allocation using the allocator.
- Support for 2M block mappings where applicable.
- APIs for mapping, unmapping, enabling, and disabling the Stage 2 MMU.
- Basic fault info reporting (ESR, FAR, HPFAR).

This infrastructure is necessary for upcoming virtualization and
hypervisor-mode tests.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
---
 arm/Makefile.arm64         |   1 +
 lib/arm64/asm/stage2_mmu.h |  74 +++++++
 lib/arm64/stage2_mmu.c     | 402 +++++++++++++++++++++++++++++++++++++
 3 files changed, 477 insertions(+)
 create mode 100644 lib/arm64/asm/stage2_mmu.h
 create mode 100644 lib/arm64/stage2_mmu.c

diff --git a/arm/Makefile.arm64 b/arm/Makefile.arm64
index a40c830d..5e50f5ba 100644
--- a/arm/Makefile.arm64
+++ b/arm/Makefile.arm64
@@ -40,6 +40,7 @@ cflatobjs += lib/arm64/stack.o
 cflatobjs += lib/arm64/processor.o
 cflatobjs += lib/arm64/spinlock.o
 cflatobjs += lib/arm64/gic-v3-its.o lib/arm64/gic-v3-its-cmd.o
+cflatobjs += lib/arm64/stage2_mmu.o
 
 ifeq ($(CONFIG_EFI),y)
 cflatobjs += lib/acpi.o
diff --git a/lib/arm64/asm/stage2_mmu.h b/lib/arm64/asm/stage2_mmu.h
new file mode 100644
index 00000000..c9e931a8
--- /dev/null
+++ b/lib/arm64/asm/stage2_mmu.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2026, Google LLC.
+ * Author: Jing Zhang <jingzhangos@google.com>
+ *
+ * SPDX-License-Identifier: LGPL-2.0-or-later
+ */
+#ifndef _ASMARM64_STAGE2_MMU_H_
+#define _ASMARM64_STAGE2_MMU_H_
+
+#include <libcflat.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#define pte_is_table(pte)	(pte_val(pte) & PTE_TABLE_BIT)
+
+/* Stage-2 Memory Attributes (MemAttr[3:0]) */
+#define S2_MEMATTR_NORMAL	(0xFUL << 2) /* Normal Memory, Outer/Inner Write-Back */
+#define S2_MEMATTR_DEVICE	(0x0UL << 2) /* Device-nGnRnE */
+
+#define ESR_ELx_EC_SHIFT	(26)
+#define ESR_ELx_EC_HVC64	UL(0x16)
+#define ESR_ELx_EC_DABT_LOW	UL(0x24)
+
+/* Stage-2 Access Permissions (S2AP[1:0]) */
+#define S2AP_NONE	(0UL << 6)
+#define S2AP_RO		(1UL << 6) /* Read-only */
+#define S2AP_WO		(2UL << 6) /* Write-only */
+#define S2AP_RW		(3UL << 6) /* Read-Write */
+
+/* Flags for mapping */
+#define S2_MAP_RW	(S2AP_RW | S2_MEMATTR_NORMAL | PTE_AF | PTE_SHARED)
+#define S2_MAP_DEVICE	(S2AP_RW | S2_MEMATTR_DEVICE | PTE_AF)
+
+enum s2_granule {
+	S2_PAGE_4K,
+	S2_PAGE_16K,
+	S2_PAGE_64K,
+};
+
+/* Main Stage-2 MMU Structure */
+struct s2_mmu {
+	pgd_t *pgd;
+	int vmid;
+
+	/* Configuration */
+	enum s2_granule granule;
+	bool allow_block_mappings;
+
+	/* Internal helpers calculated from granule & VA_BITS */
+	unsigned int page_shift;
+	unsigned int level_shift;
+	int root_level; /* 0, 1, or 2 */
+	unsigned long page_size;
+	unsigned long block_size;
+};
+
+/* API */
+/* Initialize an s2_mmu struct with specific settings */
+struct s2_mmu *s2mmu_init(int vmid, enum s2_granule granule, bool allow_block_mappings);
+
+/* Management */
+void s2mmu_destroy(struct s2_mmu *mmu);
+void s2mmu_map(struct s2_mmu *mmu, unsigned long ipa, unsigned long pa,
+	       unsigned long size, unsigned long flags);
+void s2mmu_unmap(struct s2_mmu *mmu, unsigned long ipa, unsigned long size);
+
+/* Activation */
+void s2mmu_enable(struct s2_mmu *mmu);
+void s2mmu_disable(struct s2_mmu *mmu);
+
+/* Debug */
+void s2mmu_print_fault_info(void);
+
+#endif /* _ASMARM64_STAGE2_MMU_H_ */
diff --git a/lib/arm64/stage2_mmu.c b/lib/arm64/stage2_mmu.c
new file mode 100644
index 00000000..bfe87eac
--- /dev/null
+++ b/lib/arm64/stage2_mmu.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright (C) 2026, Google LLC.
+ * Author: Jing Zhang <jingzhangos@google.com>
+ *
+ * SPDX-License-Identifier: LGPL-2.0-or-later
+ */
+#include <libcflat.h>
+#include <alloc.h>
+#include <asm/stage2_mmu.h>
+#include <asm/sysreg.h>
+#include <asm/io.h>
+#include <asm/barrier.h>
+#include <alloc_page.h>
+
+/* VTCR_EL2 Definitions */
+#define VTCR_SH0_INNER		(3UL << 12)
+#define VTCR_ORGN0_WBWA		(1UL << 10)
+#define VTCR_IRGN0_WBWA		(1UL << 8)
+
+/* TG0 Encodings */
+#define VTCR_TG0_4K		(0UL << 14)
+#define VTCR_TG0_64K		(1UL << 14)
+#define VTCR_TG0_16K		(2UL << 14)
+
+/* Physical Address Size (PS) - Derive from VA_BITS for simplicity or max */
+#if VA_BITS > 40
+#define VTCR_PS_VAL		(5UL << 16) /* 48-bit PA */
+#else
+#define VTCR_PS_VAL		(2UL << 16) /* 40-bit PA */
+#endif
+
+struct s2_mmu *s2mmu_init(int vmid, enum s2_granule granule, bool allow_block_mappings)
+{
+	struct s2_mmu *mmu = calloc(1, sizeof(struct s2_mmu));
+	int order = 0;
+
+	mmu->vmid = vmid;
+	mmu->granule = granule;
+	mmu->allow_block_mappings = allow_block_mappings;
+
+	/* Configure shifts based on granule */
+	switch (granule) {
+	case S2_PAGE_4K:
+		mmu->page_shift = 12;
+		mmu->level_shift = 9;
+		/*
+		 * Determine Root Level for 4K:
+		 * VA_BITS > 39 (e.g. 48) -> Start L0
+		 * VA_BITS <= 39 (e.g. 32, 36) -> Start L1 
+		 */
+		mmu->root_level = (VA_BITS > 39) ? 0 : 1;
+		break;
+	case S2_PAGE_16K:
+		mmu->page_shift = 14;
+		mmu->level_shift = 11;
+		/*
+		 * 16K: L1 covers 47 bits. L0 not valid for 16K 
+		 * Start L1 for 47 bits. Start L2 for 36 bits.
+		 */
+		mmu->root_level = (VA_BITS > 36) ? 1 : 2;
+		break;
+	case S2_PAGE_64K:
+		mmu->page_shift = 16;
+		mmu->level_shift = 13;
+		/* 64K: L1 covers 52 bits. L2 covers 42 bits. */
+		mmu->root_level = (VA_BITS > 42) ? 1 : 2;
+		break;
+	}
+
+	mmu->page_size = 1UL << mmu->page_shift;
+	mmu->block_size = 1UL << (mmu->page_shift + mmu->level_shift);
+
+	/* Alloc PGD. Use order for allocation size */
+	if (mmu->page_size > PAGE_SIZE) {
+		order = __builtin_ctz(mmu->page_size / PAGE_SIZE);
+	}
+	mmu->pgd = (pgd_t *)alloc_pages(order);
+	if (mmu->pgd) {
+		memset(mmu->pgd, 0, mmu->page_size);
+	} else {
+		free(mmu);
+		return NULL;
+	}
+
+	return mmu;
+}
+
+static unsigned long s2mmu_get_addr_mask(struct s2_mmu *mmu)
+{
+	switch (mmu->granule) {
+	case S2_PAGE_16K:
+		return GENMASK_ULL(47, 14);
+	case S2_PAGE_64K:
+		return GENMASK_ULL(47, 16);
+	default:
+		return GENMASK_ULL(47, 12); /* 4K */
+	}
+}
+
+static void s2mmu_free_tables(struct s2_mmu *mmu, pte_t *table, int level)
+{
+	unsigned long entries = 1UL << mmu->level_shift;
+	unsigned long mask = s2mmu_get_addr_mask(mmu);
+	unsigned long i;
+
+	/*
+	 * Recurse if not leaf level
+	 * Level 3 is always leaf page. Levels 0-2 can be Table or Block.
+	 */
+	if (level < 3) {
+		for (i = 0; i < entries; i++) {
+			pte_t entry = table[i];
+			if ((pte_valid(entry) && pte_is_table(entry))) {
+				pte_t *next = (pte_t *)phys_to_virt(pte_val(entry) & mask);
+				s2mmu_free_tables(mmu, next, level + 1);
+			}
+		}
+	}
+
+	free_pages(table);
+}
+
+void s2mmu_destroy(struct s2_mmu *mmu)
+{
+	if (mmu->pgd)
+		s2mmu_free_tables(mmu, (pte_t *)mmu->pgd, mmu->root_level);
+	free(mmu);
+}
+
+void s2mmu_enable(struct s2_mmu *mmu)
+{
+	unsigned long vtcr = VTCR_PS_VAL | VTCR_SH0_INNER |
+			     VTCR_ORGN0_WBWA | VTCR_IRGN0_WBWA;
+	unsigned long t0sz = 64 - VA_BITS;
+	unsigned long vttbr;
+
+	switch (mmu->granule) {
+	case S2_PAGE_4K:
+		vtcr |= VTCR_TG0_4K;
+		/* SL0 Encodings for 4K: 0=L2, 1=L1, 2=L0 */
+		if (mmu->root_level == 0)
+			vtcr |= (2UL << 6); /* Start L0 */
+		else if (mmu->root_level == 1)
+			vtcr |= (1UL << 6); /* Start L1 */
+		else
+			vtcr |= (0UL << 6); /* Start L2 */
+		break;
+	case S2_PAGE_16K:
+		vtcr |= VTCR_TG0_16K;
+		/* SL0 Encodings for 16K: 0=L3(Res), 1=L2, 2=L1, 3=L0(Res) */
+		if (mmu->root_level == 1)
+			vtcr |= (2UL << 6); /* Start L1 */
+		else
+			vtcr |= (1UL << 6); /* Start L2 */
+		break;
+	case S2_PAGE_64K:
+		vtcr |= VTCR_TG0_64K;
+		/* SL0 Encodings for 64K: 0=L3(Res), 1=L2, 2=L1, 3=L0(Res) */
+		if (mmu->root_level == 1)
+			vtcr |= (2UL << 6); /* Start L1 */
+		else
+			vtcr |= (1UL << 6); /* Start L2 */
+		break;
+	}
+
+	vtcr |= t0sz;
+
+	write_sysreg(vtcr, vtcr_el2);
+	isb();
+
+	/* Setup VTTBR */
+	vttbr = virt_to_phys(mmu->pgd);
+	vttbr |= ((unsigned long)mmu->vmid << 48);
+	write_sysreg(vttbr, vttbr_el2);
+	isb();
+
+	asm volatile("tlbi vmalls12e1is");
+	dsb(ish);
+	isb();
+}
+
+void s2mmu_disable(struct s2_mmu *mmu)
+{
+	write_sysreg(0, vttbr_el2);
+	isb();
+}
+
+static pte_t *get_pte(struct s2_mmu *mmu, pte_t *table, unsigned long idx, bool alloc)
+{
+	unsigned long mask = s2mmu_get_addr_mask(mmu);
+	pte_t entry = table[idx];
+	pte_t *next_table;
+	int order = 0;
+
+	if (pte_valid(entry)) {
+		if (pte_is_table(entry))
+			return (pte_t *)phys_to_virt(pte_val(entry) & mask);
+		/* Block Entry */
+		return NULL;
+	}
+
+	if (!alloc)
+		return NULL;
+
+	/* Allocate table memory covering the Stage-2 Granule size */
+	if (mmu->page_size > PAGE_SIZE)
+		order = __builtin_ctz(mmu->page_size / PAGE_SIZE);
+
+	next_table = (pte_t *)alloc_pages(order);
+	if (next_table)
+		memset(next_table, 0, mmu->page_size);
+
+	pte_val(entry) = virt_to_phys(next_table) | PTE_TABLE_BIT | PTE_VALID;
+	WRITE_ONCE(table[idx], entry);
+
+	return next_table;
+}
+
+void s2mmu_map(struct s2_mmu *mmu, unsigned long ipa, unsigned long pa,
+	       unsigned long size, unsigned long flags)
+{
+	unsigned long level_mask, level_shift, level_size, level;
+	unsigned long start_ipa, end_ipa, idx;
+	pte_t entry, *table, *next_table;
+	bool is_block_level;
+
+	start_ipa = ipa;
+	end_ipa = ipa + size;
+	level_mask = (1UL << mmu->level_shift) - 1;
+
+	while (start_ipa < end_ipa) {
+		table = (pte_t *)mmu->pgd;
+
+		/* Walk from Root to Leaf */
+		for (level = mmu->root_level; level < 3; level++) {
+			level_shift = mmu->page_shift + (3 - level) * mmu->level_shift;
+			idx = (start_ipa >> level_shift) & level_mask;
+			level_size = 1UL << level_shift;
+
+			/*
+			 * Check for Block Mapping
+			 * Valid Block Levels:
+			 * 4K:  L1 (1G), L2 (2MB)
+			 * 16K: L2 (32MB)
+			 * 64K: L2 (512MB) 
+			 */
+			is_block_level = (level == 2) ||
+				(mmu->granule == S2_PAGE_4K && level == 1);
+
+			if (mmu->allow_block_mappings && is_block_level) {
+				if ((start_ipa & (level_size - 1)) == 0 &&
+				    (pa & (level_size - 1)) == 0 &&
+				    (start_ipa + level_size) <= end_ipa) {
+					/* Map Block */
+					pte_val(entry) = (pa & ~(level_size - 1)) |
+							 flags | PTE_VALID;
+					WRITE_ONCE(table[idx], entry);
+					start_ipa += level_size;
+					pa += level_size;
+					goto next_chunk; /* Continue outer loop */
+				}
+			}
+
+			/* Move to next level */
+			next_table = get_pte(mmu, table, idx, true);
+			if (!next_table) {
+				printf("Error allocating or existing block conflict.\n");
+				return;
+			}
+			table = next_table;
+		}
+
+		/* Leaf Level (Level 3 PTE) */
+		if (level == 3) {
+			idx = (start_ipa >> mmu->page_shift) & level_mask;
+			pte_val(entry) = (pa & ~(mmu->page_size - 1)) | flags | PTE_TYPE_PAGE;
+			WRITE_ONCE(table[idx], entry);
+			start_ipa += mmu->page_size;
+			pa += mmu->page_size;
+		}
+
+next_chunk:
+		continue;
+	}
+
+	asm volatile("tlbi vmalls12e1is");
+	dsb(ish);
+	isb();
+}
+
+/*
+ * Recursive helper to unmap a range within a specific table.
+ * Returns true if the table at this level is now completely empty
+ * and should be freed by the caller.
+ */
+static bool s2mmu_unmap_level(struct s2_mmu *mmu, pte_t *table,
+			      unsigned long current_ipa, int level,
+			      unsigned long start_ipa, unsigned long end_ipa,
+			      unsigned long mask)
+{
+	unsigned long level_size, entry_ipa, entry_end;
+	bool child_empty, table_empty = true;
+	pte_t entry, *next_table;
+	unsigned int level_shift;
+	unsigned long i;
+
+	/* Calculate shift and size for this level */
+	if (level == 3) {
+		level_shift = mmu->page_shift;
+	} else {
+		level_shift = mmu->page_shift + (3 - level) * mmu->level_shift;
+	}
+	level_size = 1UL << level_shift;
+
+	/* Iterate over all entries in this table */
+	for (i = 0; i < (1UL << mmu->level_shift); i++) {
+		entry = table[i];
+		entry_ipa = current_ipa + (i * level_size);
+		entry_end = entry_ipa + level_size;
+
+		/* Skip entries completely outside our target range */
+		if (entry_end <= start_ipa || entry_ipa >= end_ipa) {
+			if (pte_valid(entry))
+				table_empty = false;
+			continue;
+		}
+
+		/*
+		 * If the entry is fully covered by the unmap range,
+		 * we can clear it (leaf) or recurse and free (table).
+		 */
+		if (entry_ipa >= start_ipa && entry_end <= end_ipa) {
+			if (pte_valid(entry)) {
+				if (pte_is_table(entry) && level < 3) {
+					/* Recurse to free children first */
+					next_table = (pte_t *)phys_to_virt(pte_val(entry) & mask);
+					s2mmu_free_tables(mmu, next_table, level + 1);
+				}
+				/* Invalidate the entry */
+				WRITE_ONCE(table[i], __pte(0));
+			}
+			continue;
+		}
+
+		/*
+		 * Partial overlap: This must be a table (split required).
+		 * If it's a Block, we can't split easily in this context
+		 * without complex logic, so we generally skip or fail.
+		 * Assuming standard breakdown: recurse into the table.
+		 */
+		if (pte_valid(entry) && pte_is_table(entry) && level < 3) {
+			next_table = (pte_t *)phys_to_virt(pte_val(entry) & mask);
+			child_empty = s2mmu_unmap_level(mmu, next_table, entry_ipa, level + 1,
+							start_ipa, end_ipa, mask);
+
+			if (child_empty) {
+				free_pages(next_table);
+				WRITE_ONCE(table[i], __pte(0));
+			} else {
+				table_empty = false;
+			}
+		} else if (pte_valid(entry)) {
+			/*
+			 * Overlap on a leaf/block entry that extends
+			 * beyond the unmap range. We cannot simply clear it.
+			 */
+			table_empty = false;
+		}
+	}
+
+	return table_empty;
+}
+
+void s2mmu_unmap(struct s2_mmu *mmu, unsigned long ipa, unsigned long size)
+{
+	unsigned long end_ipa = ipa + size;
+	unsigned long mask = s2mmu_get_addr_mask(mmu);
+
+	if (!mmu->pgd)
+		return;
+
+	/*
+	 * Start recursion from the root level.
+	 * We rarely free the PGD itself unless destroying the MMU, 
+	 * so we ignore the return value here.
+	 */
+	s2mmu_unmap_level(mmu, (pte_t *)mmu->pgd, 0, mmu->root_level,
+			  ipa, end_ipa, mask);
+
+	/* Ensure TLB invalidation occurs after page table updates */
+	asm volatile("tlbi vmalls12e1is");
+	dsb(ish);
+	isb();
+}
+
+void s2mmu_print_fault_info(void)
+{
+	unsigned long esr = read_sysreg(esr_el2);
+	unsigned long far = read_sysreg(far_el2);
+	unsigned long hpfar = read_sysreg(hpfar_el2);
+	printf("Stage-2 Fault Info: ESR=0x%lx FAR=0x%lx HPFAR=0x%lx\n", esr, far, hpfar);
+}
-- 
2.53.0.851.ga537e3e6e9-goog


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [kvm-unit-tests PATCH v1 2/3] lib: arm64: Add bare-metal guest execution framework
  2026-03-16 22:43 [kvm-unit-tests PATCH v1 0/3] arm64: Add Stage-2 MMU and Nested Guest Framework Jing Zhang
  2026-03-16 22:43 ` [kvm-unit-tests PATCH v1 1/3] lib: arm64: Add stage2 page table management library Jing Zhang
@ 2026-03-16 22:43 ` Jing Zhang
  2026-03-17  1:46   ` Yao Yuan
                     ` (3 more replies)
  2026-03-16 22:43 ` [kvm-unit-tests PATCH v1 3/3] arm64: Add Stage-2 MMU demand paging test Jing Zhang
  2026-03-24 11:43 ` [kvm-unit-tests PATCH v1 0/3] arm64: Add Stage-2 MMU and Nested Guest Framework Joey Gouly
  3 siblings, 4 replies; 10+ messages in thread
From: Jing Zhang @ 2026-03-16 22:43 UTC (permalink / raw)
  To: KVM, KVMARM
  Cc: Marc Zyngier, Joey Gouly, Andrew Jones, Alexandru Elisei,
	Oliver Upton, Jing Zhang

To test advanced KVM features such as nested virtualization (NV) and
GICv4 direct interrupt injection, kvm-unit-tests needs the ability to
act as an L1 hypervisor running at EL2 and manage its own L2 guests.

Introduce a lightweight guest management library that provides the
infrastructure to create, configure, and execute nested guests.

This framework includes:
- Guest lifecycle management: `guest_create()` and `guest_destroy()`
  APIs to allocate guest context and setup Stage-2 identity mappings
  for code and stack using the s2mmu library.
- Context switching: The `guest_run()` assembly routine handles
  saving the host (L1) callee-saved registers and loading the guest
  (L2) GPRs and EL1 system registers.
- VM-Exit handling: Installs an EL2 trap handler (`guest_hyp_vectors`)
  to intercept guest exits and route them to `guest_c_exception_handler`
  to determine whether to return to the host test logic or resume.
- Guest-internal exceptions: Provides `guest_el1_vectors` to catch
  Sync, IRQ, FIQ, and SError exceptions occurring entirely within the
  guest (EL1) without trapping to the host.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
---
 arm/Makefile.arm64     |   2 +
 lib/arm64/asm/guest.h  | 156 ++++++++++++++++++++++++
 lib/arm64/guest.c      | 197 ++++++++++++++++++++++++++++++
 lib/arm64/guest_arch.S | 263 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 618 insertions(+)
 create mode 100644 lib/arm64/asm/guest.h
 create mode 100644 lib/arm64/guest.c
 create mode 100644 lib/arm64/guest_arch.S

diff --git a/arm/Makefile.arm64 b/arm/Makefile.arm64
index 5e50f5ba..9026fd71 100644
--- a/arm/Makefile.arm64
+++ b/arm/Makefile.arm64
@@ -41,6 +41,8 @@ cflatobjs += lib/arm64/processor.o
 cflatobjs += lib/arm64/spinlock.o
 cflatobjs += lib/arm64/gic-v3-its.o lib/arm64/gic-v3-its-cmd.o
 cflatobjs += lib/arm64/stage2_mmu.o
+cflatobjs += lib/arm64/guest.o
+cflatobjs += lib/arm64/guest_arch.o
 
 ifeq ($(CONFIG_EFI),y)
 cflatobjs += lib/acpi.o
diff --git a/lib/arm64/asm/guest.h b/lib/arm64/asm/guest.h
new file mode 100644
index 00000000..1d70873d
--- /dev/null
+++ b/lib/arm64/asm/guest.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2026, Google LLC.
+ * Author: Jing Zhang <jingzhangos@google.com>
+ *
+ * SPDX-License-Identifier: LGPL-2.0-or-later
+ */
+#ifndef _ASMARM64_GUEST_H_
+#define _ASMARM64_GUEST_H_
+
+/* Offsets for assembly (Must match struct guest) */
+#define GUEST_X_OFFSET			0
+#define GUEST_ELR_OFFSET		248
+#define GUEST_SPSR_OFFSET		256
+#define GUEST_HCR_OFFSET		264
+#define GUEST_VTTBR_OFFSET		272
+#define GUEST_SCTLR_OFFSET		280
+#define GUEST_VBAR_OFFSET		288
+#define GUEST_SP_EL1_OFFSET		296
+#define GUEST_ESR_OFFSET		304
+#define GUEST_FAR_OFFSET		312
+#define GUEST_HPFAR_OFFSET		320
+#define GUEST_EXIT_CODE_OFFSET		328
+#define GUEST_TPIDR_EL1_OFFSET		336
+#define GUEST_ICH_VMCR_EL2_OFFSET	344
+
+#ifndef __ASSEMBLY__
+
+#include <libcflat.h>
+#include <asm/stage2_mmu.h>
+
+/* HCR_EL2 Definitions */
+#define HCR_VM		(1UL << 0)	/* Virtualization Enable */
+#define HCR_FMO		(1UL << 3)	/* Physical FIQ Routing */
+#define HCR_IMO		(1UL << 4)	/* Physical IRQ Routing */
+#define HCR_AMO		(1UL << 5)	/* Physical SError Interrupt Routing */
+#define HCR_RW		(1UL << 31)	/* Execution State: AArch64 */
+#define HCR_DC		(1UL << 12)	/* Default Cacheable */
+#define HCR_E2H		(1UL << 34)	/* EL2 Host */
+
+#define HCR_GUEST_FLAGS (HCR_VM | HCR_FMO | HCR_IMO | HCR_AMO | HCR_RW | \
+			 HCR_DC | HCR_E2H)
+
+/* ICH_VMCR_EL2 bit definition */
+#define ICH_VMCR_PMR_SHIFT	24
+#define ICH_VMCR_PMR_MASK	(0xffUL << ICH_VMCR_PMR_SHIFT)
+#define ICH_VMCR_ENG0_SHIFT	0
+#define ICH_VMCR_ENG0_MASK	(1 << ICH_VMCR_ENG0_SHIFT)
+#define ICH_VMCR_ENG1_SHIFT	1
+#define ICH_VMCR_ENG1_MASK	(1 << ICH_VMCR_ENG1_SHIFT)
+
+/* Guest stack size */
+#define GUEST_STACK_SIZE		SZ_64K
+
+/*
+ * Result from Handler:
+ * RESUME: Keep guest running (ERET immediately)
+ * EXIT:   Return to Host C caller
+ */
+enum guest_handler_result {
+	GUEST_ACTION_RESUME,
+	GUEST_ACTION_EXIT
+};
+
+struct guest;
+typedef enum guest_handler_result (*guest_handler_t)(struct guest *guest);
+
+/* EL1 (Guest-internal) Exception Vector */
+enum guest_el1_vector {
+	GUEST_EL1_SYNC,
+	GUEST_EL1_IRQ,
+	GUEST_EL1_FIQ,
+	GUEST_EL1_SERROR,
+	GUEST_EL1_MAX
+};
+
+/*
+ * Guest EL1 Exception Frame (pushed to guest stack by asm stub)
+ * We use a simplified frame: x0-x30, elr, spsr. size = 33*8
+ */
+struct guest_el1_regs {
+	unsigned long regs[31];
+	unsigned long elr;
+	unsigned long spsr;
+};
+
+typedef void (*guest_el1_handler_t)(struct guest_el1_regs *regs, unsigned int esr);
+
+/* Exceptions from the Guest (Lower EL using AArch64) */
+enum guest_vector {
+	GUEST_VECTOR_SYNC,
+	GUEST_VECTOR_IRQ,
+	GUEST_VECTOR_FIQ,
+	GUEST_VECTOR_SERROR,
+	GUEST_VECTOR_MAX
+};
+
+/*
+ * Guest Context Structure
+ * This will be pointed to by TPIDR_EL1 while the guest is running.
+ */
+struct guest_context {
+	guest_el1_handler_t handlers[GUEST_EL1_MAX];
+};
+
+struct guest {
+	/* 0x000: General Purpose Registers */
+	unsigned long x[31]; /* x0..x30 */
+
+	/* 0x0F8: Execution State */
+	unsigned long elr_el2;
+	unsigned long spsr_el2;
+
+	/* 0x108: Control Registers */
+	unsigned long hcr_el2;
+	unsigned long vttbr_el2;
+	unsigned long sctlr_el1;
+	unsigned long vbar_el1;
+	unsigned long sp_el1;
+
+	/* 0x130: Exit Information */
+	unsigned long esr_el2;
+	unsigned long far_el2;
+	unsigned long hpfar_el2;
+	unsigned long exit_code; /* enum guest_vector */
+	unsigned long tpidr_el1;
+
+	/* 0x158: GIC Registers */
+	unsigned long ich_vmcr_el2;
+
+	/* 0x160: Exception Handlers */
+	guest_handler_t handlers[GUEST_VECTOR_MAX];
+	struct guest_context *guest_context;
+
+	struct s2_mmu *s2mmu;
+};
+
+/* API */
+struct guest *guest_create(int vmid, void (*guest_func)(void), enum s2_granule granule);
+void guest_destroy(struct guest *guest);
+
+/* Configuration */
+void guest_set_vector(struct guest *guest, void *vector_table);
+void guest_set_stack(struct guest *guest, void *stack_top);
+void guest_install_handler(struct guest *guest, enum guest_vector v, guest_handler_t handler);
+
+/* Install handler for exceptions INSIDE EL1 */
+void guest_install_el1_handler(struct guest *guest, enum guest_el1_vector v, guest_el1_handler_t handler);
+
+unsigned long guest_c_exception_handler(struct guest *guest, unsigned long vector_offset);
+void guest_el1_c_handler(struct guest_el1_regs *regs, unsigned int vector);
+
+/* Core Run Loop */
+void guest_run(struct guest *guest);
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASMARM64_GUEST_H_ */
diff --git a/lib/arm64/guest.c b/lib/arm64/guest.c
new file mode 100644
index 00000000..6c256c11
--- /dev/null
+++ b/lib/arm64/guest.c
@@ -0,0 +1,197 @@
+/*
+ * Copyright (C) 2026, Google LLC.
+ * Author: Jing Zhang <jingzhangos@google.com>
+ *
+ * SPDX-License-Identifier: LGPL-2.0-or-later
+ */
+#include <libcflat.h>
+#include <asm/guest.h>
+#include <asm/io.h>
+#include <asm/sysreg.h>
+#include <asm/barrier.h>
+#include <alloc_page.h>
+#include <alloc.h>
+
+/* Compile-time checks to ensure Assembly macros match C Struct */
+_Static_assert(offsetof(struct guest, x) == GUEST_X_OFFSET,
+	       "GUEST_X_OFFSET mismatch");
+_Static_assert(offsetof(struct guest, elr_el2) == GUEST_ELR_OFFSET,
+	       "GUEST_ELR_OFFSET mismatch");
+_Static_assert(offsetof(struct guest, spsr_el2) == GUEST_SPSR_OFFSET,
+	       "GUEST_SPSR_OFFSET mismatch");
+_Static_assert(offsetof(struct guest, hcr_el2) == GUEST_HCR_OFFSET,
+	       "GUEST_HCR_OFFSET mismatch");
+_Static_assert(offsetof(struct guest, vttbr_el2) == GUEST_VTTBR_OFFSET,
+	       "GUEST_VTTBR_OFFSET mismatch");
+_Static_assert(offsetof(struct guest, sctlr_el1) == GUEST_SCTLR_OFFSET,
+	       "GUEST_SCTLR_OFFSET mismatch");
+_Static_assert(offsetof(struct guest, vbar_el1) == GUEST_VBAR_OFFSET,
+	       "GUEST_VBAR_OFFSET mismatch");
+_Static_assert(offsetof(struct guest, sp_el1) == GUEST_SP_EL1_OFFSET,
+	       "GUEST_SP_EL1_OFFSET mismatch");
+_Static_assert(offsetof(struct guest, esr_el2) == GUEST_ESR_OFFSET,
+	       "GUEST_ESR_OFFSET mismatch");
+_Static_assert(offsetof(struct guest, far_el2) == GUEST_FAR_OFFSET,
+	       "GUEST_FAR_OFFSET mismatch");
+_Static_assert(offsetof(struct guest, hpfar_el2) == GUEST_HPFAR_OFFSET,
+	       "GUEST_HPFAR_OFFSET mismatch");
+_Static_assert(offsetof(struct guest, exit_code) == GUEST_EXIT_CODE_OFFSET,
+	       "GUEST_EXIT_CODE_OFFSET mismatch");
+_Static_assert(offsetof(struct guest, tpidr_el1) == GUEST_TPIDR_EL1_OFFSET,
+		"GUEST_TPIDR_EL1_OFFSET mismatch");
+_Static_assert(offsetof(struct guest, ich_vmcr_el2) == GUEST_ICH_VMCR_EL2_OFFSET,
+		"GUEST_ICH_VMCR_EL2_OFFSET mismatch");
+
+/*
+ * C-Entry for Exception Handling
+ * Returns 0 to Resume Guest, 1 to Exit to Host Caller
+ */
+unsigned long guest_c_exception_handler(struct guest *guest, unsigned long vector_offset)
+{
+	enum guest_vector vector = (enum guest_vector)guest->exit_code;
+
+	/* Save Trap Info */
+	guest->esr_el2 = read_sysreg(esr_el2);
+	guest->far_el2 = read_sysreg(far_el2);
+	guest->hpfar_el2 = read_sysreg(hpfar_el2);
+
+	/* Invoke Handler if registered */
+	if (guest->handlers[vector]) {
+		if (guest->handlers[vector](guest) == GUEST_ACTION_RESUME) {
+			return 0; /* ASM stub will restore and ERET */
+		}
+	}
+
+	/* Default: Exit to caller */
+	return 1;
+}
+
+/* --- EL1 (Guest-Internal) Vector Handling --- */
+
+void guest_install_el1_handler(struct guest *guest, enum guest_el1_vector v, guest_el1_handler_t handler)
+{
+	if (guest && guest->guest_context && v < GUEST_EL1_MAX)
+		guest->guest_context->handlers[v] = handler;
+}
+
+void guest_el1_c_handler(struct guest_el1_regs *regs, unsigned int vector)
+{
+	struct guest_context *ctx = (struct guest_context *)read_sysreg(tpidr_el1);
+	unsigned int esr = read_sysreg(esr_el1);
+
+	if (ctx && vector < GUEST_EL1_MAX && ctx->handlers[vector]) {
+		ctx->handlers[vector](regs, esr);
+	} else {
+		printf("Guest: Unhandled Exception Vector %d, ESR=0x%x\n", vector, esr);
+		asm volatile("hvc #0xFFFF");
+	}
+}
+
+extern void guest_el1_vectors(void);
+
+static struct guest *__guest_create(struct s2_mmu *s2_ctx, void *entry_point)
+{
+	struct guest *guest = calloc(1, sizeof(struct guest));
+	struct guest_context *guest_ctx;
+	unsigned long guest_ctx_pa;
+
+	/* Allocate the internal context table */
+	guest_ctx = (void *)alloc_page();
+	memset(guest_ctx, 0, PAGE_SIZE);
+	guest->guest_context = guest_ctx;
+
+	guest_ctx_pa = virt_to_phys(guest_ctx);
+	if (s2_ctx)
+		s2mmu_map(s2_ctx, guest_ctx_pa, guest_ctx_pa, PAGE_SIZE, S2_MAP_RW);
+
+	guest->tpidr_el1 = guest_ctx_pa;;
+
+	guest->elr_el2 = (unsigned long)entry_point;
+	guest->spsr_el2 = 0x3C5; /* M=EL1h, DAIF=Masked */
+	guest->hcr_el2 = HCR_GUEST_FLAGS;
+
+	if (s2_ctx) {
+		guest->vttbr_el2 = virt_to_phys(s2_ctx->pgd);
+		guest->vttbr_el2 |= ((unsigned long)s2_ctx->vmid << 48);
+	}
+
+	guest->sctlr_el1 = read_sysreg(sctlr_el1);
+	guest->sctlr_el1 |= SCTLR_EL1_C | SCTLR_EL1_I | SCTLR_EL1_M;
+
+	guest->ich_vmcr_el2 = read_sysreg(ich_vmcr_el2);
+	guest->ich_vmcr_el2 |= (0xFFUL << ICH_VMCR_PMR_SHIFT) | (1UL << ICH_VMCR_ENG1_SHIFT);
+
+	guest->vbar_el1 = (unsigned long)guest_el1_vectors;
+	guest->s2mmu = s2_ctx;
+
+	return guest;
+}
+
+struct guest *guest_create(int vmid, void (*guest_func)(void), enum s2_granule granule)
+{
+	unsigned long guest_pa, code_base, stack_pa;
+	unsigned long *stack_page;
+	struct guest *guest;
+	struct s2_mmu *ctx;
+
+	ctx = s2mmu_init(vmid, granule, true);
+	/*
+	 * Map the Host's code segment Identity Mapped (IPA=PA).
+	 * To be safe, we map a large chunk (e.g., 2MB) around the function
+	 * to capture any helper functions the compiler might generate calls to.
+	 */
+	guest_pa = virt_to_phys((void *)guest_func);
+	code_base = guest_pa & ~(SZ_2M - 1);
+	s2mmu_map(ctx, code_base, code_base, SZ_2M, S2_MAP_RW);
+
+	/*
+	 * Map Stack
+	 * Allocate 16 pages (64K) in Host, get its PA, and map it for Guest.
+	 */
+	stack_page = alloc_pages(get_order(GUEST_STACK_SIZE >> PAGE_SHIFT));
+	stack_pa = virt_to_phys(stack_page);
+	/* Identity Map it (IPA = PA) */
+	s2mmu_map(ctx, stack_pa, stack_pa, GUEST_STACK_SIZE, S2_MAP_RW);
+
+	s2mmu_enable(ctx);
+
+	/* Create Guest */
+	/* Entry point is the PA of the function (Identity Mapped) */
+	guest = __guest_create(ctx, (void *)guest_pa);
+
+	/*
+	 * Setup Guest Stack Pointer
+	 * Must match where we mapped the stack + Offset
+	 */
+	guest_set_stack(guest, (void *)(stack_pa + GUEST_STACK_SIZE));
+
+	/* Map UART identity mapped, printf() available to guest */
+	s2mmu_map(ctx, 0x09000000, 0x09000000, PAGE_SIZE, S2_MAP_DEVICE);
+
+	return guest;
+}
+
+void guest_destroy(struct guest *guest)
+{
+	s2mmu_disable(guest->s2mmu);
+	s2mmu_destroy(guest->s2mmu);
+	if (guest->guest_context)
+		free_page(guest->guest_context);
+	free(guest);
+}
+
+void guest_set_vector(struct guest *guest, void *vector_table)
+{
+	guest->vbar_el1 = (unsigned long)vector_table;
+}
+
+void guest_set_stack(struct guest *guest, void *stack_top)
+{
+	guest->sp_el1 = (unsigned long)stack_top;
+}
+
+void guest_install_handler(struct guest *guest, enum guest_vector v, guest_handler_t handler)
+{
+	if (v < GUEST_VECTOR_MAX)
+		guest->handlers[v] = handler;
+}
diff --git a/lib/arm64/guest_arch.S b/lib/arm64/guest_arch.S
new file mode 100644
index 00000000..cb7074d7
--- /dev/null
+++ b/lib/arm64/guest_arch.S
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2026, Google LLC.
+ * Author: Jing Zhang <jingzhangos@google.com>
+ *
+ * SPDX-License-Identifier: LGPL-2.0-or-later
+ */
+#define __ASSEMBLY__
+#include <asm/guest.h>
+
+.global guest_run
+guest_run:
+	/* x0 = struct guest pointer */
+
+	/* Save Host Callee-Saved Regs */
+	stp	x29, x30, [sp, #-16]!
+	stp	x27, x28, [sp, #-16]!
+	stp	x25, x26, [sp, #-16]!
+	stp	x23, x24, [sp, #-16]!
+	stp	x21, x22, [sp, #-16]!
+	stp	x19, x20, [sp, #-16]!
+
+	/* Cache Guest Pointer in TPIDR_EL2 */
+	msr	tpidr_el2, x0
+
+	/* Configure ICC_SRE_EL2 to allow EL1 access to SysRegs */
+	/* Bit 3 (Enable) = 1, Bit 0 (SRE) = 1 */
+	mrs	x1, icc_sre_el2
+	orr	x1, x1, #1
+	orr	x1, x1, #(1 << 3)
+	msr	icc_sre_el2, x1
+	isb
+
+	/* Enable virtual CPU interface */
+	mrs	x1, ich_hcr_el2
+	orr	x1, x1, #1
+	msr	ich_hcr_el2, x1
+
+	/* Load Guest System Registers */
+	ldr	x1, [x0, #GUEST_ELR_OFFSET]
+	msr	elr_el2, x1
+	ldr	x1, [x0, #GUEST_SPSR_OFFSET]
+	msr	spsr_el2, x1
+	ldr	x1, [x0, #GUEST_HCR_OFFSET]
+	msr	hcr_el2, x1
+	ldr	x1, [x0, #GUEST_VTTBR_OFFSET]
+	msr	vttbr_el2, x1
+	ldr	x1, [x0, #GUEST_SCTLR_OFFSET]
+	msr	S3_5_c1_c0_0, x1
+	ldr	x1, [x0, #GUEST_VBAR_OFFSET]
+	msr	S3_5_c12_c0_0, x1
+	ldr	x1, [x0, #GUEST_SP_EL1_OFFSET]
+	msr	sp_el1, x1
+	ldr	x1, [x0, #GUEST_TPIDR_EL1_OFFSET]
+	msr	tpidr_el1, x1
+	ldr	x1, [x0, #GUEST_ICH_VMCR_EL2_OFFSET]
+	msr	ich_vmcr_el2, x1
+
+	/* Load Guest GPRs */
+	ldp	x1, x2, [x0, #8]
+	ldp	x3, x4, [x0, #24]
+	ldp	x5, x6, [x0, #40]
+	ldp	x7, x8, [x0, #56]
+	ldp	x9, x10, [x0, #72]
+	ldp	x11, x12, [x0, #88]
+	ldp	x13, x14, [x0, #104]
+	ldp	x15, x16, [x0, #120]
+	ldp	x17, x18, [x0, #136]
+	ldp	x19, x20, [x0, #152]
+	ldp	x21, x22, [x0, #168]
+	ldp	x23, x24, [x0, #184]
+	ldp	x25, x26, [x0, #200]
+	ldp	x27, x28, [x0, #216]
+	ldp	x29, x30, [x0, #232]
+	ldr	x0, [x0, #0]
+
+	/* Install Trap Handler */
+	adrp	x29, guest_hyp_vectors
+	add	x29, x29, :lo12:guest_hyp_vectors
+	msr	vbar_el2, x29
+
+	/* Restore x29 from struct (via tpidr_el2) */
+	mrs	x29, tpidr_el2
+	ldr	x29, [x29, #232]
+
+	isb
+	eret
+
+	.align 11
+guest_hyp_vectors:
+	.skip 0x400
+
+guest_exit_sync:
+	stp	x0, x1, [sp, #-16]!
+	mrs	x0, tpidr_el2
+	mov	x1, #0
+	str	x1, [x0, #GUEST_EXIT_CODE_OFFSET]
+	b	guest_common_exit
+
+	.balign 0x80
+
+guest_exit_irq:
+	stp	x0, x1, [sp, #-16]!
+	mrs	x0, tpidr_el2
+	mov	x1, #1
+	str	x1, [x0, #GUEST_EXIT_CODE_OFFSET]
+	b	guest_common_exit
+
+guest_common_exit:
+	stp	x2, x3, [x0, #16]
+	stp	x4, x5, [x0, #32]
+	stp	x6, x7, [x0, #48]
+	stp	x8, x9, [x0, #64]
+	stp	x10, x11, [x0, #80]
+	stp	x12, x13, [x0, #96]
+	stp	x14, x15, [x0, #112]
+	stp	x16, x17, [x0, #128]
+	stp	x18, x19, [x0, #144]
+	stp	x20, x21, [x0, #160]
+	stp	x22, x23, [x0, #176]
+	stp	x24, x25, [x0, #192]
+	stp	x26, x27, [x0, #208]
+	stp	x28, x29, [x0, #224]
+	str	x30, [x0, #240]
+
+	ldp	x2, x3, [sp], #16
+	stp	x2, x3, [x0, #0]
+
+	mrs	x1, elr_el2
+	str	x1, [x0, #GUEST_ELR_OFFSET]
+	mrs	x1, spsr_el2
+	str	x1, [x0, #GUEST_SPSR_OFFSET]
+	mrs	x1, esr_el2
+	str	x1, [x0, #GUEST_ESR_OFFSET]
+	mrs	x1, far_el2
+	str	x1, [x0, #GUEST_FAR_OFFSET]
+	mrs	x1, hpfar_el2
+	str	x1, [x0, #GUEST_HPFAR_OFFSET]
+	mrs	x1, sp_el1
+	str	x1, [x0, #GUEST_SP_EL1_OFFSET]
+	mrs	x1, ich_vmcr_el2
+	str	x1, [x0, #GUEST_ICH_VMCR_EL2_OFFSET]
+
+	/* x29 contains vector offset from entry */
+	mov	x1, x29
+	bl	guest_c_exception_handler
+	cbz	x0, guest_resume_guest
+
+	/* EXIT */
+	/* Restore Host Callee-Saved Regs */
+	ldp	x19, x20, [sp], #16
+	ldp	x21, x22, [sp], #16
+	ldp	x23, x24, [sp], #16
+	ldp	x25, x26, [sp], #16
+	ldp	x27, x28, [sp], #16
+	ldp	x29, x30, [sp], #16
+	ret
+
+	/* RESUME */
+guest_resume_guest:
+	mrs	x0, tpidr_el2
+	ldr	x1, [x0, #GUEST_ELR_OFFSET]
+	msr	elr_el2, x1
+	ldr	x1, [x0, #GUEST_SPSR_OFFSET]
+	msr	spsr_el2, x1
+	ldr	x1, [x0, #GUEST_SP_EL1_OFFSET]
+	msr	sp_el1, x1
+
+	ldp	x1, x2, [x0, #8]
+	ldp	x3, x4, [x0, #24]
+	ldp	x5, x6, [x0, #40]
+	ldp	x7, x8, [x0, #56]
+	ldp	x9, x10, [x0, #72]
+	ldp	x11, x12, [x0, #88]
+	ldp	x13, x14, [x0, #104]
+	ldp	x15, x16, [x0, #120]
+	ldp	x17, x18, [x0, #136]
+	ldp	x19, x20, [x0, #152]
+	ldp	x21, x22, [x0, #168]
+	ldp	x23, x24, [x0, #184]
+	ldp	x25, x26, [x0, #200]
+	ldp	x27, x28, [x0, #216]
+	ldp	x29, x30, [x0, #232]
+	ldr	x0, [x0, #0]
+	eret
+
+/* EL1 Vector Table */
+.align 11
+.global guest_el1_vectors
+guest_el1_vectors:
+	/* Sync (0x000) */
+	.skip 0x200
+	/* Sync (0x200) */
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, #0
+	b	guest_el1_common
+	.skip 0x80 - 12
+	/* IRQ (0x280) */
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, #1
+	b	guest_el1_common
+	.skip 0x80 - 12
+	/* FIQ (0x300) */
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, #2
+	b	guest_el1_common
+	.skip 0x80 - 12
+	/* SError (0x380) */
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, #3
+	b	guest_el1_common
+	.skip 0x400
+
+guest_el1_common:
+	sub	sp, sp, #264
+	stp	x0, x1, [sp, #0]
+	stp	x2, x3, [sp, #16]
+	stp	x4, x5, [sp, #32]
+	stp	x6, x7, [sp, #48]
+	stp	x8, x9, [sp, #64]
+	stp	x10, x11, [sp, #80]
+	stp	x12, x13, [sp, #96]
+	stp	x14, x15, [sp, #112]
+	stp	x16, x17, [sp, #128]
+	stp	x18, x19, [sp, #144]
+	stp	x20, x21, [sp, #160]
+	stp	x22, x23, [sp, #176]
+	stp	x24, x25, [sp, #192]
+	stp	x26, x27, [sp, #208]
+	stp	x28, x30, [sp, #224]
+
+	mrs	x0, elr_el1
+	str	x0, [sp, #248]
+	mrs	x0, spsr_el1
+	str	x0, [sp, #256]
+
+	mov	x0, sp
+	mov	x1, x29
+	bl	guest_el1_c_handler
+
+	ldr	x0, [sp, #248]
+	msr	elr_el1, x0
+	ldr	x0, [sp, #256]
+	msr	spsr_el1, x0
+
+	ldp	x0, x1, [sp, #0]
+	ldp	x2, x3, [sp, #16]
+	ldp	x4, x5, [sp, #32]
+	ldp	x6, x7, [sp, #48]
+	ldp	x8, x9, [sp, #64]
+	ldp	x10, x11, [sp, #80]
+	ldp	x12, x13, [sp, #96]
+	ldp	x14, x15, [sp, #112]
+	ldp	x16, x17, [sp, #128]
+	ldp	x18, x19, [sp, #144]
+	ldp	x20, x21, [sp, #160]
+	ldp	x22, x23, [sp, #176]
+	ldp	x24, x25, [sp, #192]
+	ldp	x26, x27, [sp, #208]
+	ldp	x28, x30, [sp, #224]
+
+	add	sp, sp, #264
+	ldp	x29, x30, [sp], #16
+	eret
-- 
2.53.0.851.ga537e3e6e9-goog


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [kvm-unit-tests PATCH v1 3/3] arm64: Add Stage-2 MMU demand paging test
  2026-03-16 22:43 [kvm-unit-tests PATCH v1 0/3] arm64: Add Stage-2 MMU and Nested Guest Framework Jing Zhang
  2026-03-16 22:43 ` [kvm-unit-tests PATCH v1 1/3] lib: arm64: Add stage2 page table management library Jing Zhang
  2026-03-16 22:43 ` [kvm-unit-tests PATCH v1 2/3] lib: arm64: Add bare-metal guest execution framework Jing Zhang
@ 2026-03-16 22:43 ` Jing Zhang
  2026-03-24 11:43 ` [kvm-unit-tests PATCH v1 0/3] arm64: Add Stage-2 MMU and Nested Guest Framework Joey Gouly
  3 siblings, 0 replies; 10+ messages in thread
From: Jing Zhang @ 2026-03-16 22:43 UTC (permalink / raw)
  To: KVM, KVMARM
  Cc: Marc Zyngier, Joey Gouly, Andrew Jones, Alexandru Elisei,
	Oliver Upton, Jing Zhang

Introduce a new test case to validate Stage-2 MMU fault handling. The
test verifies that the hypervisor correctly identifies and handles
Stage-2 data aborts triggered by a guest accessing unmapped memory.

The test performs the following:
- Sets up a guest with Stage-1 disabled, using identity-mapped host
   code and shared data in the Stage-2 page tables.
- Triggers a Stage-2 data abort by accessing a specific unmapped IPA.
- Catches the exception in the host, verifies the fault address,
   and dynamically maps a new page to resolve the fault.
- Resumes the guest to confirm the memory access completes successfully
   and the fault handler functioned as expected.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
---
 arm/Makefile.arm64    |   1 +
 arm/stage2-mmu-test.c | 100 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+)
 create mode 100644 arm/stage2-mmu-test.c

diff --git a/arm/Makefile.arm64 b/arm/Makefile.arm64
index 9026fd71..e547f92d 100644
--- a/arm/Makefile.arm64
+++ b/arm/Makefile.arm64
@@ -67,6 +67,7 @@ tests += $(TEST_DIR)/cache.$(exe)
 tests += $(TEST_DIR)/debug.$(exe)
 tests += $(TEST_DIR)/fpu.$(exe)
 tests += $(TEST_DIR)/mte.$(exe)
+tests += $(TEST_DIR)/stage2-mmu-test.$(exe)
 
 include $(SRCDIR)/$(TEST_DIR)/Makefile.common
 
diff --git a/arm/stage2-mmu-test.c b/arm/stage2-mmu-test.c
new file mode 100644
index 00000000..391c28f0
--- /dev/null
+++ b/arm/stage2-mmu-test.c
@@ -0,0 +1,100 @@
+/*
+ * ARM64 Stage-2 MMU Demand Paging Test
+ *
+ * This test validates stage-2 data abort handling by purposefully
+ * accessing unmapped memory in the guest and verifying that the
+ * host correctly handles the fault by mapping the page.
+ *
+ * Copyright (C) 2026 Google LLC.
+ * Author: Jing Zhang <jingzhangos@google.com>
+ *
+ * SPDX-License-Identifier: LGPL-2.0-or-later
+ */
+#include <libcflat.h>
+#include <alloc_page.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/guest.h>
+#include <asm/stage2_mmu.h>
+
+#define TEST_PAGE_IPA		0x40000000UL
+#define FAULT_ADDR_IPA		0x50000000UL
+#define TEST_DATA		0xBEEFCAFEUL
+
+static volatile bool handled = false;
+
+static void guest_code(void)
+{
+	volatile unsigned long *test_va = (void *)TEST_PAGE_IPA;
+	volatile unsigned long *fault_va = (void *)FAULT_ADDR_IPA;
+
+	*fault_va = *test_va;
+
+	if (*fault_va == *test_va)
+		handled = true;
+
+	asm("hvc #0");
+}
+
+int main(int argc, char **argv)
+{
+	struct guest *guest;
+	unsigned long *test_page, *fixup_page;
+	unsigned long code_va_base, code_pa_base, data_base, far, ec;
+
+	report_prefix_push("stage2-mmu");
+
+	guest = guest_create(smp_processor_id(), guest_code, S2_PAGE_4K);
+
+	/* Map host code: IPA(VA) -> PA */
+	/* We use the host VA as the Guest IPA because guest stage 1 is disabled. */
+	code_va_base = (unsigned long)guest_code;
+	code_pa_base = virt_to_phys((void *)guest_code);
+
+	/* Align to 2MB to use block descriptors where possible */
+	code_va_base = code_va_base & ~(SZ_2M - 1);
+	code_pa_base = code_pa_base & ~(SZ_2M - 1);
+	s2mmu_map(guest->s2mmu, code_va_base, code_pa_base, SZ_2M, S2_MAP_RW);
+
+	/* Identity map the shared variable */
+	data_base = virt_to_phys((void *)&handled) & PAGE_MASK;
+	s2mmu_map(guest->s2mmu, data_base, data_base, PAGE_SIZE, S2_MAP_RW);
+
+	/* Map test data page */
+	test_page = alloc_page();
+	*test_page = TEST_DATA;
+	s2mmu_map(guest->s2mmu, TEST_PAGE_IPA, virt_to_phys(test_page), PAGE_SIZE, S2_MAP_RW);
+
+	report_info("CPU%d: entering guest...", smp_processor_id());
+
+	while (1) {
+		guest_run(guest);
+
+		if (guest->exit_code == GUEST_VECTOR_SYNC) {
+			ec = guest->esr_el2 >> ESR_ELx_EC_SHIFT;
+			if (ec == ESR_ELx_EC_HVC64) {
+				report_info("CPU%d: Guest exited via HVC.", smp_processor_id());
+				break;
+			} else if (ec == ESR_ELx_EC_DABT_LOW) {
+				far = guest->far_el2;
+				if (far == FAULT_ADDR_IPA) {
+					fixup_page = alloc_page();
+					s2mmu_map(guest->s2mmu, FAULT_ADDR_IPA,
+						  virt_to_phys(fixup_page), PAGE_SIZE, S2_MAP_RW);
+					report(true, "Caught stage-2 fault at 0x%lx", far);
+				} else {
+					report(false, "Unexpected fault address: 0x%lx", far);
+					break;
+				}
+			} else {
+				report(false, "Unexpected exception class: 0x%lx", ec);
+				break;
+			}
+		}
+	}
+
+	report(handled, "Stage-2 fault handling test completed");
+	guest_destroy(guest);
+
+	return report_summary();
+}
-- 
2.53.0.851.ga537e3e6e9-goog


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [kvm-unit-tests PATCH v1 2/3] lib: arm64: Add bare-metal guest execution framework
  2026-03-16 22:43 ` [kvm-unit-tests PATCH v1 2/3] lib: arm64: Add bare-metal guest execution framework Jing Zhang
@ 2026-03-17  1:46   ` Yao Yuan
  2026-03-17  8:09   ` Marc Zyngier
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 10+ messages in thread
From: Yao Yuan @ 2026-03-17  1:46 UTC (permalink / raw)
  To: Jing Zhang
  Cc: KVM, KVMARM, Marc Zyngier, Joey Gouly, Andrew Jones,
	Alexandru Elisei, Oliver Upton

On Mon, Mar 16, 2026 at 03:43:48PM +0800, Jing Zhang wrote:
> To test advanced KVM features such as nested virtualization (NV) and
> GICv4 direct interrupt injection, kvm-unit-tests needs the ability to
> act as an L1 hypervisor running at EL2 and manage its own L2 guests.
>
> Introduce a lightweight guest management library that provides the
> infrastructure to create, configure, and execute nested guests.
>
> This framework includes:
> - Guest lifecycle management: `guest_create()` and `guest_destroy()`
>   APIs to allocate guest context and setup Stage-2 identity mappings
>   for code and stack using the s2mmu library.
> - Context switching: The `guest_run()` assembly routine handles
>   saving the host (L1) callee-saved registers and loading the guest
>   (L2) GPRs and EL1 system registers.
> - VM-Exit handling: Installs an EL2 trap handler (`guest_hyp_vectors`)
>   to intercept guest exits and route them to `guest_c_exception_handler`
>   to determine whether to return to the host test logic or resume.
> - Guest-internal exceptions: Provides `guest_el1_vectors` to catch
>   Sync, IRQ, FIQ, and SError exceptions occurring entirely within the
>   guest (EL1) without trapping to the host.
>
> Signed-off-by: Jing Zhang <jingzhangos@google.com>
> ---
>  arm/Makefile.arm64     |   2 +
>  lib/arm64/asm/guest.h  | 156 ++++++++++++++++++++++++
>  lib/arm64/guest.c      | 197 ++++++++++++++++++++++++++++++
>  lib/arm64/guest_arch.S | 263 +++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 618 insertions(+)
>  create mode 100644 lib/arm64/asm/guest.h
>  create mode 100644 lib/arm64/guest.c
>  create mode 100644 lib/arm64/guest_arch.S
>
> diff --git a/arm/Makefile.arm64 b/arm/Makefile.arm64
> index 5e50f5ba..9026fd71 100644
> --- a/arm/Makefile.arm64
> +++ b/arm/Makefile.arm64
> @@ -41,6 +41,8 @@ cflatobjs += lib/arm64/processor.o
>  cflatobjs += lib/arm64/spinlock.o
>  cflatobjs += lib/arm64/gic-v3-its.o lib/arm64/gic-v3-its-cmd.o
>  cflatobjs += lib/arm64/stage2_mmu.o
> +cflatobjs += lib/arm64/guest.o
> +cflatobjs += lib/arm64/guest_arch.o
>
>  ifeq ($(CONFIG_EFI),y)
>  cflatobjs += lib/acpi.o
> diff --git a/lib/arm64/asm/guest.h b/lib/arm64/asm/guest.h
> new file mode 100644
> index 00000000..1d70873d
> --- /dev/null
> +++ b/lib/arm64/asm/guest.h
> @@ -0,0 +1,156 @@
> +/*
> + * Copyright (C) 2026, Google LLC.
> + * Author: Jing Zhang <jingzhangos@google.com>
> + *
> + * SPDX-License-Identifier: LGPL-2.0-or-later
> + */
> +#ifndef _ASMARM64_GUEST_H_
> +#define _ASMARM64_GUEST_H_
> +
> +/* Offsets for assembly (Must match struct guest) */
> +#define GUEST_X_OFFSET			0
> +#define GUEST_ELR_OFFSET		248
> +#define GUEST_SPSR_OFFSET		256
> +#define GUEST_HCR_OFFSET		264
> +#define GUEST_VTTBR_OFFSET		272
> +#define GUEST_SCTLR_OFFSET		280
> +#define GUEST_VBAR_OFFSET		288
> +#define GUEST_SP_EL1_OFFSET		296
> +#define GUEST_ESR_OFFSET		304
> +#define GUEST_FAR_OFFSET		312
> +#define GUEST_HPFAR_OFFSET		320
> +#define GUEST_EXIT_CODE_OFFSET		328
> +#define GUEST_TPIDR_EL1_OFFSET		336
> +#define GUEST_ICH_VMCR_EL2_OFFSET	344
> +
> +#ifndef __ASSEMBLY__
> +
> +#include <libcflat.h>
> +#include <asm/stage2_mmu.h>
> +
> +/* HCR_EL2 Definitions */
> +#define HCR_VM		(1UL << 0)	/* Virtualization Enable */
> +#define HCR_FMO		(1UL << 3)	/* Physical FIQ Routing */
> +#define HCR_IMO		(1UL << 4)	/* Physical IRQ Routing */
> +#define HCR_AMO		(1UL << 5)	/* Physical SError Interrupt Routing */
> +#define HCR_RW		(1UL << 31)	/* Execution State: AArch64 */
> +#define HCR_DC		(1UL << 12)	/* Default Cacheable */
> +#define HCR_E2H		(1UL << 34)	/* EL2 Host */
> +
> +#define HCR_GUEST_FLAGS (HCR_VM | HCR_FMO | HCR_IMO | HCR_AMO | HCR_RW | \
> +			 HCR_DC | HCR_E2H)

Set HCR_DC works for the testing in patch 03, for more
possible testing in L2 I guess the L2 guest's own paging is
necessary, this can be done by mapping L1 VA to L1 PA in
L2's page table (like kselftesting), then do IPA->PA with
Identity mapping in L1's Stage 2 mapping.

> +
> +/* ICH_VMCR_EL2 bit definition */
> +#define ICH_VMCR_PMR_SHIFT	24
> +#define ICH_VMCR_PMR_MASK	(0xffUL << ICH_VMCR_PMR_SHIFT)
> +#define ICH_VMCR_ENG0_SHIFT	0
> +#define ICH_VMCR_ENG0_MASK	(1 << ICH_VMCR_ENG0_SHIFT)
> +#define ICH_VMCR_ENG1_SHIFT	1
> +#define ICH_VMCR_ENG1_MASK	(1 << ICH_VMCR_ENG1_SHIFT)
> +
> +/* Guest stack size */
> +#define GUEST_STACK_SIZE		SZ_64K
> +
> +/*
> + * Result from Handler:
> + * RESUME: Keep guest running (ERET immediately)
> + * EXIT:   Return to Host C caller
> + */
> +enum guest_handler_result {
> +	GUEST_ACTION_RESUME,
> +	GUEST_ACTION_EXIT
> +};
> +
> +struct guest;
> +typedef enum guest_handler_result (*guest_handler_t)(struct guest *guest);
> +
> +/* EL1 (Guest-internal) Exception Vector */
> +enum guest_el1_vector {
> +	GUEST_EL1_SYNC,
> +	GUEST_EL1_IRQ,
> +	GUEST_EL1_FIQ,
> +	GUEST_EL1_SERROR,
> +	GUEST_EL1_MAX
> +};
> +
> +/*
> + * Guest EL1 Exception Frame (pushed to guest stack by asm stub)
> + * We use a simplified frame: x0-x30, elr, spsr. size = 33*8
> + */
> +struct guest_el1_regs {
> +	unsigned long regs[31];
> +	unsigned long elr;
> +	unsigned long spsr;
> +};
> +
> +typedef void (*guest_el1_handler_t)(struct guest_el1_regs *regs, unsigned int esr);
> +
> +/* Exceptions from the Guest (Lower EL using AArch64) */
> +enum guest_vector {
> +	GUEST_VECTOR_SYNC,
> +	GUEST_VECTOR_IRQ,
> +	GUEST_VECTOR_FIQ,
> +	GUEST_VECTOR_SERROR,
> +	GUEST_VECTOR_MAX
> +};
> +
> +/*
> + * Guest Context Structure
> + * This will be pointed to by TPIDR_EL1 while the guest is running.
> + */
> +struct guest_context {
> +	guest_el1_handler_t handlers[GUEST_EL1_MAX];
> +};
> +
> +struct guest {
> +	/* 0x000: General Purpose Registers */
> +	unsigned long x[31]; /* x0..x30 */
> +
> +	/* 0x0F8: Execution State */
> +	unsigned long elr_el2;
> +	unsigned long spsr_el2;
> +
> +	/* 0x108: Control Registers */
> +	unsigned long hcr_el2;
> +	unsigned long vttbr_el2;
> +	unsigned long sctlr_el1;
> +	unsigned long vbar_el1;
> +	unsigned long sp_el1;
> +
> +	/* 0x130: Exit Information */
> +	unsigned long esr_el2;
> +	unsigned long far_el2;
> +	unsigned long hpfar_el2;
> +	unsigned long exit_code; /* enum guest_vector */
> +	unsigned long tpidr_el1;
> +
> +	/* 0x158: GIC Registers */
> +	unsigned long ich_vmcr_el2;
> +
> +	/* 0x160: Exception Handlers */
> +	guest_handler_t handlers[GUEST_VECTOR_MAX];
> +	struct guest_context *guest_context;
> +
> +	struct s2_mmu *s2mmu;
> +};
> +
> +/* API */
> +struct guest *guest_create(int vmid, void (*guest_func)(void), enum s2_granule granule);
> +void guest_destroy(struct guest *guest);
> +
> +/* Configuration */
> +void guest_set_vector(struct guest *guest, void *vector_table);
> +void guest_set_stack(struct guest *guest, void *stack_top);
> +void guest_install_handler(struct guest *guest, enum guest_vector v, guest_handler_t handler);
> +
> +/* Install handler for exceptions INSIDE EL1 */
> +void guest_install_el1_handler(struct guest *guest, enum guest_el1_vector v, guest_el1_handler_t handler);
> +
> +unsigned long guest_c_exception_handler(struct guest *guest, unsigned long vector_offset);
> +void guest_el1_c_handler(struct guest_el1_regs *regs, unsigned int vector);
> +
> +/* Core Run Loop */
> +void guest_run(struct guest *guest);
> +
> +#endif /* __ASSEMBLY__ */
> +#endif /* _ASMARM64_GUEST_H_ */
> diff --git a/lib/arm64/guest.c b/lib/arm64/guest.c
> new file mode 100644
> index 00000000..6c256c11
> --- /dev/null
> +++ b/lib/arm64/guest.c
> @@ -0,0 +1,197 @@
> +/*
> + * Copyright (C) 2026, Google LLC.
> + * Author: Jing Zhang <jingzhangos@google.com>
> + *
> + * SPDX-License-Identifier: LGPL-2.0-or-later
> + */
> +#include <libcflat.h>
> +#include <asm/guest.h>
> +#include <asm/io.h>
> +#include <asm/sysreg.h>
> +#include <asm/barrier.h>
> +#include <alloc_page.h>
> +#include <alloc.h>
> +
> +/* Compile-time checks to ensure Assembly macros match C Struct */
> +_Static_assert(offsetof(struct guest, x) == GUEST_X_OFFSET,
> +	       "GUEST_X_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, elr_el2) == GUEST_ELR_OFFSET,
> +	       "GUEST_ELR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, spsr_el2) == GUEST_SPSR_OFFSET,
> +	       "GUEST_SPSR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, hcr_el2) == GUEST_HCR_OFFSET,
> +	       "GUEST_HCR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, vttbr_el2) == GUEST_VTTBR_OFFSET,
> +	       "GUEST_VTTBR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, sctlr_el1) == GUEST_SCTLR_OFFSET,
> +	       "GUEST_SCTLR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, vbar_el1) == GUEST_VBAR_OFFSET,
> +	       "GUEST_VBAR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, sp_el1) == GUEST_SP_EL1_OFFSET,
> +	       "GUEST_SP_EL1_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, esr_el2) == GUEST_ESR_OFFSET,
> +	       "GUEST_ESR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, far_el2) == GUEST_FAR_OFFSET,
> +	       "GUEST_FAR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, hpfar_el2) == GUEST_HPFAR_OFFSET,
> +	       "GUEST_HPFAR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, exit_code) == GUEST_EXIT_CODE_OFFSET,
> +	       "GUEST_EXIT_CODE_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, tpidr_el1) == GUEST_TPIDR_EL1_OFFSET,
> +		"GUEST_TPIDR_EL1_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, ich_vmcr_el2) == GUEST_ICH_VMCR_EL2_OFFSET,
> +		"GUEST_ICH_VMCR_EL2_OFFSET mismatch");
> +
> +/*
> + * C-Entry for Exception Handling
> + * Returns 0 to Resume Guest, 1 to Exit to Host Caller
> + */
> +unsigned long guest_c_exception_handler(struct guest *guest, unsigned long vector_offset)
> +{
> +	enum guest_vector vector = (enum guest_vector)guest->exit_code;
> +
> +	/* Save Trap Info */
> +	guest->esr_el2 = read_sysreg(esr_el2);
> +	guest->far_el2 = read_sysreg(far_el2);
> +	guest->hpfar_el2 = read_sysreg(hpfar_el2);
> +
> +	/* Invoke Handler if registered */
> +	if (guest->handlers[vector]) {
> +		if (guest->handlers[vector](guest) == GUEST_ACTION_RESUME) {
> +			return 0; /* ASM stub will restore and ERET */
> +		}
> +	}
> +
> +	/* Default: Exit to caller */
> +	return 1;
> +}
> +
> +/* --- EL1 (Guest-Internal) Vector Handling --- */
> +
> +void guest_install_el1_handler(struct guest *guest, enum guest_el1_vector v, guest_el1_handler_t handler)
> +{
> +	if (guest && guest->guest_context && v < GUEST_EL1_MAX)
> +		guest->guest_context->handlers[v] = handler;
> +}
> +
> +void guest_el1_c_handler(struct guest_el1_regs *regs, unsigned int vector)
> +{
> +	struct guest_context *ctx = (struct guest_context *)read_sysreg(tpidr_el1);
> +	unsigned int esr = read_sysreg(esr_el1);
> +
> +	if (ctx && vector < GUEST_EL1_MAX && ctx->handlers[vector]) {
> +		ctx->handlers[vector](regs, esr);
> +	} else {
> +		printf("Guest: Unhandled Exception Vector %d, ESR=0x%x\n", vector, esr);
> +		asm volatile("hvc #0xFFFF");
> +	}
> +}
> +
> +extern void guest_el1_vectors(void);
> +
> +static struct guest *__guest_create(struct s2_mmu *s2_ctx, void *entry_point)
> +{
> +	struct guest *guest = calloc(1, sizeof(struct guest));
> +	struct guest_context *guest_ctx;
> +	unsigned long guest_ctx_pa;
> +
> +	/* Allocate the internal context table */
> +	guest_ctx = (void *)alloc_page();
> +	memset(guest_ctx, 0, PAGE_SIZE);
> +	guest->guest_context = guest_ctx;
> +
> +	guest_ctx_pa = virt_to_phys(guest_ctx);
> +	if (s2_ctx)
> +		s2mmu_map(s2_ctx, guest_ctx_pa, guest_ctx_pa, PAGE_SIZE, S2_MAP_RW);
> +
> +	guest->tpidr_el1 = guest_ctx_pa;;
> +
> +	guest->elr_el2 = (unsigned long)entry_point;
> +	guest->spsr_el2 = 0x3C5; /* M=EL1h, DAIF=Masked */
> +	guest->hcr_el2 = HCR_GUEST_FLAGS;
> +
> +	if (s2_ctx) {
> +		guest->vttbr_el2 = virt_to_phys(s2_ctx->pgd);
> +		guest->vttbr_el2 |= ((unsigned long)s2_ctx->vmid << 48);
> +	}
> +
> +	guest->sctlr_el1 = read_sysreg(sctlr_el1);
> +	guest->sctlr_el1 |= SCTLR_EL1_C | SCTLR_EL1_I | SCTLR_EL1_M;
> +
> +	guest->ich_vmcr_el2 = read_sysreg(ich_vmcr_el2);
> +	guest->ich_vmcr_el2 |= (0xFFUL << ICH_VMCR_PMR_SHIFT) | (1UL << ICH_VMCR_ENG1_SHIFT);
> +
> +	guest->vbar_el1 = (unsigned long)guest_el1_vectors;
> +	guest->s2mmu = s2_ctx;
> +
> +	return guest;
> +}
> +
> +struct guest *guest_create(int vmid, void (*guest_func)(void), enum s2_granule granule)
> +{
> +	unsigned long guest_pa, code_base, stack_pa;
> +	unsigned long *stack_page;
> +	struct guest *guest;
> +	struct s2_mmu *ctx;
> +
> +	ctx = s2mmu_init(vmid, granule, true);
> +	/*
> +	 * Map the Host's code segment Identity Mapped (IPA=PA).
> +	 * To be safe, we map a large chunk (e.g., 2MB) around the function
> +	 * to capture any helper functions the compiler might generate calls to.
> +	 */
> +	guest_pa = virt_to_phys((void *)guest_func);
> +	code_base = guest_pa & ~(SZ_2M - 1);
> +	s2mmu_map(ctx, code_base, code_base, SZ_2M, S2_MAP_RW);
> +
> +	/*
> +	 * Map Stack
> +	 * Allocate 16 pages (64K) in Host, get its PA, and map it for Guest.
> +	 */
> +	stack_page = alloc_pages(get_order(GUEST_STACK_SIZE >> PAGE_SHIFT));
> +	stack_pa = virt_to_phys(stack_page);
> +	/* Identity Map it (IPA = PA) */
> +	s2mmu_map(ctx, stack_pa, stack_pa, GUEST_STACK_SIZE, S2_MAP_RW);
> +
> +	s2mmu_enable(ctx);
> +
> +	/* Create Guest */
> +	/* Entry point is the PA of the function (Identity Mapped) */
> +	guest = __guest_create(ctx, (void *)guest_pa);
> +
> +	/*
> +	 * Setup Guest Stack Pointer
> +	 * Must match where we mapped the stack + Offset
> +	 */
> +	guest_set_stack(guest, (void *)(stack_pa + GUEST_STACK_SIZE));
> +
> +	/* Map UART identity mapped, printf() available to guest */
> +	s2mmu_map(ctx, 0x09000000, 0x09000000, PAGE_SIZE, S2_MAP_DEVICE);
> +
> +	return guest;
> +}
> +
> +void guest_destroy(struct guest *guest)
> +{
> +	s2mmu_disable(guest->s2mmu);
> +	s2mmu_destroy(guest->s2mmu);
> +	if (guest->guest_context)
> +		free_page(guest->guest_context);
> +	free(guest);
> +}
> +
> +void guest_set_vector(struct guest *guest, void *vector_table)
> +{
> +	guest->vbar_el1 = (unsigned long)vector_table;
> +}
> +
> +void guest_set_stack(struct guest *guest, void *stack_top)
> +{
> +	guest->sp_el1 = (unsigned long)stack_top;
> +}
> +
> +void guest_install_handler(struct guest *guest, enum guest_vector v, guest_handler_t handler)
> +{
> +	if (v < GUEST_VECTOR_MAX)
> +		guest->handlers[v] = handler;
> +}
> diff --git a/lib/arm64/guest_arch.S b/lib/arm64/guest_arch.S
> new file mode 100644
> index 00000000..cb7074d7
> --- /dev/null
> +++ b/lib/arm64/guest_arch.S
> @@ -0,0 +1,263 @@
> +/*
> + * Copyright (C) 2026, Google LLC.
> + * Author: Jing Zhang <jingzhangos@google.com>
> + *
> + * SPDX-License-Identifier: LGPL-2.0-or-later
> + */
> +#define __ASSEMBLY__
> +#include <asm/guest.h>
> +
> +.global guest_run
> +guest_run:
> +	/* x0 = struct guest pointer */
> +
> +	/* Save Host Callee-Saved Regs */
> +	stp	x29, x30, [sp, #-16]!
> +	stp	x27, x28, [sp, #-16]!
> +	stp	x25, x26, [sp, #-16]!
> +	stp	x23, x24, [sp, #-16]!
> +	stp	x21, x22, [sp, #-16]!
> +	stp	x19, x20, [sp, #-16]!
> +
> +	/* Cache Guest Pointer in TPIDR_EL2 */
> +	msr	tpidr_el2, x0
> +
> +	/* Configure ICC_SRE_EL2 to allow EL1 access to SysRegs */
> +	/* Bit 3 (Enable) = 1, Bit 0 (SRE) = 1 */
> +	mrs	x1, icc_sre_el2
> +	orr	x1, x1, #1
> +	orr	x1, x1, #(1 << 3)
> +	msr	icc_sre_el2, x1
> +	isb
> +
> +	/* Enable virtual CPU interface */
> +	mrs	x1, ich_hcr_el2
> +	orr	x1, x1, #1
> +	msr	ich_hcr_el2, x1
> +
> +	/* Load Guest System Registers */
> +	ldr	x1, [x0, #GUEST_ELR_OFFSET]
> +	msr	elr_el2, x1
> +	ldr	x1, [x0, #GUEST_SPSR_OFFSET]
> +	msr	spsr_el2, x1
> +	ldr	x1, [x0, #GUEST_HCR_OFFSET]
> +	msr	hcr_el2, x1
> +	ldr	x1, [x0, #GUEST_VTTBR_OFFSET]
> +	msr	vttbr_el2, x1
> +	ldr	x1, [x0, #GUEST_SCTLR_OFFSET]
> +	msr	S3_5_c1_c0_0, x1
> +	ldr	x1, [x0, #GUEST_VBAR_OFFSET]
> +	msr	S3_5_c12_c0_0, x1
> +	ldr	x1, [x0, #GUEST_SP_EL1_OFFSET]
> +	msr	sp_el1, x1
> +	ldr	x1, [x0, #GUEST_TPIDR_EL1_OFFSET]
> +	msr	tpidr_el1, x1
> +	ldr	x1, [x0, #GUEST_ICH_VMCR_EL2_OFFSET]
> +	msr	ich_vmcr_el2, x1
> +
> +	/* Load Guest GPRs */
> +	ldp	x1, x2, [x0, #8]
> +	ldp	x3, x4, [x0, #24]
> +	ldp	x5, x6, [x0, #40]
> +	ldp	x7, x8, [x0, #56]
> +	ldp	x9, x10, [x0, #72]
> +	ldp	x11, x12, [x0, #88]
> +	ldp	x13, x14, [x0, #104]
> +	ldp	x15, x16, [x0, #120]
> +	ldp	x17, x18, [x0, #136]
> +	ldp	x19, x20, [x0, #152]
> +	ldp	x21, x22, [x0, #168]
> +	ldp	x23, x24, [x0, #184]
> +	ldp	x25, x26, [x0, #200]
> +	ldp	x27, x28, [x0, #216]
> +	ldp	x29, x30, [x0, #232]
> +	ldr	x0, [x0, #0]
> +
> +	/* Install Trap Handler */
> +	adrp	x29, guest_hyp_vectors
> +	add	x29, x29, :lo12:guest_hyp_vectors
> +	msr	vbar_el2, x29
> +
> +	/* Restore x29 from struct (via tpidr_el2) */
> +	mrs	x29, tpidr_el2
> +	ldr	x29, [x29, #232]
> +
> +	isb
> +	eret
> +
> +	.align 11
> +guest_hyp_vectors:
> +	.skip 0x400
> +
> +guest_exit_sync:
> +	stp	x0, x1, [sp, #-16]!
> +	mrs	x0, tpidr_el2
> +	mov	x1, #0
> +	str	x1, [x0, #GUEST_EXIT_CODE_OFFSET]
> +	b	guest_common_exit
> +
> +	.balign 0x80
> +
> +guest_exit_irq:
> +	stp	x0, x1, [sp, #-16]!
> +	mrs	x0, tpidr_el2
> +	mov	x1, #1
> +	str	x1, [x0, #GUEST_EXIT_CODE_OFFSET]
> +	b	guest_common_exit
> +
> +guest_common_exit:
> +	stp	x2, x3, [x0, #16]
> +	stp	x4, x5, [x0, #32]
> +	stp	x6, x7, [x0, #48]
> +	stp	x8, x9, [x0, #64]
> +	stp	x10, x11, [x0, #80]
> +	stp	x12, x13, [x0, #96]
> +	stp	x14, x15, [x0, #112]
> +	stp	x16, x17, [x0, #128]
> +	stp	x18, x19, [x0, #144]
> +	stp	x20, x21, [x0, #160]
> +	stp	x22, x23, [x0, #176]
> +	stp	x24, x25, [x0, #192]
> +	stp	x26, x27, [x0, #208]
> +	stp	x28, x29, [x0, #224]
> +	str	x30, [x0, #240]
> +
> +	ldp	x2, x3, [sp], #16
> +	stp	x2, x3, [x0, #0]
> +
> +	mrs	x1, elr_el2
> +	str	x1, [x0, #GUEST_ELR_OFFSET]
> +	mrs	x1, spsr_el2
> +	str	x1, [x0, #GUEST_SPSR_OFFSET]
> +	mrs	x1, esr_el2
> +	str	x1, [x0, #GUEST_ESR_OFFSET]
> +	mrs	x1, far_el2
> +	str	x1, [x0, #GUEST_FAR_OFFSET]
> +	mrs	x1, hpfar_el2
> +	str	x1, [x0, #GUEST_HPFAR_OFFSET]
> +	mrs	x1, sp_el1
> +	str	x1, [x0, #GUEST_SP_EL1_OFFSET]
> +	mrs	x1, ich_vmcr_el2
> +	str	x1, [x0, #GUEST_ICH_VMCR_EL2_OFFSET]
> +
> +	/* x29 contains vector offset from entry */
> +	mov	x1, x29
> +	bl	guest_c_exception_handler
> +	cbz	x0, guest_resume_guest
> +
> +	/* EXIT */
> +	/* Restore Host Callee-Saved Regs */
> +	ldp	x19, x20, [sp], #16
> +	ldp	x21, x22, [sp], #16
> +	ldp	x23, x24, [sp], #16
> +	ldp	x25, x26, [sp], #16
> +	ldp	x27, x28, [sp], #16
> +	ldp	x29, x30, [sp], #16
> +	ret
> +
> +	/* RESUME */
> +guest_resume_guest:
> +	mrs	x0, tpidr_el2
> +	ldr	x1, [x0, #GUEST_ELR_OFFSET]
> +	msr	elr_el2, x1
> +	ldr	x1, [x0, #GUEST_SPSR_OFFSET]
> +	msr	spsr_el2, x1
> +	ldr	x1, [x0, #GUEST_SP_EL1_OFFSET]
> +	msr	sp_el1, x1
> +
> +	ldp	x1, x2, [x0, #8]
> +	ldp	x3, x4, [x0, #24]
> +	ldp	x5, x6, [x0, #40]
> +	ldp	x7, x8, [x0, #56]
> +	ldp	x9, x10, [x0, #72]
> +	ldp	x11, x12, [x0, #88]
> +	ldp	x13, x14, [x0, #104]
> +	ldp	x15, x16, [x0, #120]
> +	ldp	x17, x18, [x0, #136]
> +	ldp	x19, x20, [x0, #152]
> +	ldp	x21, x22, [x0, #168]
> +	ldp	x23, x24, [x0, #184]
> +	ldp	x25, x26, [x0, #200]
> +	ldp	x27, x28, [x0, #216]
> +	ldp	x29, x30, [x0, #232]
> +	ldr	x0, [x0, #0]
> +	eret
> +
> +/* EL1 Vector Table */
> +.align 11
> +.global guest_el1_vectors
> +guest_el1_vectors:
> +	/* Sync (0x000) */
> +	.skip 0x200
> +	/* Sync (0x200) */
> +	stp	x29, x30, [sp, #-16]!
> +	mov	x29, #0
> +	b	guest_el1_common
> +	.skip 0x80 - 12
> +	/* IRQ (0x280) */
> +	stp	x29, x30, [sp, #-16]!
> +	mov	x29, #1
> +	b	guest_el1_common
> +	.skip 0x80 - 12
> +	/* FIQ (0x300) */
> +	stp	x29, x30, [sp, #-16]!
> +	mov	x29, #2
> +	b	guest_el1_common
> +	.skip 0x80 - 12
> +	/* SError (0x380) */
> +	stp	x29, x30, [sp, #-16]!
> +	mov	x29, #3
> +	b	guest_el1_common
> +	.skip 0x400
> +
> +guest_el1_common:
> +	sub	sp, sp, #264
> +	stp	x0, x1, [sp, #0]
> +	stp	x2, x3, [sp, #16]
> +	stp	x4, x5, [sp, #32]
> +	stp	x6, x7, [sp, #48]
> +	stp	x8, x9, [sp, #64]
> +	stp	x10, x11, [sp, #80]
> +	stp	x12, x13, [sp, #96]
> +	stp	x14, x15, [sp, #112]
> +	stp	x16, x17, [sp, #128]
> +	stp	x18, x19, [sp, #144]
> +	stp	x20, x21, [sp, #160]
> +	stp	x22, x23, [sp, #176]
> +	stp	x24, x25, [sp, #192]
> +	stp	x26, x27, [sp, #208]
> +	stp	x28, x30, [sp, #224]
> +
> +	mrs	x0, elr_el1
> +	str	x0, [sp, #248]
> +	mrs	x0, spsr_el1
> +	str	x0, [sp, #256]
> +
> +	mov	x0, sp
> +	mov	x1, x29
> +	bl	guest_el1_c_handler
> +
> +	ldr	x0, [sp, #248]
> +	msr	elr_el1, x0
> +	ldr	x0, [sp, #256]
> +	msr	spsr_el1, x0
> +
> +	ldp	x0, x1, [sp, #0]
> +	ldp	x2, x3, [sp, #16]
> +	ldp	x4, x5, [sp, #32]
> +	ldp	x6, x7, [sp, #48]
> +	ldp	x8, x9, [sp, #64]
> +	ldp	x10, x11, [sp, #80]
> +	ldp	x12, x13, [sp, #96]
> +	ldp	x14, x15, [sp, #112]
> +	ldp	x16, x17, [sp, #128]
> +	ldp	x18, x19, [sp, #144]
> +	ldp	x20, x21, [sp, #160]
> +	ldp	x22, x23, [sp, #176]
> +	ldp	x24, x25, [sp, #192]
> +	ldp	x26, x27, [sp, #208]
> +	ldp	x28, x30, [sp, #224]
> +
> +	add	sp, sp, #264
> +	ldp	x29, x30, [sp], #16
> +	eret
> --
> 2.53.0.851.ga537e3e6e9-goog
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [kvm-unit-tests PATCH v1 2/3] lib: arm64: Add bare-metal guest execution framework
  2026-03-16 22:43 ` [kvm-unit-tests PATCH v1 2/3] lib: arm64: Add bare-metal guest execution framework Jing Zhang
  2026-03-17  1:46   ` Yao Yuan
@ 2026-03-17  8:09   ` Marc Zyngier
  2026-03-24 15:04   ` Joey Gouly
  2026-03-24 15:44   ` Wei-Lin Chang
  3 siblings, 0 replies; 10+ messages in thread
From: Marc Zyngier @ 2026-03-17  8:09 UTC (permalink / raw)
  To: Jing Zhang
  Cc: KVM, KVMARM, Joey Gouly, Andrew Jones, Alexandru Elisei,
	Oliver Upton

On Mon, 16 Mar 2026 22:43:48 +0000,
Jing Zhang <jingzhangos@google.com> wrote:
> 
> To test advanced KVM features such as nested virtualization (NV) and
> GICv4 direct interrupt injection, kvm-unit-tests needs the ability to
> act as an L1 hypervisor running at EL2 and manage its own L2 guests.
> 
> Introduce a lightweight guest management library that provides the
> infrastructure to create, configure, and execute nested guests.
> 
> This framework includes:
> - Guest lifecycle management: `guest_create()` and `guest_destroy()`
>   APIs to allocate guest context and setup Stage-2 identity mappings
>   for code and stack using the s2mmu library.
> - Context switching: The `guest_run()` assembly routine handles
>   saving the host (L1) callee-saved registers and loading the guest
>   (L2) GPRs and EL1 system registers.
> - VM-Exit handling: Installs an EL2 trap handler (`guest_hyp_vectors`)
>   to intercept guest exits and route them to `guest_c_exception_handler`
>   to determine whether to return to the host test logic or resume.
> - Guest-internal exceptions: Provides `guest_el1_vectors` to catch
>   Sync, IRQ, FIQ, and SError exceptions occurring entirely within the
>   guest (EL1) without trapping to the host.
> 
> Signed-off-by: Jing Zhang <jingzhangos@google.com>
> ---
>  arm/Makefile.arm64     |   2 +
>  lib/arm64/asm/guest.h  | 156 ++++++++++++++++++++++++
>  lib/arm64/guest.c      | 197 ++++++++++++++++++++++++++++++
>  lib/arm64/guest_arch.S | 263 +++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 618 insertions(+)
>  create mode 100644 lib/arm64/asm/guest.h
>  create mode 100644 lib/arm64/guest.c
>  create mode 100644 lib/arm64/guest_arch.S
> 
> diff --git a/arm/Makefile.arm64 b/arm/Makefile.arm64
> index 5e50f5ba..9026fd71 100644
> --- a/arm/Makefile.arm64
> +++ b/arm/Makefile.arm64
> @@ -41,6 +41,8 @@ cflatobjs += lib/arm64/processor.o
>  cflatobjs += lib/arm64/spinlock.o
>  cflatobjs += lib/arm64/gic-v3-its.o lib/arm64/gic-v3-its-cmd.o
>  cflatobjs += lib/arm64/stage2_mmu.o
> +cflatobjs += lib/arm64/guest.o
> +cflatobjs += lib/arm64/guest_arch.o
>  
>  ifeq ($(CONFIG_EFI),y)
>  cflatobjs += lib/acpi.o
> diff --git a/lib/arm64/asm/guest.h b/lib/arm64/asm/guest.h
> new file mode 100644
> index 00000000..1d70873d
> --- /dev/null
> +++ b/lib/arm64/asm/guest.h
> @@ -0,0 +1,156 @@
> +/*
> + * Copyright (C) 2026, Google LLC.
> + * Author: Jing Zhang <jingzhangos@google.com>
> + *
> + * SPDX-License-Identifier: LGPL-2.0-or-later
> + */
> +#ifndef _ASMARM64_GUEST_H_
> +#define _ASMARM64_GUEST_H_
> +
> +/* Offsets for assembly (Must match struct guest) */
> +#define GUEST_X_OFFSET			0
> +#define GUEST_ELR_OFFSET		248
> +#define GUEST_SPSR_OFFSET		256
> +#define GUEST_HCR_OFFSET		264
> +#define GUEST_VTTBR_OFFSET		272
> +#define GUEST_SCTLR_OFFSET		280
> +#define GUEST_VBAR_OFFSET		288
> +#define GUEST_SP_EL1_OFFSET		296
> +#define GUEST_ESR_OFFSET		304
> +#define GUEST_FAR_OFFSET		312
> +#define GUEST_HPFAR_OFFSET		320
> +#define GUEST_EXIT_CODE_OFFSET		328
> +#define GUEST_TPIDR_EL1_OFFSET		336
> +#define GUEST_ICH_VMCR_EL2_OFFSET	344

Don't hardcode offsets. Generate them.

> +
> +#ifndef __ASSEMBLY__
> +
> +#include <libcflat.h>
> +#include <asm/stage2_mmu.h>
> +
> +/* HCR_EL2 Definitions */
> +#define HCR_VM		(1UL << 0)	/* Virtualization Enable */
> +#define HCR_FMO		(1UL << 3)	/* Physical FIQ Routing */
> +#define HCR_IMO		(1UL << 4)	/* Physical IRQ Routing */
> +#define HCR_AMO		(1UL << 5)	/* Physical SError Interrupt Routing */
> +#define HCR_RW		(1UL << 31)	/* Execution State: AArch64 */
> +#define HCR_DC		(1UL << 12)	/* Default Cacheable */
> +#define HCR_E2H		(1UL << 34)	/* EL2 Host */

Please consider importing the kernel's sysreg definition, or generate
them from an official source (the architecture JSON file, for
example).

> +
> +#define HCR_GUEST_FLAGS (HCR_VM | HCR_FMO | HCR_IMO | HCR_AMO | HCR_RW | \
> +			 HCR_DC | HCR_E2H)

Just to set expectations: HCR_EL2.DC is not supported by KVM, and
likely never will. I'm hopeful that this bit (and a few others) will
eventually be deprecated because it serves no purpose. If you need a
1:1 S1 mapping, create it using (surprise!) page tables.

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [kvm-unit-tests PATCH v1 0/3] arm64: Add Stage-2 MMU and Nested Guest Framework
  2026-03-16 22:43 [kvm-unit-tests PATCH v1 0/3] arm64: Add Stage-2 MMU and Nested Guest Framework Jing Zhang
                   ` (2 preceding siblings ...)
  2026-03-16 22:43 ` [kvm-unit-tests PATCH v1 3/3] arm64: Add Stage-2 MMU demand paging test Jing Zhang
@ 2026-03-24 11:43 ` Joey Gouly
  3 siblings, 0 replies; 10+ messages in thread
From: Joey Gouly @ 2026-03-24 11:43 UTC (permalink / raw)
  To: Jing Zhang
  Cc: KVM, KVMARM, Marc Zyngier, Andrew Jones, Alexandru Elisei,
	Oliver Upton

On Mon, Mar 16, 2026 at 03:43:46PM -0700, Jing Zhang wrote:
> This patch series introduces a lightweight infrastructure for managing ARM64
> Stage-2 translation tables and executing nested guests. These components are
> essential for testing advanced virtualization features such as nested
> virtualization (NV) and GICv4 direct interrupt injection.
> 
> The series provides a generic Stage-2 MMU library supporting multiple
> translation granules (4K, 16K, 64K) and dynamic page table management.
> Building on this, it adds a guest execution framework that handles guest
> lifecycle management, context switching and guest exit routing. A new test
> case for Stage-2 MMU demand paging to verify fault handling.
> 
> Please note that this is a very preliminary implementation intended as a
> startup baseline for future work in virtualization testing. Users should be
> aware that because this is an early-stage baseline, some portions of the code
> may just happen to work in its current state. There might be critical
> architectural elements or edge-case handling missing that will need to be
> addressed as the framework matures.
> 

Hi,

I'm interested in this, I had a much more barebones (aka no stage2) guest
framework thing that I sent out [1], but it seems more useful to try get this
series merged.

I will try get my tests working with this new framework, one thing missing for
me is support for executing at EL0.

Thanks,
Joey

[1] https://lore.kernel.org/kvmarm/20260306142656.2775185-1-joey.gouly@arm.com/#t

> ---
> 
> Jing Zhang (3):
>   lib: arm64: Add stage2 page table management library
>   lib: arm64: Add bare-metal guest execution framework
>   arm64: Add Stage-2 MMU demand paging test
> 
>  arm/Makefile.arm64         |   4 +
>  arm/stage2-mmu-test.c      | 100 +++++++++
>  lib/arm64/asm/guest.h      | 156 ++++++++++++++
>  lib/arm64/asm/stage2_mmu.h |  74 +++++++
>  lib/arm64/guest.c          | 197 ++++++++++++++++++
>  lib/arm64/guest_arch.S     | 263 ++++++++++++++++++++++++
>  lib/arm64/stage2_mmu.c     | 402 +++++++++++++++++++++++++++++++++++++
>  7 files changed, 1196 insertions(+)
>  create mode 100644 arm/stage2-mmu-test.c
>  create mode 100644 lib/arm64/asm/guest.h
>  create mode 100644 lib/arm64/asm/stage2_mmu.h
>  create mode 100644 lib/arm64/guest.c
>  create mode 100644 lib/arm64/guest_arch.S
>  create mode 100644 lib/arm64/stage2_mmu.c
> 
> 
> base-commit: 86e53277ac80dabb04f4fa5fa6a6cc7649392bdc
> -- 
> 2.53.0.851.ga537e3e6e9-goog
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [kvm-unit-tests PATCH v1 2/3] lib: arm64: Add bare-metal guest execution framework
  2026-03-16 22:43 ` [kvm-unit-tests PATCH v1 2/3] lib: arm64: Add bare-metal guest execution framework Jing Zhang
  2026-03-17  1:46   ` Yao Yuan
  2026-03-17  8:09   ` Marc Zyngier
@ 2026-03-24 15:04   ` Joey Gouly
  2026-03-24 15:44   ` Wei-Lin Chang
  3 siblings, 0 replies; 10+ messages in thread
From: Joey Gouly @ 2026-03-24 15:04 UTC (permalink / raw)
  To: Jing Zhang
  Cc: KVM, KVMARM, Marc Zyngier, Andrew Jones, Alexandru Elisei,
	Oliver Upton

Hi,

Just some preliminary comments.

On Mon, Mar 16, 2026 at 03:43:48PM -0700, Jing Zhang wrote:
> To test advanced KVM features such as nested virtualization (NV) and
> GICv4 direct interrupt injection, kvm-unit-tests needs the ability to
> act as an L1 hypervisor running at EL2 and manage its own L2 guests.
> 
> Introduce a lightweight guest management library that provides the
> infrastructure to create, configure, and execute nested guests.
> 
> This framework includes:
> - Guest lifecycle management: `guest_create()` and `guest_destroy()`
>   APIs to allocate guest context and setup Stage-2 identity mappings
>   for code and stack using the s2mmu library.
> - Context switching: The `guest_run()` assembly routine handles
>   saving the host (L1) callee-saved registers and loading the guest
>   (L2) GPRs and EL1 system registers.
> - VM-Exit handling: Installs an EL2 trap handler (`guest_hyp_vectors`)
>   to intercept guest exits and route them to `guest_c_exception_handler`
>   to determine whether to return to the host test logic or resume.
> - Guest-internal exceptions: Provides `guest_el1_vectors` to catch
>   Sync, IRQ, FIQ, and SError exceptions occurring entirely within the
>   guest (EL1) without trapping to the host.

Might be helpful if this patch could be split into 2-3 commits.

> 
> Signed-off-by: Jing Zhang <jingzhangos@google.com>
> ---
>  arm/Makefile.arm64     |   2 +
>  lib/arm64/asm/guest.h  | 156 ++++++++++++++++++++++++
>  lib/arm64/guest.c      | 197 ++++++++++++++++++++++++++++++
>  lib/arm64/guest_arch.S | 263 +++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 618 insertions(+)
>  create mode 100644 lib/arm64/asm/guest.h
>  create mode 100644 lib/arm64/guest.c
>  create mode 100644 lib/arm64/guest_arch.S
> 
> diff --git a/arm/Makefile.arm64 b/arm/Makefile.arm64
> index 5e50f5ba..9026fd71 100644
> --- a/arm/Makefile.arm64
> +++ b/arm/Makefile.arm64
> @@ -41,6 +41,8 @@ cflatobjs += lib/arm64/processor.o
>  cflatobjs += lib/arm64/spinlock.o
>  cflatobjs += lib/arm64/gic-v3-its.o lib/arm64/gic-v3-its-cmd.o
>  cflatobjs += lib/arm64/stage2_mmu.o
> +cflatobjs += lib/arm64/guest.o
> +cflatobjs += lib/arm64/guest_arch.o
>  
>  ifeq ($(CONFIG_EFI),y)
>  cflatobjs += lib/acpi.o
> diff --git a/lib/arm64/asm/guest.h b/lib/arm64/asm/guest.h
> new file mode 100644
> index 00000000..1d70873d
> --- /dev/null
> +++ b/lib/arm64/asm/guest.h
> @@ -0,0 +1,156 @@
> +/*
> + * Copyright (C) 2026, Google LLC.
> + * Author: Jing Zhang <jingzhangos@google.com>
> + *
> + * SPDX-License-Identifier: LGPL-2.0-or-later
> + */
> +#ifndef _ASMARM64_GUEST_H_
> +#define _ASMARM64_GUEST_H_
> +
> +/* Offsets for assembly (Must match struct guest) */
> +#define GUEST_X_OFFSET			0
> +#define GUEST_ELR_OFFSET		248
> +#define GUEST_SPSR_OFFSET		256
> +#define GUEST_HCR_OFFSET		264
> +#define GUEST_VTTBR_OFFSET		272
> +#define GUEST_SCTLR_OFFSET		280
> +#define GUEST_VBAR_OFFSET		288
> +#define GUEST_SP_EL1_OFFSET		296
> +#define GUEST_ESR_OFFSET		304
> +#define GUEST_FAR_OFFSET		312
> +#define GUEST_HPFAR_OFFSET		320
> +#define GUEST_EXIT_CODE_OFFSET		328
> +#define GUEST_TPIDR_EL1_OFFSET		336
> +#define GUEST_ICH_VMCR_EL2_OFFSET	344

Look at lib/arm64/asm-offsets.c for how to generate theses.

> +
> +#ifndef __ASSEMBLY__
> +
> +#include <libcflat.h>
> +#include <asm/stage2_mmu.h>
> +
> +/* HCR_EL2 Definitions */
> +#define HCR_VM		(1UL << 0)	/* Virtualization Enable */
> +#define HCR_FMO		(1UL << 3)	/* Physical FIQ Routing */
> +#define HCR_IMO		(1UL << 4)	/* Physical IRQ Routing */
> +#define HCR_AMO		(1UL << 5)	/* Physical SError Interrupt Routing */
> +#define HCR_RW		(1UL << 31)	/* Execution State: AArch64 */
> +#define HCR_DC		(1UL << 12)	/* Default Cacheable */
> +#define HCR_E2H		(1UL << 34)	/* EL2 Host */
> +

Should be in lib/arm64/asm/sysreg.h

> +#define HCR_GUEST_FLAGS (HCR_VM | HCR_FMO | HCR_IMO | HCR_AMO | HCR_RW | \
> +			 HCR_DC | HCR_E2H)
> +
> +/* ICH_VMCR_EL2 bit definition */
> +#define ICH_VMCR_PMR_SHIFT	24
> +#define ICH_VMCR_PMR_MASK	(0xffUL << ICH_VMCR_PMR_SHIFT)
> +#define ICH_VMCR_ENG0_SHIFT	0
> +#define ICH_VMCR_ENG0_MASK	(1 << ICH_VMCR_ENG0_SHIFT)
> +#define ICH_VMCR_ENG1_SHIFT	1
> +#define ICH_VMCR_ENG1_MASK	(1 << ICH_VMCR_ENG1_SHIFT)
> +
> +/* Guest stack size */
> +#define GUEST_STACK_SIZE		SZ_64K
> +
> +/*
> + * Result from Handler:
> + * RESUME: Keep guest running (ERET immediately)
> + * EXIT:   Return to Host C caller
> + */
> +enum guest_handler_result {
> +	GUEST_ACTION_RESUME,
> +	GUEST_ACTION_EXIT
> +};
> +
> +struct guest;
> +typedef enum guest_handler_result (*guest_handler_t)(struct guest *guest);
> +
> +/* EL1 (Guest-internal) Exception Vector */
> +enum guest_el1_vector {
> +	GUEST_EL1_SYNC,
> +	GUEST_EL1_IRQ,
> +	GUEST_EL1_FIQ,
> +	GUEST_EL1_SERROR,
> +	GUEST_EL1_MAX
> +};
> +

There's a similar vector enum in lib/arm64/asm/processor.h, is there a specific
need to have a separate guest_el1 version?

> +/*
> + * Guest EL1 Exception Frame (pushed to guest stack by asm stub)
> + * We use a simplified frame: x0-x30, elr, spsr. size = 33*8
> + */
> +struct guest_el1_regs {
> +	unsigned long regs[31];
> +	unsigned long elr;
> +	unsigned long spsr;
> +};

What about SP?

> +
> +typedef void (*guest_el1_handler_t)(struct guest_el1_regs *regs, unsigned int esr);
> +
> +/* Exceptions from the Guest (Lower EL using AArch64) */
> +enum guest_vector {
> +	GUEST_VECTOR_SYNC,
> +	GUEST_VECTOR_IRQ,
> +	GUEST_VECTOR_FIQ,
> +	GUEST_VECTOR_SERROR,
> +	GUEST_VECTOR_MAX
> +};

Same comment about vector.

> +
> +/*
> + * Guest Context Structure
> + * This will be pointed to by TPIDR_EL1 while the guest is running.
> + */
> +struct guest_context {
> +	guest_el1_handler_t handlers[GUEST_EL1_MAX];
> +};
> +
> +struct guest {
> +	/* 0x000: General Purpose Registers */
> +	unsigned long x[31]; /* x0..x30 */
> +
> +	/* 0x0F8: Execution State */
> +	unsigned long elr_el2;
> +	unsigned long spsr_el2;
> +
> +	/* 0x108: Control Registers */
> +	unsigned long hcr_el2;
> +	unsigned long vttbr_el2;
> +	unsigned long sctlr_el1;
> +	unsigned long vbar_el1;
> +	unsigned long sp_el1;
> +
> +	/* 0x130: Exit Information */
> +	unsigned long esr_el2;
> +	unsigned long far_el2;
> +	unsigned long hpfar_el2;
> +	unsigned long exit_code; /* enum guest_vector */
> +	unsigned long tpidr_el1;
> +
> +	/* 0x158: GIC Registers */
> +	unsigned long ich_vmcr_el2;
> +
> +	/* 0x160: Exception Handlers */
> +	guest_handler_t handlers[GUEST_VECTOR_MAX];
> +	struct guest_context *guest_context;
> +
> +	struct s2_mmu *s2mmu;
> +};
> +
> +/* API */
> +struct guest *guest_create(int vmid, void (*guest_func)(void), enum s2_granule granule);
> +void guest_destroy(struct guest *guest);
> +
> +/* Configuration */
> +void guest_set_vector(struct guest *guest, void *vector_table);
> +void guest_set_stack(struct guest *guest, void *stack_top);
> +void guest_install_handler(struct guest *guest, enum guest_vector v, guest_handler_t handler);
> +
> +/* Install handler for exceptions INSIDE EL1 */
> +void guest_install_el1_handler(struct guest *guest, enum guest_el1_vector v, guest_el1_handler_t handler);
> +
> +unsigned long guest_c_exception_handler(struct guest *guest, unsigned long vector_offset);
> +void guest_el1_c_handler(struct guest_el1_regs *regs, unsigned int vector);
> +
> +/* Core Run Loop */
> +void guest_run(struct guest *guest);
> +
> +#endif /* __ASSEMBLY__ */
> +#endif /* _ASMARM64_GUEST_H_ */
> diff --git a/lib/arm64/guest.c b/lib/arm64/guest.c
> new file mode 100644
> index 00000000..6c256c11
> --- /dev/null
> +++ b/lib/arm64/guest.c
> @@ -0,0 +1,197 @@
> +/*
> + * Copyright (C) 2026, Google LLC.
> + * Author: Jing Zhang <jingzhangos@google.com>
> + *
> + * SPDX-License-Identifier: LGPL-2.0-or-later
> + */
> +#include <libcflat.h>
> +#include <asm/guest.h>
> +#include <asm/io.h>
> +#include <asm/sysreg.h>
> +#include <asm/barrier.h>
> +#include <alloc_page.h>
> +#include <alloc.h>
> +
> +/* Compile-time checks to ensure Assembly macros match C Struct */
> +_Static_assert(offsetof(struct guest, x) == GUEST_X_OFFSET,
> +	       "GUEST_X_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, elr_el2) == GUEST_ELR_OFFSET,
> +	       "GUEST_ELR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, spsr_el2) == GUEST_SPSR_OFFSET,
> +	       "GUEST_SPSR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, hcr_el2) == GUEST_HCR_OFFSET,
> +	       "GUEST_HCR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, vttbr_el2) == GUEST_VTTBR_OFFSET,
> +	       "GUEST_VTTBR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, sctlr_el1) == GUEST_SCTLR_OFFSET,
> +	       "GUEST_SCTLR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, vbar_el1) == GUEST_VBAR_OFFSET,
> +	       "GUEST_VBAR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, sp_el1) == GUEST_SP_EL1_OFFSET,
> +	       "GUEST_SP_EL1_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, esr_el2) == GUEST_ESR_OFFSET,
> +	       "GUEST_ESR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, far_el2) == GUEST_FAR_OFFSET,
> +	       "GUEST_FAR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, hpfar_el2) == GUEST_HPFAR_OFFSET,
> +	       "GUEST_HPFAR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, exit_code) == GUEST_EXIT_CODE_OFFSET,
> +	       "GUEST_EXIT_CODE_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, tpidr_el1) == GUEST_TPIDR_EL1_OFFSET,
> +		"GUEST_TPIDR_EL1_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, ich_vmcr_el2) == GUEST_ICH_VMCR_EL2_OFFSET,
> +		"GUEST_ICH_VMCR_EL2_OFFSET mismatch");
> +
> +/*
> + * C-Entry for Exception Handling
> + * Returns 0 to Resume Guest, 1 to Exit to Host Caller
> + */
> +unsigned long guest_c_exception_handler(struct guest *guest, unsigned long vector_offset)
> +{
> +	enum guest_vector vector = (enum guest_vector)guest->exit_code;
> +
> +	/* Save Trap Info */
> +	guest->esr_el2 = read_sysreg(esr_el2);
> +	guest->far_el2 = read_sysreg(far_el2);
> +	guest->hpfar_el2 = read_sysreg(hpfar_el2);
> +
> +	/* Invoke Handler if registered */
> +	if (guest->handlers[vector]) {
> +		if (guest->handlers[vector](guest) == GUEST_ACTION_RESUME) {
> +			return 0; /* ASM stub will restore and ERET */
> +		}
> +	}
> +
> +	/* Default: Exit to caller */
> +	return 1;
> +}
> +
> +/* --- EL1 (Guest-Internal) Vector Handling --- */
> +
> +void guest_install_el1_handler(struct guest *guest, enum guest_el1_vector v, guest_el1_handler_t handler)
> +{
> +	if (guest && guest->guest_context && v < GUEST_EL1_MAX)
> +		guest->guest_context->handlers[v] = handler;
> +}
> +
> +void guest_el1_c_handler(struct guest_el1_regs *regs, unsigned int vector)
> +{
> +	struct guest_context *ctx = (struct guest_context *)read_sysreg(tpidr_el1);
> +	unsigned int esr = read_sysreg(esr_el1);
> +
> +	if (ctx && vector < GUEST_EL1_MAX && ctx->handlers[vector]) {
> +		ctx->handlers[vector](regs, esr);
> +	} else {
> +		printf("Guest: Unhandled Exception Vector %d, ESR=0x%x\n", vector, esr);
> +		asm volatile("hvc #0xFFFF");
> +	}
> +}
> +
> +extern void guest_el1_vectors(void);
> +
> +static struct guest *__guest_create(struct s2_mmu *s2_ctx, void *entry_point)
> +{
> +	struct guest *guest = calloc(1, sizeof(struct guest));
> +	struct guest_context *guest_ctx;
> +	unsigned long guest_ctx_pa;
> +
> +	/* Allocate the internal context table */
> +	guest_ctx = (void *)alloc_page();
> +	memset(guest_ctx, 0, PAGE_SIZE);
> +	guest->guest_context = guest_ctx;
> +
> +	guest_ctx_pa = virt_to_phys(guest_ctx);
> +	if (s2_ctx)
> +		s2mmu_map(s2_ctx, guest_ctx_pa, guest_ctx_pa, PAGE_SIZE, S2_MAP_RW);
> +
> +	guest->tpidr_el1 = guest_ctx_pa;;
> +
> +	guest->elr_el2 = (unsigned long)entry_point;
> +	guest->spsr_el2 = 0x3C5; /* M=EL1h, DAIF=Masked */
> +	guest->hcr_el2 = HCR_GUEST_FLAGS;
> +
> +	if (s2_ctx) {
> +		guest->vttbr_el2 = virt_to_phys(s2_ctx->pgd);
> +		guest->vttbr_el2 |= ((unsigned long)s2_ctx->vmid << 48);
> +	}
> +
> +	guest->sctlr_el1 = read_sysreg(sctlr_el1);
> +	guest->sctlr_el1 |= SCTLR_EL1_C | SCTLR_EL1_I | SCTLR_EL1_M;
> +
> +	guest->ich_vmcr_el2 = read_sysreg(ich_vmcr_el2);
> +	guest->ich_vmcr_el2 |= (0xFFUL << ICH_VMCR_PMR_SHIFT) | (1UL << ICH_VMCR_ENG1_SHIFT);
> +
> +	guest->vbar_el1 = (unsigned long)guest_el1_vectors;
> +	guest->s2mmu = s2_ctx;
> +
> +	return guest;
> +}
> +
> +struct guest *guest_create(int vmid, void (*guest_func)(void), enum s2_granule granule)
> +{
> +	unsigned long guest_pa, code_base, stack_pa;
> +	unsigned long *stack_page;
> +	struct guest *guest;
> +	struct s2_mmu *ctx;
> +
> +	ctx = s2mmu_init(vmid, granule, true);
> +	/*
> +	 * Map the Host's code segment Identity Mapped (IPA=PA).
> +	 * To be safe, we map a large chunk (e.g., 2MB) around the function
> +	 * to capture any helper functions the compiler might generate calls to.
> +	 */
> +	guest_pa = virt_to_phys((void *)guest_func);
> +	code_base = guest_pa & ~(SZ_2M - 1);
> +	s2mmu_map(ctx, code_base, code_base, SZ_2M, S2_MAP_RW);
> +
> +	/*
> +	 * Map Stack
> +	 * Allocate 16 pages (64K) in Host, get its PA, and map it for Guest.
> +	 */
> +	stack_page = alloc_pages(get_order(GUEST_STACK_SIZE >> PAGE_SHIFT));
> +	stack_pa = virt_to_phys(stack_page);
> +	/* Identity Map it (IPA = PA) */
> +	s2mmu_map(ctx, stack_pa, stack_pa, GUEST_STACK_SIZE, S2_MAP_RW);
> +
> +	s2mmu_enable(ctx);
> +
> +	/* Create Guest */
> +	/* Entry point is the PA of the function (Identity Mapped) */
> +	guest = __guest_create(ctx, (void *)guest_pa);
> +
> +	/*
> +	 * Setup Guest Stack Pointer
> +	 * Must match where we mapped the stack + Offset
> +	 */
> +	guest_set_stack(guest, (void *)(stack_pa + GUEST_STACK_SIZE));
> +
> +	/* Map UART identity mapped, printf() available to guest */
> +	s2mmu_map(ctx, 0x09000000, 0x09000000, PAGE_SIZE, S2_MAP_DEVICE);
> +
> +	return guest;
> +}
> +
> +void guest_destroy(struct guest *guest)
> +{
> +	s2mmu_disable(guest->s2mmu);
> +	s2mmu_destroy(guest->s2mmu);
> +	if (guest->guest_context)
> +		free_page(guest->guest_context);
> +	free(guest);
> +}
> +
> +void guest_set_vector(struct guest *guest, void *vector_table)
> +{
> +	guest->vbar_el1 = (unsigned long)vector_table;
> +}
> +
> +void guest_set_stack(struct guest *guest, void *stack_top)
> +{
> +	guest->sp_el1 = (unsigned long)stack_top;
> +}
> +
> +void guest_install_handler(struct guest *guest, enum guest_vector v, guest_handler_t handler)
> +{
> +	if (v < GUEST_VECTOR_MAX)
> +		guest->handlers[v] = handler;
> +}
> diff --git a/lib/arm64/guest_arch.S b/lib/arm64/guest_arch.S
> new file mode 100644
> index 00000000..cb7074d7
> --- /dev/null
> +++ b/lib/arm64/guest_arch.S
> @@ -0,0 +1,263 @@
> +/*
> + * Copyright (C) 2026, Google LLC.
> + * Author: Jing Zhang <jingzhangos@google.com>
> + *
> + * SPDX-License-Identifier: LGPL-2.0-or-later
> + */
> +#define __ASSEMBLY__
> +#include <asm/guest.h>
> +
> +.global guest_run
> +guest_run:
> +	/* x0 = struct guest pointer */
> +
> +	/* Save Host Callee-Saved Regs */
> +	stp	x29, x30, [sp, #-16]!
> +	stp	x27, x28, [sp, #-16]!
> +	stp	x25, x26, [sp, #-16]!
> +	stp	x23, x24, [sp, #-16]!
> +	stp	x21, x22, [sp, #-16]!
> +	stp	x19, x20, [sp, #-16]!
> +
> +	/* Cache Guest Pointer in TPIDR_EL2 */
> +	msr	tpidr_el2, x0
> +
> +	/* Configure ICC_SRE_EL2 to allow EL1 access to SysRegs */
> +	/* Bit 3 (Enable) = 1, Bit 0 (SRE) = 1 */
> +	mrs	x1, icc_sre_el2
> +	orr	x1, x1, #1
> +	orr	x1, x1, #(1 << 3)
> +	msr	icc_sre_el2, x1
> +	isb
> +
> +	/* Enable virtual CPU interface */
> +	mrs	x1, ich_hcr_el2
> +	orr	x1, x1, #1
> +	msr	ich_hcr_el2, x1
> +
> +	/* Load Guest System Registers */
> +	ldr	x1, [x0, #GUEST_ELR_OFFSET]
> +	msr	elr_el2, x1
> +	ldr	x1, [x0, #GUEST_SPSR_OFFSET]
> +	msr	spsr_el2, x1
> +	ldr	x1, [x0, #GUEST_HCR_OFFSET]
> +	msr	hcr_el2, x1
> +	ldr	x1, [x0, #GUEST_VTTBR_OFFSET]
> +	msr	vttbr_el2, x1
> +	ldr	x1, [x0, #GUEST_SCTLR_OFFSET]
> +	msr	S3_5_c1_c0_0, x1
> +	ldr	x1, [x0, #GUEST_VBAR_OFFSET]
> +	msr	S3_5_c12_c0_0, x1

Need to add these registers to the sysreg.h file, so don't need the raw encoding.

> +	ldr	x1, [x0, #GUEST_SP_EL1_OFFSET]
> +	msr	sp_el1, x1
> +	ldr	x1, [x0, #GUEST_TPIDR_EL1_OFFSET]
> +	msr	tpidr_el1, x1
> +	ldr	x1, [x0, #GUEST_ICH_VMCR_EL2_OFFSET]
> +	msr	ich_vmcr_el2, x1
> +
> +	/* Load Guest GPRs */
> +	ldp	x1, x2, [x0, #8]
> +	ldp	x3, x4, [x0, #24]
> +	ldp	x5, x6, [x0, #40]
> +	ldp	x7, x8, [x0, #56]
> +	ldp	x9, x10, [x0, #72]
> +	ldp	x11, x12, [x0, #88]
> +	ldp	x13, x14, [x0, #104]
> +	ldp	x15, x16, [x0, #120]
> +	ldp	x17, x18, [x0, #136]
> +	ldp	x19, x20, [x0, #152]
> +	ldp	x21, x22, [x0, #168]
> +	ldp	x23, x24, [x0, #184]
> +	ldp	x25, x26, [x0, #200]
> +	ldp	x27, x28, [x0, #216]
> +	ldp	x29, x30, [x0, #232]
> +	ldr	x0, [x0, #0]
> +
> +	/* Install Trap Handler */
> +	adrp	x29, guest_hyp_vectors
> +	add	x29, x29, :lo12:guest_hyp_vectors
> +	msr	vbar_el2, x29
> +
> +	/* Restore x29 from struct (via tpidr_el2) */
> +	mrs	x29, tpidr_el2
> +	ldr	x29, [x29, #232]
> +
> +	isb
> +	eret
> +
> +	.align 11
> +guest_hyp_vectors:
> +	.skip 0x400
> +
> +guest_exit_sync:
> +	stp	x0, x1, [sp, #-16]!
> +	mrs	x0, tpidr_el2
> +	mov	x1, #0
> +	str	x1, [x0, #GUEST_EXIT_CODE_OFFSET]
> +	b	guest_common_exit
> +
> +	.balign 0x80
> +
> +guest_exit_irq:
> +	stp	x0, x1, [sp, #-16]!
> +	mrs	x0, tpidr_el2
> +	mov	x1, #1
> +	str	x1, [x0, #GUEST_EXIT_CODE_OFFSET]
> +	b	guest_common_exit
> +
> +guest_common_exit:
> +	stp	x2, x3, [x0, #16]
> +	stp	x4, x5, [x0, #32]
> +	stp	x6, x7, [x0, #48]
> +	stp	x8, x9, [x0, #64]
> +	stp	x10, x11, [x0, #80]
> +	stp	x12, x13, [x0, #96]
> +	stp	x14, x15, [x0, #112]
> +	stp	x16, x17, [x0, #128]
> +	stp	x18, x19, [x0, #144]
> +	stp	x20, x21, [x0, #160]
> +	stp	x22, x23, [x0, #176]
> +	stp	x24, x25, [x0, #192]
> +	stp	x26, x27, [x0, #208]
> +	stp	x28, x29, [x0, #224]
> +	str	x30, [x0, #240]
> +
> +	ldp	x2, x3, [sp], #16
> +	stp	x2, x3, [x0, #0]
> +
> +	mrs	x1, elr_el2
> +	str	x1, [x0, #GUEST_ELR_OFFSET]
> +	mrs	x1, spsr_el2
> +	str	x1, [x0, #GUEST_SPSR_OFFSET]
> +	mrs	x1, esr_el2
> +	str	x1, [x0, #GUEST_ESR_OFFSET]
> +	mrs	x1, far_el2
> +	str	x1, [x0, #GUEST_FAR_OFFSET]
> +	mrs	x1, hpfar_el2
> +	str	x1, [x0, #GUEST_HPFAR_OFFSET]
> +	mrs	x1, sp_el1
> +	str	x1, [x0, #GUEST_SP_EL1_OFFSET]

Missing VBAR_EL1?

> +	mrs	x1, ich_vmcr_el2
> +	str	x1, [x0, #GUEST_ICH_VMCR_EL2_OFFSET]
> +
> +	/* x29 contains vector offset from entry */
> +	mov	x1, x29
> +	bl	guest_c_exception_handler
> +	cbz	x0, guest_resume_guest
> +
> +	/* EXIT */
> +	/* Restore Host Callee-Saved Regs */
> +	ldp	x19, x20, [sp], #16
> +	ldp	x21, x22, [sp], #16
> +	ldp	x23, x24, [sp], #16
> +	ldp	x25, x26, [sp], #16
> +	ldp	x27, x28, [sp], #16
> +	ldp	x29, x30, [sp], #16
> +	ret
> +
> +	/* RESUME */
> +guest_resume_guest:
> +	mrs	x0, tpidr_el2
> +	ldr	x1, [x0, #GUEST_ELR_OFFSET]
> +	msr	elr_el2, x1
> +	ldr	x1, [x0, #GUEST_SPSR_OFFSET]
> +	msr	spsr_el2, x1
> +	ldr	x1, [x0, #GUEST_SP_EL1_OFFSET]
> +	msr	sp_el1, x1
> +
> +	ldp	x1, x2, [x0, #8]
> +	ldp	x3, x4, [x0, #24]
> +	ldp	x5, x6, [x0, #40]
> +	ldp	x7, x8, [x0, #56]
> +	ldp	x9, x10, [x0, #72]
> +	ldp	x11, x12, [x0, #88]
> +	ldp	x13, x14, [x0, #104]
> +	ldp	x15, x16, [x0, #120]
> +	ldp	x17, x18, [x0, #136]
> +	ldp	x19, x20, [x0, #152]
> +	ldp	x21, x22, [x0, #168]
> +	ldp	x23, x24, [x0, #184]
> +	ldp	x25, x26, [x0, #200]
> +	ldp	x27, x28, [x0, #216]
> +	ldp	x29, x30, [x0, #232]
> +	ldr	x0, [x0, #0]
> +	eret
> +
> +/* EL1 Vector Table */
> +.align 11
> +.global guest_el1_vectors
> +guest_el1_vectors:
> +	/* Sync (0x000) */
> +	.skip 0x200
> +	/* Sync (0x200) */
> +	stp	x29, x30, [sp, #-16]!
> +	mov	x29, #0
> +	b	guest_el1_common
> +	.skip 0x80 - 12
> +	/* IRQ (0x280) */
> +	stp	x29, x30, [sp, #-16]!
> +	mov	x29, #1
> +	b	guest_el1_common
> +	.skip 0x80 - 12
> +	/* FIQ (0x300) */
> +	stp	x29, x30, [sp, #-16]!
> +	mov	x29, #2
> +	b	guest_el1_common
> +	.skip 0x80 - 12
> +	/* SError (0x380) */
> +	stp	x29, x30, [sp, #-16]!
> +	mov	x29, #3
> +	b	guest_el1_common
> +	.skip 0x400
> +
> +guest_el1_common:
> +	sub	sp, sp, #264
> +	stp	x0, x1, [sp, #0]
> +	stp	x2, x3, [sp, #16]
> +	stp	x4, x5, [sp, #32]
> +	stp	x6, x7, [sp, #48]
> +	stp	x8, x9, [sp, #64]
> +	stp	x10, x11, [sp, #80]
> +	stp	x12, x13, [sp, #96]
> +	stp	x14, x15, [sp, #112]
> +	stp	x16, x17, [sp, #128]
> +	stp	x18, x19, [sp, #144]
> +	stp	x20, x21, [sp, #160]
> +	stp	x22, x23, [sp, #176]
> +	stp	x24, x25, [sp, #192]
> +	stp	x26, x27, [sp, #208]
> +	stp	x28, x30, [sp, #224]
> +
> +	mrs	x0, elr_el1
> +	str	x0, [sp, #248]
> +	mrs	x0, spsr_el1
> +	str	x0, [sp, #256]
> +
> +	mov	x0, sp
> +	mov	x1, x29
> +	bl	guest_el1_c_handler
> +
> +	ldr	x0, [sp, #248]
> +	msr	elr_el1, x0
> +	ldr	x0, [sp, #256]
> +	msr	spsr_el1, x0
> +
> +	ldp	x0, x1, [sp, #0]
> +	ldp	x2, x3, [sp, #16]
> +	ldp	x4, x5, [sp, #32]
> +	ldp	x6, x7, [sp, #48]
> +	ldp	x8, x9, [sp, #64]
> +	ldp	x10, x11, [sp, #80]
> +	ldp	x12, x13, [sp, #96]
> +	ldp	x14, x15, [sp, #112]
> +	ldp	x16, x17, [sp, #128]
> +	ldp	x18, x19, [sp, #144]
> +	ldp	x20, x21, [sp, #160]
> +	ldp	x22, x23, [sp, #176]
> +	ldp	x24, x25, [sp, #192]
> +	ldp	x26, x27, [sp, #208]
> +	ldp	x28, x30, [sp, #224]
> +
> +	add	sp, sp, #264
> +	ldp	x29, x30, [sp], #16
> +	eret
> -- 

Thanks,
Joey

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [kvm-unit-tests PATCH v1 1/3] lib: arm64: Add stage2 page table management library
  2026-03-16 22:43 ` [kvm-unit-tests PATCH v1 1/3] lib: arm64: Add stage2 page table management library Jing Zhang
@ 2026-03-24 15:12   ` Wei-Lin Chang
  0 siblings, 0 replies; 10+ messages in thread
From: Wei-Lin Chang @ 2026-03-24 15:12 UTC (permalink / raw)
  To: Jing Zhang, KVM, KVMARM
  Cc: Marc Zyngier, Joey Gouly, Andrew Jones, Alexandru Elisei,
	Oliver Upton

Hi,

On Mon, Mar 16, 2026 at 03:43:47PM -0700, Jing Zhang wrote:
> Tests running at EL2 (hypervisor level) often require the ability to
> manage Stage 2 translation tables to control Guest Physical Address (IPA)
> to Host Physical Address (PA) translation.
> 
> Add a generic Stage 2 MMU library that provides software management of
> ARM64 Stage 2 translation tables.
> 
> The library features include:
> - Support for 4K, 16K, and 64K translation granules.
> - Dynamic page table allocation using the allocator.
> - Support for 2M block mappings where applicable.
> - APIs for mapping, unmapping, enabling, and disabling the Stage 2 MMU.
> - Basic fault info reporting (ESR, FAR, HPFAR).
> 
> This infrastructure is necessary for upcoming virtualization and
> hypervisor-mode tests.
> 
> Signed-off-by: Jing Zhang <jingzhangos@google.com>
> ---
>  arm/Makefile.arm64         |   1 +
>  lib/arm64/asm/stage2_mmu.h |  74 +++++++
>  lib/arm64/stage2_mmu.c     | 402 +++++++++++++++++++++++++++++++++++++
>  3 files changed, 477 insertions(+)
>  create mode 100644 lib/arm64/asm/stage2_mmu.h
>  create mode 100644 lib/arm64/stage2_mmu.c
> 
> diff --git a/arm/Makefile.arm64 b/arm/Makefile.arm64
> index a40c830d..5e50f5ba 100644
> --- a/arm/Makefile.arm64
> +++ b/arm/Makefile.arm64
> @@ -40,6 +40,7 @@ cflatobjs += lib/arm64/stack.o
>  cflatobjs += lib/arm64/processor.o
>  cflatobjs += lib/arm64/spinlock.o
>  cflatobjs += lib/arm64/gic-v3-its.o lib/arm64/gic-v3-its-cmd.o
> +cflatobjs += lib/arm64/stage2_mmu.o
>  
>  ifeq ($(CONFIG_EFI),y)
>  cflatobjs += lib/acpi.o
> diff --git a/lib/arm64/asm/stage2_mmu.h b/lib/arm64/asm/stage2_mmu.h
> new file mode 100644
> index 00000000..c9e931a8
> --- /dev/null
> +++ b/lib/arm64/asm/stage2_mmu.h
> @@ -0,0 +1,74 @@
> +/*
> + * Copyright (C) 2026, Google LLC.
> + * Author: Jing Zhang <jingzhangos@google.com>
> + *
> + * SPDX-License-Identifier: LGPL-2.0-or-later
> + */
> +#ifndef _ASMARM64_STAGE2_MMU_H_
> +#define _ASMARM64_STAGE2_MMU_H_
> +
> +#include <libcflat.h>
> +#include <asm/page.h>
> +#include <asm/pgtable.h>
> +
> +#define pte_is_table(pte)	(pte_val(pte) & PTE_TABLE_BIT)
> +
> +/* Stage-2 Memory Attributes (MemAttr[3:0]) */
> +#define S2_MEMATTR_NORMAL	(0xFUL << 2) /* Normal Memory, Outer/Inner Write-Back */
> +#define S2_MEMATTR_DEVICE	(0x0UL << 2) /* Device-nGnRnE */
> +
> +#define ESR_ELx_EC_SHIFT	(26)
> +#define ESR_ELx_EC_HVC64	UL(0x16)
> +#define ESR_ELx_EC_DABT_LOW	UL(0x24)

nit:
This looks out of place to me, could this be better if moved to guest.h?

> +
> +/* Stage-2 Access Permissions (S2AP[1:0]) */
> +#define S2AP_NONE	(0UL << 6)
> +#define S2AP_RO		(1UL << 6) /* Read-only */
> +#define S2AP_WO		(2UL << 6) /* Write-only */
> +#define S2AP_RW		(3UL << 6) /* Read-Write */
> +
> +/* Flags for mapping */
> +#define S2_MAP_RW	(S2AP_RW | S2_MEMATTR_NORMAL | PTE_AF | PTE_SHARED)
> +#define S2_MAP_DEVICE	(S2AP_RW | S2_MEMATTR_DEVICE | PTE_AF)
> +
> +enum s2_granule {
> +	S2_PAGE_4K,
> +	S2_PAGE_16K,
> +	S2_PAGE_64K,
> +};
> +
> +/* Main Stage-2 MMU Structure */
> +struct s2_mmu {
> +	pgd_t *pgd;
> +	int vmid;
> +
> +	/* Configuration */
> +	enum s2_granule granule;
> +	bool allow_block_mappings;
> +
> +	/* Internal helpers calculated from granule & VA_BITS */
> +	unsigned int page_shift;
> +	unsigned int level_shift;
> +	int root_level; /* 0, 1, or 2 */
> +	unsigned long page_size;
> +	unsigned long block_size;
> +};
> +
> +/* API */
> +/* Initialize an s2_mmu struct with specific settings */
> +struct s2_mmu *s2mmu_init(int vmid, enum s2_granule granule, bool allow_block_mappings);
> +
> +/* Management */
> +void s2mmu_destroy(struct s2_mmu *mmu);
> +void s2mmu_map(struct s2_mmu *mmu, unsigned long ipa, unsigned long pa,
> +	       unsigned long size, unsigned long flags);
> +void s2mmu_unmap(struct s2_mmu *mmu, unsigned long ipa, unsigned long size);
> +
> +/* Activation */
> +void s2mmu_enable(struct s2_mmu *mmu);
> +void s2mmu_disable(struct s2_mmu *mmu);
> +
> +/* Debug */
> +void s2mmu_print_fault_info(void);
> +
> +#endif /* _ASMARM64_STAGE2_MMU_H_ */
> diff --git a/lib/arm64/stage2_mmu.c b/lib/arm64/stage2_mmu.c
> new file mode 100644
> index 00000000..bfe87eac
> --- /dev/null
> +++ b/lib/arm64/stage2_mmu.c
> @@ -0,0 +1,402 @@
> +/*
> + * Copyright (C) 2026, Google LLC.
> + * Author: Jing Zhang <jingzhangos@google.com>
> + *
> + * SPDX-License-Identifier: LGPL-2.0-or-later
> + */
> +#include <libcflat.h>
> +#include <alloc.h>
> +#include <asm/stage2_mmu.h>
> +#include <asm/sysreg.h>
> +#include <asm/io.h>
> +#include <asm/barrier.h>
> +#include <alloc_page.h>
> +
> +/* VTCR_EL2 Definitions */
> +#define VTCR_SH0_INNER		(3UL << 12)
> +#define VTCR_ORGN0_WBWA		(1UL << 10)
> +#define VTCR_IRGN0_WBWA		(1UL << 8)
> +
> +/* TG0 Encodings */
> +#define VTCR_TG0_4K		(0UL << 14)
> +#define VTCR_TG0_64K		(1UL << 14)
> +#define VTCR_TG0_16K		(2UL << 14)
> +
> +/* Physical Address Size (PS) - Derive from VA_BITS for simplicity or max */
> +#if VA_BITS > 40
> +#define VTCR_PS_VAL		(5UL << 16) /* 48-bit PA */
> +#else
> +#define VTCR_PS_VAL		(2UL << 16) /* 40-bit PA */
> +#endif
> +
> +struct s2_mmu *s2mmu_init(int vmid, enum s2_granule granule, bool allow_block_mappings)
> +{
> +	struct s2_mmu *mmu = calloc(1, sizeof(struct s2_mmu));
> +	int order = 0;
> +
> +	mmu->vmid = vmid;
> +	mmu->granule = granule;
> +	mmu->allow_block_mappings = allow_block_mappings;
> +
> +	/* Configure shifts based on granule */
> +	switch (granule) {
> +	case S2_PAGE_4K:
> +		mmu->page_shift = 12;
> +		mmu->level_shift = 9;
> +		/*
> +		 * Determine Root Level for 4K:
> +		 * VA_BITS > 39 (e.g. 48) -> Start L0
> +		 * VA_BITS <= 39 (e.g. 32, 36) -> Start L1 
> +		 */
> +		mmu->root_level = (VA_BITS > 39) ? 0 : 1;
> +		break;
> +	case S2_PAGE_16K:
> +		mmu->page_shift = 14;
> +		mmu->level_shift = 11;
> +		/*
> +		 * 16K: L1 covers 47 bits. L0 not valid for 16K 
> +		 * Start L1 for 47 bits. Start L2 for 36 bits.
> +		 */
> +		mmu->root_level = (VA_BITS > 36) ? 1 : 2;
> +		break;
> +	case S2_PAGE_64K:
> +		mmu->page_shift = 16;
> +		mmu->level_shift = 13;
> +		/* 64K: L1 covers 52 bits. L2 covers 42 bits. */
> +		mmu->root_level = (VA_BITS > 42) ? 1 : 2;
> +		break;
> +	}
> +
> +	mmu->page_size = 1UL << mmu->page_shift;
> +	mmu->block_size = 1UL << (mmu->page_shift + mmu->level_shift);
> +
> +	/* Alloc PGD. Use order for allocation size */
> +	if (mmu->page_size > PAGE_SIZE) {
> +		order = __builtin_ctz(mmu->page_size / PAGE_SIZE);
> +	}
> +	mmu->pgd = (pgd_t *)alloc_pages(order);
> +	if (mmu->pgd) {
> +		memset(mmu->pgd, 0, mmu->page_size);
> +	} else {
> +		free(mmu);
> +		return NULL;
> +	}
> +
> +	return mmu;
> +}
> +
> +static unsigned long s2mmu_get_addr_mask(struct s2_mmu *mmu)
> +{
> +	switch (mmu->granule) {
> +	case S2_PAGE_16K:
> +		return GENMASK_ULL(47, 14);
> +	case S2_PAGE_64K:
> +		return GENMASK_ULL(47, 16);
> +	default:
> +		return GENMASK_ULL(47, 12); /* 4K */
> +	}
> +}
> +
> +static void s2mmu_free_tables(struct s2_mmu *mmu, pte_t *table, int level)
> +{
> +	unsigned long entries = 1UL << mmu->level_shift;
> +	unsigned long mask = s2mmu_get_addr_mask(mmu);
> +	unsigned long i;
> +
> +	/*
> +	 * Recurse if not leaf level
> +	 * Level 3 is always leaf page. Levels 0-2 can be Table or Block.
> +	 */
> +	if (level < 3) {
> +		for (i = 0; i < entries; i++) {
> +			pte_t entry = table[i];
> +			if ((pte_valid(entry) && pte_is_table(entry))) {
> +				pte_t *next = (pte_t *)phys_to_virt(pte_val(entry) & mask);
> +				s2mmu_free_tables(mmu, next, level + 1);
> +			}
> +		}
> +	}
> +
> +	free_pages(table);
> +}
> +
> +void s2mmu_destroy(struct s2_mmu *mmu)
> +{
> +	if (mmu->pgd)
> +		s2mmu_free_tables(mmu, (pte_t *)mmu->pgd, mmu->root_level);
> +	free(mmu);
> +}
> +
> +void s2mmu_enable(struct s2_mmu *mmu)
> +{
> +	unsigned long vtcr = VTCR_PS_VAL | VTCR_SH0_INNER |
> +			     VTCR_ORGN0_WBWA | VTCR_IRGN0_WBWA;
> +	unsigned long t0sz = 64 - VA_BITS;
> +	unsigned long vttbr;
> +
> +	switch (mmu->granule) {
> +	case S2_PAGE_4K:
> +		vtcr |= VTCR_TG0_4K;
> +		/* SL0 Encodings for 4K: 0=L2, 1=L1, 2=L0 */
> +		if (mmu->root_level == 0)
> +			vtcr |= (2UL << 6); /* Start L0 */
> +		else if (mmu->root_level == 1)
> +			vtcr |= (1UL << 6); /* Start L1 */
> +		else
> +			vtcr |= (0UL << 6); /* Start L2 */
> +		break;
> +	case S2_PAGE_16K:
> +		vtcr |= VTCR_TG0_16K;
> +		/* SL0 Encodings for 16K: 0=L3(Res), 1=L2, 2=L1, 3=L0(Res) */
> +		if (mmu->root_level == 1)
> +			vtcr |= (2UL << 6); /* Start L1 */
> +		else
> +			vtcr |= (1UL << 6); /* Start L2 */
> +		break;
> +	case S2_PAGE_64K:
> +		vtcr |= VTCR_TG0_64K;
> +		/* SL0 Encodings for 64K: 0=L3(Res), 1=L2, 2=L1, 3=L0(Res) */
> +		if (mmu->root_level == 1)
> +			vtcr |= (2UL << 6); /* Start L1 */
> +		else
> +			vtcr |= (1UL << 6); /* Start L2 */
> +		break;
> +	}

The pattern (xUL << 6) is repeated many times, perhaps turn it into a
macro, then the comments behind it can be omitted as well.

> +
> +	vtcr |= t0sz;
> +
> +	write_sysreg(vtcr, vtcr_el2);
> +	isb();
> +
> +	/* Setup VTTBR */
> +	vttbr = virt_to_phys(mmu->pgd);
> +	vttbr |= ((unsigned long)mmu->vmid << 48);
> +	write_sysreg(vttbr, vttbr_el2);
> +	isb();
> +
> +	asm volatile("tlbi vmalls12e1is");
> +	dsb(ish);
> +	isb();

I don't think you need the previous isb's other that this last one, the
effects of change vtcr_el2 and vttbr_el2 need not to be observed
immediately.

> +}
> +
> +void s2mmu_disable(struct s2_mmu *mmu)
> +{
> +	write_sysreg(0, vttbr_el2);
> +	isb();
> +}

I see you have HCR_DC further down, but writing 0s to vttbr_el2 does not mean
s2 translation gets disabled.

Thanks,
Wei-Lin Chang

> +
> +static pte_t *get_pte(struct s2_mmu *mmu, pte_t *table, unsigned long idx, bool alloc)
> +{
> +	unsigned long mask = s2mmu_get_addr_mask(mmu);
> +	pte_t entry = table[idx];
> +	pte_t *next_table;
> +	int order = 0;
> +
> +	if (pte_valid(entry)) {
> +		if (pte_is_table(entry))
> +			return (pte_t *)phys_to_virt(pte_val(entry) & mask);
> +		/* Block Entry */
> +		return NULL;
> +	}
> +
> +	if (!alloc)
> +		return NULL;
> +
> +	/* Allocate table memory covering the Stage-2 Granule size */
> +	if (mmu->page_size > PAGE_SIZE)
> +		order = __builtin_ctz(mmu->page_size / PAGE_SIZE);
> +
> +	next_table = (pte_t *)alloc_pages(order);
> +	if (next_table)
> +		memset(next_table, 0, mmu->page_size);
> +
> +	pte_val(entry) = virt_to_phys(next_table) | PTE_TABLE_BIT | PTE_VALID;
> +	WRITE_ONCE(table[idx], entry);
> +
> +	return next_table;
> +}
> +
> +void s2mmu_map(struct s2_mmu *mmu, unsigned long ipa, unsigned long pa,
> +	       unsigned long size, unsigned long flags)
> +{
> +	unsigned long level_mask, level_shift, level_size, level;
> +	unsigned long start_ipa, end_ipa, idx;
> +	pte_t entry, *table, *next_table;
> +	bool is_block_level;
> +
> +	start_ipa = ipa;
> +	end_ipa = ipa + size;
> +	level_mask = (1UL << mmu->level_shift) - 1;
> +
> +	while (start_ipa < end_ipa) {
> +		table = (pte_t *)mmu->pgd;
> +
> +		/* Walk from Root to Leaf */
> +		for (level = mmu->root_level; level < 3; level++) {
> +			level_shift = mmu->page_shift + (3 - level) * mmu->level_shift;
> +			idx = (start_ipa >> level_shift) & level_mask;
> +			level_size = 1UL << level_shift;
> +
> +			/*
> +			 * Check for Block Mapping
> +			 * Valid Block Levels:
> +			 * 4K:  L1 (1G), L2 (2MB)
> +			 * 16K: L2 (32MB)
> +			 * 64K: L2 (512MB) 
> +			 */
> +			is_block_level = (level == 2) ||
> +				(mmu->granule == S2_PAGE_4K && level == 1);
> +
> +			if (mmu->allow_block_mappings && is_block_level) {
> +				if ((start_ipa & (level_size - 1)) == 0 &&
> +				    (pa & (level_size - 1)) == 0 &&
> +				    (start_ipa + level_size) <= end_ipa) {
> +					/* Map Block */
> +					pte_val(entry) = (pa & ~(level_size - 1)) |
> +							 flags | PTE_VALID;
> +					WRITE_ONCE(table[idx], entry);
> +					start_ipa += level_size;
> +					pa += level_size;
> +					goto next_chunk; /* Continue outer loop */
> +				}
> +			}
> +
> +			/* Move to next level */
> +			next_table = get_pte(mmu, table, idx, true);
> +			if (!next_table) {
> +				printf("Error allocating or existing block conflict.\n");
> +				return;
> +			}
> +			table = next_table;
> +		}
> +
> +		/* Leaf Level (Level 3 PTE) */
> +		if (level == 3) {
> +			idx = (start_ipa >> mmu->page_shift) & level_mask;
> +			pte_val(entry) = (pa & ~(mmu->page_size - 1)) | flags | PTE_TYPE_PAGE;
> +			WRITE_ONCE(table[idx], entry);
> +			start_ipa += mmu->page_size;
> +			pa += mmu->page_size;
> +		}
> +
> +next_chunk:
> +		continue;
> +	}
> +
> +	asm volatile("tlbi vmalls12e1is");
> +	dsb(ish);
> +	isb();
> +}
> +
> +/*
> + * Recursive helper to unmap a range within a specific table.
> + * Returns true if the table at this level is now completely empty
> + * and should be freed by the caller.
> + */
> +static bool s2mmu_unmap_level(struct s2_mmu *mmu, pte_t *table,
> +			      unsigned long current_ipa, int level,
> +			      unsigned long start_ipa, unsigned long end_ipa,
> +			      unsigned long mask)
> +{
> +	unsigned long level_size, entry_ipa, entry_end;
> +	bool child_empty, table_empty = true;
> +	pte_t entry, *next_table;
> +	unsigned int level_shift;
> +	unsigned long i;
> +
> +	/* Calculate shift and size for this level */
> +	if (level == 3) {
> +		level_shift = mmu->page_shift;
> +	} else {
> +		level_shift = mmu->page_shift + (3 - level) * mmu->level_shift;
> +	}
> +	level_size = 1UL << level_shift;
> +
> +	/* Iterate over all entries in this table */
> +	for (i = 0; i < (1UL << mmu->level_shift); i++) {
> +		entry = table[i];
> +		entry_ipa = current_ipa + (i * level_size);
> +		entry_end = entry_ipa + level_size;
> +
> +		/* Skip entries completely outside our target range */
> +		if (entry_end <= start_ipa || entry_ipa >= end_ipa) {
> +			if (pte_valid(entry))
> +				table_empty = false;
> +			continue;
> +		}
> +
> +		/*
> +		 * If the entry is fully covered by the unmap range,
> +		 * we can clear it (leaf) or recurse and free (table).
> +		 */
> +		if (entry_ipa >= start_ipa && entry_end <= end_ipa) {
> +			if (pte_valid(entry)) {
> +				if (pte_is_table(entry) && level < 3) {
> +					/* Recurse to free children first */
> +					next_table = (pte_t *)phys_to_virt(pte_val(entry) & mask);
> +					s2mmu_free_tables(mmu, next_table, level + 1);
> +				}
> +				/* Invalidate the entry */
> +				WRITE_ONCE(table[i], __pte(0));
> +			}
> +			continue;
> +		}
> +
> +		/*
> +		 * Partial overlap: This must be a table (split required).
> +		 * If it's a Block, we can't split easily in this context
> +		 * without complex logic, so we generally skip or fail.
> +		 * Assuming standard breakdown: recurse into the table.
> +		 */
> +		if (pte_valid(entry) && pte_is_table(entry) && level < 3) {
> +			next_table = (pte_t *)phys_to_virt(pte_val(entry) & mask);
> +			child_empty = s2mmu_unmap_level(mmu, next_table, entry_ipa, level + 1,
> +							start_ipa, end_ipa, mask);
> +
> +			if (child_empty) {
> +				free_pages(next_table);
> +				WRITE_ONCE(table[i], __pte(0));
> +			} else {
> +				table_empty = false;
> +			}
> +		} else if (pte_valid(entry)) {
> +			/*
> +			 * Overlap on a leaf/block entry that extends
> +			 * beyond the unmap range. We cannot simply clear it.
> +			 */
> +			table_empty = false;
> +		}
> +	}
> +
> +	return table_empty;
> +}
> +
> +void s2mmu_unmap(struct s2_mmu *mmu, unsigned long ipa, unsigned long size)
> +{
> +	unsigned long end_ipa = ipa + size;
> +	unsigned long mask = s2mmu_get_addr_mask(mmu);
> +
> +	if (!mmu->pgd)
> +		return;
> +
> +	/*
> +	 * Start recursion from the root level.
> +	 * We rarely free the PGD itself unless destroying the MMU, 
> +	 * so we ignore the return value here.
> +	 */
> +	s2mmu_unmap_level(mmu, (pte_t *)mmu->pgd, 0, mmu->root_level,
> +			  ipa, end_ipa, mask);
> +
> +	/* Ensure TLB invalidation occurs after page table updates */
> +	asm volatile("tlbi vmalls12e1is");
> +	dsb(ish);
> +	isb();
> +}
> +
> +void s2mmu_print_fault_info(void)
> +{
> +	unsigned long esr = read_sysreg(esr_el2);
> +	unsigned long far = read_sysreg(far_el2);
> +	unsigned long hpfar = read_sysreg(hpfar_el2);
> +	printf("Stage-2 Fault Info: ESR=0x%lx FAR=0x%lx HPFAR=0x%lx\n", esr, far, hpfar);
> +}
> -- 
> 2.53.0.851.ga537e3e6e9-goog
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [kvm-unit-tests PATCH v1 2/3] lib: arm64: Add bare-metal guest execution framework
  2026-03-16 22:43 ` [kvm-unit-tests PATCH v1 2/3] lib: arm64: Add bare-metal guest execution framework Jing Zhang
                     ` (2 preceding siblings ...)
  2026-03-24 15:04   ` Joey Gouly
@ 2026-03-24 15:44   ` Wei-Lin Chang
  3 siblings, 0 replies; 10+ messages in thread
From: Wei-Lin Chang @ 2026-03-24 15:44 UTC (permalink / raw)
  To: Jing Zhang, KVM, KVMARM
  Cc: Marc Zyngier, Joey Gouly, Andrew Jones, Alexandru Elisei,
	Oliver Upton

Hi,

On Mon, Mar 16, 2026 at 03:43:48PM -0700, Jing Zhang wrote:
> To test advanced KVM features such as nested virtualization (NV) and
> GICv4 direct interrupt injection, kvm-unit-tests needs the ability to
> act as an L1 hypervisor running at EL2 and manage its own L2 guests.
> 
> Introduce a lightweight guest management library that provides the
> infrastructure to create, configure, and execute nested guests.
> 
> This framework includes:
> - Guest lifecycle management: `guest_create()` and `guest_destroy()`
>   APIs to allocate guest context and setup Stage-2 identity mappings
>   for code and stack using the s2mmu library.
> - Context switching: The `guest_run()` assembly routine handles
>   saving the host (L1) callee-saved registers and loading the guest
>   (L2) GPRs and EL1 system registers.
> - VM-Exit handling: Installs an EL2 trap handler (`guest_hyp_vectors`)
>   to intercept guest exits and route them to `guest_c_exception_handler`
>   to determine whether to return to the host test logic or resume.
> - Guest-internal exceptions: Provides `guest_el1_vectors` to catch
>   Sync, IRQ, FIQ, and SError exceptions occurring entirely within the
>   guest (EL1) without trapping to the host.
> 
> Signed-off-by: Jing Zhang <jingzhangos@google.com>

Thank you for the effort.

To me this feels too much trying to add all these in one go. I think at
least guest-internal exceptions, and usage of stage-2 (start from no
stage-2 for L2), can probably be split into its separate commits.

> ---
>  arm/Makefile.arm64     |   2 +
>  lib/arm64/asm/guest.h  | 156 ++++++++++++++++++++++++
>  lib/arm64/guest.c      | 197 ++++++++++++++++++++++++++++++
>  lib/arm64/guest_arch.S | 263 +++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 618 insertions(+)
>  create mode 100644 lib/arm64/asm/guest.h
>  create mode 100644 lib/arm64/guest.c
>  create mode 100644 lib/arm64/guest_arch.S
> 
> diff --git a/arm/Makefile.arm64 b/arm/Makefile.arm64
> index 5e50f5ba..9026fd71 100644
> --- a/arm/Makefile.arm64
> +++ b/arm/Makefile.arm64
> @@ -41,6 +41,8 @@ cflatobjs += lib/arm64/processor.o
>  cflatobjs += lib/arm64/spinlock.o
>  cflatobjs += lib/arm64/gic-v3-its.o lib/arm64/gic-v3-its-cmd.o
>  cflatobjs += lib/arm64/stage2_mmu.o
> +cflatobjs += lib/arm64/guest.o
> +cflatobjs += lib/arm64/guest_arch.o
>  
>  ifeq ($(CONFIG_EFI),y)
>  cflatobjs += lib/acpi.o
> diff --git a/lib/arm64/asm/guest.h b/lib/arm64/asm/guest.h
> new file mode 100644
> index 00000000..1d70873d
> --- /dev/null
> +++ b/lib/arm64/asm/guest.h
> @@ -0,0 +1,156 @@
> +/*
> + * Copyright (C) 2026, Google LLC.
> + * Author: Jing Zhang <jingzhangos@google.com>
> + *
> + * SPDX-License-Identifier: LGPL-2.0-or-later
> + */
> +#ifndef _ASMARM64_GUEST_H_
> +#define _ASMARM64_GUEST_H_
> +
> +/* Offsets for assembly (Must match struct guest) */
> +#define GUEST_X_OFFSET			0
> +#define GUEST_ELR_OFFSET		248
> +#define GUEST_SPSR_OFFSET		256
> +#define GUEST_HCR_OFFSET		264
> +#define GUEST_VTTBR_OFFSET		272
> +#define GUEST_SCTLR_OFFSET		280
> +#define GUEST_VBAR_OFFSET		288
> +#define GUEST_SP_EL1_OFFSET		296
> +#define GUEST_ESR_OFFSET		304
> +#define GUEST_FAR_OFFSET		312
> +#define GUEST_HPFAR_OFFSET		320
> +#define GUEST_EXIT_CODE_OFFSET		328
> +#define GUEST_TPIDR_EL1_OFFSET		336
> +#define GUEST_ICH_VMCR_EL2_OFFSET	344
> +
> +#ifndef __ASSEMBLY__
> +
> +#include <libcflat.h>
> +#include <asm/stage2_mmu.h>
> +
> +/* HCR_EL2 Definitions */
> +#define HCR_VM		(1UL << 0)	/* Virtualization Enable */
> +#define HCR_FMO		(1UL << 3)	/* Physical FIQ Routing */
> +#define HCR_IMO		(1UL << 4)	/* Physical IRQ Routing */
> +#define HCR_AMO		(1UL << 5)	/* Physical SError Interrupt Routing */
> +#define HCR_RW		(1UL << 31)	/* Execution State: AArch64 */
> +#define HCR_DC		(1UL << 12)	/* Default Cacheable */
> +#define HCR_E2H		(1UL << 34)	/* EL2 Host */
> +
> +#define HCR_GUEST_FLAGS (HCR_VM | HCR_FMO | HCR_IMO | HCR_AMO | HCR_RW | \
> +			 HCR_DC | HCR_E2H)
> +
> +/* ICH_VMCR_EL2 bit definition */
> +#define ICH_VMCR_PMR_SHIFT	24
> +#define ICH_VMCR_PMR_MASK	(0xffUL << ICH_VMCR_PMR_SHIFT)
> +#define ICH_VMCR_ENG0_SHIFT	0
> +#define ICH_VMCR_ENG0_MASK	(1 << ICH_VMCR_ENG0_SHIFT)
> +#define ICH_VMCR_ENG1_SHIFT	1
> +#define ICH_VMCR_ENG1_MASK	(1 << ICH_VMCR_ENG1_SHIFT)
> +
> +/* Guest stack size */
> +#define GUEST_STACK_SIZE		SZ_64K
> +
> +/*
> + * Result from Handler:
> + * RESUME: Keep guest running (ERET immediately)
> + * EXIT:   Return to Host C caller
> + */
> +enum guest_handler_result {
> +	GUEST_ACTION_RESUME,
> +	GUEST_ACTION_EXIT
> +};
> +
> +struct guest;
> +typedef enum guest_handler_result (*guest_handler_t)(struct guest *guest);
> +
> +/* EL1 (Guest-internal) Exception Vector */
> +enum guest_el1_vector {
> +	GUEST_EL1_SYNC,
> +	GUEST_EL1_IRQ,
> +	GUEST_EL1_FIQ,
> +	GUEST_EL1_SERROR,
> +	GUEST_EL1_MAX
> +};
> +
> +/*
> + * Guest EL1 Exception Frame (pushed to guest stack by asm stub)
> + * We use a simplified frame: x0-x30, elr, spsr. size = 33*8
> + */
> +struct guest_el1_regs {
> +	unsigned long regs[31];
> +	unsigned long elr;
> +	unsigned long spsr;
> +};
> +
> +typedef void (*guest_el1_handler_t)(struct guest_el1_regs *regs, unsigned int esr);
> +
> +/* Exceptions from the Guest (Lower EL using AArch64) */
> +enum guest_vector {
> +	GUEST_VECTOR_SYNC,
> +	GUEST_VECTOR_IRQ,
> +	GUEST_VECTOR_FIQ,
> +	GUEST_VECTOR_SERROR,
> +	GUEST_VECTOR_MAX
> +};
> +
> +/*
> + * Guest Context Structure
> + * This will be pointed to by TPIDR_EL1 while the guest is running.
> + */
> +struct guest_context {
> +	guest_el1_handler_t handlers[GUEST_EL1_MAX];
> +};
> +
> +struct guest {
> +	/* 0x000: General Purpose Registers */
> +	unsigned long x[31]; /* x0..x30 */
> +
> +	/* 0x0F8: Execution State */
> +	unsigned long elr_el2;
> +	unsigned long spsr_el2;
> +
> +	/* 0x108: Control Registers */
> +	unsigned long hcr_el2;
> +	unsigned long vttbr_el2;
> +	unsigned long sctlr_el1;
> +	unsigned long vbar_el1;
> +	unsigned long sp_el1;
> +
> +	/* 0x130: Exit Information */
> +	unsigned long esr_el2;
> +	unsigned long far_el2;
> +	unsigned long hpfar_el2;
> +	unsigned long exit_code; /* enum guest_vector */
> +	unsigned long tpidr_el1;
> +
> +	/* 0x158: GIC Registers */
> +	unsigned long ich_vmcr_el2;
> +
> +	/* 0x160: Exception Handlers */
> +	guest_handler_t handlers[GUEST_VECTOR_MAX];
> +	struct guest_context *guest_context;
> +
> +	struct s2_mmu *s2mmu;
> +};
> +
> +/* API */
> +struct guest *guest_create(int vmid, void (*guest_func)(void), enum s2_granule granule);
> +void guest_destroy(struct guest *guest);
> +
> +/* Configuration */
> +void guest_set_vector(struct guest *guest, void *vector_table);
> +void guest_set_stack(struct guest *guest, void *stack_top);
> +void guest_install_handler(struct guest *guest, enum guest_vector v, guest_handler_t handler);
> +
> +/* Install handler for exceptions INSIDE EL1 */
> +void guest_install_el1_handler(struct guest *guest, enum guest_el1_vector v, guest_el1_handler_t handler);
> +
> +unsigned long guest_c_exception_handler(struct guest *guest, unsigned long vector_offset);
> +void guest_el1_c_handler(struct guest_el1_regs *regs, unsigned int vector);
> +
> +/* Core Run Loop */
> +void guest_run(struct guest *guest);
> +
> +#endif /* __ASSEMBLY__ */
> +#endif /* _ASMARM64_GUEST_H_ */
> diff --git a/lib/arm64/guest.c b/lib/arm64/guest.c
> new file mode 100644
> index 00000000..6c256c11
> --- /dev/null
> +++ b/lib/arm64/guest.c
> @@ -0,0 +1,197 @@
> +/*
> + * Copyright (C) 2026, Google LLC.
> + * Author: Jing Zhang <jingzhangos@google.com>
> + *
> + * SPDX-License-Identifier: LGPL-2.0-or-later
> + */
> +#include <libcflat.h>
> +#include <asm/guest.h>
> +#include <asm/io.h>
> +#include <asm/sysreg.h>
> +#include <asm/barrier.h>
> +#include <alloc_page.h>
> +#include <alloc.h>
> +
> +/* Compile-time checks to ensure Assembly macros match C Struct */
> +_Static_assert(offsetof(struct guest, x) == GUEST_X_OFFSET,
> +	       "GUEST_X_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, elr_el2) == GUEST_ELR_OFFSET,
> +	       "GUEST_ELR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, spsr_el2) == GUEST_SPSR_OFFSET,
> +	       "GUEST_SPSR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, hcr_el2) == GUEST_HCR_OFFSET,
> +	       "GUEST_HCR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, vttbr_el2) == GUEST_VTTBR_OFFSET,
> +	       "GUEST_VTTBR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, sctlr_el1) == GUEST_SCTLR_OFFSET,
> +	       "GUEST_SCTLR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, vbar_el1) == GUEST_VBAR_OFFSET,
> +	       "GUEST_VBAR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, sp_el1) == GUEST_SP_EL1_OFFSET,
> +	       "GUEST_SP_EL1_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, esr_el2) == GUEST_ESR_OFFSET,
> +	       "GUEST_ESR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, far_el2) == GUEST_FAR_OFFSET,
> +	       "GUEST_FAR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, hpfar_el2) == GUEST_HPFAR_OFFSET,
> +	       "GUEST_HPFAR_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, exit_code) == GUEST_EXIT_CODE_OFFSET,
> +	       "GUEST_EXIT_CODE_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, tpidr_el1) == GUEST_TPIDR_EL1_OFFSET,
> +		"GUEST_TPIDR_EL1_OFFSET mismatch");
> +_Static_assert(offsetof(struct guest, ich_vmcr_el2) == GUEST_ICH_VMCR_EL2_OFFSET,
> +		"GUEST_ICH_VMCR_EL2_OFFSET mismatch");
> +
> +/*
> + * C-Entry for Exception Handling
> + * Returns 0 to Resume Guest, 1 to Exit to Host Caller
> + */
> +unsigned long guest_c_exception_handler(struct guest *guest, unsigned long vector_offset)
> +{
> +	enum guest_vector vector = (enum guest_vector)guest->exit_code;
> +
> +	/* Save Trap Info */
> +	guest->esr_el2 = read_sysreg(esr_el2);
> +	guest->far_el2 = read_sysreg(far_el2);
> +	guest->hpfar_el2 = read_sysreg(hpfar_el2);
> +
> +	/* Invoke Handler if registered */
> +	if (guest->handlers[vector]) {
> +		if (guest->handlers[vector](guest) == GUEST_ACTION_RESUME) {
> +			return 0; /* ASM stub will restore and ERET */
> +		}
> +	}
> +
> +	/* Default: Exit to caller */
> +	return 1;
> +}
> +
> +/* --- EL1 (Guest-Internal) Vector Handling --- */
> +
> +void guest_install_el1_handler(struct guest *guest, enum guest_el1_vector v, guest_el1_handler_t handler)
> +{
> +	if (guest && guest->guest_context && v < GUEST_EL1_MAX)
> +		guest->guest_context->handlers[v] = handler;
> +}
> +
> +void guest_el1_c_handler(struct guest_el1_regs *regs, unsigned int vector)
> +{
> +	struct guest_context *ctx = (struct guest_context *)read_sysreg(tpidr_el1);
> +	unsigned int esr = read_sysreg(esr_el1);
> +
> +	if (ctx && vector < GUEST_EL1_MAX && ctx->handlers[vector]) {
> +		ctx->handlers[vector](regs, esr);
> +	} else {
> +		printf("Guest: Unhandled Exception Vector %d, ESR=0x%x\n", vector, esr);
> +		asm volatile("hvc #0xFFFF");
> +	}
> +}
> +
> +extern void guest_el1_vectors(void);
> +
> +static struct guest *__guest_create(struct s2_mmu *s2_ctx, void *entry_point)
> +{
> +	struct guest *guest = calloc(1, sizeof(struct guest));
> +	struct guest_context *guest_ctx;
> +	unsigned long guest_ctx_pa;
> +
> +	/* Allocate the internal context table */
> +	guest_ctx = (void *)alloc_page();
> +	memset(guest_ctx, 0, PAGE_SIZE);
> +	guest->guest_context = guest_ctx;
> +
> +	guest_ctx_pa = virt_to_phys(guest_ctx);
> +	if (s2_ctx)
> +		s2mmu_map(s2_ctx, guest_ctx_pa, guest_ctx_pa, PAGE_SIZE, S2_MAP_RW);
> +
> +	guest->tpidr_el1 = guest_ctx_pa;;

nit: extra semicolon

Thanks,
Wei-Lin Chang

> +
> +	guest->elr_el2 = (unsigned long)entry_point;
> +	guest->spsr_el2 = 0x3C5; /* M=EL1h, DAIF=Masked */
> +	guest->hcr_el2 = HCR_GUEST_FLAGS;
> +
> +	if (s2_ctx) {
> +		guest->vttbr_el2 = virt_to_phys(s2_ctx->pgd);
> +		guest->vttbr_el2 |= ((unsigned long)s2_ctx->vmid << 48);
> +	}
> +
> +	guest->sctlr_el1 = read_sysreg(sctlr_el1);
> +	guest->sctlr_el1 |= SCTLR_EL1_C | SCTLR_EL1_I | SCTLR_EL1_M;
> +
> +	guest->ich_vmcr_el2 = read_sysreg(ich_vmcr_el2);
> +	guest->ich_vmcr_el2 |= (0xFFUL << ICH_VMCR_PMR_SHIFT) | (1UL << ICH_VMCR_ENG1_SHIFT);
> +
> +	guest->vbar_el1 = (unsigned long)guest_el1_vectors;
> +	guest->s2mmu = s2_ctx;
> +
> +	return guest;
> +}
> +
> +struct guest *guest_create(int vmid, void (*guest_func)(void), enum s2_granule granule)
> +{
> +	unsigned long guest_pa, code_base, stack_pa;
> +	unsigned long *stack_page;
> +	struct guest *guest;
> +	struct s2_mmu *ctx;
> +
> +	ctx = s2mmu_init(vmid, granule, true);
> +	/*
> +	 * Map the Host's code segment Identity Mapped (IPA=PA).
> +	 * To be safe, we map a large chunk (e.g., 2MB) around the function
> +	 * to capture any helper functions the compiler might generate calls to.
> +	 */
> +	guest_pa = virt_to_phys((void *)guest_func);
> +	code_base = guest_pa & ~(SZ_2M - 1);
> +	s2mmu_map(ctx, code_base, code_base, SZ_2M, S2_MAP_RW);
> +
> +	/*
> +	 * Map Stack
> +	 * Allocate 16 pages (64K) in Host, get its PA, and map it for Guest.
> +	 */
> +	stack_page = alloc_pages(get_order(GUEST_STACK_SIZE >> PAGE_SHIFT));
> +	stack_pa = virt_to_phys(stack_page);
> +	/* Identity Map it (IPA = PA) */
> +	s2mmu_map(ctx, stack_pa, stack_pa, GUEST_STACK_SIZE, S2_MAP_RW);
> +
> +	s2mmu_enable(ctx);
> +
> +	/* Create Guest */
> +	/* Entry point is the PA of the function (Identity Mapped) */
> +	guest = __guest_create(ctx, (void *)guest_pa);
> +
> +	/*
> +	 * Setup Guest Stack Pointer
> +	 * Must match where we mapped the stack + Offset
> +	 */
> +	guest_set_stack(guest, (void *)(stack_pa + GUEST_STACK_SIZE));
> +
> +	/* Map UART identity mapped, printf() available to guest */
> +	s2mmu_map(ctx, 0x09000000, 0x09000000, PAGE_SIZE, S2_MAP_DEVICE);
> +
> +	return guest;
> +}
> +
> +void guest_destroy(struct guest *guest)
> +{
> +	s2mmu_disable(guest->s2mmu);
> +	s2mmu_destroy(guest->s2mmu);
> +	if (guest->guest_context)
> +		free_page(guest->guest_context);
> +	free(guest);
> +}
> +
> +void guest_set_vector(struct guest *guest, void *vector_table)
> +{
> +	guest->vbar_el1 = (unsigned long)vector_table;
> +}
> +
> +void guest_set_stack(struct guest *guest, void *stack_top)
> +{
> +	guest->sp_el1 = (unsigned long)stack_top;
> +}
> +
> +void guest_install_handler(struct guest *guest, enum guest_vector v, guest_handler_t handler)
> +{
> +	if (v < GUEST_VECTOR_MAX)
> +		guest->handlers[v] = handler;
> +}
> diff --git a/lib/arm64/guest_arch.S b/lib/arm64/guest_arch.S
> new file mode 100644
> index 00000000..cb7074d7
> --- /dev/null
> +++ b/lib/arm64/guest_arch.S
> @@ -0,0 +1,263 @@
> +/*
> + * Copyright (C) 2026, Google LLC.
> + * Author: Jing Zhang <jingzhangos@google.com>
> + *
> + * SPDX-License-Identifier: LGPL-2.0-or-later
> + */
> +#define __ASSEMBLY__
> +#include <asm/guest.h>
> +
> +.global guest_run
> +guest_run:
> +	/* x0 = struct guest pointer */
> +
> +	/* Save Host Callee-Saved Regs */
> +	stp	x29, x30, [sp, #-16]!
> +	stp	x27, x28, [sp, #-16]!
> +	stp	x25, x26, [sp, #-16]!
> +	stp	x23, x24, [sp, #-16]!
> +	stp	x21, x22, [sp, #-16]!
> +	stp	x19, x20, [sp, #-16]!
> +
> +	/* Cache Guest Pointer in TPIDR_EL2 */
> +	msr	tpidr_el2, x0
> +
> +	/* Configure ICC_SRE_EL2 to allow EL1 access to SysRegs */
> +	/* Bit 3 (Enable) = 1, Bit 0 (SRE) = 1 */
> +	mrs	x1, icc_sre_el2
> +	orr	x1, x1, #1
> +	orr	x1, x1, #(1 << 3)
> +	msr	icc_sre_el2, x1
> +	isb
> +
> +	/* Enable virtual CPU interface */
> +	mrs	x1, ich_hcr_el2
> +	orr	x1, x1, #1
> +	msr	ich_hcr_el2, x1
> +
> +	/* Load Guest System Registers */
> +	ldr	x1, [x0, #GUEST_ELR_OFFSET]
> +	msr	elr_el2, x1
> +	ldr	x1, [x0, #GUEST_SPSR_OFFSET]
> +	msr	spsr_el2, x1
> +	ldr	x1, [x0, #GUEST_HCR_OFFSET]
> +	msr	hcr_el2, x1
> +	ldr	x1, [x0, #GUEST_VTTBR_OFFSET]
> +	msr	vttbr_el2, x1
> +	ldr	x1, [x0, #GUEST_SCTLR_OFFSET]
> +	msr	S3_5_c1_c0_0, x1
> +	ldr	x1, [x0, #GUEST_VBAR_OFFSET]
> +	msr	S3_5_c12_c0_0, x1
> +	ldr	x1, [x0, #GUEST_SP_EL1_OFFSET]
> +	msr	sp_el1, x1
> +	ldr	x1, [x0, #GUEST_TPIDR_EL1_OFFSET]
> +	msr	tpidr_el1, x1
> +	ldr	x1, [x0, #GUEST_ICH_VMCR_EL2_OFFSET]
> +	msr	ich_vmcr_el2, x1
> +
> +	/* Load Guest GPRs */
> +	ldp	x1, x2, [x0, #8]
> +	ldp	x3, x4, [x0, #24]
> +	ldp	x5, x6, [x0, #40]
> +	ldp	x7, x8, [x0, #56]
> +	ldp	x9, x10, [x0, #72]
> +	ldp	x11, x12, [x0, #88]
> +	ldp	x13, x14, [x0, #104]
> +	ldp	x15, x16, [x0, #120]
> +	ldp	x17, x18, [x0, #136]
> +	ldp	x19, x20, [x0, #152]
> +	ldp	x21, x22, [x0, #168]
> +	ldp	x23, x24, [x0, #184]
> +	ldp	x25, x26, [x0, #200]
> +	ldp	x27, x28, [x0, #216]
> +	ldp	x29, x30, [x0, #232]
> +	ldr	x0, [x0, #0]
> +
> +	/* Install Trap Handler */
> +	adrp	x29, guest_hyp_vectors
> +	add	x29, x29, :lo12:guest_hyp_vectors
> +	msr	vbar_el2, x29
> +
> +	/* Restore x29 from struct (via tpidr_el2) */
> +	mrs	x29, tpidr_el2
> +	ldr	x29, [x29, #232]
> +
> +	isb
> +	eret
> +
> +	.align 11
> +guest_hyp_vectors:
> +	.skip 0x400
> +
> +guest_exit_sync:
> +	stp	x0, x1, [sp, #-16]!
> +	mrs	x0, tpidr_el2
> +	mov	x1, #0
> +	str	x1, [x0, #GUEST_EXIT_CODE_OFFSET]
> +	b	guest_common_exit
> +
> +	.balign 0x80
> +
> +guest_exit_irq:
> +	stp	x0, x1, [sp, #-16]!
> +	mrs	x0, tpidr_el2
> +	mov	x1, #1
> +	str	x1, [x0, #GUEST_EXIT_CODE_OFFSET]
> +	b	guest_common_exit
> +
> +guest_common_exit:
> +	stp	x2, x3, [x0, #16]
> +	stp	x4, x5, [x0, #32]
> +	stp	x6, x7, [x0, #48]
> +	stp	x8, x9, [x0, #64]
> +	stp	x10, x11, [x0, #80]
> +	stp	x12, x13, [x0, #96]
> +	stp	x14, x15, [x0, #112]
> +	stp	x16, x17, [x0, #128]
> +	stp	x18, x19, [x0, #144]
> +	stp	x20, x21, [x0, #160]
> +	stp	x22, x23, [x0, #176]
> +	stp	x24, x25, [x0, #192]
> +	stp	x26, x27, [x0, #208]
> +	stp	x28, x29, [x0, #224]
> +	str	x30, [x0, #240]
> +
> +	ldp	x2, x3, [sp], #16
> +	stp	x2, x3, [x0, #0]
> +
> +	mrs	x1, elr_el2
> +	str	x1, [x0, #GUEST_ELR_OFFSET]
> +	mrs	x1, spsr_el2
> +	str	x1, [x0, #GUEST_SPSR_OFFSET]
> +	mrs	x1, esr_el2
> +	str	x1, [x0, #GUEST_ESR_OFFSET]
> +	mrs	x1, far_el2
> +	str	x1, [x0, #GUEST_FAR_OFFSET]
> +	mrs	x1, hpfar_el2
> +	str	x1, [x0, #GUEST_HPFAR_OFFSET]
> +	mrs	x1, sp_el1
> +	str	x1, [x0, #GUEST_SP_EL1_OFFSET]
> +	mrs	x1, ich_vmcr_el2
> +	str	x1, [x0, #GUEST_ICH_VMCR_EL2_OFFSET]
> +
> +	/* x29 contains vector offset from entry */
> +	mov	x1, x29
> +	bl	guest_c_exception_handler
> +	cbz	x0, guest_resume_guest
> +
> +	/* EXIT */
> +	/* Restore Host Callee-Saved Regs */
> +	ldp	x19, x20, [sp], #16
> +	ldp	x21, x22, [sp], #16
> +	ldp	x23, x24, [sp], #16
> +	ldp	x25, x26, [sp], #16
> +	ldp	x27, x28, [sp], #16
> +	ldp	x29, x30, [sp], #16
> +	ret
> +
> +	/* RESUME */
> +guest_resume_guest:
> +	mrs	x0, tpidr_el2
> +	ldr	x1, [x0, #GUEST_ELR_OFFSET]
> +	msr	elr_el2, x1
> +	ldr	x1, [x0, #GUEST_SPSR_OFFSET]
> +	msr	spsr_el2, x1
> +	ldr	x1, [x0, #GUEST_SP_EL1_OFFSET]
> +	msr	sp_el1, x1
> +
> +	ldp	x1, x2, [x0, #8]
> +	ldp	x3, x4, [x0, #24]
> +	ldp	x5, x6, [x0, #40]
> +	ldp	x7, x8, [x0, #56]
> +	ldp	x9, x10, [x0, #72]
> +	ldp	x11, x12, [x0, #88]
> +	ldp	x13, x14, [x0, #104]
> +	ldp	x15, x16, [x0, #120]
> +	ldp	x17, x18, [x0, #136]
> +	ldp	x19, x20, [x0, #152]
> +	ldp	x21, x22, [x0, #168]
> +	ldp	x23, x24, [x0, #184]
> +	ldp	x25, x26, [x0, #200]
> +	ldp	x27, x28, [x0, #216]
> +	ldp	x29, x30, [x0, #232]
> +	ldr	x0, [x0, #0]
> +	eret
> +
> +/* EL1 Vector Table */
> +.align 11
> +.global guest_el1_vectors
> +guest_el1_vectors:
> +	/* Sync (0x000) */
> +	.skip 0x200
> +	/* Sync (0x200) */
> +	stp	x29, x30, [sp, #-16]!
> +	mov	x29, #0
> +	b	guest_el1_common
> +	.skip 0x80 - 12
> +	/* IRQ (0x280) */
> +	stp	x29, x30, [sp, #-16]!
> +	mov	x29, #1
> +	b	guest_el1_common
> +	.skip 0x80 - 12
> +	/* FIQ (0x300) */
> +	stp	x29, x30, [sp, #-16]!
> +	mov	x29, #2
> +	b	guest_el1_common
> +	.skip 0x80 - 12
> +	/* SError (0x380) */
> +	stp	x29, x30, [sp, #-16]!
> +	mov	x29, #3
> +	b	guest_el1_common
> +	.skip 0x400
> +
> +guest_el1_common:
> +	sub	sp, sp, #264
> +	stp	x0, x1, [sp, #0]
> +	stp	x2, x3, [sp, #16]
> +	stp	x4, x5, [sp, #32]
> +	stp	x6, x7, [sp, #48]
> +	stp	x8, x9, [sp, #64]
> +	stp	x10, x11, [sp, #80]
> +	stp	x12, x13, [sp, #96]
> +	stp	x14, x15, [sp, #112]
> +	stp	x16, x17, [sp, #128]
> +	stp	x18, x19, [sp, #144]
> +	stp	x20, x21, [sp, #160]
> +	stp	x22, x23, [sp, #176]
> +	stp	x24, x25, [sp, #192]
> +	stp	x26, x27, [sp, #208]
> +	stp	x28, x30, [sp, #224]
> +
> +	mrs	x0, elr_el1
> +	str	x0, [sp, #248]
> +	mrs	x0, spsr_el1
> +	str	x0, [sp, #256]
> +
> +	mov	x0, sp
> +	mov	x1, x29
> +	bl	guest_el1_c_handler
> +
> +	ldr	x0, [sp, #248]
> +	msr	elr_el1, x0
> +	ldr	x0, [sp, #256]
> +	msr	spsr_el1, x0
> +
> +	ldp	x0, x1, [sp, #0]
> +	ldp	x2, x3, [sp, #16]
> +	ldp	x4, x5, [sp, #32]
> +	ldp	x6, x7, [sp, #48]
> +	ldp	x8, x9, [sp, #64]
> +	ldp	x10, x11, [sp, #80]
> +	ldp	x12, x13, [sp, #96]
> +	ldp	x14, x15, [sp, #112]
> +	ldp	x16, x17, [sp, #128]
> +	ldp	x18, x19, [sp, #144]
> +	ldp	x20, x21, [sp, #160]
> +	ldp	x22, x23, [sp, #176]
> +	ldp	x24, x25, [sp, #192]
> +	ldp	x26, x27, [sp, #208]
> +	ldp	x28, x30, [sp, #224]
> +
> +	add	sp, sp, #264
> +	ldp	x29, x30, [sp], #16
> +	eret
> -- 
> 2.53.0.851.ga537e3e6e9-goog
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2026-03-24 15:44 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-16 22:43 [kvm-unit-tests PATCH v1 0/3] arm64: Add Stage-2 MMU and Nested Guest Framework Jing Zhang
2026-03-16 22:43 ` [kvm-unit-tests PATCH v1 1/3] lib: arm64: Add stage2 page table management library Jing Zhang
2026-03-24 15:12   ` Wei-Lin Chang
2026-03-16 22:43 ` [kvm-unit-tests PATCH v1 2/3] lib: arm64: Add bare-metal guest execution framework Jing Zhang
2026-03-17  1:46   ` Yao Yuan
2026-03-17  8:09   ` Marc Zyngier
2026-03-24 15:04   ` Joey Gouly
2026-03-24 15:44   ` Wei-Lin Chang
2026-03-16 22:43 ` [kvm-unit-tests PATCH v1 3/3] arm64: Add Stage-2 MMU demand paging test Jing Zhang
2026-03-24 11:43 ` [kvm-unit-tests PATCH v1 0/3] arm64: Add Stage-2 MMU and Nested Guest Framework Joey Gouly

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox