Linux-RISC-V Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] Support runtime configuration for per-VM's HGATP mode
@ 2026-01-05 14:32 fangyu.yu
  2026-01-05 14:32 ` [PATCH v2] RISC-V: KVM: " fangyu.yu
  2026-01-05 14:32 ` [PATCH v2] RISC-V: KVM: add KVM_CAP_RISCV_SET_HGATP_MODE fangyu.yu
  0 siblings, 2 replies; 11+ messages in thread
From: fangyu.yu @ 2026-01-05 14:32 UTC (permalink / raw)
  To: pbonzini, corbet, anup, atish.patra, pjw, palmer, aou, alex
  Cc: guoren, ajones, rkrcmar, linux-doc, kvm, kvm-riscv, linux-riscv,
	linux-kernel, Fangyu Yu

From: Fangyu Yu <fangyu.yu@linux.alibaba.com>

Currently, RISC-V KVM hardcodes the G-stage page table format (HGATP mode)
to the maximum mode detected at boot time (e.g., SV57x4 if supported). but
often such a wide GPA is unnecessary, just as a host sometimes doesn't need
sv57.

This patch introduces per-VM configurability of the G-stage mode via a new
KVM capability: KVM_CAP_RISCV_SET_HGATP_MODE. User-space can now explicitly
request a specific HGATP mode (SV39x4, SV48x4, or SV57x4 on 64-bit) during
VM creation.

Fangyu Yu (2):
  RISC-V: KVM: Support runtime configuration for per-VM's HGATP mode
  RISC-V: KVM: add KVM_CAP_RISCV_SET_HGATP_MODE

 Documentation/virt/kvm/api.rst      | 14 +++++
 arch/riscv/include/asm/kvm_gstage.h | 12 ++---
 arch/riscv/include/asm/kvm_host.h   |  4 ++
 arch/riscv/kvm/gstage.c             | 82 +++++++++++++++++------------
 arch/riscv/kvm/main.c               |  4 +-
 arch/riscv/kvm/mmu.c                | 18 +++++--
 arch/riscv/kvm/vm.c                 | 28 ++++++++--
 arch/riscv/kvm/vmid.c               |  2 +-
 include/uapi/linux/kvm.h            |  1 +
 9 files changed, 113 insertions(+), 52 deletions(-)

-- 
2.50.1


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2] RISC-V: KVM: Support runtime configuration for per-VM's HGATP mode
  2026-01-05 14:32 [PATCH v2] Support runtime configuration for per-VM's HGATP mode fangyu.yu
@ 2026-01-05 14:32 ` fangyu.yu
  2026-01-15 23:37   ` Andrew Jones
  2026-01-05 14:32 ` [PATCH v2] RISC-V: KVM: add KVM_CAP_RISCV_SET_HGATP_MODE fangyu.yu
  1 sibling, 1 reply; 11+ messages in thread
From: fangyu.yu @ 2026-01-05 14:32 UTC (permalink / raw)
  To: pbonzini, corbet, anup, atish.patra, pjw, palmer, aou, alex
  Cc: guoren, ajones, rkrcmar, linux-doc, kvm, kvm-riscv, linux-riscv,
	linux-kernel, Fangyu Yu

From: Fangyu Yu <fangyu.yu@linux.alibaba.com>

Introduces two per-VM architecture-specific fields to support runtime
configuration of the G-stage page table format:

- kvm->arch.kvm_riscv_gstage_mode: specifies the HGATP mode used by the
  current VM;
- kvm->arch.kvm_riscv_gstage_pgd_levels: the corresponding number of page
  table levels for the selected mode.

These fields replace the previous global variables
kvm_riscv_gstage_mode and kvm_riscv_gstage_pgd_levels, enabling different
virtual machines to independently select their G-stage page table format
instead of being forced to share the maximum mode detected by the kernel
at boot time.

Signed-off-by: Fangyu Yu <fangyu.yu@linux.alibaba.com>
---
 arch/riscv/include/asm/kvm_gstage.h | 12 ++---
 arch/riscv/include/asm/kvm_host.h   |  4 ++
 arch/riscv/kvm/gstage.c             | 82 +++++++++++++++++------------
 arch/riscv/kvm/main.c               |  4 +-
 arch/riscv/kvm/mmu.c                | 18 +++++--
 arch/riscv/kvm/vm.c                 |  2 +-
 arch/riscv/kvm/vmid.c               |  2 +-
 7 files changed, 74 insertions(+), 50 deletions(-)

diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
index 595e2183173e..fdcada123b3f 100644
--- a/arch/riscv/include/asm/kvm_gstage.h
+++ b/arch/riscv/include/asm/kvm_gstage.h
@@ -29,16 +29,11 @@ struct kvm_gstage_mapping {
 #define kvm_riscv_gstage_index_bits	10
 #endif
 
-extern unsigned long kvm_riscv_gstage_mode;
-extern unsigned long kvm_riscv_gstage_pgd_levels;
+extern unsigned long kvm_riscv_gstage_max_mode;
+extern unsigned long kvm_riscv_gstage_max_pgd_levels;
 
 #define kvm_riscv_gstage_pgd_xbits	2
 #define kvm_riscv_gstage_pgd_size	(1UL << (HGATP_PAGE_SHIFT + kvm_riscv_gstage_pgd_xbits))
-#define kvm_riscv_gstage_gpa_bits	(HGATP_PAGE_SHIFT + \
-					 (kvm_riscv_gstage_pgd_levels * \
-					  kvm_riscv_gstage_index_bits) + \
-					 kvm_riscv_gstage_pgd_xbits)
-#define kvm_riscv_gstage_gpa_size	((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits))
 
 bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
 			       pte_t **ptepp, u32 *ptep_level);
@@ -69,4 +64,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
 
 void kvm_riscv_gstage_mode_detect(void);
 
+gpa_t kvm_riscv_gstage_gpa_size(struct kvm_arch *k);
+unsigned long kvm_riscv_gstage_gpa_bits(struct kvm_arch *k);
+
 #endif
diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index 24585304c02b..27ea8e8fd5b0 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -103,6 +103,10 @@ struct kvm_arch {
 
 	/* KVM_CAP_RISCV_MP_STATE_RESET */
 	bool mp_state_reset;
+
+	unsigned long kvm_riscv_gstage_mode;
+	unsigned long kvm_riscv_gstage_pgd_levels;
+	bool gstage_mode_initialized;
 };
 
 struct kvm_cpu_trap {
diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
index b67d60d722c2..06452e4c2ab2 100644
--- a/arch/riscv/kvm/gstage.c
+++ b/arch/riscv/kvm/gstage.c
@@ -12,22 +12,23 @@
 #include <asm/kvm_gstage.h>
 
 #ifdef CONFIG_64BIT
-unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV39X4;
-unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 3;
+unsigned long kvm_riscv_gstage_max_mode __ro_after_init = HGATP_MODE_SV39X4;
+unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 3;
 #else
-unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV32X4;
-unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 2;
+unsigned long kvm_riscv_gstage_max_mode __ro_after_init = HGATP_MODE_SV32X4;
+unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 2;
 #endif
 
 #define gstage_pte_leaf(__ptep)	\
 	(pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
 
-static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
+static inline unsigned long gstage_pte_index(struct kvm_gstage *gstage,
+					     gpa_t addr, u32 level)
 {
 	unsigned long mask;
 	unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level);
 
-	if (level == (kvm_riscv_gstage_pgd_levels - 1))
+	if (level == (gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1))
 		mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1;
 	else
 		mask = PTRS_PER_PTE - 1;
@@ -40,12 +41,13 @@ static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
 	return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte)));
 }
 
-static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
+static int gstage_page_size_to_level(struct kvm_gstage *gstage, unsigned long page_size,
+				     u32 *out_level)
 {
 	u32 i;
 	unsigned long psz = 1UL << 12;
 
-	for (i = 0; i < kvm_riscv_gstage_pgd_levels; i++) {
+	for (i = 0; i < gstage->kvm->arch.kvm_riscv_gstage_pgd_levels; i++) {
 		if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) {
 			*out_level = i;
 			return 0;
@@ -55,21 +57,23 @@ static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
 	return -EINVAL;
 }
 
-static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
+static int gstage_level_to_page_order(struct kvm_gstage *gstage, u32 level,
+				      unsigned long *out_pgorder)
 {
-	if (kvm_riscv_gstage_pgd_levels < level)
+	if (gstage->kvm->arch.kvm_riscv_gstage_pgd_levels < level)
 		return -EINVAL;
 
 	*out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits);
 	return 0;
 }
 
-static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
+static int gstage_level_to_page_size(struct kvm_gstage *gstage, u32 level,
+				     unsigned long *out_pgsize)
 {
 	int rc;
 	unsigned long page_order = PAGE_SHIFT;
 
-	rc = gstage_level_to_page_order(level, &page_order);
+	rc = gstage_level_to_page_order(gstage, level, &page_order);
 	if (rc)
 		return rc;
 
@@ -81,11 +85,11 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
 			       pte_t **ptepp, u32 *ptep_level)
 {
 	pte_t *ptep;
-	u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
+	u32 current_level = gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1;
 
 	*ptep_level = current_level;
 	ptep = (pte_t *)gstage->pgd;
-	ptep = &ptep[gstage_pte_index(addr, current_level)];
+	ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
 	while (ptep && pte_val(ptep_get(ptep))) {
 		if (gstage_pte_leaf(ptep)) {
 			*ptep_level = current_level;
@@ -97,7 +101,7 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
 			current_level--;
 			*ptep_level = current_level;
 			ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
-			ptep = &ptep[gstage_pte_index(addr, current_level)];
+			ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
 		} else {
 			ptep = NULL;
 		}
@@ -110,7 +114,7 @@ static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr)
 {
 	unsigned long order = PAGE_SHIFT;
 
-	if (gstage_level_to_page_order(level, &order))
+	if (gstage_level_to_page_order(gstage, level, &order))
 		return;
 	addr &= ~(BIT(order) - 1);
 
@@ -125,9 +129,9 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
 			     struct kvm_mmu_memory_cache *pcache,
 			     const struct kvm_gstage_mapping *map)
 {
-	u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
+	u32 current_level = gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1;
 	pte_t *next_ptep = (pte_t *)gstage->pgd;
-	pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
+	pte_t *ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
 
 	if (current_level < map->level)
 		return -EINVAL;
@@ -151,7 +155,7 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
 		}
 
 		current_level--;
-		ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
+		ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
 	}
 
 	if (pte_val(*ptep) != pte_val(map->pte)) {
@@ -175,7 +179,7 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
 	out_map->addr = gpa;
 	out_map->level = 0;
 
-	ret = gstage_page_size_to_level(page_size, &out_map->level);
+	ret = gstage_page_size_to_level(gstage, page_size, &out_map->level);
 	if (ret)
 		return ret;
 
@@ -217,7 +221,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
 	u32 next_ptep_level;
 	unsigned long next_page_size, page_size;
 
-	ret = gstage_level_to_page_size(ptep_level, &page_size);
+	ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
 	if (ret)
 		return;
 
@@ -229,7 +233,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
 	if (ptep_level && !gstage_pte_leaf(ptep)) {
 		next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
 		next_ptep_level = ptep_level - 1;
-		ret = gstage_level_to_page_size(next_ptep_level, &next_page_size);
+		ret = gstage_level_to_page_size(gstage, next_ptep_level, &next_page_size);
 		if (ret)
 			return;
 
@@ -263,7 +267,7 @@ void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
 
 	while (addr < end) {
 		found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
-		ret = gstage_level_to_page_size(ptep_level, &page_size);
+		ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
 		if (ret)
 			break;
 
@@ -297,7 +301,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
 
 	while (addr < end) {
 		found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
-		ret = gstage_level_to_page_size(ptep_level, &page_size);
+		ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
 		if (ret)
 			break;
 
@@ -319,41 +323,51 @@ void __init kvm_riscv_gstage_mode_detect(void)
 	/* Try Sv57x4 G-stage mode */
 	csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
 	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
-		kvm_riscv_gstage_mode = HGATP_MODE_SV57X4;
-		kvm_riscv_gstage_pgd_levels = 5;
+		kvm_riscv_gstage_max_mode = HGATP_MODE_SV57X4;
+		kvm_riscv_gstage_max_pgd_levels = 5;
 		goto done;
 	}
 
 	/* Try Sv48x4 G-stage mode */
 	csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
 	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
-		kvm_riscv_gstage_mode = HGATP_MODE_SV48X4;
-		kvm_riscv_gstage_pgd_levels = 4;
+		kvm_riscv_gstage_max_mode = HGATP_MODE_SV48X4;
+		kvm_riscv_gstage_max_pgd_levels = 4;
 		goto done;
 	}
 
 	/* Try Sv39x4 G-stage mode */
 	csr_write(CSR_HGATP, HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT);
 	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV39X4) {
-		kvm_riscv_gstage_mode = HGATP_MODE_SV39X4;
-		kvm_riscv_gstage_pgd_levels = 3;
+		kvm_riscv_gstage_max_mode = HGATP_MODE_SV39X4;
+		kvm_riscv_gstage_max_pgd_levels = 3;
 		goto done;
 	}
 #else /* CONFIG_32BIT */
 	/* Try Sv32x4 G-stage mode */
 	csr_write(CSR_HGATP, HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT);
 	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV32X4) {
-		kvm_riscv_gstage_mode = HGATP_MODE_SV32X4;
-		kvm_riscv_gstage_pgd_levels = 2;
+		kvm_riscv_gstage_max_mode = HGATP_MODE_SV32X4;
+		kvm_riscv_gstage_max_pgd_levels = 2;
 		goto done;
 	}
 #endif
 
 	/* KVM depends on !HGATP_MODE_OFF */
-	kvm_riscv_gstage_mode = HGATP_MODE_OFF;
-	kvm_riscv_gstage_pgd_levels = 0;
+	kvm_riscv_gstage_max_mode = HGATP_MODE_OFF;
+	kvm_riscv_gstage_max_pgd_levels = 0;
 
 done:
 	csr_write(CSR_HGATP, 0);
 	kvm_riscv_local_hfence_gvma_all();
 }
+
+unsigned long kvm_riscv_gstage_gpa_bits(struct kvm_arch *k) {
+	return (HGATP_PAGE_SHIFT + (k->kvm_riscv_gstage_pgd_levels *
+		    kvm_riscv_gstage_index_bits) +
+		    kvm_riscv_gstage_pgd_xbits);
+}
+
+gpa_t kvm_riscv_gstage_gpa_size(struct kvm_arch *k) {
+	return ((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits(k)));
+}
diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
index 45536af521f0..56a246e0e791 100644
--- a/arch/riscv/kvm/main.c
+++ b/arch/riscv/kvm/main.c
@@ -105,7 +105,7 @@ static int __init riscv_kvm_init(void)
 		return rc;
 
 	kvm_riscv_gstage_mode_detect();
-	switch (kvm_riscv_gstage_mode) {
+	switch (kvm_riscv_gstage_max_mode) {
 	case HGATP_MODE_SV32X4:
 		str = "Sv32x4";
 		break;
@@ -164,7 +164,7 @@ static int __init riscv_kvm_init(void)
 			 (rc) ? slist : "no features");
 	}
 
-	kvm_info("using %s G-stage page table format\n", str);
+	kvm_info("Max G-stage page table format %s \n", str);
 
 	kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits());
 
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 4ab06697bfc0..574783907162 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -67,7 +67,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
 		if (!writable)
 			map.pte = pte_wrprotect(map.pte);
 
-		ret = kvm_mmu_topup_memory_cache(&pcache, kvm_riscv_gstage_pgd_levels);
+		ret = kvm_mmu_topup_memory_cache(&pcache,kvm->arch.kvm_riscv_gstage_pgd_levels);
 		if (ret)
 			goto out;
 
@@ -186,8 +186,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	 * space addressable by the KVM guest GPA space.
 	 */
 	if ((new->base_gfn + new->npages) >=
-	    (kvm_riscv_gstage_gpa_size >> PAGE_SHIFT))
+			(kvm_riscv_gstage_gpa_size(&kvm->arch) >> PAGE_SHIFT)) {
 		return -EFAULT;
+	}
 
 	hva = new->userspace_addr;
 	size = new->npages << PAGE_SHIFT;
@@ -332,7 +333,7 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 	memset(out_map, 0, sizeof(*out_map));
 
 	/* We need minimum second+third level pages */
-	ret = kvm_mmu_topup_memory_cache(pcache, kvm_riscv_gstage_pgd_levels);
+	ret = kvm_mmu_topup_memory_cache(pcache, kvm->arch.kvm_riscv_gstage_pgd_levels);
 	if (ret) {
 		kvm_err("Failed to topup G-stage cache\n");
 		return ret;
@@ -431,6 +432,11 @@ int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm)
 		return -ENOMEM;
 	kvm->arch.pgd = page_to_virt(pgd_page);
 	kvm->arch.pgd_phys = page_to_phys(pgd_page);
+	if (!kvm->arch.gstage_mode_initialized) {
+		/*user-space didn't set KVM_CAP_RISC_HGATP_MODE cap*/
+		kvm->arch.kvm_riscv_gstage_mode = kvm_riscv_gstage_max_mode;
+		kvm->arch.kvm_riscv_gstage_pgd_levels = kvm_riscv_gstage_max_pgd_levels;
+	}
 
 	return 0;
 }
@@ -446,10 +452,12 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
 		gstage.flags = 0;
 		gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
 		gstage.pgd = kvm->arch.pgd;
-		kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size, false);
+		kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size(&kvm->arch), false);
 		pgd = READ_ONCE(kvm->arch.pgd);
 		kvm->arch.pgd = NULL;
 		kvm->arch.pgd_phys = 0;
+		kvm->arch.kvm_riscv_gstage_mode = HGATP_MODE_OFF;
+		kvm->arch.kvm_riscv_gstage_pgd_levels = 0;
 	}
 	spin_unlock(&kvm->mmu_lock);
 
@@ -459,8 +467,8 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
 
 void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu)
 {
-	unsigned long hgatp = kvm_riscv_gstage_mode << HGATP_MODE_SHIFT;
 	struct kvm_arch *k = &vcpu->kvm->arch;
+	unsigned long hgatp = k->kvm_riscv_gstage_mode << HGATP_MODE_SHIFT;
 
 	hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID;
 	hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN;
diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
index 66d91ae6e9b2..4b2156df40fc 100644
--- a/arch/riscv/kvm/vm.c
+++ b/arch/riscv/kvm/vm.c
@@ -200,7 +200,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = KVM_USER_MEM_SLOTS;
 		break;
 	case KVM_CAP_VM_GPA_BITS:
-		r = kvm_riscv_gstage_gpa_bits;
+		r = kvm_riscv_gstage_gpa_bits(&kvm->arch);
 		break;
 	default:
 		r = 0;
diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c
index cf34d448289d..db27430f111e 100644
--- a/arch/riscv/kvm/vmid.c
+++ b/arch/riscv/kvm/vmid.c
@@ -26,7 +26,7 @@ static DEFINE_SPINLOCK(vmid_lock);
 void __init kvm_riscv_gstage_vmid_detect(void)
 {
 	/* Figure-out number of VMID bits in HW */
-	csr_write(CSR_HGATP, (kvm_riscv_gstage_mode << HGATP_MODE_SHIFT) | HGATP_VMID);
+	csr_write(CSR_HGATP, (kvm_riscv_gstage_max_mode << HGATP_MODE_SHIFT) | HGATP_VMID);
 	vmid_bits = csr_read(CSR_HGATP);
 	vmid_bits = (vmid_bits & HGATP_VMID) >> HGATP_VMID_SHIFT;
 	vmid_bits = fls_long(vmid_bits);
-- 
2.50.1


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2] RISC-V: KVM: add KVM_CAP_RISCV_SET_HGATP_MODE
  2026-01-05 14:32 [PATCH v2] Support runtime configuration for per-VM's HGATP mode fangyu.yu
  2026-01-05 14:32 ` [PATCH v2] RISC-V: KVM: " fangyu.yu
@ 2026-01-05 14:32 ` fangyu.yu
  2026-01-15 23:56   ` Andrew Jones
                     ` (2 more replies)
  1 sibling, 3 replies; 11+ messages in thread
From: fangyu.yu @ 2026-01-05 14:32 UTC (permalink / raw)
  To: pbonzini, corbet, anup, atish.patra, pjw, palmer, aou, alex
  Cc: guoren, ajones, rkrcmar, linux-doc, kvm, kvm-riscv, linux-riscv,
	linux-kernel, Fangyu Yu

From: Fangyu Yu <fangyu.yu@linux.alibaba.com>

This capability allows userspace to explicitly select the HGATP mode
for the VM. The selected mode must be less than or equal to the max
HGATP mode supported by the hardware. This capability must be enabled
before creating any vCPUs, and can only be set once per VM.

Signed-off-by: Fangyu Yu <fangyu.yu@linux.alibaba.com>
---
 Documentation/virt/kvm/api.rst | 14 ++++++++++++++
 arch/riscv/kvm/vm.c            | 26 ++++++++++++++++++++++++--
 include/uapi/linux/kvm.h       |  1 +
 3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 01a3abef8abb..9e17788e3a9d 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8765,6 +8765,20 @@ helpful if user space wants to emulate instructions which are not
 This capability can be enabled dynamically even if VCPUs were already
 created and are running.
 
+7.47 KVM_CAP_RISCV_SET_HGATP_MODE
+---------------------------------
+
+:Architectures: riscv
+:Type: VM
+:Parameters: args[0] contains the requested HGATP mode
+:Returns: 0 on success, -EINVAL if arg[0] is outside the range of hgatp
+          modes supported by the hardware.
+
+This capability allows userspace to explicitly select the HGATP mode for
+the VM. The selected mode must be less than or equal to the maximum HGATP
+mode supported by the hardware. This capability must be enabled before
+creating any vCPUs, and can only be set once per VM.
+
 8. Other capabilities.
 ======================
 
diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
index 4b2156df40fc..e9275023a73a 100644
--- a/arch/riscv/kvm/vm.c
+++ b/arch/riscv/kvm/vm.c
@@ -202,6 +202,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_VM_GPA_BITS:
 		r = kvm_riscv_gstage_gpa_bits(&kvm->arch);
 		break;
+	case KVM_CAP_RISCV_SET_HGATP_MODE:
+#ifdef CONFIG_64BIT
+		r = 1;
+#else/* CONFIG_32BIT */
+		r = 0;
+#endif
+		break;
 	default:
 		r = 0;
 		break;
@@ -212,12 +219,27 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
 int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 {
+	if (cap->flags)
+		return -EINVAL;
 	switch (cap->cap) {
 	case KVM_CAP_RISCV_MP_STATE_RESET:
-		if (cap->flags)
-			return -EINVAL;
 		kvm->arch.mp_state_reset = true;
 		return 0;
+	case KVM_CAP_RISCV_SET_HGATP_MODE:
+#ifdef CONFIG_64BIT
+		if (cap->args[0] < HGATP_MODE_SV39X4 ||
+			cap->args[0] > kvm_riscv_gstage_max_mode)
+			return -EINVAL;
+		if (kvm->arch.gstage_mode_initialized)
+			return 0;
+		kvm->arch.gstage_mode_initialized = true;
+		kvm->arch.kvm_riscv_gstage_mode = cap->args[0];
+		kvm->arch.kvm_riscv_gstage_pgd_levels = 3 +
+		    kvm->arch.kvm_riscv_gstage_mode - HGATP_MODE_SV39X4;
+		kvm_info("using SV%lluX4 G-stage page table format\n",
+			39 + (cap->args[0] - HGATP_MODE_SV39X4) * 9);
+#endif
+		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index dddb781b0507..00c02a880518 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -974,6 +974,7 @@ struct kvm_enable_cap {
 #define KVM_CAP_GUEST_MEMFD_FLAGS 244
 #define KVM_CAP_ARM_SEA_TO_USER 245
 #define KVM_CAP_S390_USER_OPEREXEC 246
+#define KVM_CAP_RISCV_SET_HGATP_MODE 247
 
 struct kvm_irq_routing_irqchip {
 	__u32 irqchip;
-- 
2.50.1


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] RISC-V: KVM: Support runtime configuration for per-VM's HGATP mode
  2026-01-05 14:32 ` [PATCH v2] RISC-V: KVM: " fangyu.yu
@ 2026-01-15 23:37   ` Andrew Jones
  2026-01-16 14:29     ` fangyu.yu
  0 siblings, 1 reply; 11+ messages in thread
From: Andrew Jones @ 2026-01-15 23:37 UTC (permalink / raw)
  To: fangyu.yu
  Cc: pbonzini, corbet, anup, atish.patra, pjw, palmer, aou, alex,
	guoren, ajones, rkrcmar, linux-doc, kvm, kvm-riscv, linux-riscv,
	linux-kernel

On Mon, Jan 05, 2026 at 10:32:31PM +0800, fangyu.yu@linux.alibaba.com wrote:
> From: Fangyu Yu <fangyu.yu@linux.alibaba.com>
> 
> Introduces two per-VM architecture-specific fields to support runtime
> configuration of the G-stage page table format:
> 
> - kvm->arch.kvm_riscv_gstage_mode: specifies the HGATP mode used by the
>   current VM;
> - kvm->arch.kvm_riscv_gstage_pgd_levels: the corresponding number of page
>   table levels for the selected mode.
> 
> These fields replace the previous global variables
> kvm_riscv_gstage_mode and kvm_riscv_gstage_pgd_levels, enabling different
> virtual machines to independently select their G-stage page table format
> instead of being forced to share the maximum mode detected by the kernel
> at boot time.
> 
> Signed-off-by: Fangyu Yu <fangyu.yu@linux.alibaba.com>
> ---
>  arch/riscv/include/asm/kvm_gstage.h | 12 ++---
>  arch/riscv/include/asm/kvm_host.h   |  4 ++
>  arch/riscv/kvm/gstage.c             | 82 +++++++++++++++++------------
>  arch/riscv/kvm/main.c               |  4 +-
>  arch/riscv/kvm/mmu.c                | 18 +++++--
>  arch/riscv/kvm/vm.c                 |  2 +-
>  arch/riscv/kvm/vmid.c               |  2 +-
>  7 files changed, 74 insertions(+), 50 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
> index 595e2183173e..fdcada123b3f 100644
> --- a/arch/riscv/include/asm/kvm_gstage.h
> +++ b/arch/riscv/include/asm/kvm_gstage.h
> @@ -29,16 +29,11 @@ struct kvm_gstage_mapping {
>  #define kvm_riscv_gstage_index_bits	10
>  #endif
>  
> -extern unsigned long kvm_riscv_gstage_mode;
> -extern unsigned long kvm_riscv_gstage_pgd_levels;
> +extern unsigned long kvm_riscv_gstage_max_mode;
> +extern unsigned long kvm_riscv_gstage_max_pgd_levels;
>  
>  #define kvm_riscv_gstage_pgd_xbits	2
>  #define kvm_riscv_gstage_pgd_size	(1UL << (HGATP_PAGE_SHIFT + kvm_riscv_gstage_pgd_xbits))
> -#define kvm_riscv_gstage_gpa_bits	(HGATP_PAGE_SHIFT + \
> -					 (kvm_riscv_gstage_pgd_levels * \
> -					  kvm_riscv_gstage_index_bits) + \
> -					 kvm_riscv_gstage_pgd_xbits)
> -#define kvm_riscv_gstage_gpa_size	((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits))
>  
>  bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
>  			       pte_t **ptepp, u32 *ptep_level);
> @@ -69,4 +64,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
>  
>  void kvm_riscv_gstage_mode_detect(void);
>  
> +gpa_t kvm_riscv_gstage_gpa_size(struct kvm_arch *k);
> +unsigned long kvm_riscv_gstage_gpa_bits(struct kvm_arch *k);
> +
>  #endif
> diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
> index 24585304c02b..27ea8e8fd5b0 100644
> --- a/arch/riscv/include/asm/kvm_host.h
> +++ b/arch/riscv/include/asm/kvm_host.h
> @@ -103,6 +103,10 @@ struct kvm_arch {
>  
>  	/* KVM_CAP_RISCV_MP_STATE_RESET */
>  	bool mp_state_reset;
> +
> +	unsigned long kvm_riscv_gstage_mode;

There's a 1:1 mapping for mode/levels, so we don't need to track both.
Since mode is rarely used, then I think something like this would still
provide enough convenience without requiring the storage allocation.

 static inline unsigned long kvm_riscv_gstage_mode(struct kvm_gstage *gstage)
 {
     unsigned long modes[] = {
         [2] = HGATP_MODE_SV32X4,
         [3] = HGATP_MODE_SV39X4,
         [4] = HGATP_MODE_SV48X4,
         [5] = HGATP_MODE_SV57X4,
     };

     return modes[gstage->kvm->arch.kvm_riscv_gstage_pgd_levels];
 }

> +	unsigned long kvm_riscv_gstage_pgd_levels;
> +	bool gstage_mode_initialized;
>  };
>  
>  struct kvm_cpu_trap {
> diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
> index b67d60d722c2..06452e4c2ab2 100644
> --- a/arch/riscv/kvm/gstage.c
> +++ b/arch/riscv/kvm/gstage.c
> @@ -12,22 +12,23 @@
>  #include <asm/kvm_gstage.h>
>  
>  #ifdef CONFIG_64BIT
> -unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV39X4;
> -unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 3;
> +unsigned long kvm_riscv_gstage_max_mode __ro_after_init = HGATP_MODE_SV39X4;

With a kvm_riscv_gstage_mode() function we don't need
kvm_riscv_gstage_max_mode either.

> +unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 3;
>  #else
> -unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV32X4;
> -unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 2;
> +unsigned long kvm_riscv_gstage_max_mode __ro_after_init = HGATP_MODE_SV32X4;
> +unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 2;
>  #endif
>  
>  #define gstage_pte_leaf(__ptep)	\
>  	(pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
>  
> -static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
> +static inline unsigned long gstage_pte_index(struct kvm_gstage *gstage,
> +					     gpa_t addr, u32 level)
>  {
>  	unsigned long mask;
>  	unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level);
>  
> -	if (level == (kvm_riscv_gstage_pgd_levels - 1))
> +	if (level == (gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1))

nit: we can drop the unnecessary () while touching this line.

>  		mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1;
>  	else
>  		mask = PTRS_PER_PTE - 1;
> @@ -40,12 +41,13 @@ static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
>  	return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte)));
>  }
>  
> -static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
> +static int gstage_page_size_to_level(struct kvm_gstage *gstage, unsigned long page_size,
> +				     u32 *out_level)
>  {
>  	u32 i;
>  	unsigned long psz = 1UL << 12;
>  
> -	for (i = 0; i < kvm_riscv_gstage_pgd_levels; i++) {
> +	for (i = 0; i < gstage->kvm->arch.kvm_riscv_gstage_pgd_levels; i++) {
>  		if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) {
>  			*out_level = i;
>  			return 0;
> @@ -55,21 +57,23 @@ static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
>  	return -EINVAL;
>  }
>  
> -static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
> +static int gstage_level_to_page_order(struct kvm_gstage *gstage, u32 level,
> +				      unsigned long *out_pgorder)
>  {
> -	if (kvm_riscv_gstage_pgd_levels < level)
> +	if (gstage->kvm->arch.kvm_riscv_gstage_pgd_levels < level)
>  		return -EINVAL;
>  
>  	*out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits);
>  	return 0;
>  }
>  
> -static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
> +static int gstage_level_to_page_size(struct kvm_gstage *gstage, u32 level,
> +				     unsigned long *out_pgsize)
>  {
>  	int rc;
>  	unsigned long page_order = PAGE_SHIFT;
>  
> -	rc = gstage_level_to_page_order(level, &page_order);
> +	rc = gstage_level_to_page_order(gstage, level, &page_order);
>  	if (rc)
>  		return rc;
>  
> @@ -81,11 +85,11 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
>  			       pte_t **ptepp, u32 *ptep_level)
>  {
>  	pte_t *ptep;
> -	u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
> +	u32 current_level = gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1;
>  
>  	*ptep_level = current_level;
>  	ptep = (pte_t *)gstage->pgd;
> -	ptep = &ptep[gstage_pte_index(addr, current_level)];
> +	ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
>  	while (ptep && pte_val(ptep_get(ptep))) {
>  		if (gstage_pte_leaf(ptep)) {
>  			*ptep_level = current_level;
> @@ -97,7 +101,7 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
>  			current_level--;
>  			*ptep_level = current_level;
>  			ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
> -			ptep = &ptep[gstage_pte_index(addr, current_level)];
> +			ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
>  		} else {
>  			ptep = NULL;
>  		}
> @@ -110,7 +114,7 @@ static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr)
>  {
>  	unsigned long order = PAGE_SHIFT;
>  
> -	if (gstage_level_to_page_order(level, &order))
> +	if (gstage_level_to_page_order(gstage, level, &order))
>  		return;
>  	addr &= ~(BIT(order) - 1);
>  
> @@ -125,9 +129,9 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
>  			     struct kvm_mmu_memory_cache *pcache,
>  			     const struct kvm_gstage_mapping *map)
>  {
> -	u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
> +	u32 current_level = gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1;
>  	pte_t *next_ptep = (pte_t *)gstage->pgd;
> -	pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
> +	pte_t *ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
>  
>  	if (current_level < map->level)
>  		return -EINVAL;
> @@ -151,7 +155,7 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
>  		}
>  
>  		current_level--;
> -		ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
> +		ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
>  	}
>  
>  	if (pte_val(*ptep) != pte_val(map->pte)) {
> @@ -175,7 +179,7 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
>  	out_map->addr = gpa;
>  	out_map->level = 0;
>  
> -	ret = gstage_page_size_to_level(page_size, &out_map->level);
> +	ret = gstage_page_size_to_level(gstage, page_size, &out_map->level);
>  	if (ret)
>  		return ret;
>  
> @@ -217,7 +221,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
>  	u32 next_ptep_level;
>  	unsigned long next_page_size, page_size;
>  
> -	ret = gstage_level_to_page_size(ptep_level, &page_size);
> +	ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
>  	if (ret)
>  		return;
>  
> @@ -229,7 +233,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
>  	if (ptep_level && !gstage_pte_leaf(ptep)) {
>  		next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
>  		next_ptep_level = ptep_level - 1;
> -		ret = gstage_level_to_page_size(next_ptep_level, &next_page_size);
> +		ret = gstage_level_to_page_size(gstage, next_ptep_level, &next_page_size);
>  		if (ret)
>  			return;
>  
> @@ -263,7 +267,7 @@ void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
>  
>  	while (addr < end) {
>  		found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
> -		ret = gstage_level_to_page_size(ptep_level, &page_size);
> +		ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
>  		if (ret)
>  			break;
>  
> @@ -297,7 +301,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
>  
>  	while (addr < end) {
>  		found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
> -		ret = gstage_level_to_page_size(ptep_level, &page_size);
> +		ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
>  		if (ret)
>  			break;
>  
> @@ -319,41 +323,51 @@ void __init kvm_riscv_gstage_mode_detect(void)
>  	/* Try Sv57x4 G-stage mode */
>  	csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
>  	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
> -		kvm_riscv_gstage_mode = HGATP_MODE_SV57X4;
> -		kvm_riscv_gstage_pgd_levels = 5;
> +		kvm_riscv_gstage_max_mode = HGATP_MODE_SV57X4;
> +		kvm_riscv_gstage_max_pgd_levels = 5;
>  		goto done;
>  	}
>  
>  	/* Try Sv48x4 G-stage mode */
>  	csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
>  	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
> -		kvm_riscv_gstage_mode = HGATP_MODE_SV48X4;
> -		kvm_riscv_gstage_pgd_levels = 4;
> +		kvm_riscv_gstage_max_mode = HGATP_MODE_SV48X4;
> +		kvm_riscv_gstage_max_pgd_levels = 4;
>  		goto done;
>  	}
>  
>  	/* Try Sv39x4 G-stage mode */
>  	csr_write(CSR_HGATP, HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT);
>  	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV39X4) {
> -		kvm_riscv_gstage_mode = HGATP_MODE_SV39X4;
> -		kvm_riscv_gstage_pgd_levels = 3;
> +		kvm_riscv_gstage_max_mode = HGATP_MODE_SV39X4;
> +		kvm_riscv_gstage_max_pgd_levels = 3;
>  		goto done;
>  	}
>  #else /* CONFIG_32BIT */
>  	/* Try Sv32x4 G-stage mode */
>  	csr_write(CSR_HGATP, HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT);
>  	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV32X4) {
> -		kvm_riscv_gstage_mode = HGATP_MODE_SV32X4;
> -		kvm_riscv_gstage_pgd_levels = 2;
> +		kvm_riscv_gstage_max_mode = HGATP_MODE_SV32X4;
> +		kvm_riscv_gstage_max_pgd_levels = 2;
>  		goto done;
>  	}
>  #endif
>  
>  	/* KVM depends on !HGATP_MODE_OFF */
> -	kvm_riscv_gstage_mode = HGATP_MODE_OFF;
> -	kvm_riscv_gstage_pgd_levels = 0;
> +	kvm_riscv_gstage_max_mode = HGATP_MODE_OFF;
> +	kvm_riscv_gstage_max_pgd_levels = 0;
>  
>  done:
>  	csr_write(CSR_HGATP, 0);
>  	kvm_riscv_local_hfence_gvma_all();
>  }
> +
> +unsigned long kvm_riscv_gstage_gpa_bits(struct kvm_arch *k) {

Did you run checkpatch? I think it requires '{' to be on its own line.

nit: s/k/ka/ would be consistent with other archs, although I see k is
used in riscv's kvm_riscv_mmu_update_hgatp() but that can be fixed up
in this patch since there's a change in the same place too.


> +	return (HGATP_PAGE_SHIFT + (k->kvm_riscv_gstage_pgd_levels *
> +		    kvm_riscv_gstage_index_bits) +
> +		    kvm_riscv_gstage_pgd_xbits);
> +}
> +
> +gpa_t kvm_riscv_gstage_gpa_size(struct kvm_arch *k) {

same comments as above

> +	return ((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits(k)));

 return BIT_ULL(kvm_riscv_gstage_gpa_bits(ka))

(the cast is implicit from return type)

> +}
> diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
> index 45536af521f0..56a246e0e791 100644
> --- a/arch/riscv/kvm/main.c
> +++ b/arch/riscv/kvm/main.c
> @@ -105,7 +105,7 @@ static int __init riscv_kvm_init(void)
>  		return rc;
>  
>  	kvm_riscv_gstage_mode_detect();
> -	switch (kvm_riscv_gstage_mode) {
> +	switch (kvm_riscv_gstage_max_mode) {
>  	case HGATP_MODE_SV32X4:
>  		str = "Sv32x4";
>  		break;
> @@ -164,7 +164,7 @@ static int __init riscv_kvm_init(void)
>  			 (rc) ? slist : "no features");
>  	}
>  
> -	kvm_info("using %s G-stage page table format\n", str);
> +	kvm_info("Max G-stage page table format %s \n", str);
>  
>  	kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits());
>  
> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> index 4ab06697bfc0..574783907162 100644
> --- a/arch/riscv/kvm/mmu.c
> +++ b/arch/riscv/kvm/mmu.c
> @@ -67,7 +67,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
>  		if (!writable)
>  			map.pte = pte_wrprotect(map.pte);
>  
> -		ret = kvm_mmu_topup_memory_cache(&pcache, kvm_riscv_gstage_pgd_levels);
> +		ret = kvm_mmu_topup_memory_cache(&pcache,kvm->arch.kvm_riscv_gstage_pgd_levels);
                                                         ^ missing space

>  		if (ret)
>  			goto out;
>  
> @@ -186,8 +186,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  	 * space addressable by the KVM guest GPA space.
>  	 */
>  	if ((new->base_gfn + new->npages) >=
> -	    (kvm_riscv_gstage_gpa_size >> PAGE_SHIFT))
> +			(kvm_riscv_gstage_gpa_size(&kvm->arch) >> PAGE_SHIFT)) {
>  		return -EFAULT;
> +	}

nit: Remove the unnecessary () and the '{' and the condition will fit on
one 100 char line.

>  
>  	hva = new->userspace_addr;
>  	size = new->npages << PAGE_SHIFT;
> @@ -332,7 +333,7 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
>  	memset(out_map, 0, sizeof(*out_map));
>  
>  	/* We need minimum second+third level pages */
> -	ret = kvm_mmu_topup_memory_cache(pcache, kvm_riscv_gstage_pgd_levels);
> +	ret = kvm_mmu_topup_memory_cache(pcache, kvm->arch.kvm_riscv_gstage_pgd_levels);
>  	if (ret) {
>  		kvm_err("Failed to topup G-stage cache\n");
>  		return ret;
> @@ -431,6 +432,11 @@ int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm)
>  		return -ENOMEM;
>  	kvm->arch.pgd = page_to_virt(pgd_page);
>  	kvm->arch.pgd_phys = page_to_phys(pgd_page);
> +	if (!kvm->arch.gstage_mode_initialized) {
> +		/*user-space didn't set KVM_CAP_RISC_HGATP_MODE cap*/
                  ^ missing space                                  ^ missing space
> +		kvm->arch.kvm_riscv_gstage_mode = kvm_riscv_gstage_max_mode;
> +		kvm->arch.kvm_riscv_gstage_pgd_levels = kvm_riscv_gstage_max_pgd_levels;

Missing 'kvm->arch.gstage_mode_initialized = true' statement.

> +	}
>  
>  	return 0;
>  }
> @@ -446,10 +452,12 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
>  		gstage.flags = 0;
>  		gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
>  		gstage.pgd = kvm->arch.pgd;
> -		kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size, false);
> +		kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size(&kvm->arch), false);
>  		pgd = READ_ONCE(kvm->arch.pgd);
>  		kvm->arch.pgd = NULL;
>  		kvm->arch.pgd_phys = 0;
> +		kvm->arch.kvm_riscv_gstage_mode = HGATP_MODE_OFF;
> +		kvm->arch.kvm_riscv_gstage_pgd_levels = 0;
>  	}
>  	spin_unlock(&kvm->mmu_lock);
>  
> @@ -459,8 +467,8 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
>  
>  void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu)
>  {
> -	unsigned long hgatp = kvm_riscv_gstage_mode << HGATP_MODE_SHIFT;
>  	struct kvm_arch *k = &vcpu->kvm->arch;
> +	unsigned long hgatp = k->kvm_riscv_gstage_mode << HGATP_MODE_SHIFT;
>  
>  	hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID;
>  	hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN;
> diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
> index 66d91ae6e9b2..4b2156df40fc 100644
> --- a/arch/riscv/kvm/vm.c
> +++ b/arch/riscv/kvm/vm.c
> @@ -200,7 +200,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  		r = KVM_USER_MEM_SLOTS;
>  		break;
>  	case KVM_CAP_VM_GPA_BITS:
> -		r = kvm_riscv_gstage_gpa_bits;
> +		r = kvm_riscv_gstage_gpa_bits(&kvm->arch);
>  		break;
>  	default:
>  		r = 0;
> diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c
> index cf34d448289d..db27430f111e 100644
> --- a/arch/riscv/kvm/vmid.c
> +++ b/arch/riscv/kvm/vmid.c
> @@ -26,7 +26,7 @@ static DEFINE_SPINLOCK(vmid_lock);
>  void __init kvm_riscv_gstage_vmid_detect(void)
>  {
>  	/* Figure-out number of VMID bits in HW */
> -	csr_write(CSR_HGATP, (kvm_riscv_gstage_mode << HGATP_MODE_SHIFT) | HGATP_VMID);
> +	csr_write(CSR_HGATP, (kvm_riscv_gstage_max_mode << HGATP_MODE_SHIFT) | HGATP_VMID);
>  	vmid_bits = csr_read(CSR_HGATP);
>  	vmid_bits = (vmid_bits & HGATP_VMID) >> HGATP_VMID_SHIFT;
>  	vmid_bits = fls_long(vmid_bits);
> -- 
> 2.50.1
>

Thanks,
drew

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] RISC-V: KVM: add KVM_CAP_RISCV_SET_HGATP_MODE
  2026-01-05 14:32 ` [PATCH v2] RISC-V: KVM: add KVM_CAP_RISCV_SET_HGATP_MODE fangyu.yu
@ 2026-01-15 23:56   ` Andrew Jones
  2026-01-16 14:29     ` fangyu.yu
  2026-01-16 19:03   ` Andrew Jones
  2026-01-19 13:56   ` Radim Krčmář
  2 siblings, 1 reply; 11+ messages in thread
From: Andrew Jones @ 2026-01-15 23:56 UTC (permalink / raw)
  To: fangyu.yu
  Cc: pbonzini, corbet, anup, atish.patra, pjw, palmer, aou, alex,
	guoren, ajones, rkrcmar, linux-doc, kvm, kvm-riscv, linux-riscv,
	linux-kernel

On Mon, Jan 05, 2026 at 10:32:32PM +0800, fangyu.yu@linux.alibaba.com wrote:
> From: Fangyu Yu <fangyu.yu@linux.alibaba.com>
> 
> This capability allows userspace to explicitly select the HGATP mode
> for the VM. The selected mode must be less than or equal to the max
> HGATP mode supported by the hardware. This capability must be enabled
> before creating any vCPUs, and can only be set once per VM.
> 
> Signed-off-by: Fangyu Yu <fangyu.yu@linux.alibaba.com>
> ---
>  Documentation/virt/kvm/api.rst | 14 ++++++++++++++
>  arch/riscv/kvm/vm.c            | 26 ++++++++++++++++++++++++--
>  include/uapi/linux/kvm.h       |  1 +
>  3 files changed, 39 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 01a3abef8abb..9e17788e3a9d 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -8765,6 +8765,20 @@ helpful if user space wants to emulate instructions which are not
>  This capability can be enabled dynamically even if VCPUs were already
>  created and are running.
>  
> +7.47 KVM_CAP_RISCV_SET_HGATP_MODE
> +---------------------------------
> +
> +:Architectures: riscv
> +:Type: VM
> +:Parameters: args[0] contains the requested HGATP mode
> +:Returns: 0 on success, -EINVAL if arg[0] is outside the range of hgatp
> +          modes supported by the hardware.
> +
> +This capability allows userspace to explicitly select the HGATP mode for
> +the VM. The selected mode must be less than or equal to the maximum HGATP
> +mode supported by the hardware. This capability must be enabled before
> +creating any vCPUs, and can only be set once per VM.

I think I would prefer a KVM_CAP_RISCV_SET_MAX_GPA type of capability. The
reason is because, while one of the results of the max-gpa being set will
be to set hgatp, there may be other reasons to track the guest's maximum
physical address too and kvm userspace shouldn't need to think about each
individually.

> +
>  8. Other capabilities.
>  ======================
>  
> diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
> index 4b2156df40fc..e9275023a73a 100644
> --- a/arch/riscv/kvm/vm.c
> +++ b/arch/riscv/kvm/vm.c
> @@ -202,6 +202,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  	case KVM_CAP_VM_GPA_BITS:
>  		r = kvm_riscv_gstage_gpa_bits(&kvm->arch);
>  		break;
> +	case KVM_CAP_RISCV_SET_HGATP_MODE:
> +#ifdef CONFIG_64BIT
> +		r = 1;
> +#else/* CONFIG_32BIT */
> +		r = 0;
> +#endif

 r = IS_ENABLED(CONFIG_64BIT) ? 1 : 0;

> +		break;
>  	default:
>  		r = 0;
>  		break;
> @@ -212,12 +219,27 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  
>  int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
>  {
> +	if (cap->flags)
> +		return -EINVAL;

add blank line

>  	switch (cap->cap) {
>  	case KVM_CAP_RISCV_MP_STATE_RESET:
> -		if (cap->flags)
> -			return -EINVAL;
>  		kvm->arch.mp_state_reset = true;
>  		return 0;
> +	case KVM_CAP_RISCV_SET_HGATP_MODE:
> +#ifdef CONFIG_64BIT
> +		if (cap->args[0] < HGATP_MODE_SV39X4 ||
> +			cap->args[0] > kvm_riscv_gstage_max_mode)
> +			return -EINVAL;
> +		if (kvm->arch.gstage_mode_initialized)
> +			return 0;

I think we want to return -EBUSY here and it should be documented where it
already states "...can only be set once per VM"

> +		kvm->arch.gstage_mode_initialized = true;

In the previous patch I thought we were missing this, but I see now it
means "user initialized". Let's rename it as such,

 gstage_mode_user_initialized

> +		kvm->arch.kvm_riscv_gstage_mode = cap->args[0];
> +		kvm->arch.kvm_riscv_gstage_pgd_levels = 3 +
> +		    kvm->arch.kvm_riscv_gstage_mode - HGATP_MODE_SV39X4;
> +		kvm_info("using SV%lluX4 G-stage page table format\n",
> +			39 + (cap->args[0] - HGATP_MODE_SV39X4) * 9);
> +#endif
> +		return 0;
>  	default:
>  		return -EINVAL;
>  	}
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index dddb781b0507..00c02a880518 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -974,6 +974,7 @@ struct kvm_enable_cap {
>  #define KVM_CAP_GUEST_MEMFD_FLAGS 244
>  #define KVM_CAP_ARM_SEA_TO_USER 245
>  #define KVM_CAP_S390_USER_OPEREXEC 246
> +#define KVM_CAP_RISCV_SET_HGATP_MODE 247
>  
>  struct kvm_irq_routing_irqchip {
>  	__u32 irqchip;
> -- 
> 2.50.1
>

Thanks,
drew

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Re: [PATCH v2] RISC-V: KVM: Support runtime configuration for per-VM's HGATP mode
  2026-01-15 23:37   ` Andrew Jones
@ 2026-01-16 14:29     ` fangyu.yu
  0 siblings, 0 replies; 11+ messages in thread
From: fangyu.yu @ 2026-01-16 14:29 UTC (permalink / raw)
  To: andrew.jones
  Cc: ajones, alex, anup, aou, atish.patra, corbet, fangyu.yu, guoren,
	kvm-riscv, kvm, linux-doc, linux-kernel, linux-riscv, palmer,
	pbonzini, pjw, rkrcmar

>> From: Fangyu Yu <fangyu.yu@linux.alibaba.com>
>>
>> Introduces two per-VM architecture-specific fields to support runtime
>> configuration of the G-stage page table format:
>>
>> - kvm->arch.kvm_riscv_gstage_mode: specifies the HGATP mode used by the
>>   current VM;
>> - kvm->arch.kvm_riscv_gstage_pgd_levels: the corresponding number of page
>>   table levels for the selected mode.
>>
>> These fields replace the previous global variables
>> kvm_riscv_gstage_mode and kvm_riscv_gstage_pgd_levels, enabling different
>> virtual machines to independently select their G-stage page table format
>> instead of being forced to share the maximum mode detected by the kernel
>> at boot time.
>>
>> Signed-off-by: Fangyu Yu <fangyu.yu@linux.alibaba.com>
>> ---
>>  arch/riscv/include/asm/kvm_gstage.h | 12 ++---
>>  arch/riscv/include/asm/kvm_host.h   |  4 ++
>>  arch/riscv/kvm/gstage.c             | 82 +++++++++++++++++------------
>>  arch/riscv/kvm/main.c               |  4 +-
>>  arch/riscv/kvm/mmu.c                | 18 +++++--
>>  arch/riscv/kvm/vm.c                 |  2 +-
>>  arch/riscv/kvm/vmid.c               |  2 +-
>>  7 files changed, 74 insertions(+), 50 deletions(-)
>>
>> diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
>> index 595e2183173e..fdcada123b3f 100644
>> --- a/arch/riscv/include/asm/kvm_gstage.h
>> +++ b/arch/riscv/include/asm/kvm_gstage.h
>> @@ -29,16 +29,11 @@ struct kvm_gstage_mapping {
>>  #define kvm_riscv_gstage_index_bits	10
>>  #endif
>>
>> -extern unsigned long kvm_riscv_gstage_mode;
>> -extern unsigned long kvm_riscv_gstage_pgd_levels;
>> +extern unsigned long kvm_riscv_gstage_max_mode;
>> +extern unsigned long kvm_riscv_gstage_max_pgd_levels;
>>
>>  #define kvm_riscv_gstage_pgd_xbits	2
>>  #define kvm_riscv_gstage_pgd_size	(1UL << (HGATP_PAGE_SHIFT + kvm_riscv_gstage_pgd_xbits))
>> -#define kvm_riscv_gstage_gpa_bits	(HGATP_PAGE_SHIFT + \
>> -					 (kvm_riscv_gstage_pgd_levels * \
>> -					  kvm_riscv_gstage_index_bits) + \
>> -					 kvm_riscv_gstage_pgd_xbits)
>> -#define kvm_riscv_gstage_gpa_size	((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits))
>>
>>  bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
>>  			       pte_t **ptepp, u32 *ptep_level);
>> @@ -69,4 +64,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
>>
>>  void kvm_riscv_gstage_mode_detect(void);
>>
>> +gpa_t kvm_riscv_gstage_gpa_size(struct kvm_arch *k);
>> +unsigned long kvm_riscv_gstage_gpa_bits(struct kvm_arch *k);
>> +
>>  #endif
>> diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
>> index 24585304c02b..27ea8e8fd5b0 100644
>> --- a/arch/riscv/include/asm/kvm_host.h
>> +++ b/arch/riscv/include/asm/kvm_host.h
>> @@ -103,6 +103,10 @@ struct kvm_arch {
>>
>>  	/* KVM_CAP_RISCV_MP_STATE_RESET */
>>  	bool mp_state_reset;
>> +
>> +	unsigned long kvm_riscv_gstage_mode;
>
>There's a 1:1 mapping for mode/levels, so we don't need to track both.
>Since mode is rarely used, then I think something like this would still
>provide enough convenience without requiring the storage allocation.
>
> static inline unsigned long kvm_riscv_gstage_mode(struct kvm_gstage *gstage)
> {
>     unsigned long modes[] = {
>         [2] = HGATP_MODE_SV32X4,
>         [3] = HGATP_MODE_SV39X4,
>         [4] = HGATP_MODE_SV48X4,
>         [5] = HGATP_MODE_SV57X4,
>     };
>
>     return modes[gstage->kvm->arch.kvm_riscv_gstage_pgd_levels];
> }

Thanks for the suggestion.

You're right that gstage mode has a 1:1 mapping with pgd_levels, so keeping
both is redundant. In the next revision I'll drop kvm_riscv_gstage_mode and
derive HGATP.MODE from kvm_riscv_gstage_pgd_levels via a small helper.

>> +	unsigned long kvm_riscv_gstage_pgd_levels;
>> +	bool gstage_mode_initialized;
>>  };
>>
>>  struct kvm_cpu_trap {
>> diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
>> index b67d60d722c2..06452e4c2ab2 100644
>> --- a/arch/riscv/kvm/gstage.c
>> +++ b/arch/riscv/kvm/gstage.c
>> @@ -12,22 +12,23 @@
>>  #include <asm/kvm_gstage.h>
>>
>>  #ifdef CONFIG_64BIT
>> -unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV39X4;
>> -unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 3;
>> +unsigned long kvm_riscv_gstage_max_mode __ro_after_init = HGATP_MODE_SV39X4;
>
>With a kvm_riscv_gstage_mode() function we don't need
>kvm_riscv_gstage_max_mode either.

Thanks, agreed.

>> +unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 3;
>>  #else
>> -unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV32X4;
>> -unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 2;
>> +unsigned long kvm_riscv_gstage_max_mode __ro_after_init = HGATP_MODE_SV32X4;
>> +unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 2;
>>  #endif
>>
>>  #define gstage_pte_leaf(__ptep)	\
>>  	(pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
>>
>> -static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
>> +static inline unsigned long gstage_pte_index(struct kvm_gstage *gstage,
>> +					     gpa_t addr, u32 level)
>>  {
>>  	unsigned long mask;
>>  	unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level);
>>
>> -	if (level == (kvm_riscv_gstage_pgd_levels - 1))
>> +	if (level == (gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1))
>
>nit: we can drop the unnecessary () while touching this line.

Ack, will fix.

>>  		mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1;
>>  	else
>>  		mask = PTRS_PER_PTE - 1;
>> @@ -40,12 +41,13 @@ static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
>>  	return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte)));
>>  }
>>
>> -static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
>> +static int gstage_page_size_to_level(struct kvm_gstage *gstage, unsigned long page_size,
>> +				     u32 *out_level)
>>  {
>>  	u32 i;
>>  	unsigned long psz = 1UL << 12;
>>
>> -	for (i = 0; i < kvm_riscv_gstage_pgd_levels; i++) {
>> +	for (i = 0; i < gstage->kvm->arch.kvm_riscv_gstage_pgd_levels; i++) {
>>  		if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) {
>>  			*out_level = i;
>>  			return 0;
>> @@ -55,21 +57,23 @@ static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
>>  	return -EINVAL;
>>  }
>>
>> -static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
>> +static int gstage_level_to_page_order(struct kvm_gstage *gstage, u32 level,
>> +				      unsigned long *out_pgorder)
>>  {
>> -	if (kvm_riscv_gstage_pgd_levels < level)
>> +	if (gstage->kvm->arch.kvm_riscv_gstage_pgd_levels < level)
>>  		return -EINVAL;
>>
>>  	*out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits);
>>  	return 0;
>>  }
>>
>> -static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
>> +static int gstage_level_to_page_size(struct kvm_gstage *gstage, u32 level,
>> +				     unsigned long *out_pgsize)
>>  {
>>  	int rc;
>>  	unsigned long page_order = PAGE_SHIFT;
>>
>> -	rc = gstage_level_to_page_order(level, &page_order);
>> +	rc = gstage_level_to_page_order(gstage, level, &page_order);
>>  	if (rc)
>>  		return rc;
>>
>> @@ -81,11 +85,11 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
>>  			       pte_t **ptepp, u32 *ptep_level)
>>  {
>>  	pte_t *ptep;
>> -	u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
>> +	u32 current_level = gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1;
>>
>>  	*ptep_level = current_level;
>>  	ptep = (pte_t *)gstage->pgd;
>> -	ptep = &ptep[gstage_pte_index(addr, current_level)];
>> +	ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
>>  	while (ptep && pte_val(ptep_get(ptep))) {
>>  		if (gstage_pte_leaf(ptep)) {
>>  			*ptep_level = current_level;
>> @@ -97,7 +101,7 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
>>  			current_level--;
>>  			*ptep_level = current_level;
>>  			ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
>> -			ptep = &ptep[gstage_pte_index(addr, current_level)];
>> +			ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
>>  		} else {
>>  			ptep = NULL;
>>  		}
>> @@ -110,7 +114,7 @@ static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr)
>>  {
>>  	unsigned long order = PAGE_SHIFT;
>>
>> -	if (gstage_level_to_page_order(level, &order))
>> +	if (gstage_level_to_page_order(gstage, level, &order))
>>  		return;
>>  	addr &= ~(BIT(order) - 1);
>>
>> @@ -125,9 +129,9 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
>>  			     struct kvm_mmu_memory_cache *pcache,
>>  			     const struct kvm_gstage_mapping *map)
>>  {
>> -	u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
>> +	u32 current_level = gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1;
>>  	pte_t *next_ptep = (pte_t *)gstage->pgd;
>> -	pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
>> +	pte_t *ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
>>
>>  	if (current_level < map->level)
>>  		return -EINVAL;
>> @@ -151,7 +155,7 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
>>  		}
>>
>>  		current_level--;
>> -		ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
>> +		ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
>>  	}
>>
>>  	if (pte_val(*ptep) != pte_val(map->pte)) {
>> @@ -175,7 +179,7 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
>>  	out_map->addr = gpa;
>>  	out_map->level = 0;
>>
>> -	ret = gstage_page_size_to_level(page_size, &out_map->level);
>> +	ret = gstage_page_size_to_level(gstage, page_size, &out_map->level);
>>  	if (ret)
>>  		return ret;
>>
>> @@ -217,7 +221,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
>>  	u32 next_ptep_level;
>>  	unsigned long next_page_size, page_size;
>>
>> -	ret = gstage_level_to_page_size(ptep_level, &page_size);
>> +	ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
>>  	if (ret)
>>  		return;
>>
>> @@ -229,7 +233,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
>>  	if (ptep_level && !gstage_pte_leaf(ptep)) {
>>  		next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
>>  		next_ptep_level = ptep_level - 1;
>> -		ret = gstage_level_to_page_size(next_ptep_level, &next_page_size);
>> +		ret = gstage_level_to_page_size(gstage, next_ptep_level, &next_page_size);
>>  		if (ret)
>>  			return;
>>
>> @@ -263,7 +267,7 @@ void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
>>
>>  	while (addr < end) {
>>  		found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
>> -		ret = gstage_level_to_page_size(ptep_level, &page_size);
>> +		ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
>>  		if (ret)
>>  			break;
>>
>> @@ -297,7 +301,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
>>
>>  	while (addr < end) {
>>  		found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
>> -		ret = gstage_level_to_page_size(ptep_level, &page_size);
>> +		ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
>>  		if (ret)
>>  			break;
>>
>> @@ -319,41 +323,51 @@ void __init kvm_riscv_gstage_mode_detect(void)
>>  	/* Try Sv57x4 G-stage mode */
>>  	csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
>>  	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
>> -		kvm_riscv_gstage_mode = HGATP_MODE_SV57X4;
>> -		kvm_riscv_gstage_pgd_levels = 5;
>> +		kvm_riscv_gstage_max_mode = HGATP_MODE_SV57X4;
>> +		kvm_riscv_gstage_max_pgd_levels = 5;
>>  		goto done;
>>  	}
>>
>>  	/* Try Sv48x4 G-stage mode */
>>  	csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
>>  	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
>> -		kvm_riscv_gstage_mode = HGATP_MODE_SV48X4;
>> -		kvm_riscv_gstage_pgd_levels = 4;
>> +		kvm_riscv_gstage_max_mode = HGATP_MODE_SV48X4;
>> +		kvm_riscv_gstage_max_pgd_levels = 4;
>>  		goto done;
>>  	}
>>
>>  	/* Try Sv39x4 G-stage mode */
>>  	csr_write(CSR_HGATP, HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT);
>>  	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV39X4) {
>> -		kvm_riscv_gstage_mode = HGATP_MODE_SV39X4;
>> -		kvm_riscv_gstage_pgd_levels = 3;
>> +		kvm_riscv_gstage_max_mode = HGATP_MODE_SV39X4;
>> +		kvm_riscv_gstage_max_pgd_levels = 3;
>>  		goto done;
>>  	}
>>  #else /* CONFIG_32BIT */
>>  	/* Try Sv32x4 G-stage mode */
>>  	csr_write(CSR_HGATP, HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT);
>>  	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV32X4) {
>> -		kvm_riscv_gstage_mode = HGATP_MODE_SV32X4;
>> -		kvm_riscv_gstage_pgd_levels = 2;
>> +		kvm_riscv_gstage_max_mode = HGATP_MODE_SV32X4;
>> +		kvm_riscv_gstage_max_pgd_levels = 2;
>>  		goto done;
>>  	}
>>  #endif
>>
>>  	/* KVM depends on !HGATP_MODE_OFF */
>> -	kvm_riscv_gstage_mode = HGATP_MODE_OFF;
>> -	kvm_riscv_gstage_pgd_levels = 0;
>> +	kvm_riscv_gstage_max_mode = HGATP_MODE_OFF;
>> +	kvm_riscv_gstage_max_pgd_levels = 0;
>>
>>  done:
>>  	csr_write(CSR_HGATP, 0);
>>  	kvm_riscv_local_hfence_gvma_all();
>>  }
>> +
>> +unsigned long kvm_riscv_gstage_gpa_bits(struct kvm_arch *k) {
>
>Did you run checkpatch? I think it requires '{' to be on its own line.
>
>nit: s/k/ka/ would be consistent with other archs, although I see k is
>used in riscv's kvm_riscv_mmu_update_hgatp() but that can be fixed up
>in this patch since there's a change in the same place too.

Thanks for catching that.

Yes, checkpatch complains about the opening brace placement here. I'll fix
the style by moving '{' onto its own line. I'll also rename the argument
from 'k' to 'ka' for consistency (and update the existing usage in
kvm_riscv_mmu_update_hgatp() in the same patch since we're touching it
anyway).

I'll apply the same fixes to kvm_riscv_gstage_gpa_size() as well.

>
>> +	return (HGATP_PAGE_SHIFT + (k->kvm_riscv_gstage_pgd_levels *
>> +		    kvm_riscv_gstage_index_bits) +
>> +		    kvm_riscv_gstage_pgd_xbits);
>> +}
>> +
>> +gpa_t kvm_riscv_gstage_gpa_size(struct kvm_arch *k) {
>
>same comments as above
>
>> +	return ((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits(k)));
>
> return BIT_ULL(kvm_riscv_gstage_gpa_bits(ka))
>
>(the cast is implicit from return type)
>
>> +}
>> diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
>> index 45536af521f0..56a246e0e791 100644
>> --- a/arch/riscv/kvm/main.c
>> +++ b/arch/riscv/kvm/main.c
>> @@ -105,7 +105,7 @@ static int __init riscv_kvm_init(void)
>>  		return rc;
>>
>>  	kvm_riscv_gstage_mode_detect();
>> -	switch (kvm_riscv_gstage_mode) {
>> +	switch (kvm_riscv_gstage_max_mode) {
>>  	case HGATP_MODE_SV32X4:
>>  		str = "Sv32x4";
>>  		break;
>> @@ -164,7 +164,7 @@ static int __init riscv_kvm_init(void)
>>  			 (rc) ? slist : "no features");
>>  	}
>>
>> -	kvm_info("using %s G-stage page table format\n", str);
>> +	kvm_info("Max G-stage page table format %s \n", str);
>>
>>  	kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits());
>>
>> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
>> index 4ab06697bfc0..574783907162 100644
>> --- a/arch/riscv/kvm/mmu.c
>> +++ b/arch/riscv/kvm/mmu.c
>> @@ -67,7 +67,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
>>  		if (!writable)
>>  			map.pte = pte_wrprotect(map.pte);
>>
>> -		ret = kvm_mmu_topup_memory_cache(&pcache, kvm_riscv_gstage_pgd_levels);
>> +		ret = kvm_mmu_topup_memory_cache(&pcache,kvm->arch.kvm_riscv_gstage_pgd_levels);
>                                                         ^ missing space
>
>>  		if (ret)
>>  			goto out;
>>
>> @@ -186,8 +186,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>  	 * space addressable by the KVM guest GPA space.
>>  	 */
>>  	if ((new->base_gfn + new->npages) >=
>> -	    (kvm_riscv_gstage_gpa_size >> PAGE_SHIFT))
>> +			(kvm_riscv_gstage_gpa_size(&kvm->arch) >> PAGE_SHIFT)) {
>>  		return -EFAULT;
>> +	}
>
>nit: Remove the unnecessary () and the '{' and the condition will fit on
>one 100 char line.

Ack.

>>
>>  	hva = new->userspace_addr;
>>  	size = new->npages << PAGE_SHIFT;
>> @@ -332,7 +333,7 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
>>  	memset(out_map, 0, sizeof(*out_map));
>>
>>  	/* We need minimum second+third level pages */
>> -	ret = kvm_mmu_topup_memory_cache(pcache, kvm_riscv_gstage_pgd_levels);
>> +	ret = kvm_mmu_topup_memory_cache(pcache, kvm->arch.kvm_riscv_gstage_pgd_levels);
>>  	if (ret) {
>>  		kvm_err("Failed to topup G-stage cache\n");
>>  		return ret;
>> @@ -431,6 +432,11 @@ int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm)
>>  		return -ENOMEM;
>>  	kvm->arch.pgd = page_to_virt(pgd_page);
>>  	kvm->arch.pgd_phys = page_to_phys(pgd_page);
>> +	if (!kvm->arch.gstage_mode_initialized) {
>> +		/*user-space didn't set KVM_CAP_RISC_HGATP_MODE cap*/
>                  ^ missing space                                  ^ missing space
>> +		kvm->arch.kvm_riscv_gstage_mode = kvm_riscv_gstage_max_mode;
>> +		kvm->arch.kvm_riscv_gstage_pgd_levels = kvm_riscv_gstage_max_pgd_levels;
>
>Missing 'kvm->arch.gstage_mode_initialized = true' statement.

The initialization is done in the following commit of this series (patch 2/2)

>> +	}
>>
>>  	return 0;
>>  }
>> @@ -446,10 +452,12 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
>>  		gstage.flags = 0;
>>  		gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
>>  		gstage.pgd = kvm->arch.pgd;
>> -		kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size, false);
>> +		kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size(&kvm->arch), false);
>>  		pgd = READ_ONCE(kvm->arch.pgd);
>>  		kvm->arch.pgd = NULL;
>>  		kvm->arch.pgd_phys = 0;
>> +		kvm->arch.kvm_riscv_gstage_mode = HGATP_MODE_OFF;
>> +		kvm->arch.kvm_riscv_gstage_pgd_levels = 0;
>>  	}
>>  	spin_unlock(&kvm->mmu_lock);
>>
>> @@ -459,8 +467,8 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
>>
>>  void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu)
>>  {
>> -	unsigned long hgatp = kvm_riscv_gstage_mode << HGATP_MODE_SHIFT;
>>  	struct kvm_arch *k = &vcpu->kvm->arch;
>> +	unsigned long hgatp = k->kvm_riscv_gstage_mode << HGATP_MODE_SHIFT;
>>
>>  	hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID;
>>  	hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN;
>> diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
>> index 66d91ae6e9b2..4b2156df40fc 100644
>> --- a/arch/riscv/kvm/vm.c
>> +++ b/arch/riscv/kvm/vm.c
>> @@ -200,7 +200,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>>  		r = KVM_USER_MEM_SLOTS;
>>  		break;
>>  	case KVM_CAP_VM_GPA_BITS:
>> -		r = kvm_riscv_gstage_gpa_bits;
>> +		r = kvm_riscv_gstage_gpa_bits(&kvm->arch);
>>  		break;
>>  	default:
>>  		r = 0;
>> diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c
>> index cf34d448289d..db27430f111e 100644
>> --- a/arch/riscv/kvm/vmid.c
>> +++ b/arch/riscv/kvm/vmid.c
>> @@ -26,7 +26,7 @@ static DEFINE_SPINLOCK(vmid_lock);
>>  void __init kvm_riscv_gstage_vmid_detect(void)
>>  {
>>  	/* Figure-out number of VMID bits in HW */
>> -	csr_write(CSR_HGATP, (kvm_riscv_gstage_mode << HGATP_MODE_SHIFT) | HGATP_VMID);
>> +	csr_write(CSR_HGATP, (kvm_riscv_gstage_max_mode << HGATP_MODE_SHIFT) | HGATP_VMID);
>>  	vmid_bits = csr_read(CSR_HGATP);
>>  	vmid_bits = (vmid_bits & HGATP_VMID) >> HGATP_VMID_SHIFT;
>>  	vmid_bits = fls_long(vmid_bits);
>> --
>> 2.50.1
>>
>
>Thanks,
>drew
>
Thanks,
Fangyu

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Re: [PATCH v2] RISC-V: KVM: add KVM_CAP_RISCV_SET_HGATP_MODE
  2026-01-15 23:56   ` Andrew Jones
@ 2026-01-16 14:29     ` fangyu.yu
  0 siblings, 0 replies; 11+ messages in thread
From: fangyu.yu @ 2026-01-16 14:29 UTC (permalink / raw)
  To: andrew.jones
  Cc: ajones, alex, anup, aou, atish.patra, corbet, fangyu.yu, guoren,
	kvm-riscv, kvm, linux-doc, linux-kernel, linux-riscv, palmer,
	pbonzini, pjw, rkrcmar

>> From: Fangyu Yu <fangyu.yu@linux.alibaba.com>
>>
>> This capability allows userspace to explicitly select the HGATP mode
>> for the VM. The selected mode must be less than or equal to the max
>> HGATP mode supported by the hardware. This capability must be enabled
>> before creating any vCPUs, and can only be set once per VM.
>>
>> Signed-off-by: Fangyu Yu <fangyu.yu@linux.alibaba.com>
>> ---
>>  Documentation/virt/kvm/api.rst | 14 ++++++++++++++
>>  arch/riscv/kvm/vm.c            | 26 ++++++++++++++++++++++++--
>>  include/uapi/linux/kvm.h       |  1 +
>>  3 files changed, 39 insertions(+), 2 deletions(-)
>>
>> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
>> index 01a3abef8abb..9e17788e3a9d 100644
>> --- a/Documentation/virt/kvm/api.rst
>> +++ b/Documentation/virt/kvm/api.rst
>> @@ -8765,6 +8765,20 @@ helpful if user space wants to emulate instructions which are not
>>  This capability can be enabled dynamically even if VCPUs were already
>>  created and are running.
>>
>> +7.47 KVM_CAP_RISCV_SET_HGATP_MODE
>> +---------------------------------
>> +
>> +:Architectures: riscv
>> +:Type: VM
>> +:Parameters: args[0] contains the requested HGATP mode
>> +:Returns: 0 on success, -EINVAL if arg[0] is outside the range of hgatp
>> +          modes supported by the hardware.
>> +
>> +This capability allows userspace to explicitly select the HGATP mode for
>> +the VM. The selected mode must be less than or equal to the maximum HGATP
>> +mode supported by the hardware. This capability must be enabled before
>> +creating any vCPUs, and can only be set once per VM.
>
>I think I would prefer a KVM_CAP_RISCV_SET_MAX_GPA type of capability. The
>reason is because, while one of the results of the max-gpa being set will
>be to set hgatp, there may be other reasons to track the guest's maximum
>physical address too and kvm userspace shouldn't need to think about each
>individually.

That makes sense, thanks.

>> +
>>  8. Other capabilities.
>>  ======================
>>
>> diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
>> index 4b2156df40fc..e9275023a73a 100644
>> --- a/arch/riscv/kvm/vm.c
>> +++ b/arch/riscv/kvm/vm.c
>> @@ -202,6 +202,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>>  	case KVM_CAP_VM_GPA_BITS:
>>  		r = kvm_riscv_gstage_gpa_bits(&kvm->arch);
>>  		break;
>> +	case KVM_CAP_RISCV_SET_HGATP_MODE:
>> +#ifdef CONFIG_64BIT
>> +		r = 1;
>> +#else/* CONFIG_32BIT */
>> +		r = 0;
>> +#endif
>
> r = IS_ENABLED(CONFIG_64BIT) ? 1 : 0;

Ack.

>> +		break;
>>  	default:
>>  		r = 0;
>>  		break;
>> @@ -212,12 +219,27 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>>
>>  int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
>>  {
>> +	if (cap->flags)
>> +		return -EINVAL;
>
>add blank line

Ack, will add a blank line after the flags check.

>
>>  	switch (cap->cap) {
>>  	case KVM_CAP_RISCV_MP_STATE_RESET:
>> -		if (cap->flags)
>> -			return -EINVAL;
>>  		kvm->arch.mp_state_reset = true;
>>  		return 0;
>> +	case KVM_CAP_RISCV_SET_HGATP_MODE:
>> +#ifdef CONFIG_64BIT
>> +		if (cap->args[0] < HGATP_MODE_SV39X4 ||
>> +			cap->args[0] > kvm_riscv_gstage_max_mode)
>> +			return -EINVAL;
>> +		if (kvm->arch.gstage_mode_initialized)
>> +			return 0;
>
>I think we want to return -EBUSY here and it should be documented where it
>already states "...can only be set once per VM"

Agreed.

>> +		kvm->arch.gstage_mode_initialized = true;
>
>In the previous patch I thought we were missing this, but I see now it
>means "user initialized". Let's rename it as such,
>
> gstage_mode_user_initialized

Agreed.

>> +		kvm->arch.kvm_riscv_gstage_mode = cap->args[0];
>> +		kvm->arch.kvm_riscv_gstage_pgd_levels = 3 +
>> +		    kvm->arch.kvm_riscv_gstage_mode - HGATP_MODE_SV39X4;
>> +		kvm_info("using SV%lluX4 G-stage page table format\n",
>> +			39 + (cap->args[0] - HGATP_MODE_SV39X4) * 9);
>> +#endif
>> +		return 0;
>>  	default:
>>  		return -EINVAL;
>>  	}
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index dddb781b0507..00c02a880518 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -974,6 +974,7 @@ struct kvm_enable_cap {
>>  #define KVM_CAP_GUEST_MEMFD_FLAGS 244
>>  #define KVM_CAP_ARM_SEA_TO_USER 245
>>  #define KVM_CAP_S390_USER_OPEREXEC 246
>> +#define KVM_CAP_RISCV_SET_HGATP_MODE 247
>>
>>  struct kvm_irq_routing_irqchip {
>>  	__u32 irqchip;
>> --
>> 2.50.1
>>
>
>Thanks,
>drew
>
Thanks,
Fangyu

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] RISC-V: KVM: add KVM_CAP_RISCV_SET_HGATP_MODE
  2026-01-05 14:32 ` [PATCH v2] RISC-V: KVM: add KVM_CAP_RISCV_SET_HGATP_MODE fangyu.yu
  2026-01-15 23:56   ` Andrew Jones
@ 2026-01-16 19:03   ` Andrew Jones
  2026-01-20 14:22     ` fangyu.yu
  2026-01-19 13:56   ` Radim Krčmář
  2 siblings, 1 reply; 11+ messages in thread
From: Andrew Jones @ 2026-01-16 19:03 UTC (permalink / raw)
  To: fangyu.yu
  Cc: pbonzini, corbet, anup, atish.patra, pjw, palmer, aou, alex,
	guoren, ajones, rkrcmar, linux-doc, kvm, kvm-riscv, linux-riscv,
	linux-kernel

On Mon, Jan 05, 2026 at 10:32:32PM +0800, fangyu.yu@linux.alibaba.com wrote:
...
> +	case KVM_CAP_RISCV_SET_HGATP_MODE:
> +#ifdef CONFIG_64BIT
> +		if (cap->args[0] < HGATP_MODE_SV39X4 ||
> +			cap->args[0] > kvm_riscv_gstage_max_mode)
> +			return -EINVAL;
> +		if (kvm->arch.gstage_mode_initialized)
> +			return 0;
> +		kvm->arch.gstage_mode_initialized = true;
> +		kvm->arch.kvm_riscv_gstage_mode = cap->args[0];
> +		kvm->arch.kvm_riscv_gstage_pgd_levels = 3 +
> +		    kvm->arch.kvm_riscv_gstage_mode - HGATP_MODE_SV39X4;
> +		kvm_info("using SV%lluX4 G-stage page table format\n",
> +			39 + (cap->args[0] - HGATP_MODE_SV39X4) * 9);

I don't think we want this kvm_info line, particularly if it doesn't also
include a VM ID in some form to allow readers to know which VM is using
the selected format. Let's either drop it or change it to kvm_debug and
include a VM ID.

Thanks,
drew

> +#endif
> +		return 0;
>  	default:
>  		return -EINVAL;
>  	}
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index dddb781b0507..00c02a880518 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -974,6 +974,7 @@ struct kvm_enable_cap {
>  #define KVM_CAP_GUEST_MEMFD_FLAGS 244
>  #define KVM_CAP_ARM_SEA_TO_USER 245
>  #define KVM_CAP_S390_USER_OPEREXEC 246
> +#define KVM_CAP_RISCV_SET_HGATP_MODE 247
>  
>  struct kvm_irq_routing_irqchip {
>  	__u32 irqchip;
> -- 
> 2.50.1
> 
> 
> -- 
> kvm-riscv mailing list
> kvm-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kvm-riscv

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] RISC-V: KVM: add KVM_CAP_RISCV_SET_HGATP_MODE
  2026-01-05 14:32 ` [PATCH v2] RISC-V: KVM: add KVM_CAP_RISCV_SET_HGATP_MODE fangyu.yu
  2026-01-15 23:56   ` Andrew Jones
  2026-01-16 19:03   ` Andrew Jones
@ 2026-01-19 13:56   ` Radim Krčmář
  2026-01-20 14:22     ` fangyu.yu
  2 siblings, 1 reply; 11+ messages in thread
From: Radim Krčmář @ 2026-01-19 13:56 UTC (permalink / raw)
  To: fangyu.yu, pbonzini, corbet, anup, atish.patra, pjw, palmer, aou,
	alex
  Cc: guoren, ajones, rkrcmar, linux-doc, kvm, kvm-riscv, linux-riscv,
	linux-kernel

2026-01-05T22:32:32+08:00, <fangyu.yu@linux.alibaba.com>:
> From: Fangyu Yu <fangyu.yu@linux.alibaba.com>
>
> This capability allows userspace to explicitly select the HGATP mode
> for the VM. The selected mode must be less than or equal to the max
> HGATP mode supported by the hardware. This capability must be enabled
> before creating any vCPUs, and can only be set once per VM.
>
> Signed-off-by: Fangyu Yu <fangyu.yu@linux.alibaba.com>
> ---
> diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
> @@ -212,12 +219,27 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  
>  int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
>  {
> +	if (cap->flags)
> +		return -EINVAL;
>  	switch (cap->cap) {
> +	case KVM_CAP_RISCV_SET_HGATP_MODE:
> +#ifdef CONFIG_64BIT
> +		if (cap->args[0] < HGATP_MODE_SV39X4 ||
> +			cap->args[0] > kvm_riscv_gstage_max_mode)
> +			return -EINVAL;
> +		if (kvm->arch.gstage_mode_initialized)
> +			return 0;

"must be enabled before creating any vCPUs" check is missing.

> +		kvm->arch.gstage_mode_initialized = true;
> +		kvm->arch.kvm_riscv_gstage_mode = cap->args[0];
> +		kvm->arch.kvm_riscv_gstage_pgd_levels = 3 +
> +		    kvm->arch.kvm_riscv_gstage_mode - HGATP_MODE_SV39X4;

Even before creating VCPUs, I don't see enough protections to make this
work.

Userspace can only provide a hint about the physical address space size
before any other KVM code could have acted on the information.
It would be a serious issue if some code would operate on hgatp as if it
were X and others as Y.

The simplest solution would be to ensure that the CAP_SET VM ioctl can
only be executed before any other IOCTL, but a change in generic code to
achieve it would be frowned upon...
I would recommend looking at kvm_are_all_memslots_empty() first, as it's
quite likely that it could be sufficient for the purposes of changing
hgatp.

Thanks.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Re: [PATCH v2] RISC-V: KVM: add KVM_CAP_RISCV_SET_HGATP_MODE
  2026-01-16 19:03   ` Andrew Jones
@ 2026-01-20 14:22     ` fangyu.yu
  0 siblings, 0 replies; 11+ messages in thread
From: fangyu.yu @ 2026-01-20 14:22 UTC (permalink / raw)
  To: andrew.jones
  Cc: ajones, alex, anup, aou, atish.patra, corbet, fangyu.yu, guoren,
	kvm-riscv, kvm, linux-doc, linux-kernel, linux-riscv, palmer,
	pbonzini, pjw, rkrcmar

>...
>> +	case KVM_CAP_RISCV_SET_HGATP_MODE:
>> +#ifdef CONFIG_64BIT
>> +		if (cap->args[0] < HGATP_MODE_SV39X4 ||
>> +			cap->args[0] > kvm_riscv_gstage_max_mode)
>> +			return -EINVAL;
>> +		if (kvm->arch.gstage_mode_initialized)
>> +			return 0;
>> +		kvm->arch.gstage_mode_initialized = true;
>> +		kvm->arch.kvm_riscv_gstage_mode = cap->args[0];
>> +		kvm->arch.kvm_riscv_gstage_pgd_levels = 3 +
>> +		    kvm->arch.kvm_riscv_gstage_mode - HGATP_MODE_SV39X4;
>> +		kvm_info("using SV%lluX4 G-stage page table format\n",
>> +			39 + (cap->args[0] - HGATP_MODE_SV39X4) * 9);
>
>I don't think we want this kvm_info line, particularly if it doesn't also
>include a VM ID in some form to allow readers to know which VM is using
>the selected format. Let's either drop it or change it to kvm_debug and
>include a VM ID.

Agreed, I will switch it to kvm_debug() and include a VM ID.

>
>Thanks,
>drew
>
>> +#endif
>> +		return 0;
>>  	default:
>>  		return -EINVAL;
>>  	}
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index dddb781b0507..00c02a880518 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -974,6 +974,7 @@ struct kvm_enable_cap {
>>  #define KVM_CAP_GUEST_MEMFD_FLAGS 244
>>  #define KVM_CAP_ARM_SEA_TO_USER 245
>>  #define KVM_CAP_S390_USER_OPEREXEC 246
>> +#define KVM_CAP_RISCV_SET_HGATP_MODE 247
>>  
>>  struct kvm_irq_routing_irqchip {
>>  	__u32 irqchip;
>> -- 
>> 2.50.1
>
>

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Re: [PATCH v2] RISC-V: KVM: add KVM_CAP_RISCV_SET_HGATP_MODE
  2026-01-19 13:56   ` Radim Krčmář
@ 2026-01-20 14:22     ` fangyu.yu
  0 siblings, 0 replies; 11+ messages in thread
From: fangyu.yu @ 2026-01-20 14:22 UTC (permalink / raw)
  To: radim.krcmar
  Cc: ajones, alex, anup, aou, atish.patra, corbet, fangyu.yu, guoren,
	kvm-riscv, kvm, linux-doc, linux-kernel, linux-riscv, palmer,
	pbonzini, pjw, rkrcmar

>> From: Fangyu Yu <fangyu.yu@linux.alibaba.com>
>>
>> This capability allows userspace to explicitly select the HGATP mode
>> for the VM. The selected mode must be less than or equal to the max
>> HGATP mode supported by the hardware. This capability must be enabled
>> before creating any vCPUs, and can only be set once per VM.
>>
>> Signed-off-by: Fangyu Yu <fangyu.yu@linux.alibaba.com>
>> ---
>> diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
>> @@ -212,12 +219,27 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>>  
>>  int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
>>  {
>> +	if (cap->flags)
>> +		return -EINVAL;
>>  	switch (cap->cap) {
>> +	case KVM_CAP_RISCV_SET_HGATP_MODE:
>> +#ifdef CONFIG_64BIT
>> +		if (cap->args[0] < HGATP_MODE_SV39X4 ||
>> +			cap->args[0] > kvm_riscv_gstage_max_mode)
>> +			return -EINVAL;
>> +		if (kvm->arch.gstage_mode_initialized)
>> +			return 0;
>
>"must be enabled before creating any vCPUs" check is missing.

Agreed, I'll add the missing "must be enabled before creating any vCPUs" check by
rejecting the capability once kvm->created_vcpus is non-zero.

>
>> +		kvm->arch.gstage_mode_initialized = true;
>> +		kvm->arch.kvm_riscv_gstage_mode = cap->args[0];
>> +		kvm->arch.kvm_riscv_gstage_pgd_levels = 3 +
>> +		    kvm->arch.kvm_riscv_gstage_mode - HGATP_MODE_SV39X4;
>
>Even before creating VCPUs, I don't see enough protections to make this
>work.
>
>Userspace can only provide a hint about the physical address space size
>before any other KVM code could have acted on the information.
>It would be a serious issue if some code would operate on hgatp as if it
>were X and others as Y.
>
>The simplest solution would be to ensure that the CAP_SET VM ioctl can
>only be executed before any other IOCTL, but a change in generic code to
>achieve it would be frowned upon...
>I would recommend looking at kvm_are_all_memslots_empty() first, as it's
>quite likely that it could be sufficient for the purposes of changing
>hgatp.

Using kvm_are_all_memslots_empty might be a good idea, and I will add a
check for this function in the v2.

>
>Thanks.

Thanks,
Fangyu

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2026-01-20 14:23 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-05 14:32 [PATCH v2] Support runtime configuration for per-VM's HGATP mode fangyu.yu
2026-01-05 14:32 ` [PATCH v2] RISC-V: KVM: " fangyu.yu
2026-01-15 23:37   ` Andrew Jones
2026-01-16 14:29     ` fangyu.yu
2026-01-05 14:32 ` [PATCH v2] RISC-V: KVM: add KVM_CAP_RISCV_SET_HGATP_MODE fangyu.yu
2026-01-15 23:56   ` Andrew Jones
2026-01-16 14:29     ` fangyu.yu
2026-01-16 19:03   ` Andrew Jones
2026-01-20 14:22     ` fangyu.yu
2026-01-19 13:56   ` Radim Krčmář
2026-01-20 14:22     ` fangyu.yu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox