* [PATCH v3 1/3] KVM: arm64: skip pKVM cache flushes for non cacheable mappings
2026-06-24 16:00 [PATCH v3 0/3] KVM: arm64: fix pKVM mapping cache corner cases Bradley Morgan
@ 2026-06-24 16:00 ` Bradley Morgan
2026-06-24 16:00 ` [PATCH v3 2/3] KVM: arm64: top up pKVM mapping cache for permission faults Bradley Morgan
2026-06-24 16:00 ` [PATCH v3 3/3] KVM: arm64: top up stage 2 memcache for dirty logging faults Bradley Morgan
2 siblings, 0 replies; 7+ messages in thread
From: Bradley Morgan @ 2026-06-24 16:00 UTC (permalink / raw)
To: Marc Zyngier, Oliver Upton
Cc: Fuad Tabba, Joey Gouly, Steffen Eiden, Suzuki K Poulose,
Zenghui Yu, Catalin Marinas, Will Deacon, Quentin Perret,
Vincent Donnefort, Gavin Shan, Alexandru Elisei, linux-arm-kernel,
kvmarm, linux-kernel, Bradley Morgan
pKVM keeps its own mapping list for stage 2 operations. Its flush path
uses that list directly, so it lost the PTE attribute check done by the
generic stage 2 walker.
Record whether a mapping is cacheable and skip cache maintenance for
mappings that are not cacheable.
Fixes: e912efed485a ("KVM: arm64: Introduce the EL1 pKVM MMU")
Signed-off-by: Bradley Morgan <include@grrlz.net>
---
arch/arm64/kvm/pkvm.c | 51 ++++++++++++++++++++++++++++++++++---------
1 file changed, 41 insertions(+), 10 deletions(-)
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 428723b1b0f5..ca6e823028c2 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -302,9 +302,32 @@ static u64 __pkvm_mapping_start(struct pkvm_mapping *m)
return m->gfn * PAGE_SIZE;
}
+#define PKVM_MAPPING_NR_PAGES_MASK GENMASK_ULL(47, 0)
+#define PKVM_MAPPING_CACHEABLE BIT_ULL(48)
+
+static u64 pkvm_mapping_nr_pages(struct pkvm_mapping *m)
+{
+ return m->nr_pages & PKVM_MAPPING_NR_PAGES_MASK;
+}
+
+static bool pkvm_mapping_is_cacheable(struct pkvm_mapping *m)
+{
+ return m->nr_pages & PKVM_MAPPING_CACHEABLE;
+}
+
+static void pkvm_mapping_set_nr_pages(struct pkvm_mapping *m, u64 nr_pages,
+ bool cacheable)
+{
+ WARN_ON_ONCE(nr_pages & ~PKVM_MAPPING_NR_PAGES_MASK);
+
+ m->nr_pages = nr_pages & PKVM_MAPPING_NR_PAGES_MASK;
+ if (cacheable)
+ m->nr_pages |= PKVM_MAPPING_CACHEABLE;
+}
+
static u64 __pkvm_mapping_end(struct pkvm_mapping *m)
{
- return (m->gfn + m->nr_pages) * PAGE_SIZE - 1;
+ return (m->gfn + pkvm_mapping_nr_pages(m)) * PAGE_SIZE - 1;
}
INTERVAL_TREE_DEFINE(struct pkvm_mapping, node, u64, __subtree_last,
@@ -350,7 +373,7 @@ static int __pkvm_pgtable_stage2_reclaim(struct kvm_pgtable *pgt, u64 start, u64
continue;
page = pfn_to_page(mapping->pfn);
- WARN_ON_ONCE(mapping->nr_pages != 1);
+ WARN_ON_ONCE(pkvm_mapping_nr_pages(mapping) != 1);
unpin_user_pages_dirty_lock(&page, 1, true);
account_locked_vm(kvm->mm, 1, false);
pkvm_mapping_remove(mapping, &pgt->pkvm_mappings);
@@ -369,7 +392,7 @@ static int __pkvm_pgtable_stage2_unshare(struct kvm_pgtable *pgt, u64 start, u64
for_each_mapping_in_range_safe(pgt, start, end, mapping) {
ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn,
- mapping->nr_pages);
+ pkvm_mapping_nr_pages(mapping));
if (WARN_ON(ret))
return ret;
pkvm_mapping_remove(mapping, &pgt->pkvm_mappings);
@@ -448,7 +471,7 @@ int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
* permission faults are handled in the relax_perms() path.
*/
if (mapping) {
- if (size == (mapping->nr_pages * PAGE_SIZE))
+ if (size == (pkvm_mapping_nr_pages(mapping) * PAGE_SIZE))
return -EAGAIN;
/*
@@ -472,7 +495,9 @@ int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
swap(mapping, cache->mapping);
mapping->gfn = gfn;
mapping->pfn = pfn;
- mapping->nr_pages = size / PAGE_SIZE;
+ pkvm_mapping_set_nr_pages(mapping, size / PAGE_SIZE,
+ !(prot & (KVM_PGTABLE_PROT_DEVICE |
+ KVM_PGTABLE_PROT_NORMAL_NC)));
pkvm_mapping_insert(mapping, &pgt->pkvm_mappings);
return ret;
@@ -503,7 +528,7 @@ int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
lockdep_assert_held(&kvm->mmu_lock);
for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) {
ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn,
- mapping->nr_pages);
+ pkvm_mapping_nr_pages(mapping));
if (WARN_ON(ret))
break;
}
@@ -517,9 +542,13 @@ int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
struct pkvm_mapping *mapping;
lockdep_assert_held(&kvm->mmu_lock);
- for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping)
+ for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) {
+ if (!pkvm_mapping_is_cacheable(mapping))
+ continue;
+
__clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn),
- PAGE_SIZE * mapping->nr_pages);
+ PAGE_SIZE * pkvm_mapping_nr_pages(mapping));
+ }
return 0;
}
@@ -536,8 +565,10 @@ bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64
lockdep_assert_held(&kvm->mmu_lock);
for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping)
- young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn,
- mapping->nr_pages, mkold);
+ young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest,
+ handle, mapping->gfn,
+ pkvm_mapping_nr_pages(mapping),
+ mkold);
return young;
}
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH v3 2/3] KVM: arm64: top up pKVM mapping cache for permission faults
2026-06-24 16:00 [PATCH v3 0/3] KVM: arm64: fix pKVM mapping cache corner cases Bradley Morgan
2026-06-24 16:00 ` [PATCH v3 1/3] KVM: arm64: skip pKVM cache flushes for non cacheable mappings Bradley Morgan
@ 2026-06-24 16:00 ` Bradley Morgan
2026-06-24 16:00 ` [PATCH v3 3/3] KVM: arm64: top up stage 2 memcache for dirty logging faults Bradley Morgan
2 siblings, 0 replies; 7+ messages in thread
From: Bradley Morgan @ 2026-06-24 16:00 UTC (permalink / raw)
To: Marc Zyngier, Oliver Upton
Cc: Fuad Tabba, Joey Gouly, Steffen Eiden, Suzuki K Poulose,
Zenghui Yu, Catalin Marinas, Will Deacon, Quentin Perret,
Vincent Donnefort, Gavin Shan, Alexandru Elisei, linux-arm-kernel,
kvmarm, linux-kernel, Bradley Morgan, stable
Permission faults normally only relax an existing leaf, so the fault path
does not top up the memcache.
With pKVM, a permission fault can also replace page mappings with a
PMD mapping. That path needs a fresh pkvm_mapping object, and can
dereference a NULL cache->mapping if the cache was not topped up.
Allocate just that object for pKVM permission faults.
The issue was discovered [1] by Sashiko.
Link: https://lore.kernel.org/all/20260623161545.EA08E1F000E9@smtp.kernel.org/ [1]
Fixes: db14091d8f75 ("KVM: arm64: Stage-2 huge mappings for np-guests")
Cc: stable@vger.kernel.org
Signed-off-by: Bradley Morgan <include@grrlz.net>
---
arch/arm64/kvm/mmu.c | 29 ++++++++++++++++++++++-------
1 file changed, 22 insertions(+), 7 deletions(-)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 6c941aaa10c6..3f57f6825a33 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1177,17 +1177,26 @@ void free_hyp_memcache(struct kvm_hyp_memcache *mc)
__free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc);
}
+static int topup_hyp_memcache_mapping(struct kvm_hyp_memcache *mc)
+{
+ if (mc->mapping)
+ return 0;
+
+ mc->mapping = kzalloc_obj(struct pkvm_mapping,
+ GFP_KERNEL_ACCOUNT);
+ return mc->mapping ? 0 : -ENOMEM;
+}
+
int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
{
+ int ret;
+
if (!is_protected_kvm_enabled())
return 0;
- if (!mc->mapping) {
- mc->mapping = kzalloc_obj(struct pkvm_mapping,
- GFP_KERNEL_ACCOUNT);
- if (!mc->mapping)
- return -ENOMEM;
- }
+ ret = topup_hyp_memcache_mapping(mc);
+ if (ret)
+ return ret;
return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
kvm_host_pa, mc);
@@ -2113,7 +2122,9 @@ static int user_mem_abort(const struct kvm_s2_fault_desc *s2fd)
* Permission faults just need to update the existing leaf entry,
* and so normally don't require allocations from the memcache. The
* only exception to this is when dirty logging is enabled at runtime
- * and a write fault needs to collapse a block entry into a table.
+ * and a write fault needs to collapse a block entry into a table. With
+ * pKVM, they may still need a fresh mapping object if the fault turns
+ * page entries into a block entry.
*/
memcache = get_mmu_memcache(s2fd->vcpu);
if (!perm_fault || (memslot_is_logging(s2fd->memslot) &&
@@ -2121,6 +2132,10 @@ static int user_mem_abort(const struct kvm_s2_fault_desc *s2fd)
ret = topup_mmu_memcache(s2fd->vcpu, memcache);
if (ret)
return ret;
+ } else if (is_protected_kvm_enabled()) {
+ ret = topup_hyp_memcache_mapping(memcache);
+ if (ret)
+ return ret;
}
/*
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH v3 3/3] KVM: arm64: top up stage 2 memcache for dirty logging faults
2026-06-24 16:00 [PATCH v3 0/3] KVM: arm64: fix pKVM mapping cache corner cases Bradley Morgan
2026-06-24 16:00 ` [PATCH v3 1/3] KVM: arm64: skip pKVM cache flushes for non cacheable mappings Bradley Morgan
2026-06-24 16:00 ` [PATCH v3 2/3] KVM: arm64: top up pKVM mapping cache for permission faults Bradley Morgan
@ 2026-06-24 16:00 ` Bradley Morgan
2026-06-24 17:39 ` Bradley Morgan
2 siblings, 1 reply; 7+ messages in thread
From: Bradley Morgan @ 2026-06-24 16:00 UTC (permalink / raw)
To: Marc Zyngier, Oliver Upton
Cc: Fuad Tabba, Joey Gouly, Steffen Eiden, Suzuki K Poulose,
Zenghui Yu, Catalin Marinas, Will Deacon, Quentin Perret,
Vincent Donnefort, Gavin Shan, Alexandru Elisei, linux-arm-kernel,
kvmarm, linux-kernel, Bradley Morgan, stable
Dirty logging forces new stage 2 mappings down to page size, but
it does not always remove an existing block mapping before the next
fault. Eager splitting is best effort and is disabled by default.
A permission fault on such a block can still need a page table page
to install the smaller mapping. Top up the memcache for any permission
fault while dirty logging is active, not only for write faults.
The issue was discovered [1] by Sashiko.
Link: https://lore.kernel.org/all/59984F6D-06F2-4302-BDD7-92DF334E8FA0@grrlz.net/T/#t [1]
Fixes: 6f745f1bb5bf ("KVM: arm64: Convert user_mem_abort() to generic page-table API")
Cc: stable@vger.kernel.org
Signed-off-by: Bradley Morgan <include@grrlz.net>
---
arch/arm64/kvm/mmu.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 3f57f6825a33..8911e319e6fa 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -2122,13 +2122,12 @@ static int user_mem_abort(const struct kvm_s2_fault_desc *s2fd)
* Permission faults just need to update the existing leaf entry,
* and so normally don't require allocations from the memcache. The
* only exception to this is when dirty logging is enabled at runtime
- * and a write fault needs to collapse a block entry into a table. With
- * pKVM, they may still need a fresh mapping object if the fault turns
- * page entries into a block entry.
+ * and a fault needs to collapse a block entry into a table. With pKVM,
+ * they may still need a fresh mapping object if the fault turns page
+ * entries into a block entry.
*/
memcache = get_mmu_memcache(s2fd->vcpu);
- if (!perm_fault || (memslot_is_logging(s2fd->memslot) &&
- kvm_is_write_fault(s2fd->vcpu))) {
+ if (!perm_fault || memslot_is_logging(s2fd->memslot)) {
ret = topup_mmu_memcache(s2fd->vcpu, memcache);
if (ret)
return ret;
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread