From: Jack Thomson <jackabt.amazon@gmail.com>
To: maz@kernel.org, oliver.upton@linux.dev, pbonzini@redhat.com
Cc: joey.gouly@arm.com, suzuki.poulose@arm.com, yuzenghui@huawei.com,
catalin.marinas@arm.com, will@kernel.org, shuah@kernel.org,
linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org,
isaku.yamahata@intel.com, xmarcalx@amazon.co.uk,
kalyazin@amazon.co.uk, jackabt@amazon.com
Subject: [PATCH v4 1/3] KVM: arm64: Add pre_fault_memory implementation
Date: Tue, 13 Jan 2026 15:26:40 +0000 [thread overview]
Message-ID: <20260113152643.18858-2-jackabt.amazon@gmail.com> (raw)
In-Reply-To: <20260113152643.18858-1-jackabt.amazon@gmail.com>
From: Jack Thomson <jackabt@amazon.com>
Add kvm_arch_vcpu_pre_fault_memory() for arm64. The implementation hands
off the stage-2 faulting logic to either gmem_abort() or
user_mem_abort().
Add an optional page_size output parameter to user_mem_abort() to
return the VMA page size, which is needed when pre-faulting.
Update the documentation to clarify x86 specific behaviour.
Signed-off-by: Jack Thomson <jackabt@amazon.com>
---
Documentation/virt/kvm/api.rst | 3 +-
arch/arm64/kvm/Kconfig | 1 +
arch/arm64/kvm/arm.c | 1 +
arch/arm64/kvm/mmu.c | 79 ++++++++++++++++++++++++++++++++--
4 files changed, 79 insertions(+), 5 deletions(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 01a3abef8abb..44cfd9e736bb 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6493,7 +6493,8 @@ Errors:
KVM_PRE_FAULT_MEMORY populates KVM's stage-2 page tables used to map memory
for the current vCPU state. KVM maps memory as if the vCPU generated a
stage-2 read page fault, e.g. faults in memory as needed, but doesn't break
-CoW. However, KVM does not mark any newly created stage-2 PTE as Accessed.
+CoW. However, on x86, KVM does not mark any newly created stage-2 PTE as
+Accessed.
In the case of confidential VM types where there is an initial set up of
private guest memory before the guest is 'finalized'/measured, this ioctl
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 4f803fd1c99a..6872aaabe16c 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -25,6 +25,7 @@ menuconfig KVM
select HAVE_KVM_CPU_RELAX_INTERCEPT
select KVM_MMIO
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ select KVM_GENERIC_PRE_FAULT_MEMORY
select VIRT_XFER_TO_GUEST_WORK
select KVM_VFIO
select HAVE_KVM_DIRTY_RING_ACQ_REL
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 4f80da0c0d1d..19bac68f737f 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -332,6 +332,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_COUNTER_OFFSET:
case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS:
case KVM_CAP_ARM_SEA_TO_USER:
+ case KVM_CAP_PRE_FAULT_MEMORY:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 48d7c372a4cd..499b131f794e 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1642,8 +1642,8 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_s2_trans *nested,
- struct kvm_memory_slot *memslot, unsigned long hva,
- bool fault_is_perm)
+ struct kvm_memory_slot *memslot, unsigned long *page_size,
+ unsigned long hva, bool fault_is_perm)
{
int ret = 0;
bool topup_memcache;
@@ -1923,6 +1923,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
kvm_release_faultin_page(kvm, page, !!ret, writable);
kvm_fault_unlock(kvm);
+ if (page_size)
+ *page_size = vma_pagesize;
+
/* Mark the page dirty only if the fault is handled successfully */
if (writable && !ret)
mark_page_dirty_in_slot(kvm, memslot, gfn);
@@ -2196,8 +2199,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
ret = gmem_abort(vcpu, fault_ipa, nested, memslot,
esr_fsc_is_permission_fault(esr));
else
- ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
- esr_fsc_is_permission_fault(esr));
+ ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, NULL,
+ hva, esr_fsc_is_permission_fault(esr));
if (ret == 0)
ret = 1;
out:
@@ -2573,3 +2576,71 @@ void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
}
+
+long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+ struct kvm_pre_fault_memory *range)
+{
+ struct kvm_vcpu_fault_info *fault_info = &vcpu->arch.fault;
+ struct kvm_s2_trans nested_trans, *nested = NULL;
+ unsigned long page_size = PAGE_SIZE;
+ struct kvm_memory_slot *memslot;
+ phys_addr_t ipa = range->gpa;
+ phys_addr_t end;
+ hva_t hva;
+ gfn_t gfn;
+ int ret;
+
+ if (vcpu_is_protected(vcpu))
+ return -EOPNOTSUPP;
+
+ /*
+ * We may prefault on a shadow stage 2 page table if we are
+ * running a nested guest. In this case, we have to resolve the L2
+ * IPA to the L1 IPA first, before knowing what kind of memory should
+ * back the L1 IPA.
+ *
+ * If the shadow stage 2 page table walk faults, then we return
+ * -EFAULT
+ */
+ if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu) &&
+ vcpu->arch.hw_mmu->nested_stage2_enabled) {
+ ret = kvm_walk_nested_s2(vcpu, ipa, &nested_trans);
+ if (ret)
+ return -EFAULT;
+
+ ipa = kvm_s2_trans_output(&nested_trans);
+ nested = &nested_trans;
+ }
+
+ if (ipa >= kvm_phys_size(vcpu->arch.hw_mmu))
+ return -ENOENT;
+
+ /* Generate a synthetic abort for the pre-fault address */
+ fault_info->esr_el2 = (ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT) |
+ ESR_ELx_FSC_FAULT_L(KVM_PGTABLE_LAST_LEVEL);
+ fault_info->hpfar_el2 = HPFAR_EL2_NS |
+ FIELD_PREP(HPFAR_EL2_FIPA, ipa >> 12);
+
+ gfn = gpa_to_gfn(ipa);
+ memslot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!memslot)
+ return -ENOENT;
+
+ if (kvm_slot_has_gmem(memslot)) {
+ /* gmem currently only supports PAGE_SIZE mappings */
+ ret = gmem_abort(vcpu, ipa, nested, memslot, false);
+ } else {
+ hva = gfn_to_hva_memslot_prot(memslot, gfn, NULL);
+ if (kvm_is_error_hva(hva))
+ return -EFAULT;
+
+ ret = user_mem_abort(vcpu, ipa, nested, memslot, &page_size, hva,
+ false);
+ }
+
+ if (ret < 0)
+ return ret;
+
+ end = ALIGN_DOWN(range->gpa, page_size) + page_size;
+ return min(range->size, end - range->gpa);
+}
--
2.43.0
next prev parent reply other threads:[~2026-01-13 15:27 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-13 15:26 [PATCH v4 0/3] KVM ARM64 pre_fault_memory Jack Thomson
2026-01-13 15:26 ` Jack Thomson [this message]
2026-01-15 9:51 ` [PATCH v4 1/3] KVM: arm64: Add pre_fault_memory implementation Marc Zyngier
2026-01-16 14:33 ` Thomson, Jack
2026-01-18 10:29 ` Marc Zyngier
2026-01-19 11:10 ` Thomson, Jack
2026-01-13 15:26 ` [PATCH v4 2/3] KVM: selftests: Enable pre_fault_memory_test for arm64 Jack Thomson
2026-01-13 15:26 ` [PATCH v4 3/3] KVM: selftests: Add option for different backing in pre-fault tests Jack Thomson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260113152643.18858-2-jackabt.amazon@gmail.com \
--to=jackabt.amazon@gmail.com \
--cc=catalin.marinas@arm.com \
--cc=isaku.yamahata@intel.com \
--cc=jackabt@amazon.com \
--cc=joey.gouly@arm.com \
--cc=kalyazin@amazon.co.uk \
--cc=kvmarm@lists.linux.dev \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=maz@kernel.org \
--cc=oliver.upton@linux.dev \
--cc=pbonzini@redhat.com \
--cc=shuah@kernel.org \
--cc=suzuki.poulose@arm.com \
--cc=will@kernel.org \
--cc=xmarcalx@amazon.co.uk \
--cc=yuzenghui@huawei.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.