From: Jack Thomson <jackabt.amazon@gmail.com>
To: maz@kernel.org, oliver.upton@linux.dev, pbonzini@redhat.com
Cc: joey.gouly@arm.com, suzuki.poulose@arm.com, yuzenghui@huawei.com,
catalin.marinas@arm.com, will@kernel.org, shuah@kernel.org,
linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org,
isaku.yamahata@intel.com, roypat@amazon.co.uk,
kalyazin@amazon.co.uk, jackabt@amazon.com
Subject: [PATCH v2 1/4] KVM: arm64: Add pre_fault_memory implementation
Date: Mon, 13 Oct 2025 16:14:58 +0100 [thread overview]
Message-ID: <20251013151502.6679-2-jackabt.amazon@gmail.com> (raw)
In-Reply-To: <20251013151502.6679-1-jackabt.amazon@gmail.com>
From: Jack Thomson <jackabt@amazon.com>
Add kvm_arch_vcpu_pre_fault_memory() for arm64. The implementation hands
off the stage-2 faulting logic to either gmem_abort() or
user_mem_abort().
Add an optional page_size output parameter to user_mem_abort() to
return the VMA page size, which is needed when pre-faulting.
Update the documentation to clarify x86 specific behaviour.
Signed-off-by: Jack Thomson <jackabt@amazon.com>
---
Documentation/virt/kvm/api.rst | 3 +-
arch/arm64/kvm/Kconfig | 1 +
arch/arm64/kvm/arm.c | 1 +
arch/arm64/kvm/mmu.c | 73 ++++++++++++++++++++++++++++++++--
4 files changed, 73 insertions(+), 5 deletions(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index c17a87a0a5ac..9e8cc4eb505d 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6461,7 +6461,8 @@ Errors:
KVM_PRE_FAULT_MEMORY populates KVM's stage-2 page tables used to map memory
for the current vCPU state. KVM maps memory as if the vCPU generated a
stage-2 read page fault, e.g. faults in memory as needed, but doesn't break
-CoW. However, KVM does not mark any newly created stage-2 PTE as Accessed.
+CoW. However, on x86, KVM does not mark any newly created stage-2 PTE as
+Accessed.
In the case of confidential VM types where there is an initial set up of
private guest memory before the guest is 'finalized'/measured, this ioctl
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index bff62e75d681..1ac0605f86cb 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -25,6 +25,7 @@ menuconfig KVM
select HAVE_KVM_CPU_RELAX_INTERCEPT
select KVM_MMIO
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ select KVM_GENERIC_PRE_FAULT_MEMORY
select KVM_XFER_TO_GUEST_WORK
select KVM_VFIO
select HAVE_KVM_DIRTY_RING_ACQ_REL
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 888f7c7abf54..65654a742864 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -322,6 +322,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_IRQFD_RESAMPLE:
case KVM_CAP_COUNTER_OFFSET:
case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS:
+ case KVM_CAP_PRE_FAULT_MEMORY:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index a36426ccd9b5..82f122e4b08c 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1597,8 +1597,8 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_s2_trans *nested,
- struct kvm_memory_slot *memslot, unsigned long hva,
- bool fault_is_perm)
+ struct kvm_memory_slot *memslot, long *page_size,
+ unsigned long hva, bool fault_is_perm)
{
int ret = 0;
bool topup_memcache;
@@ -1871,6 +1871,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
kvm_release_faultin_page(kvm, page, !!ret, writable);
kvm_fault_unlock(kvm);
+ if (page_size)
+ *page_size = vma_pagesize;
+
/* Mark the page dirty only if the fault is handled successfully */
if (writable && !ret)
mark_page_dirty_in_slot(kvm, memslot, gfn);
@@ -2069,8 +2072,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
ret = gmem_abort(vcpu, fault_ipa, nested, memslot,
esr_fsc_is_permission_fault(esr));
else
- ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
- esr_fsc_is_permission_fault(esr));
+ ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, NULL,
+ hva, esr_fsc_is_permission_fault(esr));
if (ret == 0)
ret = 1;
out:
@@ -2446,3 +2449,65 @@ void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
}
+
+long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+ struct kvm_pre_fault_memory *range)
+{
+ int ret, idx;
+ hva_t hva;
+ phys_addr_t end;
+ struct kvm_memory_slot *memslot;
+ struct kvm_vcpu_fault_info stored_fault, *fault_info;
+
+ long page_size = PAGE_SIZE;
+ phys_addr_t ipa = range->gpa;
+ gfn_t gfn = gpa_to_gfn(range->gpa);
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ if (ipa >= kvm_phys_size(vcpu->arch.hw_mmu)) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ memslot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!memslot) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ fault_info = &vcpu->arch.fault;
+ stored_fault = *fault_info;
+
+ /* Generate a synthetic abort for the pre-fault address */
+ fault_info->esr_el2 = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_DABT_CUR);
+ fault_info->esr_el2 &= ~ESR_ELx_ISV;
+ fault_info->esr_el2 |= ESR_ELx_FSC_FAULT_L(KVM_PGTABLE_LAST_LEVEL);
+
+ fault_info->hpfar_el2 = HPFAR_EL2_NS |
+ FIELD_PREP(HPFAR_EL2_FIPA, ipa >> 12);
+
+ if (kvm_slot_has_gmem(memslot)) {
+ ret = gmem_abort(vcpu, ipa, NULL, memslot, false);
+ } else {
+ hva = gfn_to_hva_memslot_prot(memslot, gfn, NULL);
+ if (kvm_is_error_hva(hva)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = user_mem_abort(vcpu, ipa, NULL, memslot, &page_size, hva,
+ false);
+ }
+
+ if (ret < 0)
+ goto out;
+
+ end = (range->gpa & ~(page_size - 1)) + page_size;
+ ret = min(range->size, end - range->gpa);
+
+out:
+ *fault_info = stored_fault;
+out_unlock:
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ return ret;
+}
--
2.43.0
next prev parent reply other threads:[~2025-10-13 15:15 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-13 15:14 [PATCH v2 0/4] KVM ARM64 pre_fault_memory Jack Thomson
2025-10-13 15:14 ` Jack Thomson [this message]
2025-10-16 14:01 ` [PATCH v2 1/4] KVM: arm64: Add pre_fault_memory implementation Suzuki K Poulose
2025-10-13 15:14 ` [PATCH v2 2/4] KVM: selftests: Fix unaligned mmap allocations Jack Thomson
2025-10-23 17:16 ` Sean Christopherson
2025-10-28 11:44 ` Thomson, Jack
2025-11-03 21:08 ` Sean Christopherson
2025-11-04 11:40 ` Thomson, Jack
2025-11-04 20:19 ` Sean Christopherson
2025-11-13 11:34 ` Thomson, Jack
2025-10-13 15:15 ` [PATCH v2 3/4] KVM: selftests: Enable pre_fault_memory_test for arm64 Jack Thomson
2025-10-13 15:15 ` [PATCH v2 4/4] KVM: selftests: Add option for different backing in pre-fault tests Jack Thomson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251013151502.6679-2-jackabt.amazon@gmail.com \
--to=jackabt.amazon@gmail.com \
--cc=catalin.marinas@arm.com \
--cc=isaku.yamahata@intel.com \
--cc=jackabt@amazon.com \
--cc=joey.gouly@arm.com \
--cc=kalyazin@amazon.co.uk \
--cc=kvmarm@lists.linux.dev \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=maz@kernel.org \
--cc=oliver.upton@linux.dev \
--cc=pbonzini@redhat.com \
--cc=roypat@amazon.co.uk \
--cc=shuah@kernel.org \
--cc=suzuki.poulose@arm.com \
--cc=will@kernel.org \
--cc=yuzenghui@huawei.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.