* [PATCH v3 01/15] KVM: x86/mmu: Move "struct kvm_page_fault" definition to asm/kvm_host.h
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
@ 2025-06-18 4:24 ` James Houghton
2025-06-18 4:24 ` [PATCH v3 02/15] KVM: arm64: Add "struct kvm_page_fault" to gather common fault variables James Houghton
` (15 subsequent siblings)
16 siblings, 0 replies; 31+ messages in thread
From: James Houghton @ 2025-06-18 4:24 UTC (permalink / raw)
To: Paolo Bonzini, Sean Christopherson, Oliver Upton
Cc: Jonathan Corbet, Marc Zyngier, Yan Zhao, James Houghton,
Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
From: Sean Christopherson <seanjc@google.com>
Make "struct kvm_page_fault" globally visible via asm/kvm_host.h so that
the structure can be referenced by common KVM.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: James Houghton <jthoughton@google.com>
---
arch/x86/include/asm/kvm_host.h | 68 ++++++++++++++++++++++++++++++++-
arch/x86/kvm/mmu/mmu_internal.h | 67 --------------------------------
2 files changed, 67 insertions(+), 68 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b4a391929cdba..f9d3333f6d64b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -443,7 +443,73 @@ struct kvm_mmu_root_info {
#define KVM_HAVE_MMU_RWLOCK
struct kvm_mmu_page;
-struct kvm_page_fault;
+
+struct kvm_page_fault {
+ /* arguments to kvm_mmu_do_page_fault. */
+ const gpa_t addr;
+ const u64 error_code;
+ const bool prefetch;
+
+ /* Derived from error_code. */
+ const bool exec;
+ const bool write;
+ const bool present;
+ const bool rsvd;
+ const bool user;
+
+ /* Derived from mmu and global state. */
+ const bool is_tdp;
+ const bool is_private;
+ const bool nx_huge_page_workaround_enabled;
+
+ /*
+ * Whether a >4KB mapping can be created or is forbidden due to NX
+ * hugepages.
+ */
+ bool huge_page_disallowed;
+
+ /*
+ * Maximum page size that can be created for this fault; input to
+ * FNAME(fetch), direct_map() and kvm_tdp_mmu_map().
+ */
+ u8 max_level;
+
+ /*
+ * Page size that can be created based on the max_level and the
+ * page size used by the host mapping.
+ */
+ u8 req_level;
+
+ /*
+ * Page size that will be created based on the req_level and
+ * huge_page_disallowed.
+ */
+ u8 goal_level;
+
+ /*
+ * Shifted addr, or result of guest page table walk if addr is a gva. In
+ * the case of VM where memslot's can be mapped at multiple GPA aliases
+ * (i.e. TDX), the gfn field does not contain the bit that selects between
+ * the aliases (i.e. the shared bit for TDX).
+ */
+ gfn_t gfn;
+
+ /* The memslot containing gfn. May be NULL. */
+ struct kvm_memory_slot *slot;
+
+ /* Outputs of kvm_mmu_faultin_pfn(). */
+ unsigned long mmu_seq;
+ kvm_pfn_t pfn;
+ struct page *refcounted_page;
+ bool map_writable;
+
+ /*
+ * Indicates the guest is trying to write a gfn that contains one or
+ * more of the PTEs used to translate the write itself, i.e. the access
+ * is changing its own translation in the guest page tables.
+ */
+ bool write_fault_to_shadow_pgtable;
+};
/*
* x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index db8f33e4de624..384fc4d0bfec0 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -230,73 +230,6 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
}
-struct kvm_page_fault {
- /* arguments to kvm_mmu_do_page_fault. */
- const gpa_t addr;
- const u64 error_code;
- const bool prefetch;
-
- /* Derived from error_code. */
- const bool exec;
- const bool write;
- const bool present;
- const bool rsvd;
- const bool user;
-
- /* Derived from mmu and global state. */
- const bool is_tdp;
- const bool is_private;
- const bool nx_huge_page_workaround_enabled;
-
- /*
- * Whether a >4KB mapping can be created or is forbidden due to NX
- * hugepages.
- */
- bool huge_page_disallowed;
-
- /*
- * Maximum page size that can be created for this fault; input to
- * FNAME(fetch), direct_map() and kvm_tdp_mmu_map().
- */
- u8 max_level;
-
- /*
- * Page size that can be created based on the max_level and the
- * page size used by the host mapping.
- */
- u8 req_level;
-
- /*
- * Page size that will be created based on the req_level and
- * huge_page_disallowed.
- */
- u8 goal_level;
-
- /*
- * Shifted addr, or result of guest page table walk if addr is a gva. In
- * the case of VM where memslot's can be mapped at multiple GPA aliases
- * (i.e. TDX), the gfn field does not contain the bit that selects between
- * the aliases (i.e. the shared bit for TDX).
- */
- gfn_t gfn;
-
- /* The memslot containing gfn. May be NULL. */
- struct kvm_memory_slot *slot;
-
- /* Outputs of kvm_mmu_faultin_pfn(). */
- unsigned long mmu_seq;
- kvm_pfn_t pfn;
- struct page *refcounted_page;
- bool map_writable;
-
- /*
- * Indicates the guest is trying to write a gfn that contains one or
- * more of the PTEs used to translate the write itself, i.e. the access
- * is changing its own translation in the guest page tables.
- */
- bool write_fault_to_shadow_pgtable;
-};
-
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
/*
--
2.50.0.rc2.692.g299adb8693-goog
^ permalink raw reply related [flat|nested] 31+ messages in thread
* [PATCH v3 02/15] KVM: arm64: Add "struct kvm_page_fault" to gather common fault variables
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
2025-06-18 4:24 ` [PATCH v3 01/15] KVM: x86/mmu: Move "struct kvm_page_fault" definition to asm/kvm_host.h James Houghton
@ 2025-06-18 4:24 ` James Houghton
2025-06-18 19:26 ` Oliver Upton
2025-06-18 4:24 ` [PATCH v3 03/15] KVM: arm64: x86: Require "struct kvm_page_fault" for memory fault exits James Houghton
` (14 subsequent siblings)
16 siblings, 1 reply; 31+ messages in thread
From: James Houghton @ 2025-06-18 4:24 UTC (permalink / raw)
To: Paolo Bonzini, Sean Christopherson, Oliver Upton
Cc: Jonathan Corbet, Marc Zyngier, Yan Zhao, James Houghton,
Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
From: Sean Christopherson <seanjc@google.com>
Introduce "struct kvm_page_fault" and use it in user_mem_abort() in lieu
of a collection of local variables. Providing "struct kvm_page_fault"
will allow common KVM to provide APIs to take in said structure, e.g. when
preparing memory fault exits.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: James Houghton <jthoughton@google.com>
---
arch/arm64/include/asm/kvm_host.h | 9 +++++++++
arch/arm64/kvm/mmu.c | 32 +++++++++++++++++--------------
2 files changed, 27 insertions(+), 14 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 6ce2c51734820..ae83d95d11b74 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -413,6 +413,15 @@ struct kvm_vcpu_fault_info {
u64 disr_el1; /* Deferred [SError] Status Register */
};
+struct kvm_page_fault {
+ const bool exec;
+ const bool write;
+ const bool is_private;
+
+ gfn_t gfn;
+ struct kvm_memory_slot *slot;
+};
+
/*
* VNCR() just places the VNCR_capable registers in the enum after
* __VNCR_START__, and the value (after correction) to be an 8-byte offset
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 2942ec92c5a4a..0c209f2e1c7b2 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1476,8 +1476,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
bool fault_is_perm)
{
int ret = 0;
- bool write_fault, writable, force_pte = false;
- bool exec_fault, mte_allowed;
+ bool writable, force_pte = false;
+ bool mte_allowed;
bool device = false, vfio_allow_any_uc = false;
unsigned long mmu_seq;
phys_addr_t ipa = fault_ipa;
@@ -1485,7 +1485,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct vm_area_struct *vma;
short vma_shift;
void *memcache;
- gfn_t gfn;
kvm_pfn_t pfn;
bool logging_active = memslot_is_logging(memslot);
long vma_pagesize, fault_granule;
@@ -1494,13 +1493,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct page *page;
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
+ struct kvm_page_fault fault = {
+ .write = kvm_is_write_fault(vcpu),
+ .exec = kvm_vcpu_trap_is_exec_fault(vcpu),
+
+ .slot = memslot,
+ };
+
if (fault_is_perm)
fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
- write_fault = kvm_is_write_fault(vcpu);
- exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
- VM_BUG_ON(write_fault && exec_fault);
+ VM_BUG_ON(fault.write && fault.exec);
- if (fault_is_perm && !write_fault && !exec_fault) {
+ if (fault_is_perm && !fault.write && !fault.exec) {
kvm_err("Unexpected L2 read permission error\n");
return -EFAULT;
}
@@ -1516,7 +1520,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* only exception to this is when dirty logging is enabled at runtime
* and a write fault needs to collapse a block entry into a table.
*/
- if (!fault_is_perm || (logging_active && write_fault)) {
+ if (!fault_is_perm || (logging_active && fault.write)) {
int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
if (!is_protected_kvm_enabled())
@@ -1614,7 +1618,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
ipa &= ~(vma_pagesize - 1);
}
- gfn = ipa >> PAGE_SHIFT;
+ fault.gfn = ipa >> PAGE_SHIFT;
mte_allowed = kvm_vma_mte_allowed(vma);
vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
@@ -1633,7 +1637,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
mmu_seq = vcpu->kvm->mmu_invalidate_seq;
mmap_read_unlock(current->mm);
- pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
+ pfn = __kvm_faultin_pfn(memslot, fault.gfn, fault.write ? FOLL_WRITE : 0,
&writable, &page);
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(hva, vma_shift);
@@ -1654,7 +1658,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* change things at the last minute.
*/
device = true;
- } else if (logging_active && !write_fault) {
+ } else if (logging_active && !fault.write) {
/*
* Only actually map the page as writable if this was a write
* fault.
@@ -1662,7 +1666,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
writable = false;
}
- if (exec_fault && device)
+ if (fault.exec && device)
return -ENOEXEC;
/*
@@ -1722,7 +1726,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (writable)
prot |= KVM_PGTABLE_PROT_W;
- if (exec_fault)
+ if (fault.exec)
prot |= KVM_PGTABLE_PROT_X;
if (device) {
@@ -1759,7 +1763,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
/* Mark the page dirty only if the fault is handled successfully */
if (writable && !ret)
- mark_page_dirty_in_slot(kvm, memslot, gfn);
+ mark_page_dirty_in_slot(kvm, memslot, fault.gfn);
return ret != -EAGAIN ? ret : 0;
}
--
2.50.0.rc2.692.g299adb8693-goog
^ permalink raw reply related [flat|nested] 31+ messages in thread
* Re: [PATCH v3 02/15] KVM: arm64: Add "struct kvm_page_fault" to gather common fault variables
2025-06-18 4:24 ` [PATCH v3 02/15] KVM: arm64: Add "struct kvm_page_fault" to gather common fault variables James Houghton
@ 2025-06-18 19:26 ` Oliver Upton
2025-06-18 21:17 ` Sean Christopherson
0 siblings, 1 reply; 31+ messages in thread
From: Oliver Upton @ 2025-06-18 19:26 UTC (permalink / raw)
To: James Houghton
Cc: Paolo Bonzini, Sean Christopherson, Jonathan Corbet, Marc Zyngier,
Yan Zhao, Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
On Wed, Jun 18, 2025 at 04:24:11AM +0000, James Houghton wrote:
> From: Sean Christopherson <seanjc@google.com>
>
> Introduce "struct kvm_page_fault" and use it in user_mem_abort() in lieu
> of a collection of local variables. Providing "struct kvm_page_fault"
> will allow common KVM to provide APIs to take in said structure, e.g. when
> preparing memory fault exits.
>
> No functional change intended.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: James Houghton <jthoughton@google.com>
> ---
> arch/arm64/include/asm/kvm_host.h | 9 +++++++++
> arch/arm64/kvm/mmu.c | 32 +++++++++++++++++--------------
> 2 files changed, 27 insertions(+), 14 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 6ce2c51734820..ae83d95d11b74 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -413,6 +413,15 @@ struct kvm_vcpu_fault_info {
> u64 disr_el1; /* Deferred [SError] Status Register */
> };
>
> +struct kvm_page_fault {
> + const bool exec;
> + const bool write;
> + const bool is_private;
> +
> + gfn_t gfn;
> + struct kvm_memory_slot *slot;
> +};
> +
So this seems to cherry-pick "interesting" values into the structure but
leaves the rest of the abort context scattered about in locals. If we're
going to do something like this I'd rather have a wholesale refactoring
than just the bits to intersect with x86 (more on that later...)
Thanks,
Oliver
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v3 02/15] KVM: arm64: Add "struct kvm_page_fault" to gather common fault variables
2025-06-18 19:26 ` Oliver Upton
@ 2025-06-18 21:17 ` Sean Christopherson
0 siblings, 0 replies; 31+ messages in thread
From: Sean Christopherson @ 2025-06-18 21:17 UTC (permalink / raw)
To: Oliver Upton
Cc: James Houghton, Paolo Bonzini, Jonathan Corbet, Marc Zyngier,
Yan Zhao, Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
On Wed, Jun 18, 2025, Oliver Upton wrote:
> On Wed, Jun 18, 2025 at 04:24:11AM +0000, James Houghton wrote:
> > From: Sean Christopherson <seanjc@google.com>
> >
> > Introduce "struct kvm_page_fault" and use it in user_mem_abort() in lieu
> > of a collection of local variables. Providing "struct kvm_page_fault"
> > will allow common KVM to provide APIs to take in said structure, e.g. when
> > preparing memory fault exits.
> >
> > No functional change intended.
> >
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > Signed-off-by: James Houghton <jthoughton@google.com>
> > ---
> > arch/arm64/include/asm/kvm_host.h | 9 +++++++++
> > arch/arm64/kvm/mmu.c | 32 +++++++++++++++++--------------
> > 2 files changed, 27 insertions(+), 14 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > index 6ce2c51734820..ae83d95d11b74 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -413,6 +413,15 @@ struct kvm_vcpu_fault_info {
> > u64 disr_el1; /* Deferred [SError] Status Register */
> > };
> >
> > +struct kvm_page_fault {
> > + const bool exec;
> > + const bool write;
> > + const bool is_private;
> > +
> > + gfn_t gfn;
> > + struct kvm_memory_slot *slot;
> > +};
> > +
>
> So this seems to cherry-pick "interesting" values into the structure but
s/interesting/necessary. I literally grabbed only the fields that were needed
to handle the userfault :-)
> leaves the rest of the abort context scattered about in locals. If we're
> going to do something like this I'd rather have a wholesale refactoring
> than just the bits to intersect with x86 (more on that later...)
Definitely no objection from me. How about I send a standalone patch so that we
can iterate on just that without James having to respin the entire series (for
changes that have no meaningful impact)? I'm anticipating we'll need a few rounds
to strike the right balance between what was done here and "throw everything into
kvm_page_fault". E.g. we probably don't want "vma" in kvm_page_fault.
Hmm, yeah, and I suspect/hope that moving more things into kvm_page_fault will
allow for encapsulating the mmap_read_lock() section in a helper without having
a bajillion boolean flags being passed around. That would obviate the need to
nullify vma, because it would simply go out of scope.
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH v3 03/15] KVM: arm64: x86: Require "struct kvm_page_fault" for memory fault exits
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
2025-06-18 4:24 ` [PATCH v3 01/15] KVM: x86/mmu: Move "struct kvm_page_fault" definition to asm/kvm_host.h James Houghton
2025-06-18 4:24 ` [PATCH v3 02/15] KVM: arm64: Add "struct kvm_page_fault" to gather common fault variables James Houghton
@ 2025-06-18 4:24 ` James Houghton
2025-06-18 20:00 ` Oliver Upton
2025-06-18 4:24 ` [PATCH v3 04/15] KVM: Add common infrastructure for KVM Userfaults James Houghton
` (13 subsequent siblings)
16 siblings, 1 reply; 31+ messages in thread
From: James Houghton @ 2025-06-18 4:24 UTC (permalink / raw)
To: Paolo Bonzini, Sean Christopherson, Oliver Upton
Cc: Jonathan Corbet, Marc Zyngier, Yan Zhao, James Houghton,
Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
From: Sean Christopherson <seanjc@google.com>
Now that both arm64 and x86 define "struct kvm_page_fault" with a base set
of fields, rework kvm_prepare_memory_fault_exit() to take a kvm_page_fault
structure instead of passing in a pile of parameters. Guard the related
code with CONFIG_KVM_GENERIC_PAGE_FAULT to play nice with architectures
that don't yet support kvm_page_fault.
Rather than define a common kvm_page_fault and kvm_arch_page_fault child,
simply assert that the handful of required fields are provided by the
arch-defined structure. Unlike vCPU and VMs, the number of common fields
is expected to be small, and letting arch code fully define the structure
allows for maximum flexibility with respect to const, layout, etc.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: James Houghton <jthoughton@google.com>
---
arch/arm64/kvm/Kconfig | 1 +
arch/x86/kvm/Kconfig | 1 +
arch/x86/kvm/mmu/mmu.c | 8 ++++----
arch/x86/kvm/mmu/mmu_internal.h | 10 +---------
include/linux/kvm_host.h | 26 ++++++++++++++++++++------
virt/kvm/Kconfig | 3 +++
6 files changed, 30 insertions(+), 19 deletions(-)
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 713248f240e03..3c299735b1668 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -37,6 +37,7 @@ menuconfig KVM
select HAVE_KVM_VCPU_RUN_PID_CHANGE
select SCHED_INFO
select GUEST_PERF_EVENTS if PERF_EVENTS
+ select KVM_GENERIC_PAGE_FAULT
help
Support hosting virtualized guest machines.
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2eeffcec53828..2d5966f15738d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -48,6 +48,7 @@ config KVM_X86
select KVM_GENERIC_PRE_FAULT_MEMORY
select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM
select KVM_WERROR if WERROR
+ select KVM_GENERIC_PAGE_FAULT
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index cbc84c6abc2e3..a4439e9e07268 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3429,7 +3429,7 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
gva_t gva = fault->is_tdp ? 0 : fault->addr;
if (fault->is_private) {
- kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ kvm_prepare_memory_fault_exit(vcpu, fault);
return -EFAULT;
}
@@ -4499,14 +4499,14 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
int max_order, r;
if (!kvm_slot_can_be_private(fault->slot)) {
- kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ kvm_prepare_memory_fault_exit(vcpu, fault);
return -EFAULT;
}
r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
&fault->refcounted_page, &max_order);
if (r) {
- kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ kvm_prepare_memory_fault_exit(vcpu, fault);
return r;
}
@@ -4586,7 +4586,7 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
* private vs. shared mismatch.
*/
if (fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) {
- kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ kvm_prepare_memory_fault_exit(vcpu, fault);
return -EFAULT;
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 384fc4d0bfec0..c15060ed6e8be 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -269,14 +269,6 @@ enum {
*/
static_assert(RET_PF_CONTINUE == 0);
-static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
- struct kvm_page_fault *fault)
-{
- kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
- PAGE_SIZE, fault->write, fault->exec,
- fault->is_private);
-}
-
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
u64 err, bool prefetch,
int *emulation_type, u8 *level)
@@ -329,7 +321,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
*/
if (r == RET_PF_EMULATE && fault.is_private) {
pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n");
- kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
+ kvm_prepare_memory_fault_exit(vcpu, &fault);
return -EFAULT;
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3bde4fb5c6aa4..9a85500cd5c50 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2497,20 +2497,34 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
/* Max number of entries allowed for each kvm dirty ring */
#define KVM_DIRTY_RING_MAX_ENTRIES 65536
+#ifdef CONFIG_KVM_GENERIC_PAGE_FAULT
+
+#define KVM_ASSERT_TYPE_IS(type_t, x) \
+do { \
+ type_t __maybe_unused tmp; \
+ \
+ BUILD_BUG_ON(!__types_ok(tmp, x) || !__typecheck(tmp, x)); \
+} while (0)
+
static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
- gpa_t gpa, gpa_t size,
- bool is_write, bool is_exec,
- bool is_private)
+ struct kvm_page_fault *fault)
{
+ KVM_ASSERT_TYPE_IS(gfn_t, fault->gfn);
+ KVM_ASSERT_TYPE_IS(bool, fault->exec);
+ KVM_ASSERT_TYPE_IS(bool, fault->write);
+ KVM_ASSERT_TYPE_IS(bool, fault->is_private);
+ KVM_ASSERT_TYPE_IS(struct kvm_memory_slot *, fault->slot);
+
vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
- vcpu->run->memory_fault.gpa = gpa;
- vcpu->run->memory_fault.size = size;
+ vcpu->run->memory_fault.gpa = fault->gfn << PAGE_SHIFT;
+ vcpu->run->memory_fault.size = PAGE_SIZE;
/* RWX flags are not (yet) defined or communicated to userspace. */
vcpu->run->memory_fault.flags = 0;
- if (is_private)
+ if (fault->is_private)
vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
}
+#endif
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 727b542074e7e..28ed6b241578b 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -128,3 +128,6 @@ config HAVE_KVM_ARCH_GMEM_PREPARE
config HAVE_KVM_ARCH_GMEM_INVALIDATE
bool
depends on KVM_PRIVATE_MEM
+
+config KVM_GENERIC_PAGE_FAULT
+ bool
--
2.50.0.rc2.692.g299adb8693-goog
^ permalink raw reply related [flat|nested] 31+ messages in thread
* Re: [PATCH v3 03/15] KVM: arm64: x86: Require "struct kvm_page_fault" for memory fault exits
2025-06-18 4:24 ` [PATCH v3 03/15] KVM: arm64: x86: Require "struct kvm_page_fault" for memory fault exits James Houghton
@ 2025-06-18 20:00 ` Oliver Upton
2025-06-18 20:47 ` Sean Christopherson
0 siblings, 1 reply; 31+ messages in thread
From: Oliver Upton @ 2025-06-18 20:00 UTC (permalink / raw)
To: James Houghton
Cc: Paolo Bonzini, Sean Christopherson, Jonathan Corbet, Marc Zyngier,
Yan Zhao, Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
On Wed, Jun 18, 2025 at 04:24:12AM +0000, James Houghton wrote:
> +#ifdef CONFIG_KVM_GENERIC_PAGE_FAULT
> +
> +#define KVM_ASSERT_TYPE_IS(type_t, x) \
> +do { \
> + type_t __maybe_unused tmp; \
> + \
> + BUILD_BUG_ON(!__types_ok(tmp, x) || !__typecheck(tmp, x)); \
> +} while (0)
> +
> static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
> - gpa_t gpa, gpa_t size,
> - bool is_write, bool is_exec,
> - bool is_private)
> + struct kvm_page_fault *fault)
> {
> + KVM_ASSERT_TYPE_IS(gfn_t, fault->gfn);
> + KVM_ASSERT_TYPE_IS(bool, fault->exec);
> + KVM_ASSERT_TYPE_IS(bool, fault->write);
> + KVM_ASSERT_TYPE_IS(bool, fault->is_private);
> + KVM_ASSERT_TYPE_IS(struct kvm_memory_slot *, fault->slot);
> +
> vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
> - vcpu->run->memory_fault.gpa = gpa;
> - vcpu->run->memory_fault.size = size;
> + vcpu->run->memory_fault.gpa = fault->gfn << PAGE_SHIFT;
> + vcpu->run->memory_fault.size = PAGE_SIZE;
>
> /* RWX flags are not (yet) defined or communicated to userspace. */
> vcpu->run->memory_fault.flags = 0;
> - if (is_private)
> + if (fault->is_private)
> vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
> }
> +#endif
This *is not* the right direction of travel for arm64. Stage-2 aborts /
EPT violations / etc. are extremely architecture-specific events.
What I would like to see on arm64 is that for every "KVM_EXIT_MEMORY_FAULT"
we provide as much syndrome information as possible. That could imply
some combination of a sanitised view of ESR_EL2 and, where it is
unambiguous, common fault flags that have shared definitions with x86.
This could incur some minor code duplication, but even then we can share
helpers for the software bits (like userfault).
FEAT_MTE_PERM is a very good example for this. There exists a "Tag"
permission at stage-2 which is unrelated to any of the 'normal'
read/write permissions. There's also the MostlyReadOnly permission from
FEAT_THE which grants write permission to a specific set of instructions.
I don't want to paper over these nuances and will happily maintain an
arm64-specific flavor of "kvm_prepare_memory_fault_exit()"/
Thanks,
Oliver
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v3 03/15] KVM: arm64: x86: Require "struct kvm_page_fault" for memory fault exits
2025-06-18 20:00 ` Oliver Upton
@ 2025-06-18 20:47 ` Sean Christopherson
2025-06-18 23:14 ` Oliver Upton
0 siblings, 1 reply; 31+ messages in thread
From: Sean Christopherson @ 2025-06-18 20:47 UTC (permalink / raw)
To: Oliver Upton
Cc: James Houghton, Paolo Bonzini, Jonathan Corbet, Marc Zyngier,
Yan Zhao, Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
On Wed, Jun 18, 2025, Oliver Upton wrote:
> On Wed, Jun 18, 2025 at 04:24:12AM +0000, James Houghton wrote:
> > +#ifdef CONFIG_KVM_GENERIC_PAGE_FAULT
> > +
> > +#define KVM_ASSERT_TYPE_IS(type_t, x) \
> > +do { \
> > + type_t __maybe_unused tmp; \
> > + \
> > + BUILD_BUG_ON(!__types_ok(tmp, x) || !__typecheck(tmp, x)); \
> > +} while (0)
> > +
> > static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
> > - gpa_t gpa, gpa_t size,
> > - bool is_write, bool is_exec,
> > - bool is_private)
> > + struct kvm_page_fault *fault)
> > {
> > + KVM_ASSERT_TYPE_IS(gfn_t, fault->gfn);
> > + KVM_ASSERT_TYPE_IS(bool, fault->exec);
> > + KVM_ASSERT_TYPE_IS(bool, fault->write);
> > + KVM_ASSERT_TYPE_IS(bool, fault->is_private);
> > + KVM_ASSERT_TYPE_IS(struct kvm_memory_slot *, fault->slot);
> > +
> > vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
> > - vcpu->run->memory_fault.gpa = gpa;
> > - vcpu->run->memory_fault.size = size;
> > + vcpu->run->memory_fault.gpa = fault->gfn << PAGE_SHIFT;
> > + vcpu->run->memory_fault.size = PAGE_SIZE;
> >
> > /* RWX flags are not (yet) defined or communicated to userspace. */
> > vcpu->run->memory_fault.flags = 0;
> > - if (is_private)
> > + if (fault->is_private)
> > vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
> > }
> > +#endif
>
> This *is not* the right direction of travel for arm64. Stage-2 aborts /
> EPT violations / etc. are extremely architecture-specific events.
Yes and no. 100% agreed there are arch/vendor specific aspects of stage-2 faults,
but there are most definitely commonalites as well.
> What I would like to see on arm64 is that for every "KVM_EXIT_MEMORY_FAULT"
> we provide as much syndrome information as possible. That could imply
> some combination of a sanitised view of ESR_EL2 and, where it is
> unambiguous, common fault flags that have shared definitions with x86.
Me confused, this is what the above does? "struct kvm_page_fault" is arch
specific, e.g. x86 has a whole pile of stuff in there beyond gfn, exec, write,
is_private, and slot.
The approach is non-standard, but I think my justification/reasoning for having
the structure be arch-defined still holds:
: Rather than define a common kvm_page_fault and kvm_arch_page_fault child,
: simply assert that the handful of required fields are provided by the
: arch-defined structure. Unlike vCPU and VMs, the number of common fields
: is expected to be small, and letting arch code fully define the structure
: allows for maximum flexibility with respect to const, layout, etc.
If we could use anonymous struct field, i.e. could embed a kvm_arch_page_fault
without having to bounce through an "arch" field, I would vote for the approach.
Sadly, AFAIK, we can't yet use those in the kernel.
> This could incur some minor code duplication, but even then we can share
> helpers for the software bits (like userfault).
Again, that is what is proposed here.
> FEAT_MTE_PERM is a very good example for this. There exists a "Tag"
> permission at stage-2 which is unrelated to any of the 'normal'
> read/write permissions. There's also the MostlyReadOnly permission from
> FEAT_THE which grants write permission to a specific set of instructions.
>
> I don't want to paper over these nuances and will happily maintain an
> arm64-specific flavor of "kvm_prepare_memory_fault_exit()"
Nothing prevents arm64 (or any arch) from wrapping kvm_prepare_memory_fault_exit()
and/or taking action after it's invoked. That's not an accident; the "prepare
exit" helpers (x86 has a few more) were specifically designed to not be used as
the "return" to userspace. E.g. this one returns "void" instead of -EFAULT
specifically so that the callers isn't "required" to ignore the return if the
caller wants to populate (or change, but hopefully that's never the case) fields
after calling kvm_prepare_memory_fault_exit), and so that arch can return an
entirely different error code, e.g. -EHWPOISON when appropriate.
And it's not just kvm_prepare_memory_fault_exit() that I want to use kvm_page_fault;
kvm_faultin_pfn() is another case where having a common "struct kvm_page_fault"
would clean up some ugly/annoying boilerplate.
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v3 03/15] KVM: arm64: x86: Require "struct kvm_page_fault" for memory fault exits
2025-06-18 20:47 ` Sean Christopherson
@ 2025-06-18 23:14 ` Oliver Upton
2025-06-19 1:22 ` Sean Christopherson
0 siblings, 1 reply; 31+ messages in thread
From: Oliver Upton @ 2025-06-18 23:14 UTC (permalink / raw)
To: Sean Christopherson
Cc: James Houghton, Paolo Bonzini, Jonathan Corbet, Marc Zyngier,
Yan Zhao, Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
On Wed, Jun 18, 2025 at 01:47:36PM -0700, Sean Christopherson wrote:
> On Wed, Jun 18, 2025, Oliver Upton wrote:
> > On Wed, Jun 18, 2025 at 04:24:12AM +0000, James Houghton wrote:
> > > +#ifdef CONFIG_KVM_GENERIC_PAGE_FAULT
> > > +
> > > +#define KVM_ASSERT_TYPE_IS(type_t, x) \
> > > +do { \
> > > + type_t __maybe_unused tmp; \
> > > + \
> > > + BUILD_BUG_ON(!__types_ok(tmp, x) || !__typecheck(tmp, x)); \
> > > +} while (0)
> > > +
> > > static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
> > > - gpa_t gpa, gpa_t size,
> > > - bool is_write, bool is_exec,
> > > - bool is_private)
> > > + struct kvm_page_fault *fault)
> > > {
> > > + KVM_ASSERT_TYPE_IS(gfn_t, fault->gfn);
> > > + KVM_ASSERT_TYPE_IS(bool, fault->exec);
> > > + KVM_ASSERT_TYPE_IS(bool, fault->write);
> > > + KVM_ASSERT_TYPE_IS(bool, fault->is_private);
> > > + KVM_ASSERT_TYPE_IS(struct kvm_memory_slot *, fault->slot);
> > > +
> > > vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
> > > - vcpu->run->memory_fault.gpa = gpa;
> > > - vcpu->run->memory_fault.size = size;
> > > + vcpu->run->memory_fault.gpa = fault->gfn << PAGE_SHIFT;
> > > + vcpu->run->memory_fault.size = PAGE_SIZE;
> > >
> > > /* RWX flags are not (yet) defined or communicated to userspace. */
> > > vcpu->run->memory_fault.flags = 0;
> > > - if (is_private)
> > > + if (fault->is_private)
> > > vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
> > > }
> > > +#endif
> >
> > This *is not* the right direction of travel for arm64. Stage-2 aborts /
> > EPT violations / etc. are extremely architecture-specific events.
>
> Yes and no. 100% agreed there are arch/vendor specific aspects of stage-2 faults,
> but there are most definitely commonalites as well.
And I agree those commonalities should be expressed with the same flags
where possible.
> > What I would like to see on arm64 is that for every "KVM_EXIT_MEMORY_FAULT"
> > we provide as much syndrome information as possible. That could imply
> > some combination of a sanitised view of ESR_EL2 and, where it is
> > unambiguous, common fault flags that have shared definitions with x86.
>
> Me confused, this is what the above does? "struct kvm_page_fault" is arch
> specific, e.g. x86 has a whole pile of stuff in there beyond gfn, exec, write,
> is_private, and slot.
Right, but now I need to remember that some of the hardware syndrome
(exec, write) is handled in the arch-neutral code and the rest belongs
to the arch.
> The approach is non-standard, but I think my justification/reasoning for having
> the structure be arch-defined still holds:
>
> : Rather than define a common kvm_page_fault and kvm_arch_page_fault child,
> : simply assert that the handful of required fields are provided by the
> : arch-defined structure. Unlike vCPU and VMs, the number of common fields
> : is expected to be small, and letting arch code fully define the structure
> : allows for maximum flexibility with respect to const, layout, etc.
>
> If we could use anonymous struct field, i.e. could embed a kvm_arch_page_fault
> without having to bounce through an "arch" field, I would vote for the approach.
> Sadly, AFAIK, we can't yet use those in the kernel.
The general impression is that this is an unnecessary amount of
complexity for doing something trivial (computing flags).
> > This could incur some minor code duplication, but even then we can share
> > helpers for the software bits (like userfault).
>
> Again, that is what is proposed here.
>
> > FEAT_MTE_PERM is a very good example for this. There exists a "Tag"
> > permission at stage-2 which is unrelated to any of the 'normal'
> > read/write permissions. There's also the MostlyReadOnly permission from
> > FEAT_THE which grants write permission to a specific set of instructions.
> >
> > I don't want to paper over these nuances and will happily maintain an
> > arm64-specific flavor of "kvm_prepare_memory_fault_exit()"
>
> Nothing prevents arm64 (or any arch) from wrapping kvm_prepare_memory_fault_exit()
> and/or taking action after it's invoked. That's not an accident; the "prepare
> exit" helpers (x86 has a few more) were specifically designed to not be used as
> the "return" to userspace. E.g. this one returns "void" instead of -EFAULT
> specifically so that the callers isn't "required" to ignore the return if the
> caller wants to populate (or change, but hopefully that's never the case) fields
> after calling kvm_prepare_memory_fault_exit), and so that arch can return an
> entirely different error code, e.g. -EHWPOISON when appropriate.
IMO, this does not achieve the desired layering / ownership of memory
fault triage. This would be better organized as the arch code computing
all of the flags relating to the hardware syndrome (even boring ones
like RWX) and arch-neutral code potentially lending a hand with the
software bits.
With this I either need to genericize the horrors of the Arm
architecture in the common thing or keep track of what parts of the
hardware flags are owned by arch v. non-arch. SW v. HW fault context is
a cleaner split, IMO.
> And it's not just kvm_prepare_memory_fault_exit() that I want to use kvm_page_fault;
> kvm_faultin_pfn() is another case where having a common "struct kvm_page_fault"
> would clean up some ugly/annoying boilerplate.
That might be a better starting point for unifying these things, esp.
since kvm_faultin_pfn() doesn't have UAPI implications hiding behind it
and is already using common parameters.
Thanks,
Oliver
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v3 03/15] KVM: arm64: x86: Require "struct kvm_page_fault" for memory fault exits
2025-06-18 23:14 ` Oliver Upton
@ 2025-06-19 1:22 ` Sean Christopherson
0 siblings, 0 replies; 31+ messages in thread
From: Sean Christopherson @ 2025-06-19 1:22 UTC (permalink / raw)
To: Oliver Upton
Cc: James Houghton, Paolo Bonzini, Jonathan Corbet, Marc Zyngier,
Yan Zhao, Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
On Wed, Jun 18, 2025, Oliver Upton wrote:
> On Wed, Jun 18, 2025 at 01:47:36PM -0700, Sean Christopherson wrote:
> > On Wed, Jun 18, 2025, Oliver Upton wrote:
> > > What I would like to see on arm64 is that for every "KVM_EXIT_MEMORY_FAULT"
> > > we provide as much syndrome information as possible. That could imply
> > > some combination of a sanitised view of ESR_EL2 and, where it is
> > > unambiguous, common fault flags that have shared definitions with x86.
> >
> > Me confused, this is what the above does? "struct kvm_page_fault" is arch
> > specific, e.g. x86 has a whole pile of stuff in there beyond gfn, exec, write,
> > is_private, and slot.
>
> Right, but now I need to remember that some of the hardware syndrome
> (exec, write) is handled in the arch-neutral code and the rest belongs
> to the arch.
Yeah, can't argue there.
> > The approach is non-standard, but I think my justification/reasoning for having
> > the structure be arch-defined still holds:
> >
> > : Rather than define a common kvm_page_fault and kvm_arch_page_fault child,
> > : simply assert that the handful of required fields are provided by the
> > : arch-defined structure. Unlike vCPU and VMs, the number of common fields
> > : is expected to be small, and letting arch code fully define the structure
> > : allows for maximum flexibility with respect to const, layout, etc.
> >
> > If we could use anonymous struct field, i.e. could embed a kvm_arch_page_fault
> > without having to bounce through an "arch" field, I would vote for the approach.
> > Sadly, AFAIK, we can't yet use those in the kernel.
>
> The general impression is that this is an unnecessary amount of complexity
> for doing something trivial (computing flags).
It looks pretty though!
> > Nothing prevents arm64 (or any arch) from wrapping kvm_prepare_memory_fault_exit()
> > and/or taking action after it's invoked. That's not an accident; the "prepare
> > exit" helpers (x86 has a few more) were specifically designed to not be used as
> > the "return" to userspace. E.g. this one returns "void" instead of -EFAULT
> > specifically so that the callers isn't "required" to ignore the return if the
> > caller wants to populate (or change, but hopefully that's never the case) fields
> > after calling kvm_prepare_memory_fault_exit), and so that arch can return an
> > entirely different error code, e.g. -EHWPOISON when appropriate.
>
> IMO, this does not achieve the desired layering / ownership of memory
> fault triage. This would be better organized as the arch code computing
> all of the flags relating to the hardware syndrome (even boring ones
> like RWX)
Just to make sure I'm not misinterpreting things, by "computing all of the flags",
you mean computing KVM_MEMORY_EXIT_FLAG_xxx flags that are derived from hardware
state, correct?
> and arch-neutral code potentially lending a hand with the software bits.
>
> With this I either need to genericize the horrors of the Arm
> architecture in the common thing or keep track of what parts of the
> hardware flags are owned by arch v. non-arch. SW v. HW fault context is
> a cleaner split, IMO.
The problem I'm struggling with is where to draw the line. If we leave hardware
state to arch code, then we're not left with much. Hmm, but it really is just
the gfn/gpa that's needed in common code to avoid true ugliness. The size is
technically arch specific, but the reported size is effectively a placeholder,
i.e. it's always PAGE_SIZE, and probably always will be PAGE_SIZE, but we wanted
to give ourselves an out if necessary.
Would you be ok having common code fill gpa and size? If so, then we can do this:
--
void kvm_arch_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault);
static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
KVM_ASSERT_TYPE_IS(gfn_t, fault->gfn);
vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
vcpu->run->memory_fault.gpa = fault->gfn << PAGE_SHIFT;
vcpu->run->memory_fault.size = PAGE_SIZE;
vcpu->run->memory_fault.flags = 0;
kvm_arch_prepare_memory_fault_exit(vcpu, fault);
}
--
where arm64's arch hook is empty, and x86's is:
--
static inline void kvm_arch_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
if (fault->is_private)
vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
}
--
It's not perfect, but it should be much easier to describe the contract, and
common code can still pass around a kvm_page_fault structure instead of a horde
of booleans.
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH v3 04/15] KVM: Add common infrastructure for KVM Userfaults
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
` (2 preceding siblings ...)
2025-06-18 4:24 ` [PATCH v3 03/15] KVM: arm64: x86: Require "struct kvm_page_fault" for memory fault exits James Houghton
@ 2025-06-18 4:24 ` James Houghton
2025-06-18 19:40 ` Oliver Upton
2025-06-18 4:24 ` [PATCH v3 05/15] KVM: x86: Add support for KVM userfault exits James Houghton
` (12 subsequent siblings)
16 siblings, 1 reply; 31+ messages in thread
From: James Houghton @ 2025-06-18 4:24 UTC (permalink / raw)
To: Paolo Bonzini, Sean Christopherson, Oliver Upton
Cc: Jonathan Corbet, Marc Zyngier, Yan Zhao, James Houghton,
Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
KVM Userfault consists of a bitmap in userspace that describes which
pages the user wants exits on (when KVM_MEM_USERFAULT is enabled). To
get those exits, the memslot where KVM_MEM_USERFAULT is being enabled
must drop (at least) all of the translations that the bitmap says should
generate faults. Today, simply drop all translations for the memslot. Do
so with a new arch interface, kvm_arch_userfault_enabled(), which can be
specialized in the future by any architecture for which optimizations
make sense.
Make some changes to kvm_set_memory_region() to support setting
KVM_MEM_USERFAULT on KVM_MEM_GUEST_MEMFD memslots, including relaxing
the retrictions on guest_memfd memslots from only deletion to no moving.
Signed-off-by: James Houghton <jthoughton@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
include/linux/kvm_host.h | 23 ++++++++++++++++++
include/uapi/linux/kvm.h | 5 +++-
virt/kvm/kvm_main.c | 51 ++++++++++++++++++++++++++++++++++++----
3 files changed, 74 insertions(+), 5 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9a85500cd5c50..bd5fb5ae10d05 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -597,6 +597,7 @@ struct kvm_memory_slot {
unsigned long *dirty_bitmap;
struct kvm_arch_memory_slot arch;
unsigned long userspace_addr;
+ unsigned long __user *userfault_bitmap;
u32 flags;
short id;
u16 as_id;
@@ -1236,6 +1237,20 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm);
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot);
+#ifndef __KVM_HAVE_ARCH_USERFAULT_ENABLED
+static inline void kvm_arch_userfault_enabled(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ /*
+ * kvm_arch_userfault_enabled() must ensure that new faults on pages
+ * marked as userfault will exit to userspace. Dropping all
+ * translations is sufficient; architectures may choose to optimize
+ * this.
+ */
+ return kvm_arch_flush_shadow_memslot(kvm, slot);
+}
+#endif
+
int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn,
struct page **pages, int nr_pages);
@@ -2524,6 +2539,14 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
if (fault->is_private)
vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
}
+
+bool kvm_do_userfault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+
+static inline bool kvm_is_userfault_memslot(struct kvm_memory_slot *memslot)
+{
+ return memslot && memslot->flags & KVM_MEM_USERFAULT;
+}
+
#endif
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d00b85cb168c3..e3b871506ec85 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -40,7 +40,8 @@ struct kvm_userspace_memory_region2 {
__u64 guest_memfd_offset;
__u32 guest_memfd;
__u32 pad1;
- __u64 pad2[14];
+ __u64 userfault_bitmap;
+ __u64 pad2[13];
};
/*
@@ -51,6 +52,7 @@ struct kvm_userspace_memory_region2 {
#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0)
#define KVM_MEM_READONLY (1UL << 1)
#define KVM_MEM_GUEST_MEMFD (1UL << 2)
+#define KVM_MEM_USERFAULT (1UL << 3)
/* for KVM_IRQ_LINE */
struct kvm_irq_level {
@@ -443,6 +445,7 @@ struct kvm_run {
/* KVM_EXIT_MEMORY_FAULT */
struct {
#define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3)
+#define KVM_MEMORY_EXIT_FLAG_USERFAULT (1ULL << 4)
__u64 flags;
__u64 gpa;
__u64 size;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index eec82775c5bfb..bef6760cd1c0e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1747,6 +1747,14 @@ static void kvm_commit_memory_region(struct kvm *kvm,
if (old->dirty_bitmap && !new->dirty_bitmap)
kvm_destroy_dirty_bitmap(old);
+ /*
+ * If KVM_MEM_USERFAULT is being enabled for the slot, drop the
+ * translations that are marked as userfault.
+ */
+ if (!(old_flags & KVM_MEM_USERFAULT) &&
+ (new_flags & KVM_MEM_USERFAULT))
+ kvm_arch_userfault_enabled(kvm, old);
+
/*
* The final quirk. Free the detached, old slot, but only its
* memory, not any metadata. Metadata, including arch specific
@@ -2039,6 +2047,12 @@ static int kvm_set_memory_region(struct kvm *kvm,
if (id < KVM_USER_MEM_SLOTS &&
(mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
return -EINVAL;
+ if (mem->flags & KVM_MEM_USERFAULT &&
+ ((mem->userfault_bitmap != untagged_addr(mem->userfault_bitmap)) ||
+ !access_ok(u64_to_user_ptr(mem->userfault_bitmap),
+ DIV_ROUND_UP(mem->memory_size >> PAGE_SHIFT, BITS_PER_LONG)
+ * sizeof(long))))
+ return -EINVAL;
slots = __kvm_memslots(kvm, as_id);
@@ -2071,14 +2085,15 @@ static int kvm_set_memory_region(struct kvm *kvm,
if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
return -EINVAL;
} else { /* Modify an existing slot. */
- /* Private memslots are immutable, they can only be deleted. */
- if (mem->flags & KVM_MEM_GUEST_MEMFD)
- return -EINVAL;
if ((mem->userspace_addr != old->userspace_addr) ||
(npages != old->npages) ||
((mem->flags ^ old->flags) & KVM_MEM_READONLY))
return -EINVAL;
+ /* Moving a guest_memfd memslot isn't supported. */
+ if (base_gfn != old->base_gfn && mem->flags & KVM_MEM_GUEST_MEMFD)
+ return -EINVAL;
+
if (base_gfn != old->base_gfn)
change = KVM_MR_MOVE;
else if (mem->flags != old->flags)
@@ -2102,11 +2117,13 @@ static int kvm_set_memory_region(struct kvm *kvm,
new->npages = npages;
new->flags = mem->flags;
new->userspace_addr = mem->userspace_addr;
- if (mem->flags & KVM_MEM_GUEST_MEMFD) {
+ if (mem->flags & KVM_MEM_GUEST_MEMFD && change == KVM_MR_CREATE) {
r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
if (r)
goto out;
}
+ if (mem->flags & KVM_MEM_USERFAULT)
+ new->userfault_bitmap = u64_to_user_ptr(mem->userfault_bitmap);
r = kvm_set_memslot(kvm, old, new, change);
if (r)
@@ -4980,6 +4997,32 @@ static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
return cleared;
}
+#ifdef CONFIG_KVM_GENERIC_PAGE_FAULT
+bool kvm_do_userfault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct kvm_memory_slot *slot = fault->slot;
+ unsigned long __user *user_chunk;
+ unsigned long chunk;
+ gfn_t offset;
+
+ if (!kvm_is_userfault_memslot(slot))
+ return false;
+
+ offset = fault->gfn - slot->base_gfn;
+ user_chunk = slot->userfault_bitmap + (offset / BITS_PER_LONG);
+
+ if (__get_user(chunk, user_chunk))
+ return true;
+
+ if (!test_bit(offset % BITS_PER_LONG, &chunk))
+ return false;
+
+ kvm_prepare_memory_fault_exit(vcpu, fault);
+ vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_USERFAULT;
+ return true;
+}
+#endif
+
int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
--
2.50.0.rc2.692.g299adb8693-goog
^ permalink raw reply related [flat|nested] 31+ messages in thread
* Re: [PATCH v3 04/15] KVM: Add common infrastructure for KVM Userfaults
2025-06-18 4:24 ` [PATCH v3 04/15] KVM: Add common infrastructure for KVM Userfaults James Houghton
@ 2025-06-18 19:40 ` Oliver Upton
2025-06-18 20:33 ` Sean Christopherson
2025-06-18 20:38 ` James Houghton
0 siblings, 2 replies; 31+ messages in thread
From: Oliver Upton @ 2025-06-18 19:40 UTC (permalink / raw)
To: James Houghton
Cc: Paolo Bonzini, Sean Christopherson, Jonathan Corbet, Marc Zyngier,
Yan Zhao, Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
On Wed, Jun 18, 2025 at 04:24:13AM +0000, James Houghton wrote:
> KVM Userfault consists of a bitmap in userspace that describes which
> pages the user wants exits on (when KVM_MEM_USERFAULT is enabled). To
> get those exits, the memslot where KVM_MEM_USERFAULT is being enabled
> must drop (at least) all of the translations that the bitmap says should
> generate faults. Today, simply drop all translations for the memslot. Do
> so with a new arch interface, kvm_arch_userfault_enabled(), which can be
> specialized in the future by any architecture for which optimizations
> make sense.
>
> Make some changes to kvm_set_memory_region() to support setting
> KVM_MEM_USERFAULT on KVM_MEM_GUEST_MEMFD memslots, including relaxing
> the retrictions on guest_memfd memslots from only deletion to no moving.
>
> Signed-off-by: James Houghton <jthoughton@google.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
> +#ifdef CONFIG_KVM_GENERIC_PAGE_FAULT
> +bool kvm_do_userfault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
The polarity of the return here feels weird. If we want a value of 0 to
indicate success then int is a better return type.
> +{
> + struct kvm_memory_slot *slot = fault->slot;
> + unsigned long __user *user_chunk;
> + unsigned long chunk;
> + gfn_t offset;
> +
> + if (!kvm_is_userfault_memslot(slot))
> + return false;
> +
> + offset = fault->gfn - slot->base_gfn;
> + user_chunk = slot->userfault_bitmap + (offset / BITS_PER_LONG);
> +
> + if (__get_user(chunk, user_chunk))
> + return true;
> +
I see that the documentation suggests userspace perform a store-release
to update the bitmap. That's the right idea but we need a load-acquire
on the consumer side for that to do something meaningful.
> + if (!test_bit(offset % BITS_PER_LONG, &chunk))
> + return false;
> +
> + kvm_prepare_memory_fault_exit(vcpu, fault);
> + vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_USERFAULT;
> + return true;
> +}
> +#endif
> +
> int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
> struct kvm_enable_cap *cap)
> {
> --
> 2.50.0.rc2.692.g299adb8693-goog
>
Thanks,
Oliver
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v3 04/15] KVM: Add common infrastructure for KVM Userfaults
2025-06-18 19:40 ` Oliver Upton
@ 2025-06-18 20:33 ` Sean Christopherson
2025-06-18 20:41 ` James Houghton
2025-06-18 22:43 ` Oliver Upton
2025-06-18 20:38 ` James Houghton
1 sibling, 2 replies; 31+ messages in thread
From: Sean Christopherson @ 2025-06-18 20:33 UTC (permalink / raw)
To: Oliver Upton
Cc: James Houghton, Paolo Bonzini, Jonathan Corbet, Marc Zyngier,
Yan Zhao, Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
On Wed, Jun 18, 2025, Oliver Upton wrote:
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
No need for my SoB.
> > +#ifdef CONFIG_KVM_GENERIC_PAGE_FAULT
> > +bool kvm_do_userfault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>
> The polarity of the return here feels weird. If we want a value of 0 to
> indicate success then int is a better return type.
The boolean is my fault/suggestion. My thinking is that it would make the callers
more intuitive, e.g. so that this reads "if do userfault, then exit to userspace
with -EFAULT".
if (kvm_do_userfault(vcpu, fault))
return -EFAULT;
> > +{
> > + struct kvm_memory_slot *slot = fault->slot;
> > + unsigned long __user *user_chunk;
> > + unsigned long chunk;
> > + gfn_t offset;
> > +
> > + if (!kvm_is_userfault_memslot(slot))
> > + return false;
> > +
> > + offset = fault->gfn - slot->base_gfn;
> > + user_chunk = slot->userfault_bitmap + (offset / BITS_PER_LONG);
> > +
> > + if (__get_user(chunk, user_chunk))
> > + return true;
And this path is other motiviation for returning a boolean. To me, return "success"
when a uaccess fails looks all kinds of wrong:
if (__get_user(chunk, user_chunk))
return 0;
That said, I don't have a super strong preference; normally I'm fanatical about
not returning booleans. :-D
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v3 04/15] KVM: Add common infrastructure for KVM Userfaults
2025-06-18 20:33 ` Sean Christopherson
@ 2025-06-18 20:41 ` James Houghton
2025-06-18 22:43 ` Oliver Upton
1 sibling, 0 replies; 31+ messages in thread
From: James Houghton @ 2025-06-18 20:41 UTC (permalink / raw)
To: Sean Christopherson
Cc: Oliver Upton, Paolo Bonzini, Jonathan Corbet, Marc Zyngier,
Yan Zhao, Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
On Wed, Jun 18, 2025 at 1:33 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Wed, Jun 18, 2025, Oliver Upton wrote:
> > > Signed-off-by: Sean Christopherson <seanjc@google.com>
>
> No need for my SoB.
Would you like me to drop your SoB from all of the patches that are
not From: you (patches 4-7)?
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v3 04/15] KVM: Add common infrastructure for KVM Userfaults
2025-06-18 20:33 ` Sean Christopherson
2025-06-18 20:41 ` James Houghton
@ 2025-06-18 22:43 ` Oliver Upton
2025-06-19 1:27 ` Sean Christopherson
1 sibling, 1 reply; 31+ messages in thread
From: Oliver Upton @ 2025-06-18 22:43 UTC (permalink / raw)
To: Sean Christopherson
Cc: James Houghton, Paolo Bonzini, Jonathan Corbet, Marc Zyngier,
Yan Zhao, Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
On Wed, Jun 18, 2025 at 01:33:17PM -0700, Sean Christopherson wrote:
> On Wed, Jun 18, 2025, Oliver Upton wrote:
> > > Signed-off-by: Sean Christopherson <seanjc@google.com>
>
> No need for my SoB.
>
> > > +#ifdef CONFIG_KVM_GENERIC_PAGE_FAULT
> > > +bool kvm_do_userfault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> >
> > The polarity of the return here feels weird. If we want a value of 0 to
> > indicate success then int is a better return type.
>
> The boolean is my fault/suggestion. My thinking is that it would make the callers
> more intuitive, e.g. so that this reads "if do userfault, then exit to userspace
> with -EFAULT".
>
> if (kvm_do_userfault(vcpu, fault))
> return -EFAULT;
Agreed, this reads correctly. My only issue is that when I read the
function signature, "bool" is usually wired the other way around.
> > > +{
> > > + struct kvm_memory_slot *slot = fault->slot;
> > > + unsigned long __user *user_chunk;
> > > + unsigned long chunk;
> > > + gfn_t offset;
> > > +
> > > + if (!kvm_is_userfault_memslot(slot))
> > > + return false;
> > > +
> > > + offset = fault->gfn - slot->base_gfn;
> > > + user_chunk = slot->userfault_bitmap + (offset / BITS_PER_LONG);
> > > +
> > > + if (__get_user(chunk, user_chunk))
> > > + return true;
>
> And this path is other motiviation for returning a boolean. To me, return "success"
> when a uaccess fails looks all kinds of wrong:
>
> if (__get_user(chunk, user_chunk))
> return 0;
Yeah, that's gross. Although I would imagine we want to express
"failure" here, game over, out to userspace for resolution. So maybe:
if (__get_user(chunk, user_chunk))
return -EFAULT;
> That said, I don't have a super strong preference; normally I'm fanatical about
> not returning booleans. :-D
+1, it isn't _that_ big of a deal, just noticed it as part of review.
Thanks,
Oliver
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v3 04/15] KVM: Add common infrastructure for KVM Userfaults
2025-06-18 22:43 ` Oliver Upton
@ 2025-06-19 1:27 ` Sean Christopherson
0 siblings, 0 replies; 31+ messages in thread
From: Sean Christopherson @ 2025-06-19 1:27 UTC (permalink / raw)
To: Oliver Upton
Cc: James Houghton, Paolo Bonzini, Jonathan Corbet, Marc Zyngier,
Yan Zhao, Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
On Wed, Jun 18, 2025, Oliver Upton wrote:
> On Wed, Jun 18, 2025 at 01:33:17PM -0700, Sean Christopherson wrote:
> > On Wed, Jun 18, 2025, Oliver Upton wrote:
> > And this path is other motiviation for returning a boolean. To me, return "success"
> > when a uaccess fails looks all kinds of wrong:
> >
> > if (__get_user(chunk, user_chunk))
> > return 0;
>
> Yeah, that's gross. Although I would imagine we want to express
> "failure" here, game over, out to userspace for resolution. So maybe:
>
> if (__get_user(chunk, user_chunk))
> return -EFAULT;
I toyed with that idea too, but if kvm_do_userfault() returns a value, that it
bugs me to no end that the callers blindly convert all failures to -EFAULT. To
avoid that, callers would have to be:
r = kvm_do_userfault(vcpu, &fault);
if (r)
return r;
And that just annoyed me. :-) But I'm a-ok with that direction if that's
preferrable to the boolean return.
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v3 04/15] KVM: Add common infrastructure for KVM Userfaults
2025-06-18 19:40 ` Oliver Upton
2025-06-18 20:33 ` Sean Christopherson
@ 2025-06-18 20:38 ` James Houghton
1 sibling, 0 replies; 31+ messages in thread
From: James Houghton @ 2025-06-18 20:38 UTC (permalink / raw)
To: Oliver Upton
Cc: Paolo Bonzini, Sean Christopherson, Jonathan Corbet, Marc Zyngier,
Yan Zhao, Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
On Wed, Jun 18, 2025 at 12:41 PM Oliver Upton <oliver.upton@linux.dev> wrote:
>
> On Wed, Jun 18, 2025 at 04:24:13AM +0000, James Houghton wrote:
> > KVM Userfault consists of a bitmap in userspace that describes which
> > pages the user wants exits on (when KVM_MEM_USERFAULT is enabled). To
> > get those exits, the memslot where KVM_MEM_USERFAULT is being enabled
> > must drop (at least) all of the translations that the bitmap says should
> > generate faults. Today, simply drop all translations for the memslot. Do
> > so with a new arch interface, kvm_arch_userfault_enabled(), which can be
> > specialized in the future by any architecture for which optimizations
> > make sense.
> >
> > Make some changes to kvm_set_memory_region() to support setting
> > KVM_MEM_USERFAULT on KVM_MEM_GUEST_MEMFD memslots, including relaxing
> > the retrictions on guest_memfd memslots from only deletion to no moving.
> >
> > Signed-off-by: James Houghton <jthoughton@google.com>
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > ---
>
> > +#ifdef CONFIG_KVM_GENERIC_PAGE_FAULT
> > +bool kvm_do_userfault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>
> The polarity of the return here feels weird. If we want a value of 0 to
> indicate success then int is a better return type.
The way it's written now feels fine to me. I'm happy to change it to
an int (where we return -EFAULT instead of 'true' and 0 instead of
'false').
> > +{
> > + struct kvm_memory_slot *slot = fault->slot;
> > + unsigned long __user *user_chunk;
> > + unsigned long chunk;
> > + gfn_t offset;
> > +
> > + if (!kvm_is_userfault_memslot(slot))
> > + return false;
> > +
> > + offset = fault->gfn - slot->base_gfn;
> > + user_chunk = slot->userfault_bitmap + (offset / BITS_PER_LONG);
> > +
> > + if (__get_user(chunk, user_chunk))
> > + return true;
> > +
>
> I see that the documentation suggests userspace perform a store-release
> to update the bitmap. That's the right idea but we need a load-acquire
> on the consumer side for that to do something meaningful.
Indeed, the below test_bit() should be test_bit_acquire(), thank you!
(N.B. I don't think the current code could result in an observable
bug, given that the later write of the PTE has a control dependency
here. But it is certainly written incorrectly.)
> > + if (!test_bit(offset % BITS_PER_LONG, &chunk))
> > + return false;
> > +
> > + kvm_prepare_memory_fault_exit(vcpu, fault);
> > + vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_USERFAULT;
> > + return true;
> > +}
> > +#endif
> > +
> > int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
> > struct kvm_enable_cap *cap)
> > {
> > --
> > 2.50.0.rc2.692.g299adb8693-goog
> >
>
> Thanks,
> Oliver
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH v3 05/15] KVM: x86: Add support for KVM userfault exits
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
` (3 preceding siblings ...)
2025-06-18 4:24 ` [PATCH v3 04/15] KVM: Add common infrastructure for KVM Userfaults James Houghton
@ 2025-06-18 4:24 ` James Houghton
2025-07-30 21:11 ` James Houghton
2025-06-18 4:24 ` [PATCH v3 06/15] KVM: arm64: " James Houghton
` (11 subsequent siblings)
16 siblings, 1 reply; 31+ messages in thread
From: James Houghton @ 2025-06-18 4:24 UTC (permalink / raw)
To: Paolo Bonzini, Sean Christopherson, Oliver Upton
Cc: Jonathan Corbet, Marc Zyngier, Yan Zhao, James Houghton,
Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
Only a few changes are needed to support KVM userfault exits on x86:
1. Adjust kvm_mmu_hugepage_adjust() to force pages to be mapped at 4K
while KVM_MEM_USERFAULT is enabled.
2. Return -EFAULT when kvm_do_userfault() when it reports that the page
is userfault. (Upon failure to read from the bitmap,
kvm_do_userfault() will return true without setting up a memory fault
exit, so we'll return a bare -EFAULT).
For hugepage recovery, the behavior when disabling KVM_MEM_USERFAULT
should match the behavior when disabling KVM_MEM_LOG_DIRTY_PAGES; make
changes to kvm_mmu_slot_apply_flags() to recover hugepages when
KVM_MEM_USERFAULT is disabled.
Signed-off-by: James Houghton <jthoughton@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kvm/mmu/mmu.c | 5 ++++-
arch/x86/kvm/x86.c | 27 +++++++++++++++++----------
2 files changed, 21 insertions(+), 11 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index a4439e9e07268..49eb6b9b268cb 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3304,7 +3304,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (is_error_noslot_pfn(fault->pfn))
return;
- if (kvm_slot_dirty_track_enabled(slot))
+ if (kvm_slot_dirty_track_enabled(slot) || kvm_is_userfault_memslot(slot))
return;
/*
@@ -4522,6 +4522,9 @@ static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
{
unsigned int foll = fault->write ? FOLL_WRITE : 0;
+ if (kvm_do_userfault(vcpu, fault))
+ return -EFAULT;
+
if (fault->is_private)
return kvm_mmu_faultin_pfn_private(vcpu, fault);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b58a74c1722de..fa279ba38115c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13152,12 +13152,27 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
u32 new_flags = new ? new->flags : 0;
bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;
+ /*
+ * Recover hugepages when userfault is toggled off, as KVM forces 4KiB
+ * mappings when userfault is enabled. See below for why CREATE, MOVE,
+ * and DELETE don't need special handling. Note, common KVM handles
+ * zapping SPTEs when userfault is toggled on.
+ */
+ if (change == KVM_MR_FLAGS_ONLY && (old_flags & KVM_MEM_USERFAULT) &&
+ !(new_flags & KVM_MEM_USERFAULT))
+ kvm_mmu_recover_huge_pages(kvm, new);
+
+ /*
+ * Nothing more to do if dirty logging isn't being toggled.
+ */
+ if (!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES))
+ return;
+
/*
* Update CPU dirty logging if dirty logging is being toggled. This
* applies to all operations.
*/
- if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)
- kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
+ kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
/*
* Nothing more to do for RO slots (which can't be dirtied and can't be
@@ -13177,14 +13192,6 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY))
return;
- /*
- * READONLY and non-flags changes were filtered out above, and the only
- * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
- * logging isn't being toggled on or off.
- */
- if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)))
- return;
-
if (!log_dirty_pages) {
/*
* Recover huge page mappings in the slot now that dirty logging
--
2.50.0.rc2.692.g299adb8693-goog
^ permalink raw reply related [flat|nested] 31+ messages in thread
* Re: [PATCH v3 05/15] KVM: x86: Add support for KVM userfault exits
2025-06-18 4:24 ` [PATCH v3 05/15] KVM: x86: Add support for KVM userfault exits James Houghton
@ 2025-07-30 21:11 ` James Houghton
0 siblings, 0 replies; 31+ messages in thread
From: James Houghton @ 2025-07-30 21:11 UTC (permalink / raw)
To: jthoughton
Cc: amoorthy, corbet, dmatlack, kalyazin, kvm, kvmarm,
linux-arm-kernel, linux-doc, linux-kernel, maz, oliver.upton,
pbonzini, peterx, pgonda, seanjc, wei.w.wang, yan.y.zhao
On Tue, Jun 17, 2025 at 9:24 PM James Houghton <jthoughton@google.com> wrote:
>
> Only a few changes are needed to support KVM userfault exits on x86:
>
> 1. Adjust kvm_mmu_hugepage_adjust() to force pages to be mapped at 4K
> while KVM_MEM_USERFAULT is enabled.
> 2. Return -EFAULT when kvm_do_userfault() when it reports that the page
> is userfault. (Upon failure to read from the bitmap,
> kvm_do_userfault() will return true without setting up a memory fault
> exit, so we'll return a bare -EFAULT).
>
> For hugepage recovery, the behavior when disabling KVM_MEM_USERFAULT
> should match the behavior when disabling KVM_MEM_LOG_DIRTY_PAGES; make
> changes to kvm_mmu_slot_apply_flags() to recover hugepages when
> KVM_MEM_USERFAULT is disabled.
>
> Signed-off-by: James Houghton <jthoughton@google.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
This patch fails to remove the WARN in recover_huge_pages_range(). The
diff below will be applied to the next version of this patch, whenever it
comes.
This WARN can be hit by enabling KVM_MEM_LOG_DIRTY_PAGES and
KVM_MEM_USERFAULT, then disabling KVM_MEM_USERFAULT.
I've been having offline discussions with Sean about this series; I'm
waiting for him to rework the KVM_GENERIC_PAGE_FAULT bits. I'll dedicate
some more time to the QEMU side of things too.
Thanks.
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7f3d7229b2c1f..2d83ddb233a9a 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1779,7 +1779,7 @@ static void recover_huge_pages_range(struct kvm *kvm,
u64 huge_spte;
int r;
- if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot)))
+ if (kvm_slot_dirty_track_enabled(slot))
return;
rcu_read_lock();
^ permalink raw reply related [flat|nested] 31+ messages in thread
* [PATCH v3 06/15] KVM: arm64: Add support for KVM userfault exits
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
` (4 preceding siblings ...)
2025-06-18 4:24 ` [PATCH v3 05/15] KVM: x86: Add support for KVM userfault exits James Houghton
@ 2025-06-18 4:24 ` James Houghton
2025-06-18 4:24 ` [PATCH v3 07/15] KVM: Enable and advertise " James Houghton
` (10 subsequent siblings)
16 siblings, 0 replies; 31+ messages in thread
From: James Houghton @ 2025-06-18 4:24 UTC (permalink / raw)
To: Paolo Bonzini, Sean Christopherson, Oliver Upton
Cc: Jonathan Corbet, Marc Zyngier, Yan Zhao, James Houghton,
Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
To support KVM userfault exits with arm64:
1. Force mappings to be 4K while KVM_MEM_USERFAULT is enabled.
2. Return -EFAULT when kvm_do_userfault() reports that the page is
userfault (or that reading the bitmap failed).
kvm_arch_commit_memory_region() was written assuming that, for
KVM_MR_FLAGS_ONLY changes, KVM_MEM_LOG_DIRTY_PAGES must be being
toggled. This is no longer the case, so adjust the logic appropriately.
Signed-off-by: James Houghton <jthoughton@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/arm64/kvm/mmu.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 0c209f2e1c7b2..d75a6685d6842 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1548,7 +1548,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* logging_active is guaranteed to never be true for VM_PFNMAP
* memslots.
*/
- if (logging_active) {
+ if (logging_active || is_protected_kvm_enabled() ||
+ kvm_is_userfault_memslot(memslot)) {
force_pte = true;
vma_shift = PAGE_SHIFT;
} else {
@@ -1637,6 +1638,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
mmu_seq = vcpu->kvm->mmu_invalidate_seq;
mmap_read_unlock(current->mm);
+ if (kvm_do_userfault(vcpu, &fault))
+ return -EFAULT;
+
pfn = __kvm_faultin_pfn(memslot, fault.gfn, fault.write ? FOLL_WRITE : 0,
&writable, &page);
if (pfn == KVM_PFN_ERR_HWPOISON) {
@@ -2134,15 +2138,19 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES;
+ u32 old_flags = old ? old->flags : 0;
+ u32 new_flags = new ? new->flags : 0;
+
+ /* Nothing to do if not toggling dirty logging. */
+ if (!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES))
+ return;
/*
* At this point memslot has been committed and there is an
* allocated dirty_bitmap[], dirty pages will be tracked while the
* memory slot is write protected.
*/
- if (log_dirty_pages) {
-
+ if (new_flags & KVM_MEM_LOG_DIRTY_PAGES) {
if (change == KVM_MR_DELETE)
return;
--
2.50.0.rc2.692.g299adb8693-goog
^ permalink raw reply related [flat|nested] 31+ messages in thread
* [PATCH v3 07/15] KVM: Enable and advertise support for KVM userfault exits
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
` (5 preceding siblings ...)
2025-06-18 4:24 ` [PATCH v3 06/15] KVM: arm64: " James Houghton
@ 2025-06-18 4:24 ` James Houghton
2025-06-18 4:24 ` [PATCH v3 08/15] KVM: selftests: Fix vm_mem_region_set_flags docstring James Houghton
` (9 subsequent siblings)
16 siblings, 0 replies; 31+ messages in thread
From: James Houghton @ 2025-06-18 4:24 UTC (permalink / raw)
To: Paolo Bonzini, Sean Christopherson, Oliver Upton
Cc: Jonathan Corbet, Marc Zyngier, Yan Zhao, James Houghton,
Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
Now that all architectures (arm64 and x86) that utilize "generic" page
faults also support userfault exits, advertise support for
KVM_CAP_USERFAULT and let userspace set KVM_MEM_USERFAULT in memslots.
Signed-off-by: James Houghton <jthoughton@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
include/uapi/linux/kvm.h | 1 +
virt/kvm/kvm_main.c | 6 ++++++
2 files changed, 7 insertions(+)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index e3b871506ec85..0ba265f99f033 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -937,6 +937,7 @@ struct kvm_enable_cap {
#define KVM_CAP_ARM_EL2 240
#define KVM_CAP_ARM_EL2_E2H0 241
#define KVM_CAP_RISCV_MP_STATE_RESET 242
+#define KVM_CAP_USERFAULT 243
struct kvm_irq_routing_irqchip {
__u32 irqchip;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index bef6760cd1c0e..2962be09d5ebf 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1604,6 +1604,9 @@ static int check_memory_region_flags(struct kvm *kvm,
!(mem->flags & KVM_MEM_GUEST_MEMFD))
valid_flags |= KVM_MEM_READONLY;
+ if (IS_ENABLED(CONFIG_KVM_GENERIC_PAGE_FAULT))
+ valid_flags |= KVM_MEM_USERFAULT;
+
if (mem->flags & ~valid_flags)
return -EINVAL;
@@ -4881,6 +4884,9 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_CHECK_EXTENSION_VM:
case KVM_CAP_ENABLE_CAP_VM:
case KVM_CAP_HALT_POLL:
+#ifdef CONFIG_KVM_GENERIC_PAGE_FAULT
+ case KVM_CAP_USERFAULT:
+#endif
return 1;
#ifdef CONFIG_KVM_MMIO
case KVM_CAP_COALESCED_MMIO:
--
2.50.0.rc2.692.g299adb8693-goog
^ permalink raw reply related [flat|nested] 31+ messages in thread
* [PATCH v3 08/15] KVM: selftests: Fix vm_mem_region_set_flags docstring
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
` (6 preceding siblings ...)
2025-06-18 4:24 ` [PATCH v3 07/15] KVM: Enable and advertise " James Houghton
@ 2025-06-18 4:24 ` James Houghton
2025-06-18 4:24 ` [PATCH v3 09/15] KVM: selftests: Fix prefault_mem logic James Houghton
` (8 subsequent siblings)
16 siblings, 0 replies; 31+ messages in thread
From: James Houghton @ 2025-06-18 4:24 UTC (permalink / raw)
To: Paolo Bonzini, Sean Christopherson, Oliver Upton
Cc: Jonathan Corbet, Marc Zyngier, Yan Zhao, James Houghton,
Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
`flags` is what region->region.flags gets set to.
Signed-off-by: James Houghton <jthoughton@google.com>
---
tools/testing/selftests/kvm/lib/kvm_util.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index a055343a7bf75..ca1aa1699f8aa 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1200,7 +1200,7 @@ memslot2region(struct kvm_vm *vm, uint32_t memslot)
*
* Input Args:
* vm - Virtual Machine
- * flags - Starting guest physical address
+ * flags - Flags for the memslot
*
* Output Args: None
*
--
2.50.0.rc2.692.g299adb8693-goog
^ permalink raw reply related [flat|nested] 31+ messages in thread
* [PATCH v3 09/15] KVM: selftests: Fix prefault_mem logic
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
` (7 preceding siblings ...)
2025-06-18 4:24 ` [PATCH v3 08/15] KVM: selftests: Fix vm_mem_region_set_flags docstring James Houghton
@ 2025-06-18 4:24 ` James Houghton
2025-06-18 4:24 ` [PATCH v3 10/15] KVM: selftests: Add va_start/end into uffd_desc James Houghton
` (7 subsequent siblings)
16 siblings, 0 replies; 31+ messages in thread
From: James Houghton @ 2025-06-18 4:24 UTC (permalink / raw)
To: Paolo Bonzini, Sean Christopherson, Oliver Upton
Cc: Jonathan Corbet, Marc Zyngier, Yan Zhao, James Houghton,
Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
The previous logic didn't handle the case where memory was partitioned
AND we were using a single userfaultfd. It would only prefault the first
vCPU's memory and not the rest.
Signed-off-by: James Houghton <jthoughton@google.com>
---
tools/testing/selftests/kvm/demand_paging_test.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
index 0202b78f8680a..315f5c9037b40 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -172,11 +172,13 @@ static void run_test(enum vm_guest_mode mode, void *arg)
memset(guest_data_prototype, 0xAB, demand_paging_size);
if (p->uffd_mode == UFFDIO_REGISTER_MODE_MINOR) {
- num_uffds = p->single_uffd ? 1 : nr_vcpus;
- for (i = 0; i < num_uffds; i++) {
+ for (i = 0; i < nr_vcpus; i++) {
vcpu_args = &memstress_args.vcpu_args[i];
prefault_mem(addr_gpa2alias(vm, vcpu_args->gpa),
vcpu_args->pages * memstress_args.guest_page_size);
+ if (!p->partition_vcpu_memory_access)
+ /* We prefaulted everything */
+ break;
}
}
--
2.50.0.rc2.692.g299adb8693-goog
^ permalink raw reply related [flat|nested] 31+ messages in thread
* [PATCH v3 10/15] KVM: selftests: Add va_start/end into uffd_desc
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
` (8 preceding siblings ...)
2025-06-18 4:24 ` [PATCH v3 09/15] KVM: selftests: Fix prefault_mem logic James Houghton
@ 2025-06-18 4:24 ` James Houghton
2025-06-18 4:24 ` [PATCH v3 11/15] KVM: selftests: Add KVM Userfault mode to demand_paging_test James Houghton
` (6 subsequent siblings)
16 siblings, 0 replies; 31+ messages in thread
From: James Houghton @ 2025-06-18 4:24 UTC (permalink / raw)
To: Paolo Bonzini, Sean Christopherson, Oliver Upton
Cc: Jonathan Corbet, Marc Zyngier, Yan Zhao, James Houghton,
Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
This will be used for the selftest to look up which userfaultfd we
should be using when handling a KVM Userfault (in the event KVM
Userfault and userfaultfd are being used together).
Signed-off-by: James Houghton <jthoughton@google.com>
---
tools/testing/selftests/kvm/include/userfaultfd_util.h | 2 ++
tools/testing/selftests/kvm/lib/userfaultfd_util.c | 2 ++
2 files changed, 4 insertions(+)
diff --git a/tools/testing/selftests/kvm/include/userfaultfd_util.h b/tools/testing/selftests/kvm/include/userfaultfd_util.h
index 60f7f9d435dc2..b62fecdfe745c 100644
--- a/tools/testing/selftests/kvm/include/userfaultfd_util.h
+++ b/tools/testing/selftests/kvm/include/userfaultfd_util.h
@@ -30,6 +30,8 @@ struct uffd_desc {
int *pipefds;
pthread_t *readers;
struct uffd_reader_args *reader_args;
+ void *va_start;
+ void *va_end;
};
struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay,
diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c
index 5bde176cedd59..31d38b3a9d127 100644
--- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c
+++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c
@@ -152,6 +152,8 @@ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay,
expected_ioctls, "missing userfaultfd ioctls");
uffd_desc->uffd = uffd;
+ uffd_desc->va_start = hva;
+ uffd_desc->va_end = (char *)hva + len;
for (i = 0; i < uffd_desc->num_readers; ++i) {
int pipes[2];
--
2.50.0.rc2.692.g299adb8693-goog
^ permalink raw reply related [flat|nested] 31+ messages in thread
* [PATCH v3 11/15] KVM: selftests: Add KVM Userfault mode to demand_paging_test
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
` (9 preceding siblings ...)
2025-06-18 4:24 ` [PATCH v3 10/15] KVM: selftests: Add va_start/end into uffd_desc James Houghton
@ 2025-06-18 4:24 ` James Houghton
2025-06-18 4:24 ` [PATCH v3 12/15] KVM: selftests: Inform set_memory_region_test of KVM_MEM_USERFAULT James Houghton
` (5 subsequent siblings)
16 siblings, 0 replies; 31+ messages in thread
From: James Houghton @ 2025-06-18 4:24 UTC (permalink / raw)
To: Paolo Bonzini, Sean Christopherson, Oliver Upton
Cc: Jonathan Corbet, Marc Zyngier, Yan Zhao, James Houghton,
Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
Add a way for the KVM_RUN loop to handle EFAULT exits when they are for
KVM_MEMORY_EXIT_FLAG_USERFAULT. In this case, preemptively handle the
UFFDIO_COPY or UFFDIO_CONTINUE if userfaultfd is also in use. This saves
the trip through the userfaultfd poll/read/WAKE loop.
When preemptively handling UFFDIO_COPY/CONTINUE, do so with
MODE_DONTWAKE, as there will not be a thread to wake. If a thread *does*
take the userfaultfd slow path, we will get a regular userfault, and we
will call handle_uffd_page_request() which will do a full wake-up. In
the EEXIST case, a wake-up will not occur. Make sure to call UFFDIO_WAKE
explicitly in this case.
When handling KVM userfaults, make sure to set the bitmap with
memory_order_release. Although it wouldn't affect the functionality of
the test (because memstress doesn't actually require any particular
guest memory contents), it is what userspace normally needs to do.
Add `-k` to set the test to use KVM Userfault.
Add the vm_mem_region_set_flags_userfault() helper for setting
`userfault_bitmap` and KVM_MEM_USERFAULT at the same time.
Signed-off-by: James Houghton <jthoughton@google.com>
---
.../selftests/kvm/demand_paging_test.c | 139 +++++++++++++++++-
.../testing/selftests/kvm/include/kvm_util.h | 5 +
tools/testing/selftests/kvm/lib/kvm_util.c | 40 ++++-
3 files changed, 176 insertions(+), 8 deletions(-)
diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
index 315f5c9037b40..183c707310933 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -12,7 +12,9 @@
#include <time.h>
#include <pthread.h>
#include <linux/userfaultfd.h>
+#include <linux/bitmap.h>
#include <sys/syscall.h>
+#include <stdatomic.h>
#include "kvm_util.h"
#include "test_util.h"
@@ -24,11 +26,21 @@
#ifdef __NR_userfaultfd
static int nr_vcpus = 1;
+static int num_uffds;
static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
static size_t demand_paging_size;
+static size_t host_page_size;
static char *guest_data_prototype;
+static struct {
+ bool enabled;
+ int uffd_mode; /* set if userfaultfd is also in use */
+ struct uffd_desc **uffd_descs;
+} kvm_userfault_data;
+
+static void resolve_kvm_userfault(u64 gpa, u64 size);
+
static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
{
struct kvm_vcpu *vcpu = vcpu_args->vcpu;
@@ -41,8 +53,22 @@ static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
clock_gettime(CLOCK_MONOTONIC, &start);
/* Let the guest access its memory */
+restart:
ret = _vcpu_run(vcpu);
- TEST_ASSERT(ret == 0, "vcpu_run failed: %d", ret);
+ if (ret < 0 && errno == EFAULT && kvm_userfault_data.enabled) {
+ /* Check for userfault. */
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_MEMORY_FAULT,
+ "Got invalid exit reason: %x", run->exit_reason);
+ TEST_ASSERT(run->memory_fault.flags ==
+ KVM_MEMORY_EXIT_FLAG_USERFAULT,
+ "Got invalid memory fault exit: %llx",
+ run->memory_fault.flags);
+ resolve_kvm_userfault(run->memory_fault.gpa,
+ run->memory_fault.size);
+ goto restart;
+ } else
+ TEST_ASSERT(ret == 0, "vcpu_run failed: %d", ret);
+
if (get_ucall(vcpu, NULL) != UCALL_SYNC) {
TEST_ASSERT(false,
"Invalid guest sync status: exit_reason=%s",
@@ -54,11 +80,10 @@ static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
ts_diff.tv_sec, ts_diff.tv_nsec);
}
-static int handle_uffd_page_request(int uffd_mode, int uffd,
- struct uffd_msg *msg)
+static int resolve_uffd_page_request(int uffd_mode, int uffd, uint64_t addr,
+ bool wake)
{
pid_t tid = syscall(__NR_gettid);
- uint64_t addr = msg->arg.pagefault.address;
struct timespec start;
struct timespec ts_diff;
int r;
@@ -71,7 +96,7 @@ static int handle_uffd_page_request(int uffd_mode, int uffd,
copy.src = (uint64_t)guest_data_prototype;
copy.dst = addr;
copy.len = demand_paging_size;
- copy.mode = 0;
+ copy.mode = wake ? 0 : UFFDIO_COPY_MODE_DONTWAKE;
r = ioctl(uffd, UFFDIO_COPY, ©);
/*
@@ -96,6 +121,7 @@ static int handle_uffd_page_request(int uffd_mode, int uffd,
cont.range.start = addr;
cont.range.len = demand_paging_size;
+ cont.mode = wake ? 0 : UFFDIO_CONTINUE_MODE_DONTWAKE;
r = ioctl(uffd, UFFDIO_CONTINUE, &cont);
/*
@@ -119,6 +145,20 @@ static int handle_uffd_page_request(int uffd_mode, int uffd,
TEST_FAIL("Invalid uffd mode %d", uffd_mode);
}
+ if (r < 0 && wake) {
+ /*
+ * No wake-up occurs when UFFDIO_COPY/CONTINUE fails, but we
+ * have a thread waiting. Wake it up.
+ */
+ struct uffdio_range range = {0};
+
+ range.start = addr;
+ range.len = demand_paging_size;
+
+ TEST_ASSERT(ioctl(uffd, UFFDIO_WAKE, &range) == 0,
+ "UFFDIO_WAKE failed: 0x%lx", addr);
+ }
+
ts_diff = timespec_elapsed(start);
PER_PAGE_DEBUG("UFFD page-in %d \t%ld ns\n", tid,
@@ -129,6 +169,58 @@ static int handle_uffd_page_request(int uffd_mode, int uffd,
return 0;
}
+static int handle_uffd_page_request(int uffd_mode, int uffd,
+ struct uffd_msg *msg)
+{
+ uint64_t addr = msg->arg.pagefault.address;
+
+ return resolve_uffd_page_request(uffd_mode, uffd, addr, true);
+}
+
+static void resolve_kvm_userfault(u64 gpa, u64 size)
+{
+ struct kvm_vm *vm = memstress_args.vm;
+ struct userspace_mem_region *region;
+ unsigned long *bitmap_chunk;
+ u64 page, gpa_offset;
+
+ region = (struct userspace_mem_region *) userspace_mem_region_find(
+ vm, gpa, (gpa + size - 1));
+
+ if (kvm_userfault_data.uffd_mode) {
+ /*
+ * Resolve userfaults early, without needing to read them
+ * off the userfaultfd.
+ */
+ uint64_t hva = (uint64_t)addr_gpa2hva(vm, gpa);
+ struct uffd_desc **descs = kvm_userfault_data.uffd_descs;
+ int i, fd;
+
+ for (i = 0; i < num_uffds; ++i)
+ if (hva >= (uint64_t)descs[i]->va_start &&
+ hva < (uint64_t)descs[i]->va_end)
+ break;
+
+ TEST_ASSERT(i < num_uffds,
+ "Did not find userfaultfd for hva: %lx", hva);
+
+ fd = kvm_userfault_data.uffd_descs[i]->uffd;
+ resolve_uffd_page_request(kvm_userfault_data.uffd_mode, fd,
+ hva, false);
+ } else {
+ uint64_t hva = (uint64_t)addr_gpa2hva(vm, gpa);
+
+ memcpy((char *)hva, guest_data_prototype, demand_paging_size);
+ }
+
+ gpa_offset = gpa - region->region.guest_phys_addr;
+ page = gpa_offset / host_page_size;
+ bitmap_chunk = (unsigned long *)region->region.userfault_bitmap +
+ page / BITS_PER_LONG;
+ atomic_fetch_and_explicit((_Atomic unsigned long *)bitmap_chunk,
+ ~(1ul << (page % BITS_PER_LONG)), memory_order_release);
+}
+
struct test_params {
int uffd_mode;
bool single_uffd;
@@ -136,6 +228,7 @@ struct test_params {
int readers_per_uffd;
enum vm_mem_backing_src_type src_type;
bool partition_vcpu_memory_access;
+ bool kvm_userfault;
};
static void prefault_mem(void *alias, uint64_t len)
@@ -149,6 +242,25 @@ static void prefault_mem(void *alias, uint64_t len)
}
}
+static void enable_userfault(struct kvm_vm *vm, int slots)
+{
+ for (int i = 0; i < slots; ++i) {
+ int slot = MEMSTRESS_MEM_SLOT_INDEX + i;
+ struct userspace_mem_region *region;
+ unsigned long *userfault_bitmap;
+ int flags = KVM_MEM_USERFAULT;
+
+ region = memslot2region(vm, slot);
+ userfault_bitmap = bitmap_zalloc(region->mmap_size /
+ host_page_size);
+ /* everything is userfault initially */
+ memset(userfault_bitmap, -1, region->mmap_size / host_page_size / CHAR_BIT);
+ printf("Setting bitmap: %p\n", userfault_bitmap);
+ vm_mem_region_set_flags_userfault(vm, slot, flags,
+ userfault_bitmap);
+ }
+}
+
static void run_test(enum vm_guest_mode mode, void *arg)
{
struct memstress_vcpu_args *vcpu_args;
@@ -159,12 +271,13 @@ static void run_test(enum vm_guest_mode mode, void *arg)
struct timespec ts_diff;
double vcpu_paging_rate;
struct kvm_vm *vm;
- int i, num_uffds = 0;
+ int i;
vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
p->src_type, p->partition_vcpu_memory_access);
demand_paging_size = get_backing_src_pagesz(p->src_type);
+ host_page_size = getpagesize();
guest_data_prototype = malloc(demand_paging_size);
TEST_ASSERT(guest_data_prototype,
@@ -208,6 +321,14 @@ static void run_test(enum vm_guest_mode mode, void *arg)
}
}
+ if (p->kvm_userfault) {
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_USERFAULT));
+ kvm_userfault_data.enabled = true;
+ kvm_userfault_data.uffd_mode = p->uffd_mode;
+ kvm_userfault_data.uffd_descs = uffd_descs;
+ enable_userfault(vm, 1);
+ }
+
pr_info("Finished creating vCPUs and starting uffd threads\n");
clock_gettime(CLOCK_MONOTONIC, &start);
@@ -265,6 +386,7 @@ static void help(char *name)
printf(" -v: specify the number of vCPUs to run.\n");
printf(" -o: Overlap guest memory accesses instead of partitioning\n"
" them into a separate region of memory for each vCPU.\n");
+ printf(" -k: Use KVM Userfault\n");
puts("");
exit(0);
}
@@ -283,7 +405,7 @@ int main(int argc, char *argv[])
guest_modes_append_default();
- while ((opt = getopt(argc, argv, "ahom:u:d:b:s:v:c:r:")) != -1) {
+ while ((opt = getopt(argc, argv, "ahokm:u:d:b:s:v:c:r:")) != -1) {
switch (opt) {
case 'm':
guest_modes_cmdline(optarg);
@@ -326,6 +448,9 @@ int main(int argc, char *argv[])
"Invalid number of readers per uffd %d: must be >=1",
p.readers_per_uffd);
break;
+ case 'k':
+ p.kvm_userfault = true;
+ break;
case 'h':
default:
help(argv[0]);
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index bee65ca087217..5642d075900f0 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -630,6 +630,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
uint64_t guest_paddr, uint32_t slot, uint64_t npages,
uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset);
+struct userspace_mem_region *
+userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end);
#ifndef vm_arch_has_protected_memory
static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm)
@@ -639,6 +641,9 @@ static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm)
#endif
void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
+void vm_mem_region_set_flags_userfault(struct kvm_vm *vm, uint32_t slot,
+ uint32_t flags,
+ unsigned long *userfault_bitmap);
void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa);
void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot);
struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index ca1aa1699f8aa..3c215df1d2d84 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -694,7 +694,7 @@ void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[],
* of the regions is returned. Null is returned only when no overlapping
* region exists.
*/
-static struct userspace_mem_region *
+struct userspace_mem_region *
userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end)
{
struct rb_node *node;
@@ -1225,6 +1225,44 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags)
ret, errno, slot, flags);
}
+/*
+ * VM Memory Region Flags Set with a userfault bitmap
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * flags - Flags for the memslot
+ * userfault_bitmap - The bitmap to use for KVM_MEM_USERFAULT
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the flags of the memory region specified by the value of slot,
+ * to the values given by flags. This helper adds a way to provide a
+ * userfault_bitmap.
+ */
+void vm_mem_region_set_flags_userfault(struct kvm_vm *vm, uint32_t slot,
+ uint32_t flags,
+ unsigned long *userfault_bitmap)
+{
+ int ret;
+ struct userspace_mem_region *region;
+
+ region = memslot2region(vm, slot);
+
+ TEST_ASSERT(!userfault_bitmap ^ (flags & KVM_MEM_USERFAULT),
+ "KVM_MEM_USERFAULT must be specified with a bitmap");
+
+ region->region.flags = flags;
+ region->region.userfault_bitmap = (__u64)userfault_bitmap;
+
+ ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region);
+
+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n"
+ " rc: %i errno: %i slot: %u flags: 0x%x",
+ ret, errno, slot, flags);
+}
+
/*
* VM Memory Region Move
*
--
2.50.0.rc2.692.g299adb8693-goog
^ permalink raw reply related [flat|nested] 31+ messages in thread
* [PATCH v3 12/15] KVM: selftests: Inform set_memory_region_test of KVM_MEM_USERFAULT
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
` (10 preceding siblings ...)
2025-06-18 4:24 ` [PATCH v3 11/15] KVM: selftests: Add KVM Userfault mode to demand_paging_test James Houghton
@ 2025-06-18 4:24 ` James Houghton
2025-06-18 4:24 ` [PATCH v3 13/15] KVM: selftests: Add KVM_MEM_USERFAULT + guest_memfd toggle tests James Houghton
` (4 subsequent siblings)
16 siblings, 0 replies; 31+ messages in thread
From: James Houghton @ 2025-06-18 4:24 UTC (permalink / raw)
To: Paolo Bonzini, Sean Christopherson, Oliver Upton
Cc: Jonathan Corbet, Marc Zyngier, Yan Zhao, James Houghton,
Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
The KVM_MEM_USERFAULT flag is supported iff KVM_CAP_USERFAULT is
available.
Signed-off-by: James Houghton <jthoughton@google.com>
---
tools/testing/selftests/kvm/set_memory_region_test.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c
index ce3ac0fd6dfb4..ba3fe8a53b33e 100644
--- a/tools/testing/selftests/kvm/set_memory_region_test.c
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -364,6 +364,9 @@ static void test_invalid_memory_region_flags(void)
if (kvm_check_cap(KVM_CAP_MEMORY_ATTRIBUTES) & KVM_MEMORY_ATTRIBUTE_PRIVATE)
supported_flags |= KVM_MEM_GUEST_MEMFD;
+ if (kvm_check_cap(KVM_CAP_USERFAULT))
+ supported_flags |= KVM_MEM_USERFAULT;
+
for (i = 0; i < 32; i++) {
if ((supported_flags & BIT(i)) && !(v2_only_flags & BIT(i)))
continue;
--
2.50.0.rc2.692.g299adb8693-goog
^ permalink raw reply related [flat|nested] 31+ messages in thread
* [PATCH v3 13/15] KVM: selftests: Add KVM_MEM_USERFAULT + guest_memfd toggle tests
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
` (11 preceding siblings ...)
2025-06-18 4:24 ` [PATCH v3 12/15] KVM: selftests: Inform set_memory_region_test of KVM_MEM_USERFAULT James Houghton
@ 2025-06-18 4:24 ` James Houghton
2025-06-18 4:24 ` [PATCH v3 14/15] KVM: Documentation: Fix section number for KVM_CAP_ARM_WRITABLE_IMP_ID_REGS James Houghton
` (3 subsequent siblings)
16 siblings, 0 replies; 31+ messages in thread
From: James Houghton @ 2025-06-18 4:24 UTC (permalink / raw)
To: Paolo Bonzini, Sean Christopherson, Oliver Upton
Cc: Jonathan Corbet, Marc Zyngier, Yan Zhao, James Houghton,
Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
Make sure KVM_MEM_USERFAULT can be toggled on and off for
KVM_MEM_GUEST_MEMFD memslots.
Signed-off-by: James Houghton <jthoughton@google.com>
---
.../selftests/kvm/set_memory_region_test.c | 30 +++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c
index ba3fe8a53b33e..3f77abb257601 100644
--- a/tools/testing/selftests/kvm/set_memory_region_test.c
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -606,6 +606,35 @@ static void test_mmio_during_vectoring(void)
kvm_vm_free(vm);
}
+
+static void test_private_memory_region_userfault(void)
+{
+ struct kvm_vm *vm;
+ int memfd;
+
+ pr_info("Testing toggling KVM_MEM_USERFAULT on KVM_MEM_GUEST_MEMFD memory regions\n");
+
+ vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM);
+
+ test_invalid_guest_memfd(vm, vm->kvm_fd, 0, "KVM fd should fail");
+ test_invalid_guest_memfd(vm, vm->fd, 0, "VM's fd should fail");
+
+ memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE, 0);
+
+ vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD,
+ MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0);
+
+ vm_set_user_memory_region2(vm, MEM_REGION_SLOT,
+ KVM_MEM_GUEST_MEMFD | KVM_MEM_USERFAULT,
+ MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0);
+
+ vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD,
+ MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0);
+
+ close(memfd);
+
+ kvm_vm_free(vm);
+}
#endif
int main(int argc, char *argv[])
@@ -633,6 +662,7 @@ int main(int argc, char *argv[])
(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM))) {
test_add_private_memory_region();
test_add_overlapping_private_memory_regions();
+ test_private_memory_region_userfault();
} else {
pr_info("Skipping tests for KVM_MEM_GUEST_MEMFD memory regions\n");
}
--
2.50.0.rc2.692.g299adb8693-goog
^ permalink raw reply related [flat|nested] 31+ messages in thread
* [PATCH v3 14/15] KVM: Documentation: Fix section number for KVM_CAP_ARM_WRITABLE_IMP_ID_REGS
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
` (12 preceding siblings ...)
2025-06-18 4:24 ` [PATCH v3 13/15] KVM: selftests: Add KVM_MEM_USERFAULT + guest_memfd toggle tests James Houghton
@ 2025-06-18 4:24 ` James Houghton
2025-06-18 4:24 ` [PATCH v3 15/15] KVM: Documentation: Add KVM_CAP_USERFAULT and KVM_MEM_USERFAULT details James Houghton
` (2 subsequent siblings)
16 siblings, 0 replies; 31+ messages in thread
From: James Houghton @ 2025-06-18 4:24 UTC (permalink / raw)
To: Paolo Bonzini, Sean Christopherson, Oliver Upton
Cc: Jonathan Corbet, Marc Zyngier, Yan Zhao, James Houghton,
Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm, Xin Li (Intel)
From: "Xin Li (Intel)" <xin@zytor.com>
The previous section is 7.41, thus this should be 7.42.
Signed-off-by: Xin Li (Intel) <xin@zytor.com>
Signed-off-by: James Houghton <jthoughton@google.com>
---
Documentation/virt/kvm/api.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 1bd2d42e6424c..ff0aa9eb91efe 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8528,7 +8528,7 @@ ENOSYS for the others.
When enabled, KVM will exit to userspace with KVM_EXIT_SYSTEM_EVENT of
type KVM_SYSTEM_EVENT_SUSPEND to process the guest suspend request.
-7.37 KVM_CAP_ARM_WRITABLE_IMP_ID_REGS
+7.42 KVM_CAP_ARM_WRITABLE_IMP_ID_REGS
-------------------------------------
:Architectures: arm64
--
2.50.0.rc2.692.g299adb8693-goog
^ permalink raw reply related [flat|nested] 31+ messages in thread
* [PATCH v3 15/15] KVM: Documentation: Add KVM_CAP_USERFAULT and KVM_MEM_USERFAULT details
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
` (13 preceding siblings ...)
2025-06-18 4:24 ` [PATCH v3 14/15] KVM: Documentation: Fix section number for KVM_CAP_ARM_WRITABLE_IMP_ID_REGS James Houghton
@ 2025-06-18 4:24 ` James Houghton
2025-06-18 23:24 ` [PATCH v3 00/15] KVM: Introduce KVM Userfault Oliver Upton
2025-09-04 16:43 ` Nikita Kalyazin
16 siblings, 0 replies; 31+ messages in thread
From: James Houghton @ 2025-06-18 4:24 UTC (permalink / raw)
To: Paolo Bonzini, Sean Christopherson, Oliver Upton
Cc: Jonathan Corbet, Marc Zyngier, Yan Zhao, James Houghton,
Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm, Bagas Sanjaya
Include the note about memory ordering when clearing bits in
userfault_bitmap, as it may not be obvious for users.
Signed-off-by: James Houghton <jthoughton@google.com>
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
---
Documentation/virt/kvm/api.rst | 33 ++++++++++++++++++++++++++++++++-
1 file changed, 32 insertions(+), 1 deletion(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index ff0aa9eb91efe..25668206a5d80 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6308,7 +6308,8 @@ bounds checks apply (use common sense).
__u64 guest_memfd_offset;
__u32 guest_memfd;
__u32 pad1;
- __u64 pad2[14];
+ __u64 userfault_bitmap;
+ __u64 pad2[13];
};
A KVM_MEM_GUEST_MEMFD region _must_ have a valid guest_memfd (private memory) and
@@ -6324,6 +6325,25 @@ state. At VM creation time, all memory is shared, i.e. the PRIVATE attribute
is '0' for all gfns. Userspace can control whether memory is shared/private by
toggling KVM_MEMORY_ATTRIBUTE_PRIVATE via KVM_SET_MEMORY_ATTRIBUTES as needed.
+When the KVM_MEM_USERFAULT flag is set, userfault_bitmap points to the starting
+address for the bitmap that controls if vCPU memory faults should immediately
+exit to userspace. If an invalid pointer is provided, at fault time, KVM_RUN
+will return -EFAULT. KVM_MEM_USERFAULT is only supported when
+KVM_CAP_USERFAULT is supported.
+
+userfault_bitmap should point to an array of longs where each bit in the array
+linearly corresponds to a single gfn. Bit 0 in userfault_bitmap corresponds to
+guest_phys_addr, bit 1 corresponds to guest_phys_addr + PAGE_SIZE, etc. If the
+bit for a page is set, any vCPU access to that page will exit to userspace with
+KVM_MEMORY_EXIT_FLAG_USERFAULT.
+
+Setting bits in userfault_bitmap has no effect on pages that have already been
+mapped by KVM until KVM_MEM_USERFAULT is disabled and re-enabled again.
+
+Clearing bits in userfault_bitmap should usually be done with a store-release
+if changes to guest memory are being made available to the guest via
+userfault_bitmap.
+
S390:
^^^^^
@@ -8557,6 +8577,17 @@ given VM.
When this capability is enabled, KVM resets the VCPU when setting
MP_STATE_INIT_RECEIVED through IOCTL. The original MP_STATE is preserved.
+7.44 KVM_CAP_USERFAULT
+----------------------
+
+:Architectures: x86, arm64
+:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP.
+
+The presence of this capability indicates that KVM_SET_USER_MEMORY_REGION2 will
+accept KVM_MEM_USERFAULT as a valid memslot flag.
+
+See KVM_SET_USER_MEMORY_REGION2 for more details.
+
8. Other capabilities.
======================
--
2.50.0.rc2.692.g299adb8693-goog
^ permalink raw reply related [flat|nested] 31+ messages in thread
* Re: [PATCH v3 00/15] KVM: Introduce KVM Userfault
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
` (14 preceding siblings ...)
2025-06-18 4:24 ` [PATCH v3 15/15] KVM: Documentation: Add KVM_CAP_USERFAULT and KVM_MEM_USERFAULT details James Houghton
@ 2025-06-18 23:24 ` Oliver Upton
2025-09-04 16:43 ` Nikita Kalyazin
16 siblings, 0 replies; 31+ messages in thread
From: Oliver Upton @ 2025-06-18 23:24 UTC (permalink / raw)
To: James Houghton
Cc: Paolo Bonzini, Sean Christopherson, Jonathan Corbet, Marc Zyngier,
Yan Zhao, Nikita Kalyazin, Anish Moorthy, Peter Gonda, Peter Xu,
David Matlack, wei.w.wang, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm
On Wed, Jun 18, 2025 at 04:24:09AM +0000, James Houghton wrote:
> Hi Sean, Paolo, Oliver, + others,
>
> Here is a v3 of KVM Userfault. Thanks for all the feedback on the v2,
> Sean. I realize it has been 6 months since the v2; I hope that isn't an
> issue.
Not one bit. The only thing I look for in patch frequency is the urgency
with which the author wants to get something in.
> I am working on the QEMU side of the changes as I get time. Let me know
> if it's important for me to send those patches out for this series to be
> merged.
It'd be good to know we have line of sight on a functional
implementation here, i.e. uffd-based handling of non-vCPU accesses. I'm
not expecting surprises here, but patches always speak louder than
words.
Don't want to block the kernel pieces if that's a time sink though. And
FWIW, besides the nitpicking I'm quite happy with the way this is
shaping up.
> Be aware that this series will have non-trivial conflicts with Fuad's
> user mapping support for guest_memfd series[1]. For example, for the
> arm64 change he is making, the newly introduced gmem_abort() would need
> to be enlightened to handle KVM Userfault exits.
Appreciate the heads up!
Thanks,
Oliver
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v3 00/15] KVM: Introduce KVM Userfault
2025-06-18 4:24 [PATCH v3 00/15] KVM: Introduce KVM Userfault James Houghton
` (15 preceding siblings ...)
2025-06-18 23:24 ` [PATCH v3 00/15] KVM: Introduce KVM Userfault Oliver Upton
@ 2025-09-04 16:43 ` Nikita Kalyazin
16 siblings, 0 replies; 31+ messages in thread
From: Nikita Kalyazin @ 2025-09-04 16:43 UTC (permalink / raw)
To: James Houghton, Paolo Bonzini, Sean Christopherson, Oliver Upton
Cc: Jonathan Corbet, Marc Zyngier, Yan Zhao, Anish Moorthy,
Peter Gonda, Peter Xu, David Matlack, wei.w.wang, kvm, linux-doc,
linux-kernel, linux-arm-kernel, kvmarm
On 18/06/2025 05:24, James Houghton wrote:
> Hi Sean, Paolo, Oliver, + others,
>
> Here is a v3 of KVM Userfault. Thanks for all the feedback on the v2,
> Sean. I realize it has been 6 months since the v2; I hope that isn't an
> issue.
>
> I am working on the QEMU side of the changes as I get time. Let me know
> if it's important for me to send those patches out for this series to be
> merged.
Hi Sean and others,
Are there any blockers for merging this series? We would like to use
the functionality in Firecracker for restoring guest_memfd-backed VMs
from snapshots via UFFD [1]. [2] is a Firecracker feature branch that
builds on top of KVM userfault, along with direct map removal [3], write
syscall [4] and UFFD support [5] in guest_memfd (currently in discussion
with MM at [6]) series.
Thanks,
Nikita
[1]:
https://github.com/firecracker-microvm/firecracker/blob/main/docs/snapshotting/handling-page-faults-on-snapshot-resume.md
[2]:
https://github.com/firecracker-microvm/firecracker/tree/feature/secret-hiding
[3]: https://lore.kernel.org/kvm/20250828093902.2719-1-roypat@amazon.co.uk
[4]: https://lore.kernel.org/kvm/20250902111951.58315-1-kalyazin@amazon.com
[5]: https://lore.kernel.org/kvm/20250404154352.23078-1-kalyazin@amazon.com
[6]:
https://lore.kernel.org/linux-mm/20250627154655.2085903-1-peterx@redhat.com
> Be aware that this series will have non-trivial conflicts with Fuad's
> user mapping support for guest_memfd series[1]. For example, for the
> arm64 change he is making, the newly introduced gmem_abort() would need
> to be enlightened to handle KVM Userfault exits.
>
> Changelog:
> v2[2]->v3:
> - Pull in Sean's changes to genericize struct kvm_page_fault and use it
> for arm64. Many of these patches now have Sean's SoB.
> - Pull in Sean's small rename and squashing of the main patches.
> - Add kvm_arch_userfault_enabled() in place of calling
> kvm_arch_flush_shadow_memslot() directly from generic code.
> - Pull in Xin Li's documentation section number fix for
> KVM_CAP_ARM_WRITABLE_IMP_ID_REGS[3].
> v1[4]->v2:
> - For arm64, no longer zap stage 2 when disabling KVM_MEM_USERFAULT
> (thanks Oliver).
> - Fix the userfault_bitmap validation and casts (thanks kernel test
> robot).
> - Fix _Atomic cast for the userfault bitmap in the selftest (thanks
> kernel test robot).
> - Pick up Reviewed-by on doc changes (thanks Bagas).
>
> Below is the cover letter from v1, mostly unchanged:
>
> Please see the RFC[5] for the problem description. In summary,
> guest_memfd VMs have no mechanism for doing post-copy live migration.
> KVM Userfault provides such a mechanism.
>
> There is a second problem that KVM Userfault solves: userfaultfd-based
> post-copy doesn't scale very well. KVM Userfault when used with
> userfaultfd can scale much better in the common case that most post-copy
> demand fetches are a result of vCPU access violations. This is a
> continuation of the solution Anish was working on[6]. This aspect of
> KVM Userfault is important for userfaultfd-based live migration when
> scaling up to hundreds of vCPUs with ~30us network latency for a
> PAGE_SIZE demand-fetch.
>
> The implementation in this series is version than the RFC[5]. It adds...
> 1. a new memslot flag is added: KVM_MEM_USERFAULT,
> 2. a new parameter, userfault_bitmap, into struct kvm_memory_slot,
> 3. a new KVM_RUN exit reason: KVM_MEMORY_EXIT_FLAG_USERFAULT,
> 4. a new KVM capability KVM_CAP_USERFAULT.
>
> KVM Userfault does not attempt to catch KVM's own accesses to guest
> memory. That is left up to userfaultfd.
>
> When enabling KVM_MEM_USERFAULT for a memslot, the second-stage mappings
> are zapped, and new faults will check `userfault_bitmap` to see if the
> fault should exit to userspace.
>
> When KVM_MEM_USERFAULT is enabled, only PAGE_SIZE mappings are
> permitted.
>
> When disabling KVM_MEM_USERFAULT, huge mappings will be reconstructed
> consistent with dirty log disabling. So on x86, huge mappings will be
> reconstructed, but on arm64, they won't be.
>
> KVM Userfault is not compatible with async page faults. Nikita has
> proposed a new implementation of async page faults that is more
> userspace-driven that *is* compatible with KVM Userfault[7].
>
> See v1 for more performance details[4]. They are unchanged in this
> version.
>
> This series is based on the latest kvm-x86/next.
>
> [1]: https://lore.kernel.org/kvm/20250611133330.1514028-1-tabba@google.com/
> [2]: https://lore.kernel.org/kvm/20250109204929.1106563-1-jthoughton@google.com/
> [3]: https://lore.kernel.org/kvm/20250414165146.2279450-1-xin@zytor.com/
> [4]: https://lore.kernel.org/kvm/20241204191349.1730936-1-jthoughton@google.com/
> [5]: https://lore.kernel.org/kvm/20240710234222.2333120-1-jthoughton@google.com/
> [6]: https://lore.kernel.org/all/20240215235405.368539-1-amoorthy@google.com/
> [7]: https://lore.kernel.org/kvm/20241118123948.4796-1-kalyazin@amazon.com/#t
>
> James Houghton (11):
> KVM: Add common infrastructure for KVM Userfaults
> KVM: x86: Add support for KVM userfault exits
> KVM: arm64: Add support for KVM userfault exits
> KVM: Enable and advertise support for KVM userfault exits
> KVM: selftests: Fix vm_mem_region_set_flags docstring
> KVM: selftests: Fix prefault_mem logic
> KVM: selftests: Add va_start/end into uffd_desc
> KVM: selftests: Add KVM Userfault mode to demand_paging_test
> KVM: selftests: Inform set_memory_region_test of KVM_MEM_USERFAULT
> KVM: selftests: Add KVM_MEM_USERFAULT + guest_memfd toggle tests
> KVM: Documentation: Add KVM_CAP_USERFAULT and KVM_MEM_USERFAULT
> details
>
> Sean Christopherson (3):
> KVM: x86/mmu: Move "struct kvm_page_fault" definition to
> asm/kvm_host.h
> KVM: arm64: Add "struct kvm_page_fault" to gather common fault
> variables
> KVM: arm64: x86: Require "struct kvm_page_fault" for memory fault
> exits
>
> Xin Li (Intel) (1):
> KVM: Documentation: Fix section number for
> KVM_CAP_ARM_WRITABLE_IMP_ID_REGS
>
> Documentation/virt/kvm/api.rst | 35 ++++-
> arch/arm64/include/asm/kvm_host.h | 9 ++
> arch/arm64/kvm/Kconfig | 1 +
> arch/arm64/kvm/mmu.c | 48 +++---
> arch/x86/include/asm/kvm_host.h | 68 +++++++-
> arch/x86/kvm/Kconfig | 1 +
> arch/x86/kvm/mmu/mmu.c | 13 +-
> arch/x86/kvm/mmu/mmu_internal.h | 77 +---------
> arch/x86/kvm/x86.c | 27 ++--
> include/linux/kvm_host.h | 49 +++++-
> include/uapi/linux/kvm.h | 6 +-
> .../selftests/kvm/demand_paging_test.c | 145 ++++++++++++++++--
> .../testing/selftests/kvm/include/kvm_util.h | 5 +
> .../selftests/kvm/include/userfaultfd_util.h | 2 +
> tools/testing/selftests/kvm/lib/kvm_util.c | 42 ++++-
> .../selftests/kvm/lib/userfaultfd_util.c | 2 +
> .../selftests/kvm/set_memory_region_test.c | 33 ++++
> virt/kvm/Kconfig | 3 +
> virt/kvm/kvm_main.c | 57 ++++++-
> 19 files changed, 489 insertions(+), 134 deletions(-)
>
>
> base-commit: 19272b37aa4f83ca52bdf9c16d5d81bdd1354494
> --
> 2.50.0.rc2.692.g299adb8693-goog
>
^ permalink raw reply [flat|nested] 31+ messages in thread