* [PATCH v7 1/7] KVM: VMX: Pass @vcpu, not @vmx to init_vmcs()
2026-05-18 4:59 [PATCH v7 0/7] KVM: SVM: Add Page Modification Logging (PML) support Nikunj A Dadhania
@ 2026-05-18 4:59 ` Nikunj A Dadhania
2026-05-18 11:35 ` Huang, Kai
2026-05-18 4:59 ` [PATCH v7 2/7] KVM: x86: Move PML page to common vcpu arch structure Nikunj A Dadhania
` (5 subsequent siblings)
6 siblings, 1 reply; 15+ messages in thread
From: Nikunj A Dadhania @ 2026-05-18 4:59 UTC (permalink / raw)
To: kvm, seanjc, pbonzini
Cc: thomas.lendacky, bp, joao.m.martins, nikunj, kai.huang, yosry
From: Sean Christopherson <seanjc@google.com>
Pass @vcpu instead of @vmx to init_vmcs(), and switch all of the vmx->vcpu
usage to a simple vcpu.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
---
arch/x86/kvm/vmx/vmx.c | 25 +++++++++++++------------
1 file changed, 13 insertions(+), 12 deletions(-)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b02d176800f8b..db468bcdc9808 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4851,10 +4851,11 @@ int vmx_vcpu_precreate(struct kvm *kvm)
#define VMX_XSS_EXIT_BITMAP 0
-static void init_vmcs(struct vcpu_vmx *vmx)
+static void init_vmcs(struct kvm_vcpu *vcpu)
{
- struct kvm *kvm = vmx->vcpu.kvm;
+ struct kvm *kvm = vcpu->kvm;
struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
if (nested)
nested_vmx_set_vmcs_shadowing_bitmap();
@@ -4879,7 +4880,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (cpu_has_tertiary_exec_ctrls())
tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
- if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
+ if (enable_apicv && lapic_in_kernel(vcpu)) {
vmcs_write64(EOI_EXIT_BITMAP0, 0);
vmcs_write64(EOI_EXIT_BITMAP1, 0);
vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4891,7 +4892,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc)));
}
- if (vmx_can_use_ipiv(&vmx->vcpu)) {
+ if (vmx_can_use_ipiv(vcpu)) {
vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
}
@@ -4926,15 +4927,15 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
- vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
+ vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat);
vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl());
/* 22.2.1, 20.8.1 */
vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl());
- vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
- vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
+ vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
set_cr4_guest_host_mask(vmx);
@@ -4949,7 +4950,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
}
- vmx_write_encls_bitmap(&vmx->vcpu, NULL);
+ vmx_write_encls_bitmap(vcpu, NULL);
if (vmx_pt_mode_is_host_guest()) {
memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
@@ -4962,13 +4963,13 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_writel(GUEST_SYSENTER_ESP, 0);
vmcs_writel(GUEST_SYSENTER_EIP, 0);
- vmx_guest_debugctl_write(&vmx->vcpu, 0);
+ vmx_guest_debugctl_write(vcpu, 0);
if (cpu_has_vmx_tpr_shadow()) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (cpu_need_tpr_shadow(&vmx->vcpu))
+ if (cpu_need_tpr_shadow(vcpu))
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
- __pa(vmx->vcpu.arch.apic->regs));
+ __pa(vcpu->arch.apic->regs));
vmcs_write32(TPR_THRESHOLD, 0);
}
@@ -4979,7 +4980,7 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- init_vmcs(vmx);
+ init_vmcs(vcpu);
if (nested &&
kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
--
2.48.1
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH v7 1/7] KVM: VMX: Pass @vcpu, not @vmx to init_vmcs()
2026-05-18 4:59 ` [PATCH v7 1/7] KVM: VMX: Pass @vcpu, not @vmx to init_vmcs() Nikunj A Dadhania
@ 2026-05-18 11:35 ` Huang, Kai
0 siblings, 0 replies; 15+ messages in thread
From: Huang, Kai @ 2026-05-18 11:35 UTC (permalink / raw)
To: kvm@vger.kernel.org, pbonzini@redhat.com, seanjc@google.com,
nikunj@amd.com
Cc: thomas.lendacky@amd.com, joao.m.martins@oracle.com,
yosry@kernel.org, bp@alien8.de
On Mon, 2026-05-18 at 04:59 +0000, Nikunj A Dadhania wrote:
> From: Sean Christopherson <seanjc@google.com>
>
> Pass @vcpu instead of @vmx to init_vmcs(), and switch all of the vmx->vcpu
> usage to a simple vcpu.
>
> No functional change intended.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
Reviewed-by: Kai Huang <kai.huang@intel.com>
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v7 2/7] KVM: x86: Move PML page to common vcpu arch structure
2026-05-18 4:59 [PATCH v7 0/7] KVM: SVM: Add Page Modification Logging (PML) support Nikunj A Dadhania
2026-05-18 4:59 ` [PATCH v7 1/7] KVM: VMX: Pass @vcpu, not @vmx to init_vmcs() Nikunj A Dadhania
@ 2026-05-18 4:59 ` Nikunj A Dadhania
2026-05-18 4:59 ` [PATCH v7 3/7] KVM: x86: Carve out PML flush routine Nikunj A Dadhania
` (4 subsequent siblings)
6 siblings, 0 replies; 15+ messages in thread
From: Nikunj A Dadhania @ 2026-05-18 4:59 UTC (permalink / raw)
To: kvm, seanjc, pbonzini
Cc: thomas.lendacky, bp, joao.m.martins, nikunj, kai.huang, yosry
Move the PML page pointer from VMX-specific vcpu_vmx structure to the
common kvm_vcpu_arch structure to enable sharing between VMX and SVM
implementations. Only the page pointer is moved to x86 common code while
keeping allocation logic vendor-specific, since AMD requires
snp_safe_alloc_page() for PML buffer allocation.
Update all VMX references accordingly.
No functional change, restructuring to prepare for SVM PML support.
Suggested-by: Kai Huang <kai.huang@intel.com>
Reviewed-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
---
arch/x86/include/asm/kvm_host.h | 2 ++
arch/x86/kvm/vmx/vmx.c | 21 ++++++++++-----------
arch/x86/kvm/vmx/vmx.h | 7 -------
arch/x86/kvm/x86.h | 6 ++++++
4 files changed, 18 insertions(+), 18 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 271bdd109a98e..1f14285380691 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -893,6 +893,8 @@ struct kvm_vcpu_arch {
*/
struct kvm_mmu_memory_cache mmu_external_spt_cache;
+ struct page *pml_page;
+
/*
* QEMU userspace and the guest each have their own FPU state.
* In vcpu_run, we switch between the user and guest FPU contexts.
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index db468bcdc9808..7524eb7ddf386 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4946,7 +4946,7 @@ static void init_vmcs(struct kvm_vcpu *vcpu)
vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
if (enable_pml) {
- vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+ vmcs_write64(PML_ADDRESS, page_to_phys(vcpu->arch.pml_page));
vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
}
@@ -6411,17 +6411,16 @@ void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code)
*error_code = 0;
}
-static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
+static void vmx_destroy_pml_buffer(struct kvm_vcpu *vcpu)
{
- if (vmx->pml_pg) {
- __free_page(vmx->pml_pg);
- vmx->pml_pg = NULL;
+ if (vcpu->arch.pml_page) {
+ __free_page(vcpu->arch.pml_page);
+ vcpu->arch.pml_page = NULL;
}
}
static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
u16 pml_idx, pml_tail_index;
u64 *pml_buf;
int i;
@@ -6444,7 +6443,7 @@ static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
* Read the entries in the same order they were written, to ensure that
* the dirty ring is filled in the same order the CPU wrote them.
*/
- pml_buf = page_address(vmx->pml_pg);
+ pml_buf = page_address(vcpu->arch.pml_page);
for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) {
u64 gpa;
@@ -7681,7 +7680,7 @@ void vmx_vcpu_free(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (enable_pml)
- vmx_destroy_pml_buffer(vmx);
+ vmx_destroy_pml_buffer(vcpu);
free_vpid(vmx->vpid);
nested_vmx_free_vcpu(vcpu);
free_loaded_vmcs(vmx->loaded_vmcs);
@@ -7710,8 +7709,8 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
* for the guest), etc.
*/
if (enable_pml) {
- vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!vmx->pml_pg)
+ vcpu->arch.pml_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vcpu->arch.pml_page)
goto free_vpid;
}
@@ -7782,7 +7781,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
free_vmcs:
free_loaded_vmcs(vmx->loaded_vmcs);
free_pml:
- vmx_destroy_pml_buffer(vmx);
+ vmx_destroy_pml_buffer(vcpu);
free_vpid:
free_vpid(vmx->vpid);
return err;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index daedf663c0a9c..645ec09628e82 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -258,13 +258,6 @@ struct vcpu_vmx {
unsigned int ple_window;
bool ple_window_dirty;
- /* Support for PML */
-#define PML_LOG_NR_ENTRIES 512
- /* PML is written backwards: this is the first entry written by the CPU */
-#define PML_HEAD_INDEX (PML_LOG_NR_ENTRIES-1)
-
- struct page *pml_pg;
-
/* apic deadline value in host tsc */
u64 hv_deadline_tsc;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 38a905fa86de2..ca3c693bfb35e 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -797,4 +797,10 @@ static inline bool kvm_is_valid_u_s_cet(struct kvm_vcpu *vcpu, u64 data)
return true;
}
+
+/* Support for PML */
+#define PML_LOG_NR_ENTRIES 512
+/* PML is written backwards: this is the first entry written by the CPU */
+#define PML_HEAD_INDEX (PML_LOG_NR_ENTRIES-1)
+
#endif
--
2.48.1
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH v7 3/7] KVM: x86: Carve out PML flush routine
2026-05-18 4:59 [PATCH v7 0/7] KVM: SVM: Add Page Modification Logging (PML) support Nikunj A Dadhania
2026-05-18 4:59 ` [PATCH v7 1/7] KVM: VMX: Pass @vcpu, not @vmx to init_vmcs() Nikunj A Dadhania
2026-05-18 4:59 ` [PATCH v7 2/7] KVM: x86: Move PML page to common vcpu arch structure Nikunj A Dadhania
@ 2026-05-18 4:59 ` Nikunj A Dadhania
2026-05-18 4:59 ` [PATCH v7 4/7] KVM: VMX: Use cpu_dirty_log_size instead of enable_pml for PML checks Nikunj A Dadhania
` (3 subsequent siblings)
6 siblings, 0 replies; 15+ messages in thread
From: Nikunj A Dadhania @ 2026-05-18 4:59 UTC (permalink / raw)
To: kvm, seanjc, pbonzini
Cc: thomas.lendacky, bp, joao.m.martins, nikunj, kai.huang, yosry
Move the PML (Page Modification Logging) buffer flushing logic from
VMX-specific code to common x86 KVM code to enable reuse by SVM and avoid
code duplication.
The AMD SVM PML implementations share the same behavior as VMX PML:
1) The PML buffer is a 4K page with 512 entries
2) Hardware records dirty GPAs in reverse order (from index 511 to 0)
3) Hardware clears bits 11:0 when recording GPAs
No functional change intended for VMX, except tone down the WARN_ON() to
WARN_ON_ONCE() for the page alignment check. If hardware exhibits this
behavior once, it's likely to occur repeatedly, so use WARN_ON_ONCE() to
avoid log flooding while still capturing the unexpected condition.
The refactoring prepares for SVM to leverage the same PML flushing
implementation.
Reviewed-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
---
arch/x86/kvm/vmx/vmx.c | 26 ++------------------------
arch/x86/kvm/x86.c | 31 +++++++++++++++++++++++++++++++
arch/x86/kvm/x86.h | 2 ++
3 files changed, 35 insertions(+), 24 deletions(-)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 7524eb7ddf386..e6fc374302f79 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6421,37 +6421,15 @@ static void vmx_destroy_pml_buffer(struct kvm_vcpu *vcpu)
static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
{
- u16 pml_idx, pml_tail_index;
- u64 *pml_buf;
- int i;
+ u16 pml_idx;
pml_idx = vmcs_read16(GUEST_PML_INDEX);
/* Do nothing if PML buffer is empty */
if (pml_idx == PML_HEAD_INDEX)
return;
- /*
- * PML index always points to the next available PML buffer entity
- * unless PML log has just overflowed.
- */
- pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1;
- /*
- * PML log is written backwards: the CPU first writes the entry 511
- * then the entry 510, and so on.
- *
- * Read the entries in the same order they were written, to ensure that
- * the dirty ring is filled in the same order the CPU wrote them.
- */
- pml_buf = page_address(vcpu->arch.pml_page);
-
- for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) {
- u64 gpa;
-
- gpa = pml_buf[i];
- WARN_ON(gpa & (PAGE_SIZE - 1));
- kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
- }
+ kvm_flush_pml_buffer(vcpu, pml_idx);
/* reset PML index */
vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 209eae67ab186..d1747c4e3858e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6746,6 +6746,37 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
kvm_vcpu_kick(vcpu);
}
+void kvm_flush_pml_buffer(struct kvm_vcpu *vcpu, u16 pml_idx)
+{
+ u16 pml_tail_index;
+ u64 *pml_buf;
+ int i;
+
+ /*
+ * PML index always points to the next available PML buffer entity
+ * unless PML log has just overflowed.
+ */
+ pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1;
+
+ /*
+ * PML log is written backwards: the CPU first writes the entry 511
+ * then the entry 510, and so on.
+ *
+ * Read the entries in the same order they were written, to ensure that
+ * the dirty ring is filled in the same order the CPU wrote them.
+ */
+ pml_buf = page_address(vcpu->arch.pml_page);
+
+ for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) {
+ u64 gpa;
+
+ gpa = pml_buf[i];
+ WARN_ON_ONCE(gpa & (PAGE_SIZE - 1));
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_flush_pml_buffer);
+
int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index ca3c693bfb35e..7844f7a5644d1 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -803,4 +803,6 @@ static inline bool kvm_is_valid_u_s_cet(struct kvm_vcpu *vcpu, u64 data)
/* PML is written backwards: this is the first entry written by the CPU */
#define PML_HEAD_INDEX (PML_LOG_NR_ENTRIES-1)
+void kvm_flush_pml_buffer(struct kvm_vcpu *vcpu, u16 pml_idx);
+
#endif
--
2.48.1
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH v7 4/7] KVM: VMX: Use cpu_dirty_log_size instead of enable_pml for PML checks
2026-05-18 4:59 [PATCH v7 0/7] KVM: SVM: Add Page Modification Logging (PML) support Nikunj A Dadhania
` (2 preceding siblings ...)
2026-05-18 4:59 ` [PATCH v7 3/7] KVM: x86: Carve out PML flush routine Nikunj A Dadhania
@ 2026-05-18 4:59 ` Nikunj A Dadhania
2026-05-18 4:59 ` [PATCH v7 5/7] x86/cpufeatures: Add Page modification logging Nikunj A Dadhania
` (2 subsequent siblings)
6 siblings, 0 replies; 15+ messages in thread
From: Nikunj A Dadhania @ 2026-05-18 4:59 UTC (permalink / raw)
To: kvm, seanjc, pbonzini
Cc: thomas.lendacky, bp, joao.m.martins, nikunj, kai.huang, yosry,
Pankaj Gupta
Replace the enable_pml check with cpu_dirty_log_size in VMX PML code
to determine whether PML is enabled on a per-VM basis. The enable_pml
module parameter is a global setting that doesn't reflect per-VM
capabilities, whereas cpu_dirty_log_size accurately indicates whether
a specific VM has PML enabled.
For example, TDX VMs don't yet support PML. Using cpu_dirty_log_size
ensures the check correctly reflects this, while enable_pml would
incorrectly indicate PML is available.
This also improves consistency with kvm_mmu_update_cpu_dirty_logging(),
which already uses cpu_dirty_log_size to determine PML enablement.
Suggested-by: Kai Huang <kai.huang@intel.com>
Reviewed-by: Kai Huang <kai.huang@intel.com>
Reviewed-by: Pankaj Gupta <pankaj.gupta@amd.com>
Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
---
arch/x86/kvm/vmx/vmx.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e6fc374302f79..4df8af02e3a2a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8359,7 +8359,7 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (WARN_ON_ONCE(!enable_pml))
+ if (WARN_ON_ONCE(!vcpu->kvm->arch.cpu_dirty_log_size))
return;
guard(vmx_vmcs01)(vcpu);
--
2.48.1
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH v7 5/7] x86/cpufeatures: Add Page modification logging
2026-05-18 4:59 [PATCH v7 0/7] KVM: SVM: Add Page Modification Logging (PML) support Nikunj A Dadhania
` (3 preceding siblings ...)
2026-05-18 4:59 ` [PATCH v7 4/7] KVM: VMX: Use cpu_dirty_log_size instead of enable_pml for PML checks Nikunj A Dadhania
@ 2026-05-18 4:59 ` Nikunj A Dadhania
2026-05-18 4:59 ` [PATCH v7 6/7] KVM: SVM: Use BIT_ULL for 64-bit misc_ctl bit definitions Nikunj A Dadhania
2026-05-18 4:59 ` [PATCH v7 7/7] KVM: SVM: Add Page modification logging support Nikunj A Dadhania
6 siblings, 0 replies; 15+ messages in thread
From: Nikunj A Dadhania @ 2026-05-18 4:59 UTC (permalink / raw)
To: kvm, seanjc, pbonzini
Cc: thomas.lendacky, bp, joao.m.martins, nikunj, kai.huang, yosry
Page modification logging(PML) is a hardware feature designed to track
guest modified memory pages. PML enables the hypervisor to identify which
pages in a guest's memory have been changed since the last checkpoint or
during live migration.
The PML feature is advertised via CPUID leaf 0x8000000A ECX[4] bit.
Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
---
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/kernel/cpu/scattered.c | 1 +
2 files changed, 2 insertions(+)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 35a2a0f9ab32e..f743f4643d897 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -228,6 +228,7 @@
#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* PV unlock function */
#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* PV vcpu_is_preempted function */
#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* "tdx_guest" Intel Trust Domain Extensions Guest */
+#define X86_FEATURE_PML ( 8*32+23) /* AMD Page Modification logging */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* "fsgsbase" RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 837d6a4b0c282..ad29798cf59dc 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -54,6 +54,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 },
{ X86_FEATURE_CPPC_PERF_PRIO, CPUID_EDX, 16, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
+ { X86_FEATURE_PML, CPUID_ECX, 4, 0x8000000a, 0 },
{ X86_FEATURE_X2AVIC_EXT, CPUID_ECX, 6, 0x8000000a, 0 },
{ X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 },
{ X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
--
2.48.1
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH v7 6/7] KVM: SVM: Use BIT_ULL for 64-bit misc_ctl bit definitions
2026-05-18 4:59 [PATCH v7 0/7] KVM: SVM: Add Page Modification Logging (PML) support Nikunj A Dadhania
` (4 preceding siblings ...)
2026-05-18 4:59 ` [PATCH v7 5/7] x86/cpufeatures: Add Page modification logging Nikunj A Dadhania
@ 2026-05-18 4:59 ` Nikunj A Dadhania
2026-05-18 4:59 ` [PATCH v7 7/7] KVM: SVM: Add Page modification logging support Nikunj A Dadhania
6 siblings, 0 replies; 15+ messages in thread
From: Nikunj A Dadhania @ 2026-05-18 4:59 UTC (permalink / raw)
To: kvm, seanjc, pbonzini
Cc: thomas.lendacky, bp, joao.m.martins, nikunj, kai.huang, yosry
Replace BIT() with BIT_ULL() for SVM nested control bit definitions
since misc_ctl is a 64-bit field in the VMCB control area structure.
No functional change intended.
Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
---
arch/x86/include/asm/svm.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index aa63431ba92c3..f199c52709df8 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -240,10 +240,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
-#define SVM_MISC_ENABLE_NP BIT(0)
-#define SVM_MISC_ENABLE_SEV BIT(1)
-#define SVM_MISC_ENABLE_SEV_ES BIT(2)
-#define SVM_MISC_ENABLE_GMET BIT(3)
+#define SVM_MISC_ENABLE_NP BIT_ULL(0)
+#define SVM_MISC_ENABLE_SEV BIT_ULL(1)
+#define SVM_MISC_ENABLE_SEV_ES BIT_ULL(2)
+#define SVM_MISC_ENABLE_GMET BIT_ULL(3)
#define SVM_MISC2_ENABLE_V_LBR BIT_ULL(0)
#define SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE BIT_ULL(1)
--
2.48.1
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH v7 7/7] KVM: SVM: Add Page modification logging support
2026-05-18 4:59 [PATCH v7 0/7] KVM: SVM: Add Page Modification Logging (PML) support Nikunj A Dadhania
` (5 preceding siblings ...)
2026-05-18 4:59 ` [PATCH v7 6/7] KVM: SVM: Use BIT_ULL for 64-bit misc_ctl bit definitions Nikunj A Dadhania
@ 2026-05-18 4:59 ` Nikunj A Dadhania
2026-05-18 17:12 ` Yosry Ahmed
6 siblings, 1 reply; 15+ messages in thread
From: Nikunj A Dadhania @ 2026-05-18 4:59 UTC (permalink / raw)
To: kvm, seanjc, pbonzini
Cc: thomas.lendacky, bp, joao.m.martins, nikunj, kai.huang, yosry
Currently, dirty logging relies on write protecting guest memory and
marking dirty GFNs during subsequent write faults. This method works but
incurs overhead due to additional write faults for each dirty GFN.
Implement support for the Page Modification Logging (PML) feature, a
hardware-assisted method for efficient dirty logging. PML automatically
logs dirty GPA[51:12] to a 4K buffer when the CPU sets NPT D-bits. Two new
VMCB fields are utilized: PML_ADDR and PML_INDEX. The PML_INDEX is
initialized to 511 (8 bytes per GPA entry), and the CPU decreases the
PML_INDEX after logging each GPA. When the PML buffer is full, a
VMEXIT(PML_FULL) with exit code 0x407 is generated.
Since PML_INDEX in the VMCB control area remains valid after an intercepted
SHUTDOWN, only initialize it on reset and leave it unchanged on INIT to
avoid discarding already-logged entries that haven't been flushed.
PML operates on guest physical addresses at the NPT level, tracking D-bit
updates in page tables rather than memory content. This allows it to work
identically for normal and confidential computing guests
(SEV/SEV-ES/SEV-SNP), enabling cpu_dirty_log_size to be set uniformly for
all AMD VMs without special-casing encrypted guests
Use vmcb01 directly when updating PML controls to ensure L1's state
remains correct, as svm->vmcb points to vmcb02 when L2 is active.
Clear PML fields to avoid stale data in vmcb02 for nested guests.
Add a new module parameter to enable/disable PML, and enable it by default
when supported
Acked-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
---
arch/x86/include/asm/svm.h | 6 +-
arch/x86/include/uapi/asm/svm.h | 2 +
arch/x86/kvm/svm/nested.c | 6 ++
arch/x86/kvm/svm/sev.c | 2 +-
arch/x86/kvm/svm/svm.c | 108 +++++++++++++++++++++++++++++++-
arch/x86/kvm/svm/svm.h | 3 +
6 files changed, 123 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index f199c52709df8..e0a7549a7c727 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -165,7 +165,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u8 reserved_9[22];
u64 allowed_sev_features; /* Offset 0x138 */
u64 guest_sev_features; /* Offset 0x140 */
- u8 reserved_10[664];
+ u8 reserved_10[128];
+ u64 pml_addr; /* Offset 0x1c8 */
+ u16 pml_index; /* Offset 0x1d0 */
+ u8 reserved_11[526];
/*
* Offset 0x3e0, 32 bytes reserved
* for use by hypervisor/software.
@@ -244,6 +247,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_MISC_ENABLE_SEV BIT_ULL(1)
#define SVM_MISC_ENABLE_SEV_ES BIT_ULL(2)
#define SVM_MISC_ENABLE_GMET BIT_ULL(3)
+#define SVM_MISC_ENABLE_PML BIT_ULL(11)
#define SVM_MISC2_ENABLE_V_LBR BIT_ULL(0)
#define SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE BIT_ULL(1)
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 010a45c9f6147..e806761850921 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -101,6 +101,7 @@
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402
#define SVM_EXIT_VMGEXIT 0x403
+#define SVM_EXIT_PML_FULL 0x407
/* SEV-ES software-defined VMGEXIT events */
#define SVM_VMGEXIT_MMIO_READ 0x80000001ull
@@ -236,6 +237,7 @@
{ SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \
{ SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \
{ SVM_EXIT_VMGEXIT, "vmgexit" }, \
+ { SVM_EXIT_PML_FULL, "pml_full" }, \
{ SVM_VMGEXIT_MMIO_READ, "vmgexit_mmio_read" }, \
{ SVM_VMGEXIT_MMIO_WRITE, "vmgexit_mmio_write" }, \
{ SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 4ef9bc6a553f3..dd30aef9fc497 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -882,6 +882,12 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP);
+ /* Clear PML fields to avoid stale data in vmcb02. */
+ if (pml) {
+ vmcb02->control.pml_addr = 0;
+ vmcb02->control.pml_index = -1;
+ }
+
/*
* Stash vmcb02's counter if the guest hasn't moved past the guilty
* instruction; otherwise, reset the counter to '0'.
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 37d4cfa5d980b..893145a691191 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -4861,7 +4861,7 @@ struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
* Allocate an SNP-safe page to workaround the SNP erratum where
* the CPU will incorrectly signal an RMP violation #PF if a
* hugepage (2MB or 1GB) collides with the RMP entry of a
- * 2MB-aligned VMCB, VMSA, or AVIC backing page.
+ * 2MB-aligned VMCB, VMSA, PML or AVIC backing page.
*
* Allocate one extra page, choose a page which is not
* 2MB-aligned, and free the other.
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e74fcde6155ec..0abe6d3f06209 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -179,6 +179,9 @@ module_param(vnmi, bool, 0444);
module_param(enable_mediated_pmu, bool, 0444);
+bool __ro_after_init pml = true;
+module_param(pml, bool, 0444);
+
static bool __ro_after_init svm_gp_erratum_intercept = true;
static u8 rsm_ins_bytes[] = "\x0f\xaa";
@@ -1260,6 +1263,24 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event)
if (vcpu->kvm->arch.bus_lock_detection_enabled)
svm_set_intercept(svm, INTERCEPT_BUSLOCK);
+ if (pml) {
+ /*
+ * Populate the page address and index here, PML is enabled
+ * when dirty logging is enabled on the memslot through
+ * svm_update_cpu_dirty_logging()
+ */
+ control->pml_addr = (u64)__sme_set(page_to_phys(vcpu->arch.pml_page));
+
+ /*
+ * PML index in the VMCB control area remains valid after an
+ * intercepted SHUTDOWN, so only initialize PML index on reset
+ * to avoid discarding already-logged entries that haven't been
+ * flushed.
+ */
+ if (!init_event)
+ control->pml_index = PML_HEAD_INDEX;
+ }
+
if (is_sev_guest(vcpu))
sev_init_vmcb(svm, init_event);
@@ -1324,9 +1345,15 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
if (!vmcb01_page)
goto out;
+ if (pml) {
+ vcpu->arch.pml_page = snp_safe_alloc_page();
+ if (!vcpu->arch.pml_page)
+ goto error_free_vmcb_page;
+ }
+
err = sev_vcpu_create(vcpu);
if (err)
- goto error_free_vmcb_page;
+ goto error_free_pml_page;
err = avic_init_vcpu(svm);
if (err)
@@ -1351,6 +1378,9 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
error_free_sev:
sev_free_vcpu(vcpu);
+error_free_pml_page:
+ if (vcpu->arch.pml_page)
+ __free_page(vcpu->arch.pml_page);
error_free_vmcb_page:
__free_page(vmcb01_page);
out:
@@ -1368,6 +1398,9 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
sev_free_vcpu(vcpu);
+ if (pml && vcpu->arch.pml_page)
+ __free_page(vcpu->arch.pml_page);
+
__free_page(__sme_pa_to_page(svm->vmcb01.pa));
svm_vcpu_free_msrpm(svm->msrpm);
}
@@ -3331,6 +3364,53 @@ static int vmmcall_interception(struct kvm_vcpu *vcpu)
return kvm_emulate_hypercall(vcpu);
}
+void svm_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+
+ if (WARN_ON_ONCE(!vcpu->kvm->arch.cpu_dirty_log_size))
+ return;
+
+ /*
+ * Note, nr_memslots_dirty_logging can be changed concurrent with this
+ * code, but in that case another update request will be made and so
+ * the guest will never run with a stale PML value.
+ */
+ if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
+ vmcb01->control.misc_ctl |= SVM_MISC_ENABLE_PML;
+ else
+ vmcb01->control.misc_ctl &= ~SVM_MISC_ENABLE_PML;
+
+ vmcb_mark_dirty(vmcb01, VMCB_NPT);
+}
+
+static void svm_flush_pml_buffer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+
+ /* Do nothing if PML buffer is empty */
+ if (control->pml_index == PML_HEAD_INDEX)
+ return;
+
+ kvm_flush_pml_buffer(vcpu, control->pml_index);
+
+ /* Reset the PML index */
+ control->pml_index = PML_HEAD_INDEX;
+}
+
+static int pml_full_interception(struct kvm_vcpu *vcpu)
+{
+ trace_kvm_pml_full(vcpu->vcpu_id);
+
+ /*
+ * PML buffer is already flushed at the beginning of svm_handle_exit().
+ * Nothing to do here.
+ */
+ return 1;
+}
+
static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
@@ -3407,6 +3487,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
#ifdef CONFIG_KVM_AMD_SEV
[SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
#endif
+ [SVM_EXIT_PML_FULL] = pml_full_interception,
};
static void dump_vmcb(struct kvm_vcpu *vcpu)
@@ -3456,8 +3537,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
- pr_err("%-20s%lld\n", "misc_ctl:", control->misc_ctl);
+ pr_err("%-20s%llx\n", "misc_ctl:", control->misc_ctl);
pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
+
+ if (pml) {
+ pr_err("%-20s%016llx\n", "pml_addr:", control->pml_addr);
+ pr_err("%-20s%04x\n", "pml_index:", control->pml_index);
+ }
+
pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
@@ -3703,6 +3790,14 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_run *kvm_run = vcpu->run;
+ /*
+ * Opportunistically flush the PML buffer on VM exit. This keeps the
+ * dirty bitmap current by processing logged GPAs rather than waiting for
+ * PML_FULL exit.
+ */
+ if (vcpu->kvm->arch.cpu_dirty_log_size && !is_guest_mode(vcpu))
+ svm_flush_pml_buffer(vcpu);
+
if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE))
return 0;
@@ -5310,6 +5405,9 @@ static int svm_vm_init(struct kvm *kvm)
return ret;
}
+ if (pml)
+ kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES;
+
svm_srso_vm_init();
return 0;
}
@@ -5465,6 +5563,8 @@ struct kvm_x86_ops svm_x86_ops __initdata = {
.gmem_prepare = sev_gmem_prepare,
.gmem_invalidate = sev_gmem_invalidate,
.gmem_max_mapping_level = sev_gmem_max_mapping_level,
+
+ .update_cpu_dirty_logging = svm_update_cpu_dirty_logging,
};
/*
@@ -5690,6 +5790,10 @@ static __init int svm_hardware_setup(void)
nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
+ pml = pml && npt_enabled && cpu_feature_enabled(X86_FEATURE_PML);
+ if (pml)
+ pr_info("Page modification logging supported\n");
+
if (lbrv) {
if (!boot_cpu_has(X86_FEATURE_LBRV))
lbrv = false;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 2b6733dffd76f..7cd8207359d7c 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -52,6 +52,7 @@ extern int vgif;
extern bool intercept_smi;
extern bool vnmi;
extern int lbrv;
+extern bool pml;
extern int tsc_aux_uret_slot __ro_after_init;
@@ -832,6 +833,8 @@ static inline void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
svm_set_intercept_for_msr(vcpu, msr, type, true);
}
+void svm_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+
/* nested.c */
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
--
2.48.1
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH v7 7/7] KVM: SVM: Add Page modification logging support
2026-05-18 4:59 ` [PATCH v7 7/7] KVM: SVM: Add Page modification logging support Nikunj A Dadhania
@ 2026-05-18 17:12 ` Yosry Ahmed
2026-05-18 18:55 ` Sean Christopherson
0 siblings, 1 reply; 15+ messages in thread
From: Yosry Ahmed @ 2026-05-18 17:12 UTC (permalink / raw)
To: Nikunj A Dadhania
Cc: kvm, seanjc, pbonzini, thomas.lendacky, bp, joao.m.martins,
kai.huang
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index 4ef9bc6a553f3..dd30aef9fc497 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -882,6 +882,12 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
> vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
> vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP);
>
> + /* Clear PML fields to avoid stale data in vmcb02. */
> + if (pml) {
> + vmcb02->control.pml_addr = 0;
> + vmcb02->control.pml_index = -1;
> + }
I think the comment here is misleading. vmcb02 is allocated with
__GFP_ZERO, and IIUC pml_index should not matter when pml_addr is zero.
This is strictly for hardening as far as I can tell, especially looking
at commit c3bb9a20834f ("KVM: nVMX: Disable PML in hardware when running
L2"), which introduced something similar for VMX.
So maybe something like:
/*
* PML is never enabled in hardware for L2. Make sure that an
* unexpected PML write would trigger a PML_FULL VM-Exit.
*/
Also, the above commit also hanlded a PML_FULL VM-Exit as unexpected,
maybe we want to do that here as well? Or is that too paranoid?
Annoyingly, the unexpected exit reason handling is in
svm_invoke_exit_handler(), but the guest mode check is in
svm_handle_exit(), so if we do that we may need to move some code
around.
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH v7 7/7] KVM: SVM: Add Page modification logging support
2026-05-18 17:12 ` Yosry Ahmed
@ 2026-05-18 18:55 ` Sean Christopherson
2026-05-18 19:14 ` Yosry Ahmed
` (2 more replies)
0 siblings, 3 replies; 15+ messages in thread
From: Sean Christopherson @ 2026-05-18 18:55 UTC (permalink / raw)
To: Yosry Ahmed
Cc: Nikunj A Dadhania, kvm, pbonzini, thomas.lendacky, bp,
joao.m.martins, kai.huang
On Mon, May 18, 2026, Yosry Ahmed wrote:
> > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > index 4ef9bc6a553f3..dd30aef9fc497 100644
> > --- a/arch/x86/kvm/svm/nested.c
> > +++ b/arch/x86/kvm/svm/nested.c
> > @@ -882,6 +882,12 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
> > vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
> > vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP);
> >
> > + /* Clear PML fields to avoid stale data in vmcb02. */
> > + if (pml) {
> > + vmcb02->control.pml_addr = 0;
> > + vmcb02->control.pml_index = -1;
> > + }
>
> I think the comment here is misleading.
+1000
> vmcb02 is allocated with
> __GFP_ZERO, and IIUC pml_index should not matter when pml_addr is zero.
> This is strictly for hardening as far as I can tell, especially looking
> at commit c3bb9a20834f ("KVM: nVMX: Disable PML in hardware when running
> L2"), which introduced something similar for VMX.
>
> So maybe something like:
> /*
> * PML is never enabled in hardware for L2. Make sure that an
> * unexpected PML write would trigger a PML_FULL VM-Exit.
> */
Hmm, I was going to say we should drop the code entirely, but I do think it makes
sense to set PML_INDEX to -1 to ensure a VM-Exit occurs.
Actually, even better idea, at least for nVMX. Rather than write '0' for the
address, which is what I really dislike (bad past-Sean!), we should write -1ull
so that unexpectedly enabling PML in vmcs02 results in VM-Fail, i.e. don't even
rely on the WARN for safety.
If SVM has a consistency check on the address, then do the same, otherwise I
guess just go with setting pml_index to -1 and WARNing on PML_FULL?
> Also, the above commit also hanlded a PML_FULL VM-Exit as unexpected,
> maybe we want to do that here as well? Or is that too paranoid?
Not too paranoid.
> Annoyingly, the unexpected exit reason handling is in
> svm_invoke_exit_handler(), but the guest mode check is in
> svm_handle_exit(), so if we do that we may need to move some code
> around.
Why's that? Oh, if we want to reuse the unexpected exit code. What about this?
We don't even have to document the same thing in two different places, because
the reasoning for nested_svm_exit_special() can and is different. E.g. even if
we decided to utilize PML while running L2 (which would be possible, we'd "just"
have to translate the GPAs), the exits would still need to be routed to KVM.
Ditto for if we did actually enable PML in vmcb02 on behalf of L1: we'd need to
remove the check in nested_svm_exit_special(), but not the check in
svm_invoke_exit_handler() (though the latter's comment would need to be updated).
diff --git arch/x86/kvm/svm/nested.c arch/x86/kvm/svm/nested.c
index 4ef9bc6a553f..401a0ac44018 100644
--- arch/x86/kvm/svm/nested.c
+++ arch/x86/kvm/svm/nested.c
@@ -1782,6 +1782,13 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
if (nested_svm_is_l2_tlb_flush_hcall(vcpu))
return NESTED_EXIT_HOST;
break;
+ case SVM_EXIT_PML_FULL:
+ /*
+ * All PML full exits are handled by KVM. KVM emulates PML in
+ * software for L1, but never enables PML in hardware on behalf
+ * of L1.
+ */
+ return NESTED_EXIT_HOST;
default:
break;
}
diff --git arch/x86/kvm/svm/svm.c arch/x86/kvm/svm/svm.c
index e74fcde6155e..6bcb191d725b 100644
--- arch/x86/kvm/svm/svm.c
+++ arch/x86/kvm/svm/svm.c
@@ -3635,6 +3635,13 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 __exit_code)
(u64)exit_code != __exit_code)
goto unexpected_vmexit;
+ /*
+ * PML is never enabled when running L2, bail immediately if a PML full
+ * exit occurs as something is horribly wrong.
+ */
+ if (unlikely(is_guest_mode(vcpu) && exit_code == SVM_EXIT_PML_FULL))
+ goto unexpected_vmexit;
+
#ifdef CONFIG_MITIGATION_RETPOLINE
if (exit_code == SVM_EXIT_MSR)
return msr_interception(vcpu);
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH v7 7/7] KVM: SVM: Add Page modification logging support
2026-05-18 18:55 ` Sean Christopherson
@ 2026-05-18 19:14 ` Yosry Ahmed
2026-05-18 19:25 ` Yosry Ahmed
2026-05-19 14:46 ` Nikunj A. Dadhania
2 siblings, 0 replies; 15+ messages in thread
From: Yosry Ahmed @ 2026-05-18 19:14 UTC (permalink / raw)
To: Sean Christopherson
Cc: Nikunj A Dadhania, kvm, pbonzini, thomas.lendacky, bp,
joao.m.martins, kai.huang
> > So maybe something like:
> > /*
> > * PML is never enabled in hardware for L2. Make sure that an
> > * unexpected PML write would trigger a PML_FULL VM-Exit.
> > */
>
> Hmm, I was going to say we should drop the code entirely, but I do think it makes
> sense to set PML_INDEX to -1 to ensure a VM-Exit occurs.
>
> Actually, even better idea, at least for nVMX. Rather than write '0' for the
> address, which is what I really dislike (bad past-Sean!), we should write -1ull
> so that unexpectedly enabling PML in vmcs02 results in VM-Fail, i.e. don't even
> rely on the WARN for safety.
>
> If SVM has a consistency check on the address, then do the same, otherwise I
> guess just go with setting pml_index to -1 and WARNing on PML_FULL?
Yeah this makes sense to me.
>
> > Also, the above commit also hanlded a PML_FULL VM-Exit as unexpected,
> > maybe we want to do that here as well? Or is that too paranoid?
>
> Not too paranoid.
>
> > Annoyingly, the unexpected exit reason handling is in
> > svm_invoke_exit_handler(), but the guest mode check is in
> > svm_handle_exit(), so if we do that we may need to move some code
> > around.
>
> Why's that? Oh, if we want to reuse the unexpected exit code. What about this?
I actually had something different in mind, I was going to suggest
that we move the unexpected exit logic to svm_handle_exit(), and key
off a specific return value from svm_invoke_exit_handler(). Then we
can bypass the whole nested_svm_exit_special() logic in
svm_handle_exit() and jump to unexpected_vmexit.
However, as I was writing a diff, I realized this will get gross
quickly. So I think your suggestion below works.
>
> We don't even have to document the same thing in two different places, because
> the reasoning for nested_svm_exit_special() can and is different. E.g. even if
> we decided to utilize PML while running L2 (which would be possible, we'd "just"
> have to translate the GPAs), the exits would still need to be routed to KVM.
> Ditto for if we did actually enable PML in vmcb02 on behalf of L1: we'd need to
> remove the check in nested_svm_exit_special(), but not the check in
> svm_invoke_exit_handler() (though the latter's comment would need to be updated).
>
> diff --git arch/x86/kvm/svm/nested.c arch/x86/kvm/svm/nested.c
> index 4ef9bc6a553f..401a0ac44018 100644
> --- arch/x86/kvm/svm/nested.c
> +++ arch/x86/kvm/svm/nested.c
> @@ -1782,6 +1782,13 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
> if (nested_svm_is_l2_tlb_flush_hcall(vcpu))
> return NESTED_EXIT_HOST;
> break;
> + case SVM_EXIT_PML_FULL:
> + /*
> + * All PML full exits are handled by KVM. KVM emulates PML in
> + * software for L1, but never enables PML in hardware on behalf
> + * of L1.
> + */
> + return NESTED_EXIT_HOST;
> default:
> break;
> }
> diff --git arch/x86/kvm/svm/svm.c arch/x86/kvm/svm/svm.c
> index e74fcde6155e..6bcb191d725b 100644
> --- arch/x86/kvm/svm/svm.c
> +++ arch/x86/kvm/svm/svm.c
> @@ -3635,6 +3635,13 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 __exit_code)
> (u64)exit_code != __exit_code)
> goto unexpected_vmexit;
>
> + /*
> + * PML is never enabled when running L2, bail immediately if a PML full
> + * exit occurs as something is horribly wrong.
> + */
> + if (unlikely(is_guest_mode(vcpu) && exit_code == SVM_EXIT_PML_FULL))
> + goto unexpected_vmexit;
> +
> #ifdef CONFIG_MITIGATION_RETPOLINE
> if (exit_code == SVM_EXIT_MSR)
> return msr_interception(vcpu);
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v7 7/7] KVM: SVM: Add Page modification logging support
2026-05-18 18:55 ` Sean Christopherson
2026-05-18 19:14 ` Yosry Ahmed
@ 2026-05-18 19:25 ` Yosry Ahmed
2026-05-19 14:46 ` Nikunj A. Dadhania
2 siblings, 0 replies; 15+ messages in thread
From: Yosry Ahmed @ 2026-05-18 19:25 UTC (permalink / raw)
To: Sean Christopherson
Cc: Nikunj A Dadhania, kvm, pbonzini, thomas.lendacky, bp,
joao.m.martins, kai.huang
On Mon, May 18, 2026 at 11:55 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Mon, May 18, 2026, Yosry Ahmed wrote:
> > > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > > index 4ef9bc6a553f3..dd30aef9fc497 100644
> > > --- a/arch/x86/kvm/svm/nested.c
> > > +++ b/arch/x86/kvm/svm/nested.c
> > > @@ -882,6 +882,12 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
> > > vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
> > > vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP);
> > >
> > > + /* Clear PML fields to avoid stale data in vmcb02. */
> > > + if (pml) {
> > > + vmcb02->control.pml_addr = 0;
> > > + vmcb02->control.pml_index = -1;
> > > + }
> >
> > I think the comment here is misleading.
>
> +1000
>
> > vmcb02 is allocated with
> > __GFP_ZERO, and IIUC pml_index should not matter when pml_addr is zero.
> > This is strictly for hardening as far as I can tell, especially looking
> > at commit c3bb9a20834f ("KVM: nVMX: Disable PML in hardware when running
> > L2"), which introduced something similar for VMX.
> >
> > So maybe something like:
> > /*
> > * PML is never enabled in hardware for L2. Make sure that an
> > * unexpected PML write would trigger a PML_FULL VM-Exit.
> > */
>
> Hmm, I was going to say we should drop the code entirely, but I do think it makes
> sense to set PML_INDEX to -1 to ensure a VM-Exit occurs.
>
> Actually, even better idea, at least for nVMX. Rather than write '0' for the
> address, which is what I really dislike (bad past-Sean!), we should write -1ull
> so that unexpectedly enabling PML in vmcs02 results in VM-Fail, i.e. don't even
> rely on the WARN for safety.
>
> If SVM has a consistency check on the address, then do the same, otherwise I
> guess just go with setting pml_index to -1 and WARNing on PML_FULL?
Wouldn't it be hilarious if there is a consistency check, but it fires
even if SVM_MISC_ENABLE_PML is not set? I honestly wouldn't be
surprised at all :)
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH v7 7/7] KVM: SVM: Add Page modification logging support
2026-05-18 18:55 ` Sean Christopherson
2026-05-18 19:14 ` Yosry Ahmed
2026-05-18 19:25 ` Yosry Ahmed
@ 2026-05-19 14:46 ` Nikunj A. Dadhania
2026-05-29 6:38 ` [PATCH v7.1] " Nikunj A Dadhania
2 siblings, 1 reply; 15+ messages in thread
From: Nikunj A. Dadhania @ 2026-05-19 14:46 UTC (permalink / raw)
To: Sean Christopherson, Yosry Ahmed
Cc: kvm, pbonzini, thomas.lendacky, bp, joao.m.martins, kai.huang
On 5/19/2026 12:25 AM, Sean Christopherson wrote:
> On Mon, May 18, 2026, Yosry Ahmed wrote:
>>> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
>> So maybe something like:
>> /*
>> * PML is never enabled in hardware for L2. Make sure that an
>> * unexpected PML write would trigger a PML_FULL VM-Exit.
>> */
>
> Hmm, I was going to say we should drop the code entirely, but I do think it makes
> sense to set PML_INDEX to -1 to ensure a VM-Exit occurs.
>
> Actually, even better idea, at least for nVMX. Rather than write '0' for the
> address, which is what I really dislike (bad past-Sean!), we should write -1ull
> so that unexpectedly enabling PML in vmcs02 results in VM-Fail, i.e. don't even
> rely on the WARN for safety.
>
> If SVM has a consistency check on the address, then do the same, otherwise I
> guess just go with setting pml_index to -1 and WARNing on PML_FULL?
At least the PML spec [1] doesn't document a consistency check on PML_ADDR
during VMRUN. I'll confirm internally whether one exists in hardware.
Till then, will go with the below:
/*
* PML is never enabled in hardware for L2. Make sure that an
* unexpected PML write would trigger a PML_FULL VM-Exit.
*/
if (pml)
vmcb02->control.pml_index = -1;
If it turns out a MAXPHYADDR check does exist, we can switch to -1ull for
pml_addr in a follow-up.
>> Annoyingly, the unexpected exit reason handling is in
>> svm_invoke_exit_handler(), but the guest mode check is in
>> svm_handle_exit(), so if we do that we may need to move some code
>> around.
>
> Why's that? Oh, if we want to reuse the unexpected exit code. What about this?
>
> We don't even have to document the same thing in two different places, because
> the reasoning for nested_svm_exit_special() can and is different. E.g. even if
> we decided to utilize PML while running L2 (which would be possible, we'd "just"
> have to translate the GPAs), the exits would still need to be routed to KVM.
> Ditto for if we did actually enable PML in vmcb02 on behalf of L1: we'd need to
> remove the check in nested_svm_exit_special(), but not the check in
> svm_invoke_exit_handler() (though the latter's comment would need to be updated).
Will add both changes and add in next version.
> diff --git arch/x86/kvm/svm/nested.c arch/x86/kvm/svm/nested.c
> index 4ef9bc6a553f..401a0ac44018 100644
> --- arch/x86/kvm/svm/nested.c
> +++ arch/x86/kvm/svm/nested.c
> @@ -1782,6 +1782,13 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
> if (nested_svm_is_l2_tlb_flush_hcall(vcpu))
> return NESTED_EXIT_HOST;
> break;
> + case SVM_EXIT_PML_FULL:
> + /*
> + * All PML full exits are handled by KVM. KVM emulates PML in
> + * software for L1, but never enables PML in hardware on behalf
> + * of L1.
> + */
> + return NESTED_EXIT_HOST;
> default:
> break;
> }
> diff --git arch/x86/kvm/svm/svm.c arch/x86/kvm/svm/svm.c
> index e74fcde6155e..6bcb191d725b 100644
> --- arch/x86/kvm/svm/svm.c
> +++ arch/x86/kvm/svm/svm.c
> @@ -3635,6 +3635,13 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 __exit_code)
> (u64)exit_code != __exit_code)
> goto unexpected_vmexit;
>
> + /*
> + * PML is never enabled when running L2, bail immediately if a PML full
> + * exit occurs as something is horribly wrong.
> + */
> + if (unlikely(is_guest_mode(vcpu) && exit_code == SVM_EXIT_PML_FULL))
> + goto unexpected_vmexit;
> +
> #ifdef CONFIG_MITIGATION_RETPOLINE
> if (exit_code == SVM_EXIT_MSR)
> return msr_interception(vcpu);
>
[1] https://docs.amd.com/v/u/en-US/69208_1.00_AMD64_PML_PUB
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v7.1] KVM: SVM: Add Page modification logging support
2026-05-19 14:46 ` Nikunj A. Dadhania
@ 2026-05-29 6:38 ` Nikunj A Dadhania
0 siblings, 0 replies; 15+ messages in thread
From: Nikunj A Dadhania @ 2026-05-29 6:38 UTC (permalink / raw)
To: nikunj, pbonzini, seanjc
Cc: bp, joao.m.martins, kai.huang, kvm, thomas.lendacky, yosry
Currently, dirty logging relies on write protecting guest memory and
marking dirty GFNs during subsequent write faults. This method works but
incurs overhead due to additional write faults for each dirty GFN.
Implement support for the Page Modification Logging (PML) feature, a
hardware-assisted method for efficient dirty logging. PML automatically
logs dirty GPA[51:12] to a 4K buffer when the CPU sets NPT D-bits. Two new
VMCB fields are utilized: PML_ADDR and PML_INDEX. The PML_INDEX is
initialized to 511 (8 bytes per GPA entry), and the CPU decreases the
PML_INDEX after logging each GPA. When the PML buffer is full, a
VMEXIT(PML_FULL) with exit code 0x407 is generated.
Since PML_INDEX in the VMCB control area remains valid after an intercepted
SHUTDOWN, only initialize it on reset and leave it unchanged on INIT to
avoid discarding already-logged entries that haven't been flushed.
PML operates on guest physical addresses at the NPT level, tracking D-bit
updates in page tables rather than memory content. This allows it to work
identically for normal and confidential computing guests
(SEV/SEV-ES/SEV-SNP), enabling cpu_dirty_log_size to be set uniformly for
all AMD VMs without special-casing encrypted guests.
Use vmcb01 directly when updating PML controls to ensure L1's state
remains correct, as svm->vmcb points to vmcb02 when L2 is active.
PML is not enabled in hardware for nested guests; treat PML_FULL as
unexpected exits.
Add a new module parameter to enable/disable PML, and enable it by default
when supported.
Acked-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
---
Sending updated 7/7 patch here, rather than reposting the full series.
Sean, any remaining comments? Happy to send v8 if needed.
v7 -> v7.1:
* Drop redundant pml_addr=0 in nested_vmcb02_prepare_control() (Yosry)
* Fix misleading comment, pml_index=-1 is hardening not stale-data cleanup (Yosry, Sean)
* Route nested SVM_EXIT_PML_FULL to KVM and treat it as unexpected exit (Sean)
---
arch/x86/include/asm/svm.h | 6 +-
arch/x86/include/uapi/asm/svm.h | 2 +
arch/x86/kvm/svm/nested.c | 14 ++++
arch/x86/kvm/svm/sev.c | 2 +-
arch/x86/kvm/svm/svm.c | 115 +++++++++++++++++++++++++++++++-
arch/x86/kvm/svm/svm.h | 3 +
6 files changed, 138 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index f199c52709df8..e0a7549a7c727 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -165,7 +165,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u8 reserved_9[22];
u64 allowed_sev_features; /* Offset 0x138 */
u64 guest_sev_features; /* Offset 0x140 */
- u8 reserved_10[664];
+ u8 reserved_10[128];
+ u64 pml_addr; /* Offset 0x1c8 */
+ u16 pml_index; /* Offset 0x1d0 */
+ u8 reserved_11[526];
/*
* Offset 0x3e0, 32 bytes reserved
* for use by hypervisor/software.
@@ -244,6 +247,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_MISC_ENABLE_SEV BIT_ULL(1)
#define SVM_MISC_ENABLE_SEV_ES BIT_ULL(2)
#define SVM_MISC_ENABLE_GMET BIT_ULL(3)
+#define SVM_MISC_ENABLE_PML BIT_ULL(11)
#define SVM_MISC2_ENABLE_V_LBR BIT_ULL(0)
#define SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE BIT_ULL(1)
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 010a45c9f6147..e806761850921 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -101,6 +101,7 @@
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402
#define SVM_EXIT_VMGEXIT 0x403
+#define SVM_EXIT_PML_FULL 0x407
/* SEV-ES software-defined VMGEXIT events */
#define SVM_VMGEXIT_MMIO_READ 0x80000001ull
@@ -236,6 +237,7 @@
{ SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \
{ SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \
{ SVM_EXIT_VMGEXIT, "vmgexit" }, \
+ { SVM_EXIT_PML_FULL, "pml_full" }, \
{ SVM_VMGEXIT_MMIO_READ, "vmgexit_mmio_read" }, \
{ SVM_VMGEXIT_MMIO_WRITE, "vmgexit_mmio_write" }, \
{ SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 4ef9bc6a553f3..8533a7610a5fa 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -882,6 +882,13 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP);
+ /*
+ * PML is never enabled in hardware for L2. Make sure that an
+ * unexpected PML write would trigger a PML_FULL VM-Exit.
+ */
+ if (pml)
+ vmcb02->control.pml_index = -1;
+
/*
* Stash vmcb02's counter if the guest hasn't moved past the guilty
* instruction; otherwise, reset the counter to '0'.
@@ -1782,6 +1789,13 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
if (nested_svm_is_l2_tlb_flush_hcall(vcpu))
return NESTED_EXIT_HOST;
break;
+ case SVM_EXIT_PML_FULL:
+ /*
+ * All PML full exits are handled by KVM. KVM emulates PML in
+ * software for L1, but never enables PML in hardware on behalf
+ * of L1.
+ */
+ return NESTED_EXIT_HOST;
default:
break;
}
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 37d4cfa5d980b..893145a691191 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -4861,7 +4861,7 @@ struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
* Allocate an SNP-safe page to workaround the SNP erratum where
* the CPU will incorrectly signal an RMP violation #PF if a
* hugepage (2MB or 1GB) collides with the RMP entry of a
- * 2MB-aligned VMCB, VMSA, or AVIC backing page.
+ * 2MB-aligned VMCB, VMSA, PML or AVIC backing page.
*
* Allocate one extra page, choose a page which is not
* 2MB-aligned, and free the other.
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e74fcde6155ec..bde900311191b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -179,6 +179,9 @@ module_param(vnmi, bool, 0444);
module_param(enable_mediated_pmu, bool, 0444);
+bool __ro_after_init pml = true;
+module_param(pml, bool, 0444);
+
static bool __ro_after_init svm_gp_erratum_intercept = true;
static u8 rsm_ins_bytes[] = "\x0f\xaa";
@@ -1260,6 +1263,24 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event)
if (vcpu->kvm->arch.bus_lock_detection_enabled)
svm_set_intercept(svm, INTERCEPT_BUSLOCK);
+ if (pml) {
+ /*
+ * Populate the page address and index here, PML is enabled
+ * when dirty logging is enabled on the memslot through
+ * svm_update_cpu_dirty_logging()
+ */
+ control->pml_addr = (u64)__sme_set(page_to_phys(vcpu->arch.pml_page));
+
+ /*
+ * PML index in the VMCB control area remains valid after an
+ * intercepted SHUTDOWN, so only initialize PML index on reset
+ * to avoid discarding already-logged entries that haven't been
+ * flushed.
+ */
+ if (!init_event)
+ control->pml_index = PML_HEAD_INDEX;
+ }
+
if (is_sev_guest(vcpu))
sev_init_vmcb(svm, init_event);
@@ -1324,9 +1345,15 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
if (!vmcb01_page)
goto out;
+ if (pml) {
+ vcpu->arch.pml_page = snp_safe_alloc_page();
+ if (!vcpu->arch.pml_page)
+ goto error_free_vmcb_page;
+ }
+
err = sev_vcpu_create(vcpu);
if (err)
- goto error_free_vmcb_page;
+ goto error_free_pml_page;
err = avic_init_vcpu(svm);
if (err)
@@ -1351,6 +1378,9 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
error_free_sev:
sev_free_vcpu(vcpu);
+error_free_pml_page:
+ if (vcpu->arch.pml_page)
+ __free_page(vcpu->arch.pml_page);
error_free_vmcb_page:
__free_page(vmcb01_page);
out:
@@ -1368,6 +1398,9 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
sev_free_vcpu(vcpu);
+ if (pml && vcpu->arch.pml_page)
+ __free_page(vcpu->arch.pml_page);
+
__free_page(__sme_pa_to_page(svm->vmcb01.pa));
svm_vcpu_free_msrpm(svm->msrpm);
}
@@ -3331,6 +3364,53 @@ static int vmmcall_interception(struct kvm_vcpu *vcpu)
return kvm_emulate_hypercall(vcpu);
}
+void svm_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+
+ if (WARN_ON_ONCE(!vcpu->kvm->arch.cpu_dirty_log_size))
+ return;
+
+ /*
+ * Note, nr_memslots_dirty_logging can be changed concurrent with this
+ * code, but in that case another update request will be made and so
+ * the guest will never run with a stale PML value.
+ */
+ if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
+ vmcb01->control.misc_ctl |= SVM_MISC_ENABLE_PML;
+ else
+ vmcb01->control.misc_ctl &= ~SVM_MISC_ENABLE_PML;
+
+ vmcb_mark_dirty(vmcb01, VMCB_NPT);
+}
+
+static void svm_flush_pml_buffer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+
+ /* Do nothing if PML buffer is empty */
+ if (control->pml_index == PML_HEAD_INDEX)
+ return;
+
+ kvm_flush_pml_buffer(vcpu, control->pml_index);
+
+ /* Reset the PML index */
+ control->pml_index = PML_HEAD_INDEX;
+}
+
+static int pml_full_interception(struct kvm_vcpu *vcpu)
+{
+ trace_kvm_pml_full(vcpu->vcpu_id);
+
+ /*
+ * PML buffer is already flushed at the beginning of svm_handle_exit().
+ * Nothing to do here.
+ */
+ return 1;
+}
+
static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
@@ -3407,6 +3487,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
#ifdef CONFIG_KVM_AMD_SEV
[SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
#endif
+ [SVM_EXIT_PML_FULL] = pml_full_interception,
};
static void dump_vmcb(struct kvm_vcpu *vcpu)
@@ -3456,8 +3537,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
- pr_err("%-20s%lld\n", "misc_ctl:", control->misc_ctl);
+ pr_err("%-20s%llx\n", "misc_ctl:", control->misc_ctl);
pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
+
+ if (pml) {
+ pr_err("%-20s%016llx\n", "pml_addr:", control->pml_addr);
+ pr_err("%-20s%04x\n", "pml_index:", control->pml_index);
+ }
+
pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
@@ -3635,6 +3722,13 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 __exit_code)
(u64)exit_code != __exit_code)
goto unexpected_vmexit;
+ /*
+ * PML is never enabled when running L2, bail immediately if a PML full
+ * exit occurs as something is horribly wrong.
+ */
+ if (unlikely(is_guest_mode(vcpu) && exit_code == SVM_EXIT_PML_FULL))
+ goto unexpected_vmexit;
+
#ifdef CONFIG_MITIGATION_RETPOLINE
if (exit_code == SVM_EXIT_MSR)
return msr_interception(vcpu);
@@ -3703,6 +3797,14 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_run *kvm_run = vcpu->run;
+ /*
+ * Opportunistically flush the PML buffer on VM exit. This keeps the
+ * dirty bitmap current by processing logged GPAs rather than waiting for
+ * PML_FULL exit.
+ */
+ if (vcpu->kvm->arch.cpu_dirty_log_size && !is_guest_mode(vcpu))
+ svm_flush_pml_buffer(vcpu);
+
if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE))
return 0;
@@ -5310,6 +5412,9 @@ static int svm_vm_init(struct kvm *kvm)
return ret;
}
+ if (pml)
+ kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES;
+
svm_srso_vm_init();
return 0;
}
@@ -5465,6 +5570,8 @@ struct kvm_x86_ops svm_x86_ops __initdata = {
.gmem_prepare = sev_gmem_prepare,
.gmem_invalidate = sev_gmem_invalidate,
.gmem_max_mapping_level = sev_gmem_max_mapping_level,
+
+ .update_cpu_dirty_logging = svm_update_cpu_dirty_logging,
};
/*
@@ -5690,6 +5797,10 @@ static __init int svm_hardware_setup(void)
nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
+ pml = pml && npt_enabled && cpu_feature_enabled(X86_FEATURE_PML);
+ if (pml)
+ pr_info("Page modification logging supported\n");
+
if (lbrv) {
if (!boot_cpu_has(X86_FEATURE_LBRV))
lbrv = false;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 2b6733dffd76f..7cd8207359d7c 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -52,6 +52,7 @@ extern int vgif;
extern bool intercept_smi;
extern bool vnmi;
extern int lbrv;
+extern bool pml;
extern int tsc_aux_uret_slot __ro_after_init;
@@ -832,6 +833,8 @@ static inline void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
svm_set_intercept_for_msr(vcpu, msr, type, true);
}
+void svm_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+
/* nested.c */
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
--
2.48.1
^ permalink raw reply related [flat|nested] 15+ messages in thread