From: Nikunj A Dadhania <nikunj@amd.com>
To: <seanjc@google.com>, <pbonzini@redhat.com>
Cc: <kvm@vger.kernel.org>, <thomas.lendacky@amd.com>,
<santosh.shukla@amd.com>, <bp@alien8.de>,
<joao.m.martins@oracle.com>, <nikunj@amd.com>,
<kai.huang@intel.com>
Subject: [PATCH v4 7/7] KVM: SVM: Add Page modification logging support
Date: Mon, 13 Oct 2025 06:25:15 +0000 [thread overview]
Message-ID: <20251013062515.3712430-8-nikunj@amd.com> (raw)
In-Reply-To: <20251013062515.3712430-1-nikunj@amd.com>
Currently, dirty logging relies on write protecting guest memory and
marking dirty GFNs during subsequent write faults. This method works but
incurs overhead due to additional write faults for each dirty GFN.
Implement support for the Page Modification Logging (PML) feature, a
hardware-assisted method for efficient dirty logging. PML automatically
logs dirty GPA[51:12] to a 4K buffer when the CPU sets NPT D-bits. Two new
VMCB fields are utilized: PML_ADDR and PML_INDEX. The PML_INDEX is
initialized to 511 (8 bytes per GPA entry), and the CPU decreases the
PML_INDEX after logging each GPA. When the PML buffer is full, a
VMEXIT(PML_FULL) with exit code 0x407 is generated.
Disable PML for nested guests.
PML is enabled by default when supported and can be disabled via the 'pml'
module parameter.
Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
---
arch/x86/include/asm/svm.h | 6 ++-
arch/x86/include/uapi/asm/svm.h | 2 +
arch/x86/kvm/svm/nested.c | 9 +++-
arch/x86/kvm/svm/sev.c | 2 +-
arch/x86/kvm/svm/svm.c | 84 ++++++++++++++++++++++++++++++++-
arch/x86/kvm/svm/svm.h | 2 +
6 files changed, 100 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index d2f1a495691c..caf6cb09f983 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -165,7 +165,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u8 reserved_9[22];
u64 allowed_sev_features; /* Offset 0x138 */
u64 guest_sev_features; /* Offset 0x140 */
- u8 reserved_10[664];
+ u8 reserved_10[128];
+ u64 pml_addr; /* Offset 0x1c8 */
+ u16 pml_index; /* Offset 0x1d0 */
+ u8 reserved_11[526];
/*
* Offset 0x3e0, 32 bytes reserved
* for use by hypervisor/software.
@@ -239,6 +242,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_NESTED_CTL_NP_ENABLE BIT_ULL(0)
#define SVM_NESTED_CTL_SEV_ENABLE BIT_ULL(1)
#define SVM_NESTED_CTL_SEV_ES_ENABLE BIT_ULL(2)
+#define SVM_NESTED_CTL_PML_ENABLE BIT_ULL(11)
#define SVM_TSC_RATIO_RSVD 0xffffff0000000000ULL
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 9c640a521a67..f329dca167de 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -101,6 +101,7 @@
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402
#define SVM_EXIT_VMGEXIT 0x403
+#define SVM_EXIT_PML_FULL 0x407
/* SEV-ES software-defined VMGEXIT events */
#define SVM_VMGEXIT_MMIO_READ 0x80000001
@@ -232,6 +233,7 @@
{ SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \
{ SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \
{ SVM_EXIT_VMGEXIT, "vmgexit" }, \
+ { SVM_EXIT_PML_FULL, "pml_full" }, \
{ SVM_VMGEXIT_MMIO_READ, "vmgexit_mmio_read" }, \
{ SVM_VMGEXIT_MMIO_WRITE, "vmgexit_mmio_write" }, \
{ SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index a6443feab252..1f6cc5a6da63 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -748,11 +748,18 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
V_NMI_BLOCKING_MASK);
}
- /* Copied from vmcb01. msrpm_base can be overwritten later. */
+ /* Copied from vmcb01. msrpm_base/nested_ctl can be overwritten later. */
vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
+ /* Disable PML for nested guest as the A/D update is emulated by MMU */
+ if (enable_pml) {
+ vmcb02->control.nested_ctl &= ~SVM_NESTED_CTL_PML_ENABLE;
+ vmcb02->control.pml_addr = 0;
+ vmcb02->control.pml_index = -1;
+ }
+
/*
* Stash vmcb02's counter if the guest hasn't moved past the guilty
* instruction; otherwise, reset the counter to '0'.
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0835c664fbfd..080a9a72545e 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -4774,7 +4774,7 @@ struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
* Allocate an SNP-safe page to workaround the SNP erratum where
* the CPU will incorrectly signal an RMP violation #PF if a
* hugepage (2MB or 1GB) collides with the RMP entry of a
- * 2MB-aligned VMCB, VMSA, or AVIC backing page.
+ * 2MB-aligned VMCB, VMSA, PML or AVIC backing page.
*
* Allocate one extra page, choose a page which is not
* 2MB-aligned, and free the other.
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 153c12dbf3eb..fc7147024123 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -170,6 +170,8 @@ module_param(intercept_smi, bool, 0444);
bool vnmi = true;
module_param(vnmi, bool, 0444);
+module_param_named(pml, enable_pml, bool, 0444);
+
static bool svm_gp_erratum_intercept = true;
static u8 rsm_ins_bytes[] = "\x0f\xaa";
@@ -1162,6 +1164,16 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event)
if (vcpu->kvm->arch.bus_lock_detection_enabled)
svm_set_intercept(svm, INTERCEPT_BUSLOCK);
+ if (enable_pml) {
+ /*
+ * Populate the page address and index here, PML is enabled
+ * when dirty logging is enabled on the memslot through
+ * svm_update_cpu_dirty_logging()
+ */
+ control->pml_addr = (u64)__sme_set(page_to_phys(vcpu->arch.pml_page));
+ control->pml_index = PML_HEAD_INDEX;
+ }
+
if (sev_guest(vcpu->kvm))
sev_init_vmcb(svm, init_event);
@@ -1221,9 +1233,15 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
if (!vmcb01_page)
goto out;
+ if (enable_pml) {
+ vcpu->arch.pml_page = snp_safe_alloc_page();
+ if (!vcpu->arch.pml_page)
+ goto error_free_vmcb_page;
+ }
+
err = sev_vcpu_create(vcpu);
if (err)
- goto error_free_vmcb_page;
+ goto error_free_pml_page;
err = avic_init_vcpu(svm);
if (err)
@@ -1247,6 +1265,9 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
error_free_sev:
sev_free_vcpu(vcpu);
+error_free_pml_page:
+ if (vcpu->arch.pml_page)
+ __free_page(vcpu->arch.pml_page);
error_free_vmcb_page:
__free_page(vmcb01_page);
out:
@@ -1264,6 +1285,9 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
sev_free_vcpu(vcpu);
+ if (enable_pml)
+ __free_page(vcpu->arch.pml_page);
+
__free_page(__sme_pa_to_page(svm->vmcb01.pa));
svm_vcpu_free_msrpm(svm->msrpm);
}
@@ -3151,6 +3175,42 @@ static int bus_lock_exit(struct kvm_vcpu *vcpu)
return 0;
}
+void svm_update_cpu_dirty_logging(struct kvm_vcpu *vcpu, bool enable)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (enable)
+ svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_PML_ENABLE;
+ else
+ svm->vmcb->control.nested_ctl &= ~SVM_NESTED_CTL_PML_ENABLE;
+}
+
+static void svm_flush_pml_buffer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+
+ /* Do nothing if PML buffer is empty */
+ if (control->pml_index == PML_HEAD_INDEX)
+ return;
+
+ kvm_flush_pml_buffer(vcpu, control->pml_index);
+
+ /* Reset the PML index */
+ control->pml_index = PML_HEAD_INDEX;
+}
+
+static int pml_full_interception(struct kvm_vcpu *vcpu)
+{
+ trace_kvm_pml_full(vcpu->vcpu_id);
+
+ /*
+ * PML buffer is already flushed at the beginning of svm_handle_exit().
+ * Nothing to do here.
+ */
+ return 1;
+}
+
static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
@@ -3227,6 +3287,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
#ifdef CONFIG_KVM_AMD_SEV
[SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
#endif
+ [SVM_EXIT_PML_FULL] = pml_full_interception,
};
static void dump_vmcb(struct kvm_vcpu *vcpu)
@@ -3275,8 +3336,10 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
- pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
+ pr_err("%-20s%llx\n", "nested_ctl:", control->nested_ctl);
pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
+ pr_err("%-20s%016llx\n", "pml_addr:", control->pml_addr);
+ pr_err("%-20s%04x\n", "pml_index:", control->pml_index);
pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
@@ -3518,6 +3581,14 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
+ /*
+ * Opportunistically flush the PML buffer on VM exit. This keeps the
+ * dirty bitmap current by processing logged GPAs rather than waiting for
+ * PML_FULL exit.
+ */
+ if (enable_pml && !is_guest_mode(vcpu))
+ svm_flush_pml_buffer(vcpu);
+
/* SEV-ES guests must use the CR write traps to track CR registers. */
if (!sev_es_guest(vcpu->kvm)) {
if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
@@ -4991,6 +5062,9 @@ static int svm_vm_init(struct kvm *kvm)
return ret;
}
+ if (enable_pml)
+ kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES;
+
svm_srso_vm_init();
return 0;
}
@@ -5144,6 +5218,8 @@ struct kvm_x86_ops svm_x86_ops __initdata = {
.gmem_prepare = sev_gmem_prepare,
.gmem_invalidate = sev_gmem_invalidate,
.gmem_max_mapping_level = sev_gmem_max_mapping_level,
+
+ .update_cpu_dirty_logging = svm_update_cpu_dirty_logging,
};
/*
@@ -5365,6 +5441,10 @@ static __init int svm_hardware_setup(void)
nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
+ enable_pml = enable_pml && npt_enabled && cpu_feature_enabled(X86_FEATURE_PML);
+ if (enable_pml)
+ pr_info("Page modification logging supported\n");
+
if (lbrv) {
if (!boot_cpu_has(X86_FEATURE_LBRV))
lbrv = false;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index e4b04f435b3d..522b557106cb 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -720,6 +720,8 @@ static inline void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
svm_set_intercept_for_msr(vcpu, msr, type, true);
}
+void svm_update_cpu_dirty_logging(struct kvm_vcpu *vcpu, bool enable);
+
/* nested.c */
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
--
2.48.1
next prev parent reply other threads:[~2025-10-13 6:26 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-13 6:25 [PATCH v4 0/7] KVM: SVM: Add Page Modification Logging (PML) support Nikunj A Dadhania
2025-10-13 6:25 ` [PATCH v4 1/7] KVM: x86: Carve out PML flush routine Nikunj A Dadhania
2025-10-14 22:04 ` Huang, Kai
2025-10-15 4:32 ` Nikunj A. Dadhania
2025-10-13 6:25 ` [PATCH v4 2/7] KVM: x86: Move PML page to common vcpu arch structure Nikunj A Dadhania
2025-10-13 6:25 ` [PATCH v4 3/7] KVM: x86: Move enable_pml variable to common x86 code Nikunj A Dadhania
2025-10-14 11:24 ` Huang, Kai
2025-10-14 19:22 ` Sean Christopherson
2025-10-14 20:47 ` Huang, Kai
2025-10-15 4:39 ` Nikunj A. Dadhania
2025-10-13 6:25 ` [PATCH v4 4/7] KVM: x86: Move nested CPU dirty logging logic to common code Nikunj A Dadhania
2025-10-14 11:34 ` Huang, Kai
2025-10-14 20:40 ` Huang, Kai
2025-10-14 21:24 ` Sean Christopherson
2025-10-14 21:37 ` Huang, Kai
2025-10-15 4:43 ` Nikunj A. Dadhania
2025-10-15 5:27 ` Huang, Kai
2025-10-15 9:06 ` Nikunj A. Dadhania
2025-10-15 21:37 ` Huang, Kai
2025-10-16 9:23 ` Nikunj A. Dadhania
2025-10-13 6:25 ` [PATCH v4 5/7] x86/cpufeatures: Add Page modification logging Nikunj A Dadhania
2025-10-13 6:25 ` [PATCH v4 6/7] KVM: SVM: Use BIT_ULL for 64-bit nested_ctl bit definitions Nikunj A Dadhania
2025-10-13 6:25 ` Nikunj A Dadhania [this message]
2025-10-17 5:13 ` [PATCH v4 7/7] KVM: SVM: Add Page modification logging support Huang, Kai
2025-11-06 9:28 ` Nikunj A. Dadhania
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251013062515.3712430-8-nikunj@amd.com \
--to=nikunj@amd.com \
--cc=bp@alien8.de \
--cc=joao.m.martins@oracle.com \
--cc=kai.huang@intel.com \
--cc=kvm@vger.kernel.org \
--cc=pbonzini@redhat.com \
--cc=santosh.shukla@amd.com \
--cc=seanjc@google.com \
--cc=thomas.lendacky@amd.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox