* [PATCH v2 01/15] KVM: TDX: Drop kvm_x86_ops.link_external_spt()
2026-05-09 7:52 [PATCH v2 00/15] TDX MMU refactors Yan Zhao
@ 2026-05-09 7:53 ` Yan Zhao
2026-05-09 7:55 ` [PATCH v2 02/15] KVM: TDX: Wrap mapping of leaf and non-leaf S-EPT entries into helpers Yan Zhao
` (13 subsequent siblings)
14 siblings, 0 replies; 16+ messages in thread
From: Yan Zhao @ 2026-05-09 7:53 UTC (permalink / raw)
To: seanjc, pbonzini, kvm, rick.p.edgecombe, kas
Cc: linux-kernel, x86, dave.hansen, kai.huang, binbin.wu, xiaoyao.li,
yan.y.zhao
From: Sean Christopherson <seanjc@google.com>
Drop the dedicated .link_external_spt() for linking S-EPT pages, and
instead funnel everything through .set_external_spte() for mapping S-EPT
entries. Using separate hooks doesn't help prevent TDP MMU details from
bleeding into TDX, and vice versa; to the contrary, dedicated callbacks
will result in _more_ pollution when hugepage support is added, e.g. will
require the TDP MMU to know details about the splitting rules for TDX that
aren't all that relevant to the TDP MMU.
Ideally, KVM would provide a single pair of hooks to set S-EPT entries,
one hook for setting SPTEs under write-lock and another for setting SPTEs
under read-lock (e.g. to ensure the entire operation is "atomic", to allow
for failure, etc.). Sadly, TDX's requirement that all child S-EPT entries
are removed before the parent makes that impractical: the TDP MMU
deliberately prunes non-leaf SPTEs and _then_ processes its children, thus
making it quite important for the TDP MMU to differentiate between zapping
leaf and non-leaf S-EPT entries.
However, that's the _only_ case that's truly special, and even that case
could be shoehorned into a single hook; it just wouldn't be a net positive.
Signed-off-by: Sean Christopherson <seanjc@google.com>
[Rick: add in trivial feedback]
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
MMU_refactors v2:
- Moved this patch to the very beginning of the series so that when
warnings like "KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm)" are
removed from __tdp_mmu_set_spte_atomic() in TDP MMU in a later patch, the
atomic zap change can be propagated via the .set_external_spte() op.
(Yan).
MMU_refactors v1:
- rename external_spt->sept_pt (Rick, Yan)
---
arch/x86/include/asm/kvm-x86-ops.h | 1 -
arch/x86/include/asm/kvm_host.h | 3 --
arch/x86/kvm/mmu/tdp_mmu.c | 29 +-------------
arch/x86/kvm/vmx/tdx.c | 63 ++++++++++++++++++++----------
4 files changed, 44 insertions(+), 52 deletions(-)
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 31d5c5d58ae6..bced6d938702 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -95,7 +95,6 @@ KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
KVM_X86_OP(load_mmu_pgd)
-KVM_X86_OP_OPTIONAL_RET0(link_external_spt)
KVM_X86_OP_OPTIONAL_RET0(set_external_spte)
KVM_X86_OP_OPTIONAL_RET0(free_external_spt)
KVM_X86_OP_OPTIONAL(remove_external_spte)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c470e40a00aa..832323c4bc27 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1891,9 +1891,6 @@ struct kvm_x86_ops {
void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
int root_level);
- /* Update external mapping with page table link. */
- int (*link_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
- void *external_spt);
/* Update the external page table from spte getting set. */
int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
u64 mirror_spte);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index dbaeb80f2b64..e6c3b739d1fe 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -495,27 +495,12 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
-static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
-{
- if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
- struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
-
- WARN_ON_ONCE(sp->role.level + 1 != level);
- WARN_ON_ONCE(sp->gfn != gfn);
- return sp->external_spt;
- }
-
- return NULL;
-}
-
static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
gfn_t gfn, u64 *old_spte,
u64 new_spte, int level)
{
bool was_present = is_shadow_present_pte(*old_spte);
- bool is_present = is_shadow_present_pte(new_spte);
- bool is_leaf = is_present && is_last_spte(new_spte, level);
- int ret = 0;
+ int ret;
KVM_BUG_ON(was_present, kvm);
@@ -528,18 +513,8 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp
if (!try_cmpxchg64(rcu_dereference(sptep), old_spte, FROZEN_SPTE))
return -EBUSY;
- /*
- * Use different call to either set up middle level
- * external page table, or leaf.
- */
- if (is_leaf) {
- ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte);
- } else {
- void *external_spt = get_external_spt(gfn, new_spte, level);
+ ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte);
- KVM_BUG_ON(!external_spt, kvm);
- ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt);
- }
if (ret)
__kvm_tdp_mmu_write_spte(sptep, *old_spte);
else
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 9bd4fd748e2a..48c836ec6063 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1653,18 +1653,58 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
return 0;
}
+static struct page *tdx_spte_to_sept_pt(struct kvm *kvm, gfn_t gfn,
+ u64 new_spte, enum pg_level level)
+{
+ struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
+
+ if (KVM_BUG_ON(!sp->external_spt, kvm) ||
+ KVM_BUG_ON(sp->role.level + 1 != level, kvm) ||
+ KVM_BUG_ON(sp->gfn != gfn, kvm))
+ return NULL;
+
+ return virt_to_page(sp->external_spt);
+}
+
+static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, u64 mirror_spte)
+{
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u64 err, entry, level_state;
+ struct page *sept_pt;
+
+ sept_pt = tdx_spte_to_sept_pt(kvm, gfn, mirror_spte, level);
+ if (!sept_pt)
+ return -EIO;
+
+ err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, level, sept_pt,
+ &entry, &level_state);
+ if (unlikely(tdx_operand_busy(err)))
+ return -EBUSY;
+
+ if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
+ return -EIO;
+
+ return 0;
+}
+
static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
enum pg_level level, u64 mirror_spte)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
+ if (KVM_BUG_ON(!is_shadow_present_pte(mirror_spte), kvm))
+ return -EIO;
+
+ if (!is_last_spte(mirror_spte, level))
+ return tdx_sept_link_private_spt(kvm, gfn, level, mirror_spte);
+
/* TODO: handle large pages. */
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
return -EIO;
- WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) ||
- (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
+ WARN_ON_ONCE((mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
/*
* Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
@@ -1684,24 +1724,6 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
return tdx_mem_page_aug(kvm, gfn, level, pfn);
}
-static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt)
-{
- gpa_t gpa = gfn_to_gpa(gfn);
- struct page *page = virt_to_page(private_spt);
- u64 err, entry, level_state;
-
- err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, level, page, &entry,
- &level_state);
- if (unlikely(tdx_operand_busy(err)))
- return -EBUSY;
-
- if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
- return -EIO;
-
- return 0;
-}
-
/*
* Ensure shared and private EPTs to be flushed on all vCPUs.
* tdh_mem_track() is the only caller that increases TD epoch. An increase in
@@ -3413,7 +3435,6 @@ int __init tdx_hardware_setup(void)
vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
- vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
--
2.43.2
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v2 02/15] KVM: TDX: Wrap mapping of leaf and non-leaf S-EPT entries into helpers
2026-05-09 7:52 [PATCH v2 00/15] TDX MMU refactors Yan Zhao
2026-05-09 7:53 ` [PATCH v2 01/15] KVM: TDX: Drop kvm_x86_ops.link_external_spt() Yan Zhao
@ 2026-05-09 7:55 ` Yan Zhao
2026-05-09 7:55 ` [PATCH v2 03/15] KVM: x86/mmu: Fold set_external_spte_present() into its sole caller Yan Zhao
` (12 subsequent siblings)
14 siblings, 0 replies; 16+ messages in thread
From: Yan Zhao @ 2026-05-09 7:55 UTC (permalink / raw)
To: seanjc, pbonzini, kvm, rick.p.edgecombe, kas
Cc: linux-kernel, x86, dave.hansen, kai.huang, binbin.wu, xiaoyao.li,
yan.y.zhao
From: Sean Christopherson <seanjc@google.com>
Add a helper, tdx_sept_map_leaf_spte(), to wrap and isolate PAGE.ADD and
PAGE.AUG operations. Rename tdx_sept_link_private_spt() to
tdx_sept_map_nonleaf_spte() to wrap SEPT.ADD for symmetry.
Thus, transition tdx_sept_set_private_spte() into a "dispatch" routine for
setting/writing S-EPT entries.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
MMU_refactors v2:
- Renamed tdx_sept_link_private_spt() to tdx_sept_map_nonleaf_spte(). (Yan)
---
arch/x86/kvm/vmx/tdx.c | 24 +++++++++++++++---------
1 file changed, 15 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 48c836ec6063..886e1eac23fa 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1666,7 +1666,7 @@ static struct page *tdx_spte_to_sept_pt(struct kvm *kvm, gfn_t gfn,
return virt_to_page(sp->external_spt);
}
-static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
+static int tdx_sept_map_nonleaf_spte(struct kvm *kvm, gfn_t gfn,
enum pg_level level, u64 mirror_spte)
{
gpa_t gpa = gfn_to_gpa(gfn);
@@ -1688,18 +1688,12 @@ static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
return 0;
}
-static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, u64 mirror_spte)
+static int tdx_sept_map_leaf_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ u64 mirror_spte)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
- if (KVM_BUG_ON(!is_shadow_present_pte(mirror_spte), kvm))
- return -EIO;
-
- if (!is_last_spte(mirror_spte, level))
- return tdx_sept_link_private_spt(kvm, gfn, level, mirror_spte);
-
/* TODO: handle large pages. */
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
return -EIO;
@@ -1724,6 +1718,18 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
return tdx_mem_page_aug(kvm, gfn, level, pfn);
}
+static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, u64 mirror_spte)
+{
+ if (KVM_BUG_ON(!is_shadow_present_pte(mirror_spte), kvm))
+ return -EIO;
+
+ if (!is_last_spte(mirror_spte, level))
+ return tdx_sept_map_nonleaf_spte(kvm, gfn, level, mirror_spte);
+
+ return tdx_sept_map_leaf_spte(kvm, gfn, level, mirror_spte);
+}
+
/*
* Ensure shared and private EPTs to be flushed on all vCPUs.
* tdh_mem_track() is the only caller that increases TD epoch. An increase in
--
2.43.2
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v2 03/15] KVM: x86/mmu: Fold set_external_spte_present() into its sole caller
2026-05-09 7:52 [PATCH v2 00/15] TDX MMU refactors Yan Zhao
2026-05-09 7:53 ` [PATCH v2 01/15] KVM: TDX: Drop kvm_x86_ops.link_external_spt() Yan Zhao
2026-05-09 7:55 ` [PATCH v2 02/15] KVM: TDX: Wrap mapping of leaf and non-leaf S-EPT entries into helpers Yan Zhao
@ 2026-05-09 7:55 ` Yan Zhao
2026-05-09 7:55 ` [PATCH v2 04/15] KVM: x86/mmu: Plumb param "old_spte" into kvm_x86_ops.set_external_spte() Yan Zhao
` (11 subsequent siblings)
14 siblings, 0 replies; 16+ messages in thread
From: Yan Zhao @ 2026-05-09 7:55 UTC (permalink / raw)
To: seanjc, pbonzini, kvm, rick.p.edgecombe, kas
Cc: linux-kernel, x86, dave.hansen, kai.huang, binbin.wu, xiaoyao.li,
yan.y.zhao
From: Sean Christopherson <seanjc@google.com>
Fold set_external_spte_present() into __tdp_mmu_set_spte_atomic() in
anticipation of propagating all changes (like atomic zap) triggered by
tdp_mmu_set_spte_atomic() to the external PTEs.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
MMU_refactors v2:
- Moved to the front of the series and updated the patch log to indicate
the propagation of changes for atomic zap. (Yan)
---
arch/x86/kvm/mmu/tdp_mmu.c | 72 ++++++++++++++++----------------------
1 file changed, 31 insertions(+), 41 deletions(-)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index e6c3b739d1fe..aa6b629a9799 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -495,33 +495,6 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
-static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
- gfn_t gfn, u64 *old_spte,
- u64 new_spte, int level)
-{
- bool was_present = is_shadow_present_pte(*old_spte);
- int ret;
-
- KVM_BUG_ON(was_present, kvm);
-
- lockdep_assert_held(&kvm->mmu_lock);
- /*
- * We need to lock out other updates to the SPTE until the external
- * page table has been modified. Use FROZEN_SPTE similar to
- * the zapping case.
- */
- if (!try_cmpxchg64(rcu_dereference(sptep), old_spte, FROZEN_SPTE))
- return -EBUSY;
-
- ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte);
-
- if (ret)
- __kvm_tdp_mmu_write_spte(sptep, *old_spte);
- else
- __kvm_tdp_mmu_write_spte(sptep, new_spte);
- return ret;
-}
-
/**
* handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
@@ -626,6 +599,8 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
+ u64 *raw_sptep = rcu_dereference(iter->sptep);
+
/*
* The caller is responsible for ensuring the old SPTE is not a FROZEN
* SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE,
@@ -635,8 +610,13 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
+ bool was_present = is_shadow_present_pte(iter->old_spte);
int ret;
+ KVM_BUG_ON(was_present, kvm);
+
+ lockdep_assert_held(&kvm->mmu_lock);
+
/*
* Users of atomic zapping don't operate on mirror roots,
* so don't handle it and bug the VM if it's seen.
@@ -644,25 +624,35 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
return -EBUSY;
- ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
- &iter->old_spte, new_spte, iter->level);
- if (ret)
- return ret;
- } else {
- u64 *sptep = rcu_dereference(iter->sptep);
-
/*
- * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs
- * and does not hold the mmu_lock. On failure, i.e. if a
- * different logical CPU modified the SPTE, try_cmpxchg64()
- * updates iter->old_spte with the current value, so the caller
- * operates on fresh data, e.g. if it retries
- * tdp_mmu_set_spte_atomic()
+ * We need to lock out other updates to the SPTE until the external
+ * page table has been modified. Use FROZEN_SPTE similar to
+ * the zapping case.
*/
- if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
+ if (!try_cmpxchg64(raw_sptep, &iter->old_spte, FROZEN_SPTE))
return -EBUSY;
+
+ ret = kvm_x86_call(set_external_spte)(kvm, iter->gfn, iter->level,
+ new_spte);
+
+ if (ret)
+ __kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte);
+ else
+ __kvm_tdp_mmu_write_spte(iter->sptep, new_spte);
+
+ return ret;
}
+ /*
+ * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
+ * does not hold the mmu_lock. On failure, i.e. if a different logical
+ * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
+ * the current value, so the caller operates on fresh data, e.g. if it
+ * retries tdp_mmu_set_spte_atomic().
+ */
+ if (!try_cmpxchg64(raw_sptep, &iter->old_spte, new_spte))
+ return -EBUSY;
+
return 0;
}
--
2.43.2
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v2 04/15] KVM: x86/mmu: Plumb param "old_spte" into kvm_x86_ops.set_external_spte()
2026-05-09 7:52 [PATCH v2 00/15] TDX MMU refactors Yan Zhao
` (2 preceding siblings ...)
2026-05-09 7:55 ` [PATCH v2 03/15] KVM: x86/mmu: Fold set_external_spte_present() into its sole caller Yan Zhao
@ 2026-05-09 7:55 ` Yan Zhao
2026-05-09 7:55 ` [PATCH v2 05/15] KVM: TDX: Move KVM_BUG_ON()s in __tdp_mmu_set_spte_atomic() to TDX code Yan Zhao
` (10 subsequent siblings)
14 siblings, 0 replies; 16+ messages in thread
From: Yan Zhao @ 2026-05-09 7:55 UTC (permalink / raw)
To: seanjc, pbonzini, kvm, rick.p.edgecombe, kas
Cc: linux-kernel, x86, dave.hansen, kai.huang, binbin.wu, xiaoyao.li,
yan.y.zhao
From: Sean Christopherson <seanjc@google.com>
If tdp_mmu_set_spte_atomic() triggers an atomic zap on a mirror SPTE
(though currently no paths trigger it), the change is propagated via the
set_external_spte() op. Plumb the old SPTE into the set_external_spte() op,
so TDX code rather than TDP MMU core can warn if the atomic zap isn't
allowed.
Rename mirror_spte to new_spte to follow the TDP MMU's naming, and to make
it more obvious what value the parameter holds.
Opportunistically tweak the ordering of parameters to match the pattern of
most TDP MMU functions, which do "old, new, level".
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
MMU_refactors v2:
- Moved this patch to before dropping the warning of
"KVM_BUG_ON(was_present, kvm)" in __tdp_mmu_set_spte_atomic(). So TDX's
tdx_sept_set_private_spte() can later warn instead if atomic zap is
propagated via the set_external_spte() op (as allowed by
tdp_mmu_set_spte_atomic() if it occurs). (Yan)
---
arch/x86/include/asm/kvm_host.h | 4 ++--
arch/x86/kvm/mmu/tdp_mmu.c | 4 ++--
arch/x86/kvm/vmx/tdx.c | 22 +++++++++++-----------
3 files changed, 15 insertions(+), 15 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 832323c4bc27..9b55973f194c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1892,8 +1892,8 @@ struct kvm_x86_ops {
int root_level);
/* Update the external page table from spte getting set. */
- int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
- u64 mirror_spte);
+ int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, u64 old_spte,
+ u64 new_spte, enum pg_level level);
/* Update external page tables for page table about to be freed. */
int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index aa6b629a9799..ceb27769bcf6 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -632,8 +632,8 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
if (!try_cmpxchg64(raw_sptep, &iter->old_spte, FROZEN_SPTE))
return -EBUSY;
- ret = kvm_x86_call(set_external_spte)(kvm, iter->gfn, iter->level,
- new_spte);
+ ret = kvm_x86_call(set_external_spte)(kvm, iter->gfn, iter->old_spte,
+ new_spte, iter->level);
if (ret)
__kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte);
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 886e1eac23fa..219da92fe8ea 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1667,13 +1667,13 @@ static struct page *tdx_spte_to_sept_pt(struct kvm *kvm, gfn_t gfn,
}
static int tdx_sept_map_nonleaf_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, u64 mirror_spte)
+ enum pg_level level, u64 new_spte)
{
gpa_t gpa = gfn_to_gpa(gfn);
u64 err, entry, level_state;
struct page *sept_pt;
- sept_pt = tdx_spte_to_sept_pt(kvm, gfn, mirror_spte, level);
+ sept_pt = tdx_spte_to_sept_pt(kvm, gfn, new_spte, level);
if (!sept_pt)
return -EIO;
@@ -1689,16 +1689,16 @@ static int tdx_sept_map_nonleaf_spte(struct kvm *kvm, gfn_t gfn,
}
static int tdx_sept_map_leaf_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
- u64 mirror_spte)
+ u64 new_spte)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
+ kvm_pfn_t pfn = spte_to_pfn(new_spte);
/* TODO: handle large pages. */
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
return -EIO;
- WARN_ON_ONCE((mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
+ WARN_ON_ONCE((new_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
/*
* Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
@@ -1718,16 +1718,16 @@ static int tdx_sept_map_leaf_spte(struct kvm *kvm, gfn_t gfn, enum pg_level leve
return tdx_mem_page_aug(kvm, gfn, level, pfn);
}
-static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, u64 mirror_spte)
+static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
+ u64 new_spte, enum pg_level level)
{
- if (KVM_BUG_ON(!is_shadow_present_pte(mirror_spte), kvm))
+ if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
return -EIO;
- if (!is_last_spte(mirror_spte, level))
- return tdx_sept_map_nonleaf_spte(kvm, gfn, level, mirror_spte);
+ if (!is_last_spte(new_spte, level))
+ return tdx_sept_map_nonleaf_spte(kvm, gfn, level, new_spte);
- return tdx_sept_map_leaf_spte(kvm, gfn, level, mirror_spte);
+ return tdx_sept_map_leaf_spte(kvm, gfn, level, new_spte);
}
/*
--
2.43.2
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v2 05/15] KVM: TDX: Move KVM_BUG_ON()s in __tdp_mmu_set_spte_atomic() to TDX code
2026-05-09 7:52 [PATCH v2 00/15] TDX MMU refactors Yan Zhao
` (3 preceding siblings ...)
2026-05-09 7:55 ` [PATCH v2 04/15] KVM: x86/mmu: Plumb param "old_spte" into kvm_x86_ops.set_external_spte() Yan Zhao
@ 2026-05-09 7:55 ` Yan Zhao
2026-05-09 7:55 ` [PATCH v2 06/15] KVM: TDX: Move lockdep assert " Yan Zhao
` (9 subsequent siblings)
14 siblings, 0 replies; 16+ messages in thread
From: Yan Zhao @ 2026-05-09 7:55 UTC (permalink / raw)
To: seanjc, pbonzini, kvm, rick.p.edgecombe, kas
Cc: linux-kernel, x86, dave.hansen, kai.huang, binbin.wu, xiaoyao.li,
yan.y.zhao
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
Drop some KVM_BUG_ON()s that are guarding against TDP MMU attempting to
propagate unsupported changes to the external page table through
__tdp_mmu_set_spte_atomic(). Have TDX code trigger them instead.
Now that TDP MMU logically allows propagating atomic zapping operation to
the external page table through the set_external_spte() op in
__tdp_mmu_set_spte_atomic(). TDX code will trigger the KVM_BUG_ON() on the
atomic zapping request instead. (Note: non-atomic zapping is not propagated
via the set_external_spte() op yet).
Despite the generic naming, external page table ops are designed completely
around TDX. They hook the bare minimum of what is needed, and exclude the
operations that are not supported by TDX. To help wrangle which operations
are handleable by various operations, warnings and KVM_BUG_ON()s exist in
the code. These warnings and KVM_BUG_ON()s put the burden of understanding
which operations should be forwarded to TDX code on TDP MMU developers, who
often read the code without TDX context.
Future changes will transition the encapsulation of this domain knowledge
to TDX code by funneling the external page table updates through a central
update mechanism. In this paradigm, the central update mechanism can
encapsulate the special knowledge, but will not have as much knowledge
about what operation is in progress.
Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
MMU_refactors v2:
- Moved this patch after "KVM: TDX: Drop kvm_x86_ops.link_external_spt()"
and "KVM: x86/mmu: Plumb param "old_spte" into
kvm_x86_ops.set_external_spte()". (Yan)
- Added a replacement KVM_BUG_ON() in TDX for the dropped
KVM_BUG_ON(was_present, kvm) in __tdp_mmu_set_spte_atomic(). (Yan).
---
arch/x86/kvm/mmu/tdp_mmu.c | 10 ----------
arch/x86/kvm/vmx/tdx.c | 3 +++
2 files changed, 3 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index ceb27769bcf6..f55967f8d74a 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -610,20 +610,10 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
- bool was_present = is_shadow_present_pte(iter->old_spte);
int ret;
- KVM_BUG_ON(was_present, kvm);
-
lockdep_assert_held(&kvm->mmu_lock);
- /*
- * Users of atomic zapping don't operate on mirror roots,
- * so don't handle it and bug the VM if it's seen.
- */
- if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
- return -EBUSY;
-
/*
* We need to lock out other updates to the SPTE until the external
* page table has been modified. Use FROZEN_SPTE similar to
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 219da92fe8ea..0ded336fbf70 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1721,6 +1721,9 @@ static int tdx_sept_map_leaf_spte(struct kvm *kvm, gfn_t gfn, enum pg_level leve
static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
u64 new_spte, enum pg_level level)
{
+ if (KVM_BUG_ON(is_shadow_present_pte(old_spte), kvm))
+ return -EIO;
+
if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
return -EIO;
--
2.43.2
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v2 06/15] KVM: TDX: Move lockdep assert in __tdp_mmu_set_spte_atomic() to TDX code
2026-05-09 7:52 [PATCH v2 00/15] TDX MMU refactors Yan Zhao
` (4 preceding siblings ...)
2026-05-09 7:55 ` [PATCH v2 05/15] KVM: TDX: Move KVM_BUG_ON()s in __tdp_mmu_set_spte_atomic() to TDX code Yan Zhao
@ 2026-05-09 7:55 ` Yan Zhao
2026-05-09 7:56 ` [PATCH v2 07/15] KVM: x86/tdp_mmu: Morph !is_frozen_spte() check into a KVM_MMU_WARN_ON() Yan Zhao
` (8 subsequent siblings)
14 siblings, 0 replies; 16+ messages in thread
From: Yan Zhao @ 2026-05-09 7:55 UTC (permalink / raw)
To: seanjc, pbonzini, kvm, rick.p.edgecombe, kas
Cc: linux-kernel, x86, dave.hansen, kai.huang, binbin.wu, xiaoyao.li,
yan.y.zhao
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
Move the MMU lockdep assert in __tdp_mmu_set_spte_atomic() into the TDX
specific op because the assert is TDX specific in intention.
The TDP MMU has many lockdep asserts for various scenarios, and in fact
the callchains that are used for TDX already have a lockdep assert which
covers the case in __tdp_mmu_set_spte_atomic(). However, these asserts are
for management of the TDP root owned by KVM. In the
__tdp_mmu_set_spte_atomic() assert case, it is helping with a scheme to
avoid contention in the TDX module during zap operations. That is very
TDX specific.
One option would be to just remove the assert in
__tdp_mmu_set_spte_atomic() and rely on the other ones in the TDP MMU. But
that assert is for a different intention, and too far away from the
SEAMCALL that needs it. So just move it to TDX code.
Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
arch/x86/kvm/mmu/tdp_mmu.c | 2 --
arch/x86/kvm/vmx/tdx.c | 2 ++
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index f55967f8d74a..401bb49a91ee 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -612,8 +612,6 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
int ret;
- lockdep_assert_held(&kvm->mmu_lock);
-
/*
* We need to lock out other updates to the SPTE until the external
* page table has been modified. Use FROZEN_SPTE similar to
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 0ded336fbf70..48aa7936a7f7 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1721,6 +1721,8 @@ static int tdx_sept_map_leaf_spte(struct kvm *kvm, gfn_t gfn, enum pg_level leve
static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
u64 new_spte, enum pg_level level)
{
+ lockdep_assert_held(&kvm->mmu_lock);
+
if (KVM_BUG_ON(is_shadow_present_pte(old_spte), kvm))
return -EIO;
--
2.43.2
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v2 07/15] KVM: x86/tdp_mmu: Morph !is_frozen_spte() check into a KVM_MMU_WARN_ON()
2026-05-09 7:52 [PATCH v2 00/15] TDX MMU refactors Yan Zhao
` (5 preceding siblings ...)
2026-05-09 7:55 ` [PATCH v2 06/15] KVM: TDX: Move lockdep assert " Yan Zhao
@ 2026-05-09 7:56 ` Yan Zhao
2026-05-09 7:56 ` [PATCH v2 08/15] KVM: x86/mmu: Plumb "sp" _pointer_ into the TDP MMU's handle_changed_spte() Yan Zhao
` (7 subsequent siblings)
14 siblings, 0 replies; 16+ messages in thread
From: Yan Zhao @ 2026-05-09 7:56 UTC (permalink / raw)
To: seanjc, pbonzini, kvm, rick.p.edgecombe, kas
Cc: linux-kernel, x86, dave.hansen, kai.huang, binbin.wu, xiaoyao.li,
yan.y.zhao
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
Remove the conditional logic for handling the setting of mirror page table
to frozen in __tdp_mmu_set_spte_atomic() and add it as a warning for both
mirror and direct cases.
The mirror page table needs to propagate PTE changes to the external page
table. This presents a problem for atomic updates which can't update both
page tables at once. So a special value, FROZEN_SPTE, is used as a
temporary state during these updates to prevent concurrent operations on
the PTE. If the TDP MMU tried to install FROZEN_SPTE as a long-term value,
it would confuse these updates.
On the other hand, it would also confuse other threads if FROZEN_SPTE is
installed as a long-term value for direct page tables (e.g., causing
another thread working on atomic zap to wait for a !FROZEN_SPTE value
endlessly).
Therefore, add the warning for installing FROZEN_SPTE as a long-term value
in __tdp_mmu_set_spte_atomic() without differentiating whether it's a
mirror or direct page table.
Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
MMU_refactors v2:
- Updated the comment for "KVM_MMU_WARN_ON(is_frozen_spte(new_spte))".
(Yan)
- Explained why the warning also applies to direct page tables. (Yan)
---
arch/x86/kvm/mmu/tdp_mmu.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 401bb49a91ee..345fdb0a89fb 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -609,7 +609,10 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
*/
WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
- if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
+ /* Should not set FROZEN_SPTE as a long-term value. */
+ KVM_MMU_WARN_ON(is_frozen_spte(new_spte));
+
+ if (is_mirror_sptep(iter->sptep)) {
int ret;
/*
--
2.43.2
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v2 08/15] KVM: x86/mmu: Plumb "sp" _pointer_ into the TDP MMU's handle_changed_spte()
2026-05-09 7:52 [PATCH v2 00/15] TDX MMU refactors Yan Zhao
` (6 preceding siblings ...)
2026-05-09 7:56 ` [PATCH v2 07/15] KVM: x86/tdp_mmu: Morph !is_frozen_spte() check into a KVM_MMU_WARN_ON() Yan Zhao
@ 2026-05-09 7:56 ` Yan Zhao
2026-05-09 7:56 ` [PATCH v2 09/15] KVM: x86/tdp_mmu: Centrally propagate to-present/atomic zap updates to external PTEs Yan Zhao
` (6 subsequent siblings)
14 siblings, 0 replies; 16+ messages in thread
From: Yan Zhao @ 2026-05-09 7:56 UTC (permalink / raw)
To: seanjc, pbonzini, kvm, rick.p.edgecombe, kas
Cc: linux-kernel, x86, dave.hansen, kai.huang, binbin.wu, xiaoyao.li,
yan.y.zhao
From: Sean Christopherson <seanjc@google.com>
Plumb the "sp" pointer into handle_changed_spte() to allow checking of
is_mirror_sp(sp) in handle_changed_spte() in the next patch. This is a
preparation to consolidate all S-EPT updates into a single kvm_x86_ops
hook.
[Yan: Remove unused "as_id" param in tdp_mmu_set_spte() ]
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
MMU_refactors v2:
-Split out and added back this patch.
(The patch was in Sean's original series, and had "SPTE" instead of "sp"
in title). (Yan)
- Remove unused "as_id" param in tdp_mmu_set_spte(). (Yan).
---
arch/x86/kvm/mmu/tdp_mmu.c | 35 ++++++++++++++++++-----------------
1 file changed, 18 insertions(+), 17 deletions(-)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 345fdb0a89fb..05dc8bdc1ea5 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -320,9 +320,9 @@ void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
}
}
-static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level,
- bool shared);
+static void handle_changed_spte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ gfn_t gfn, u64 old_spte, u64 new_spte,
+ int level, bool shared);
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
@@ -471,8 +471,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
FROZEN_SPTE, level);
}
- handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
- old_spte, FROZEN_SPTE, level, shared);
+ handle_changed_spte(kvm, sp, gfn, old_spte, FROZEN_SPTE, level, shared);
if (is_mirror_sp(sp)) {
KVM_BUG_ON(shared, kvm);
@@ -498,7 +497,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
/**
* handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
- * @as_id: the address space of the paging structure the SPTE was a part of
+ * @sp: the page table in which the SPTE resides
* @gfn: the base GFN that was mapped by the SPTE
* @old_spte: The value of the SPTE before the change
* @new_spte: The value of the SPTE after the change
@@ -511,15 +510,16 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* dirty logging updates are handled in common code, not here (see make_spte()
* and fast_pf_fix_direct_spte()).
*/
-static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level,
- bool shared)
+static void handle_changed_spte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ gfn_t gfn, u64 old_spte, u64 new_spte,
+ int level, bool shared)
{
bool was_present = is_shadow_present_pte(old_spte);
bool is_present = is_shadow_present_pte(new_spte);
bool was_leaf = was_present && is_last_spte(old_spte, level);
bool is_leaf = is_present && is_last_spte(new_spte, level);
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
+ int as_id = kvm_mmu_page_as_id(sp);
WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
WARN_ON_ONCE(level < PG_LEVEL_4K);
@@ -668,6 +668,7 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
+ struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
int ret;
lockdep_assert_held_read(&kvm->mmu_lock);
@@ -676,7 +677,7 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
if (ret)
return ret;
- handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ handle_changed_spte(kvm, sp, iter->gfn, iter->old_spte,
new_spte, iter->level, true);
return 0;
@@ -685,7 +686,6 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
/*
* tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
* @kvm: KVM instance
- * @as_id: Address space ID, i.e. regular vs. SMM
* @sptep: Pointer to the SPTE
* @old_spte: The current value of the SPTE
* @new_spte: The new value that will be set for the SPTE
@@ -695,9 +695,11 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
* Returns the old SPTE value, which _may_ be different than @old_spte if the
* SPTE had voldatile bits.
*/
-static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
- u64 old_spte, u64 new_spte, gfn_t gfn, int level)
+static u64 tdp_mmu_set_spte(struct kvm *kvm, tdp_ptep_t sptep, u64 old_spte,
+ u64 new_spte, gfn_t gfn, int level)
{
+ struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(sptep));
+
lockdep_assert_held_write(&kvm->mmu_lock);
/*
@@ -711,7 +713,7 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
- handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
+ handle_changed_spte(kvm, sp, gfn, old_spte, new_spte, level, false);
/*
* Users that do non-atomic setting of PTEs don't operate on mirror
@@ -729,9 +731,8 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte)
{
WARN_ON_ONCE(iter->yielded);
- iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
- iter->old_spte, new_spte,
- iter->gfn, iter->level);
+ iter->old_spte = tdp_mmu_set_spte(kvm, iter->sptep, iter->old_spte,
+ new_spte, iter->gfn, iter->level);
}
#define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
--
2.43.2
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v2 09/15] KVM: x86/tdp_mmu: Centrally propagate to-present/atomic zap updates to external PTEs
2026-05-09 7:52 [PATCH v2 00/15] TDX MMU refactors Yan Zhao
` (7 preceding siblings ...)
2026-05-09 7:56 ` [PATCH v2 08/15] KVM: x86/mmu: Plumb "sp" _pointer_ into the TDP MMU's handle_changed_spte() Yan Zhao
@ 2026-05-09 7:56 ` Yan Zhao
2026-05-09 7:56 ` [PATCH v2 10/15] KVM: x86/mmu: Drop KVM_BUG_ON() on shared lock to zap child " Yan Zhao
` (5 subsequent siblings)
14 siblings, 0 replies; 16+ messages in thread
From: Yan Zhao @ 2026-05-09 7:56 UTC (permalink / raw)
To: seanjc, pbonzini, kvm, rick.p.edgecombe, kas
Cc: linux-kernel, x86, dave.hansen, kai.huang, binbin.wu, xiaoyao.li,
yan.y.zhao
From: Sean Christopherson <seanjc@google.com>
Move propagation of to-present changes and atomic zap changes to external
PTEs from function __tdp_mmu_set_spte_atomic() to function
__handle_changed_spte(), which centrally handles changes of SPTEs.
When setting a PTE to present in the mirror page tables, the update needs
to be propagated to the external page tables (in TDX parlance the S-EPT).
Today this is handled by special mirror page tables branching in
__tdp_mmu_set_spte_atomic(), which is the only place where present PTEs are
set for TDX.
This keeps things running, but is a bit hacked on. The hook for setting
present leaf PTEs is added only where TDX happens to need it. For example,
TDX does not support any of the operations that use the non-atomic variant,
tdp_mmu_set_spte() to set present PTEs. Since the hook is missing there, it
is very hard to understand the code from a non-TDX lens. If the reader
doesn't know the TDX specifics it could look like the external update is
missing.
In addition to being confusing, it also litters the TDP MMU with "external"
update callbacks. This is especially unfortunate because there is already a
central place to react to TDP updates, handle_changed_spte().
Begin the process of moving towards a model where all mirror page table
updates are forwarded to TDX code where the TDX-specific logic can live
with a more proper separation of concerns. Do this by adding a helper
__handle_changed_spte() and teaching it how to return error codes, such
that it can propagate the failures that may come from TDX external page
table updates. Make the original handle_changed_spte() a no-fail version of
__handle_changed_spte(), so it handles no-fail changes which are under
exclusive mmu_lock or under the no-fail path handle_removed_pt(),
triggering KVM_BUG_ON() on error returns.
Instead of having __tdp_mmu_set_spte_atomic() do the frozen mirror SPTE
dance and trigger propagation to external PTEs, make
__tdp_mmu_set_spte_atomic() a simple helper of try_cmpxchg64() and hoist
the frozen mirror SPTE dance up a level to tdp_mmu_set_spte_atomic(). Then,
the propagation of changes to present to the external PTEs can be
centralized to __handle_changed_spte(). Aging external SPTEs is not yet
supported for the mirror page table, so just warn on mirror usage in
kvm_tdp_mmu_age_spte() and invoke __tdp_mmu_set_spte_atomic() directly
without frozen dance. No need to warn on installing FROZEN_SPTE as a
long-term value in kvm_tdp_mmu_age_spte() since removing accessed bit is
mutually exclusive with installing FROZEN_SPTE (FROZEN_SPTE is with
accessed bit in all x86 platforms).
Since tdp_mmu_set_spte_atomic() can also be invoked to atomically zap SPTEs
(though there's no path to trigger atomic zap on the mirror page table up
to now), also leverage set_external_spte() op to propagate the atomic zaps
when tdp_mmu_set_spte_atomic() zaps leaf SPTEs directly. (When
tdp_mmu_set_spte_atomic() zaps a non-leaf SPTE, zaps of the child leaf
SPTEs are propagated via the remove_external_spte() op).
Note: tdp_mmu_set_spte_atomic() invokes __handle_changed_spte() to handle
changes to new_spte while the mirror SPTE is frozen, so
(1) the update of the external PTEs and statistics, or
(2) the update of child mirror SPTEs, child external PTEs and corresponding
statistics,
now occur before the mirror SPTE is actually set to new_spte.
(1) is ok since if it fails, the mirror SPTE will be restored to its
original value. (2) is also ok since handle_removed_pt() is no-fail.
Link: https://lore.kernel.org/lkml/aYYn0nf2cayYu8e7@google.com/
Not-yet-Signed-off-by: Sean Christopherson <seanjc@google.com>
[Rick: Based on a diff by Sean Chrisopherson]
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
[Yan: added atomic zap case ]
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
MMU_refactors v2:
- Updated comments and patch log (Yan, Binbin).
- Split the "KVM: x86/mmu: Plumb "sp" _pointer_ into the TDP MMU's
handle_changed_spte()" out (which was in Sean's original series but
had SPTE instead of "sp" in patch title). (Yan)
- Also invoke set_external_spte() op to propagate changes for atomic zap of
leaf SPTEs. (Yan).
- Kept the "Not-yet-Signed-off-by" in v1
https://lore.kernel.org/all/20260327201421.2824383-8-rick.p.edgecombe@intel.com.
---
arch/x86/kvm/mmu/tdp_mmu.c | 124 +++++++++++++++++++++++--------------
1 file changed, 79 insertions(+), 45 deletions(-)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 05dc8bdc1ea5..ada4a0837298 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -495,7 +495,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
}
/**
- * handle_changed_spte - handle bookkeeping associated with an SPTE change
+ * __handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
* @sp: the page table in which the SPTE resides
* @gfn: the base GFN that was mapped by the SPTE
@@ -510,9 +510,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* dirty logging updates are handled in common code, not here (see make_spte()
* and fast_pf_fix_direct_spte()).
*/
-static void handle_changed_spte(struct kvm *kvm, struct kvm_mmu_page *sp,
- gfn_t gfn, u64 old_spte, u64 new_spte,
- int level, bool shared)
+static int __handle_changed_spte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ gfn_t gfn, u64 old_spte, u64 new_spte,
+ int level, bool shared)
{
bool was_present = is_shadow_present_pte(old_spte);
bool is_present = is_shadow_present_pte(new_spte);
@@ -549,9 +549,7 @@ static void handle_changed_spte(struct kvm *kvm, struct kvm_mmu_page *sp,
}
if (old_spte == new_spte)
- return;
-
- trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+ return 0;
if (is_leaf)
check_spte_writable_invariants(new_spte);
@@ -578,29 +576,49 @@ static void handle_changed_spte(struct kvm *kvm, struct kvm_mmu_page *sp,
"a temporary frozen SPTE.\n"
"as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
as_id, gfn, old_spte, new_spte, level);
- return;
+ return 0;
}
- if (is_leaf != was_leaf)
- kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
-
/*
* Recursively handle child PTs if the change removed a subtree from
* the paging structure. Note the WARN on the PFN changing without the
* SPTE being converted to a hugepage (leaf) or being zapped. Shadow
* pages are kernel allocations and should never be migrated.
+ *
+ * For the mirror page table, propagate changes to present or changes of
+ * leaf SPTEs to !present under shared mmu_lock to the external SPTE via
+ * set_external_spte() op.
*/
if (was_present && !was_leaf &&
- (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
+ (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) {
handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
+ } else if (is_mirror_sp(sp) && (is_present || shared)) {
+ int r;
+
+ r = kvm_x86_call(set_external_spte)(kvm, gfn, old_spte, new_spte, level);
+ if (r)
+ return r;
+ }
+ trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+
+ if (is_leaf != was_leaf)
+ kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
+
+ return 0;
+}
+
+static void handle_changed_spte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ gfn_t gfn, u64 old_spte, u64 new_spte,
+ int level, bool shared)
+{
+ KVM_BUG_ON(__handle_changed_spte(kvm, sp, gfn, old_spte, new_spte,
+ level, shared), kvm);
}
static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
- u64 *raw_sptep = rcu_dereference(iter->sptep);
-
/*
* The caller is responsible for ensuring the old SPTE is not a FROZEN
* SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE,
@@ -609,31 +627,6 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
*/
WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
- /* Should not set FROZEN_SPTE as a long-term value. */
- KVM_MMU_WARN_ON(is_frozen_spte(new_spte));
-
- if (is_mirror_sptep(iter->sptep)) {
- int ret;
-
- /*
- * We need to lock out other updates to the SPTE until the external
- * page table has been modified. Use FROZEN_SPTE similar to
- * the zapping case.
- */
- if (!try_cmpxchg64(raw_sptep, &iter->old_spte, FROZEN_SPTE))
- return -EBUSY;
-
- ret = kvm_x86_call(set_external_spte)(kvm, iter->gfn, iter->old_spte,
- new_spte, iter->level);
-
- if (ret)
- __kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte);
- else
- __kvm_tdp_mmu_write_spte(iter->sptep, new_spte);
-
- return ret;
- }
-
/*
* Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
* does not hold the mmu_lock. On failure, i.e. if a different logical
@@ -641,7 +634,7 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
* the current value, so the caller operates on fresh data, e.g. if it
* retries tdp_mmu_set_spte_atomic().
*/
- if (!try_cmpxchg64(raw_sptep, &iter->old_spte, new_spte))
+ if (!try_cmpxchg64(rcu_dereference(iter->sptep), &iter->old_spte, new_spte))
return -EBUSY;
return 0;
@@ -673,14 +666,51 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
lockdep_assert_held_read(&kvm->mmu_lock);
- ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
+ /* Should not set FROZEN_SPTE as a long-term value. */
+ KVM_MMU_WARN_ON(is_frozen_spte(new_spte));
+
+ /*
+ * Temporarily freeze the SPTE until the external PTE operation has
+ * completed, e.g. so that concurrent faults don't attempt to install a
+ * child PTE in the external page table before the parent PTE has been
+ * written.
+ */
+ if (is_mirror_sptep(iter->sptep))
+ ret = __tdp_mmu_set_spte_atomic(kvm, iter, FROZEN_SPTE);
+ else
+ ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
+
if (ret)
return ret;
- handle_changed_spte(kvm, sp, iter->gfn, iter->old_spte,
- new_spte, iter->level, true);
-
- return 0;
+ /*
+ * Handle the change from iter->old_spte to new_spte.
+ *
+ * Note: for mirror page table, this means the updates of the external
+ * PTE, statistics, or updates of child SPTEs, child external PTEs and
+ * corresponding statistics are performed while the mirror SPTE is in
+ * frozen state (i.e., before the mirror SPTE is set to new_spte).
+ */
+ ret = __handle_changed_spte(kvm, sp, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
+ /*
+ * Unfreeze the mirror SPTE. If updating the external SPTE failed,
+ * restore the old value so that the mirror SPTE isn't frozen in
+ * perpetuity, otherwise set the mirror SPTE to the new desired value.
+ */
+ if (is_mirror_sptep(iter->sptep)) {
+ if (ret)
+ __kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte);
+ else
+ __kvm_tdp_mmu_write_spte(iter->sptep, new_spte);
+ } else {
+ /*
+ * Bug the VM if handling the change failed, as failure is only
+ * allowed if KVM couldn't update the external SPTE.
+ */
+ KVM_BUG_ON(ret, kvm);
+ }
+ return ret;
}
/*
@@ -1334,6 +1364,10 @@ static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
{
u64 new_spte;
+ /* TODO: Add support for aging external SPTEs, if necessary. */
+ if (WARN_ON_ONCE(is_mirror_sptep(iter->sptep)))
+ return;
+
if (spte_ad_enabled(iter->old_spte)) {
iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
shadow_accessed_mask);
--
2.43.2
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v2 10/15] KVM: x86/mmu: Drop KVM_BUG_ON() on shared lock to zap child external PTEs
2026-05-09 7:52 [PATCH v2 00/15] TDX MMU refactors Yan Zhao
` (8 preceding siblings ...)
2026-05-09 7:56 ` [PATCH v2 09/15] KVM: x86/tdp_mmu: Centrally propagate to-present/atomic zap updates to external PTEs Yan Zhao
@ 2026-05-09 7:56 ` Yan Zhao
2026-05-09 7:56 ` [PATCH v2 11/15] KVM: TDX: Hoist tdx_sept_remove_private_spte() above set_private_spte() Yan Zhao
` (4 subsequent siblings)
14 siblings, 0 replies; 16+ messages in thread
From: Yan Zhao @ 2026-05-09 7:56 UTC (permalink / raw)
To: seanjc, pbonzini, kvm, rick.p.edgecombe, kas
Cc: linux-kernel, x86, dave.hansen, kai.huang, binbin.wu, xiaoyao.li,
yan.y.zhao
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
Drop the KVM_BUG_ON() in the KVM MMU core before zapping child external
PTEs, since requiring zapping PTEs to be protected by exclusive mmu_lock is
TDX's specific requirement.
No need to plumb the shared/exclusive info into the remove_external_spte()
op or move the KVM_BUG_ON() to TDX, because
- There's already an assertion of exclusive mmu_lock protection in TDX.
- The KVM_BUG_ON() is a bit redundant given that if there's any bug causing
zapping of leaf PTEs in S-EPT under shared mmu_lock, SEAMCALL failures
due to contention would result in TDX_BUG_ON() in TDX.
Link: https://lore.kernel.org/kvm/aYUarHf3KEwHGuJe@google.com/
Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
MMU_refactors v2:
- Updated commit log and title. (Yan)
---
arch/x86/kvm/mmu/tdp_mmu.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index ada4a0837298..553a30628960 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -473,10 +473,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
}
handle_changed_spte(kvm, sp, gfn, old_spte, FROZEN_SPTE, level, shared);
- if (is_mirror_sp(sp)) {
- KVM_BUG_ON(shared, kvm);
+ if (is_mirror_sp(sp))
remove_external_spte(kvm, gfn, old_spte, level);
- }
}
if (is_mirror_sp(sp) &&
--
2.43.2
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v2 11/15] KVM: TDX: Hoist tdx_sept_remove_private_spte() above set_private_spte()
2026-05-09 7:52 [PATCH v2 00/15] TDX MMU refactors Yan Zhao
` (9 preceding siblings ...)
2026-05-09 7:56 ` [PATCH v2 10/15] KVM: x86/mmu: Drop KVM_BUG_ON() on shared lock to zap child " Yan Zhao
@ 2026-05-09 7:56 ` Yan Zhao
2026-05-09 7:57 ` [PATCH v2 12/15] KVM: TDX: Drop kvm_x86_ops.remove_external_spte() Yan Zhao
` (3 subsequent siblings)
14 siblings, 0 replies; 16+ messages in thread
From: Yan Zhao @ 2026-05-09 7:56 UTC (permalink / raw)
To: seanjc, pbonzini, kvm, rick.p.edgecombe, kas
Cc: linux-kernel, x86, dave.hansen, kai.huang, binbin.wu, xiaoyao.li,
yan.y.zhao
From: Sean Christopherson <seanjc@google.com>
Arrange tdx_sept_remove_private_spte() (and its tdx_track() helper) to be
above tdx_sept_set_private_spte() in anticipation of routing all S-EPT
writes (with the exception of reclaiming non-leaf pages) through the "set"
API.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
MMU_refactors v2:
-Made the patch log description more generic to match the diff. (Kai, Rick)
-Kept tdx_sept_free_private_spt() below tdx_sept_set_private_spte(). (Yan)
---
arch/x86/kvm/vmx/tdx.c | 80 +++++++++++++++++++++---------------------
1 file changed, 40 insertions(+), 40 deletions(-)
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 48aa7936a7f7..e40a999b0fb8 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1718,23 +1718,6 @@ static int tdx_sept_map_leaf_spte(struct kvm *kvm, gfn_t gfn, enum pg_level leve
return tdx_mem_page_aug(kvm, gfn, level, pfn);
}
-static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
- u64 new_spte, enum pg_level level)
-{
- lockdep_assert_held(&kvm->mmu_lock);
-
- if (KVM_BUG_ON(is_shadow_present_pte(old_spte), kvm))
- return -EIO;
-
- if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
- return -EIO;
-
- if (!is_last_spte(new_spte, level))
- return tdx_sept_map_nonleaf_spte(kvm, gfn, level, new_spte);
-
- return tdx_sept_map_leaf_spte(kvm, gfn, level, new_spte);
-}
-
/*
* Ensure shared and private EPTs to be flushed on all vCPUs.
* tdh_mem_track() is the only caller that increases TD epoch. An increase in
@@ -1781,29 +1764,6 @@ static void tdx_track(struct kvm *kvm)
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
}
-static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt)
-{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
- /*
- * free_external_spt() is only called after hkid is freed when TD is
- * tearing down.
- * KVM doesn't (yet) zap page table pages in mirror page table while
- * TD is active, though guest pages mapped in mirror page table could be
- * zapped during TD is active, e.g. for shared <-> private conversion
- * and slot move/deletion.
- */
- if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
- return -EIO;
-
- /*
- * The HKID assigned to this TD was already freed and cache was
- * already flushed. We don't have to flush again.
- */
- return tdx_reclaim_page(virt_to_page(private_spt));
-}
-
static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
enum pg_level level, u64 mirror_spte)
{
@@ -1854,6 +1814,46 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
tdx_quirk_reset_paddr(PFN_PHYS(pfn), PAGE_SIZE);
}
+static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
+ u64 new_spte, enum pg_level level)
+{
+ lockdep_assert_held(&kvm->mmu_lock);
+
+ if (KVM_BUG_ON(is_shadow_present_pte(old_spte), kvm))
+ return -EIO;
+
+ if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
+ return -EIO;
+
+ if (!is_last_spte(new_spte, level))
+ return tdx_sept_map_nonleaf_spte(kvm, gfn, level, new_spte);
+
+ return tdx_sept_map_leaf_spte(kvm, gfn, level, new_spte);
+}
+
+static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, void *private_spt)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ /*
+ * free_external_spt() is only called after hkid is freed when TD is
+ * tearing down.
+ * KVM doesn't (yet) zap page table pages in mirror page table while
+ * TD is active, though guest pages mapped in mirror page table could be
+ * zapped during TD is active, e.g. for shared <-> private conversion
+ * and slot move/deletion.
+ */
+ if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
+ return -EIO;
+
+ /*
+ * The HKID assigned to this TD was already freed and cache was
+ * already flushed. We don't have to flush again.
+ */
+ return tdx_reclaim_page(virt_to_page(private_spt));
+}
+
void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
int trig_mode, int vector)
{
--
2.43.2
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v2 12/15] KVM: TDX: Drop kvm_x86_ops.remove_external_spte()
2026-05-09 7:52 [PATCH v2 00/15] TDX MMU refactors Yan Zhao
` (10 preceding siblings ...)
2026-05-09 7:56 ` [PATCH v2 11/15] KVM: TDX: Hoist tdx_sept_remove_private_spte() above set_private_spte() Yan Zhao
@ 2026-05-09 7:57 ` Yan Zhao
2026-05-09 7:57 ` [PATCH v2 13/15] KVM: TDX: Rename tdx_sept_remove_private_spte() to show it's for leaf SPTEs Yan Zhao
` (2 subsequent siblings)
14 siblings, 0 replies; 16+ messages in thread
From: Yan Zhao @ 2026-05-09 7:57 UTC (permalink / raw)
To: seanjc, pbonzini, kvm, rick.p.edgecombe, kas
Cc: linux-kernel, x86, dave.hansen, kai.huang, binbin.wu, xiaoyao.li,
yan.y.zhao
From: Sean Christopherson <seanjc@google.com>
Drop kvm_x86_ops.remove_external_spte(), and instead handle the removal of
leaf SPTEs in the S-EPT (a.k.a. external page table) in
kvm_x86_ops.set_external_spte(). This will also allow extending
tdx_sept_set_private_spte() to support splitting a huge S-EPT entry without
needing yet another kvm_x86_ops hook.
Now all changes for removing leaf mirror SPTEs are propagated through
kvm_x86_ops.set_external_spte().
- When removing leaf mirror SPTEs under shared mmu_lock (though currently
no path can trigger this scenario and TDX does not support this
scenario), tdx_sept_remove_private_spte() may produce a warning due to
lockdep_assert_held_write() or may return -EIO and trigger TDX_BUG_ON()
due to concurrent BLOCK, TRACK, REMOVE.
- When removing leaf mirror SPTEs under exclusive mmu_lock, all errors are
unexpected. If any error occurs in this scenario,
tdx_sept_remove_private_spte() will return -EIO and trigger KVM_BUG_ON().
A redundant KVM_BUG_ON() call will also be triggered in TDP MMU core in
handle_changed_spte(), which is benign (the WARN will fire if and only if
the VM isn't already bugged).
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
MMU_refactors v2:
- Added expected lock and valid scenarios in function comment of
tdx_sept_set_private_spte(). (Yan/Rick)
- Updated patch title (was "Handle removal of leaf SPTEs in
.set_private_spte()"), since atomic zaps of leaf SPTEs are already
handled in kvm_x86_ops.set_external_spte() before this patch. (Yan).
---
arch/x86/include/asm/kvm-x86-ops.h | 1 -
arch/x86/include/asm/kvm_host.h | 3 ---
arch/x86/kvm/mmu/tdp_mmu.c | 37 +++-------------------------
arch/x86/kvm/vmx/tdx.c | 39 +++++++++++++++++++++---------
4 files changed, 31 insertions(+), 49 deletions(-)
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index bced6d938702..ed348c6dd445 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -97,7 +97,6 @@ KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
KVM_X86_OP(load_mmu_pgd)
KVM_X86_OP_OPTIONAL_RET0(set_external_spte)
KVM_X86_OP_OPTIONAL_RET0(free_external_spt)
-KVM_X86_OP_OPTIONAL(remove_external_spte)
KVM_X86_OP(has_wbinvd_exit)
KVM_X86_OP(get_l2_tsc_offset)
KVM_X86_OP(get_l2_tsc_multiplier)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9b55973f194c..c62a14623dcc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1899,9 +1899,6 @@ struct kvm_x86_ops {
int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
void *external_spt);
- /* Update external page table from spte getting removed, and flush TLB. */
- void (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
- u64 mirror_spte);
bool (*has_wbinvd_exit)(void);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 553a30628960..5cc2e948610b 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -359,25 +359,6 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
-static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
- int level)
-{
- /*
- * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
- * PTs are removed in a special order, involving free_external_spt().
- * But remove_external_spte() will be called on non-leaf PTEs via
- * __tdp_mmu_zap_root(), so avoid the error the former would return
- * in this case.
- */
- if (!is_last_spte(old_spte, level))
- return;
-
- /* Zapping leaf spte is allowed only when write lock is held. */
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte);
-}
-
/**
* handle_removed_pt() - handle a page table removed from the TDP structure
*
@@ -472,9 +453,6 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
FROZEN_SPTE, level);
}
handle_changed_spte(kvm, sp, gfn, old_spte, FROZEN_SPTE, level, shared);
-
- if (is_mirror_sp(sp))
- remove_external_spte(kvm, gfn, old_spte, level);
}
if (is_mirror_sp(sp) &&
@@ -583,14 +561,14 @@ static int __handle_changed_spte(struct kvm *kvm, struct kvm_mmu_page *sp,
* SPTE being converted to a hugepage (leaf) or being zapped. Shadow
* pages are kernel allocations and should never be migrated.
*
- * For the mirror page table, propagate changes to present or changes of
- * leaf SPTEs to !present under shared mmu_lock to the external SPTE via
+ * For the mirror page table, propagate all changes to the external SPTE
+ * (except zapping/promotion of non-leaf SPTEs) via the
* set_external_spte() op.
*/
if (was_present && !was_leaf &&
(is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) {
handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
- } else if (is_mirror_sp(sp) && (is_present || shared)) {
+ } else if (is_mirror_sp(sp)) {
int r;
r = kvm_x86_call(set_external_spte)(kvm, gfn, old_spte, new_spte, level);
@@ -743,15 +721,6 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, tdp_ptep_t sptep, u64 old_spte,
handle_changed_spte(kvm, sp, gfn, old_spte, new_spte, level, false);
- /*
- * Users that do non-atomic setting of PTEs don't operate on mirror
- * roots, so don't handle it and bug the VM if it's seen.
- */
- if (is_mirror_sptep(sptep)) {
- KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
- remove_external_spte(kvm, gfn, old_spte, level);
- }
-
return old_spte;
}
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index e40a999b0fb8..749883fb8f11 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1764,11 +1764,11 @@ static void tdx_track(struct kvm *kvm)
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
}
-static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, u64 mirror_spte)
+static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, u64 old_spte)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
+ kvm_pfn_t pfn = spte_to_pfn(old_spte);
gpa_t gpa = gfn_to_gpa(gfn);
u64 err, entry, level_state;
@@ -1780,16 +1780,16 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
* there can't be anything populated in the private EPT.
*/
if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
- return;
+ return -EIO;
/* TODO: handle large pages. */
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
- return;
+ return -EIO;
err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa,
level, &entry, &level_state);
if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm))
- return;
+ return -EIO;
/*
* TDX requires TLB tracking before dropping private page. Do
@@ -1805,22 +1805,40 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa,
level, &entry, &level_state);
if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm))
- return;
+ return -EIO;
err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, pfn);
if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
- return;
+ return -EIO;
tdx_quirk_reset_paddr(PFN_PHYS(pfn), PAGE_SIZE);
+ return 0;
}
+/*
+ * Handle changes for
+ * (1) leaf SPTEs from non-present to present
+ * (2) non-leaf SPTEs from non-present to present
+ * (3) leaf SPTEs from present to non-present
+ *
+ * - (1) and (2) must be under shared mmu_lock. If (1) and (2) are under
+ * exclusive mmu_lock (currently impossible), contention errors may lead to
+ * KVM_BUG_ON() in handle_changed_spte(), e.g., due to tdx_mem_page_aug(),
+ * tdx_mem_page_add(), or tdh_mem_sept_add() contending with tdh_vp_enter()
+ * due to zero-step mitigation or contending with TDCALLs.
+ * - (3) must be under write mmu_lock. If (3) is under shared mmu_lock
+ * (currently impossible), warnings will be generated due to
+ * lockdep_assert_held_write() or TDX_BUG_ON() caused by concurrent BLOCK,
+ * TRACK, REMOVE.
+ * - Promotion/demotion is not yet supported.
+ */
static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
u64 new_spte, enum pg_level level)
{
lockdep_assert_held(&kvm->mmu_lock);
- if (KVM_BUG_ON(is_shadow_present_pte(old_spte), kvm))
- return -EIO;
+ if (is_shadow_present_pte(old_spte))
+ return tdx_sept_remove_private_spte(kvm, gfn, level, old_spte);
if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
return -EIO;
@@ -3448,7 +3466,6 @@ int __init tdx_hardware_setup(void)
vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
- vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
return 0;
--
2.43.2
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v2 13/15] KVM: TDX: Rename tdx_sept_remove_private_spte() to show it's for leaf SPTEs
2026-05-09 7:52 [PATCH v2 00/15] TDX MMU refactors Yan Zhao
` (11 preceding siblings ...)
2026-05-09 7:57 ` [PATCH v2 12/15] KVM: TDX: Drop kvm_x86_ops.remove_external_spte() Yan Zhao
@ 2026-05-09 7:57 ` Yan Zhao
2026-05-09 7:57 ` [PATCH v2 14/15] KVM: x86: Move error handling inside free_external_spt() Yan Zhao
2026-05-09 7:57 ` [PATCH v2 15/15] KVM: TDX: Move external page table freeing to TDX code Yan Zhao
14 siblings, 0 replies; 16+ messages in thread
From: Yan Zhao @ 2026-05-09 7:57 UTC (permalink / raw)
To: seanjc, pbonzini, kvm, rick.p.edgecombe, kas
Cc: linux-kernel, x86, dave.hansen, kai.huang, binbin.wu, xiaoyao.li,
yan.y.zhao
Rename tdx_sept_remove_private_spte() to tdx_sept_remove_leaf_spte() to
clearly show that this function is for removal of leaf SPTEs.
No functional change intended.
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
MMU_refactors v2:
- New patch.
---
arch/x86/kvm/vmx/tdx.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 749883fb8f11..5a7f304e14af 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1764,8 +1764,8 @@ static void tdx_track(struct kvm *kvm)
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
}
-static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, u64 old_spte)
+static int tdx_sept_remove_leaf_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, u64 old_spte)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
kvm_pfn_t pfn = spte_to_pfn(old_spte);
@@ -1838,7 +1838,7 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
lockdep_assert_held(&kvm->mmu_lock);
if (is_shadow_present_pte(old_spte))
- return tdx_sept_remove_private_spte(kvm, gfn, level, old_spte);
+ return tdx_sept_remove_leaf_spte(kvm, gfn, level, old_spte);
if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
return -EIO;
@@ -2834,7 +2834,7 @@ void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
{
/*
- * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
+ * TDX has called tdx_track() in tdx_sept_remove_leaf_spte() to
* ensure that private EPT will be flushed on the next TD enter. No need
* to call tdx_track() here again even when this callback is a result of
* zapping private EPT.
--
2.43.2
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v2 14/15] KVM: x86: Move error handling inside free_external_spt()
2026-05-09 7:52 [PATCH v2 00/15] TDX MMU refactors Yan Zhao
` (12 preceding siblings ...)
2026-05-09 7:57 ` [PATCH v2 13/15] KVM: TDX: Rename tdx_sept_remove_private_spte() to show it's for leaf SPTEs Yan Zhao
@ 2026-05-09 7:57 ` Yan Zhao
2026-05-09 7:57 ` [PATCH v2 15/15] KVM: TDX: Move external page table freeing to TDX code Yan Zhao
14 siblings, 0 replies; 16+ messages in thread
From: Yan Zhao @ 2026-05-09 7:57 UTC (permalink / raw)
To: seanjc, pbonzini, kvm, rick.p.edgecombe, kas
Cc: linux-kernel, x86, dave.hansen, kai.huang, binbin.wu, xiaoyao.li,
yan.y.zhao
From: Sean Christopherson <seanjc@google.com>
Move the logic for TDX's specific need to leak pages when reclaim
fails inside the free_external_spt() op, so this can be done in TDX
specific code and not the generic MMU.
Do this by passing the "sp" in instead of the external page table pointer.
This way TDX code can set sp->external_spt to NULL. Since the error is now
handled internally in TDX code (by triggering KVM_BUG_ON() or
TDX_BUG_ON_3(), which warn and stop the VM on any error), change the op to
return void. This way it also operates like a normal free in that success
is guaranteed from the caller's perspective.
Opportunistically, drop the unused level and gfn args while adjusting the
sp arg.
[ Rick: Re-wrote log and massaged op name ]
[ Yan: Updated patch log/function comment, dropped unused param in op ]
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
MMU_refactors v2:
- Fixed typo in the patch log. (Binbin)
- Dropped unused param gfn. (Binbin)
- Mentioned that failure is not handled silently in the patch log. (Binbin)
- Added expected lock and valid scenarios in function comment of
tdx_sept_free_private_spt(). (Yan/Rick)
---
arch/x86/include/asm/kvm-x86-ops.h | 2 +-
arch/x86/include/asm/kvm_host.h | 3 +--
arch/x86/kvm/mmu/tdp_mmu.c | 13 ++-----------
arch/x86/kvm/vmx/tdx.c | 28 ++++++++++++++--------------
4 files changed, 18 insertions(+), 28 deletions(-)
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index ed348c6dd445..10ccf6ea9d9a 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -96,7 +96,7 @@ KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
KVM_X86_OP(load_mmu_pgd)
KVM_X86_OP_OPTIONAL_RET0(set_external_spte)
-KVM_X86_OP_OPTIONAL_RET0(free_external_spt)
+KVM_X86_OP_OPTIONAL(free_external_spt)
KVM_X86_OP(has_wbinvd_exit)
KVM_X86_OP(get_l2_tsc_offset)
KVM_X86_OP(get_l2_tsc_multiplier)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c62a14623dcc..6b28dd387bc6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1896,8 +1896,7 @@ struct kvm_x86_ops {
u64 new_spte, enum pg_level level);
/* Update external page tables for page table about to be freed. */
- int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
- void *external_spt);
+ void (*free_external_spt)(struct kvm *kvm, struct kvm_mmu_page *sp);
bool (*has_wbinvd_exit)(void);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 5cc2e948610b..a847a8f09bc6 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -455,17 +455,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
handle_changed_spte(kvm, sp, gfn, old_spte, FROZEN_SPTE, level, shared);
}
- if (is_mirror_sp(sp) &&
- WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level,
- sp->external_spt))) {
- /*
- * Failed to free page table page in mirror page table and
- * there is nothing to do further.
- * Intentionally leak the page to prevent the kernel from
- * accessing the encrypted page.
- */
- sp->external_spt = NULL;
- }
+ if (is_mirror_sp(sp))
+ kvm_x86_call(free_external_spt)(kvm, sp);
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 5a7f304e14af..9431bc443d50 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1849,27 +1849,27 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
return tdx_sept_map_leaf_spte(kvm, gfn, level, new_spte);
}
-static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt)
+/*
+ * Handle changes for non-leaf SPTEs from present to non-present.
+ * Must be under exclusive mmu_lock and cannot fail.
+ */
+static void tdx_sept_free_private_spt(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
/*
- * free_external_spt() is only called after hkid is freed when TD is
- * tearing down.
* KVM doesn't (yet) zap page table pages in mirror page table while
* TD is active, though guest pages mapped in mirror page table could be
* zapped during TD is active, e.g. for shared <-> private conversion
* and slot move/deletion.
+ *
+ * In other words, KVM should only free mirror page tables after the
+ * TD's hkid is freed, when the TD is being torn down.
+ *
+ * If the S-EPT PTE can't be removed for any reason, intentionally leak
+ * the page to prevent the kernel from accessing the encrypted page.
*/
- if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
- return -EIO;
-
- /*
- * The HKID assigned to this TD was already freed and cache was
- * already flushed. We don't have to flush again.
- */
- return tdx_reclaim_page(virt_to_page(private_spt));
+ if (KVM_BUG_ON(is_hkid_assigned(to_kvm_tdx(kvm)), kvm) ||
+ tdx_reclaim_page(virt_to_page(sp->external_spt)))
+ sp->external_spt = NULL;
}
void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
--
2.43.2
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v2 15/15] KVM: TDX: Move external page table freeing to TDX code
2026-05-09 7:52 [PATCH v2 00/15] TDX MMU refactors Yan Zhao
` (13 preceding siblings ...)
2026-05-09 7:57 ` [PATCH v2 14/15] KVM: x86: Move error handling inside free_external_spt() Yan Zhao
@ 2026-05-09 7:57 ` Yan Zhao
14 siblings, 0 replies; 16+ messages in thread
From: Yan Zhao @ 2026-05-09 7:57 UTC (permalink / raw)
To: seanjc, pbonzini, kvm, rick.p.edgecombe, kas
Cc: linux-kernel, x86, dave.hansen, kai.huang, binbin.wu, xiaoyao.li,
yan.y.zhao
From: Sean Christopherson <seanjc@google.com>
Move the freeing of external page tables into the reclaim operation that
lives in TDX code.
The TDP MMU supports traversing the TDP without holding locks. Page tables
need to be freed via RCU to prevent walking one that gets freed.
While none of these lockless walk operations actually happen for the mirror
page table, the TDP MMU nonetheless frees the mirror page table in the same
way, and (because it's a handy place to plug it in) the external page table
as well.
However, the external page table definitely can't be walked once the page
table pages are reclaimed from the TDX module. The TDX module releases the
page for the host VMM to use, so this RCU-time free is unnecessary for the
external page table.
So move the free_page() call to TDX code. Create an
tdp_mmu_free_unused_sp() to allow for freeing external page tables that
have never left the TDP MMU code (i.e. don't need freed in a special way).
Link: https://lore.kernel.org/kvm/aYpjNrtGmogNzqwT@google.com/
Not-yet-Signed-off-by: Sean Christopherson <seanjc@google.com>
[Based on a diff by Sean, added log]
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
MMU_refactors v2:
- Fixed typos in the patch log. (Yan, Kai)
- Still kept "Not-yet-Signed-off-by" tag. Sean, please change it to SoB if
the patch looks good to you.
- Updated the code comment in tdx_sept_free_private_spt(): invoking
free_page() to free S-EPT page in tdx_sept_free_private_spt() is only
because RCU-time free is unnecessary, not because it can't be performed
from RCU callbacks. (Yan)
---
arch/x86/kvm/mmu/tdp_mmu.c | 16 +++++++++++-----
arch/x86/kvm/vmx/tdx.c | 11 ++++++++++-
2 files changed, 21 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index a847a8f09bc6..bb18e9e61542 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -53,13 +53,18 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
rcu_barrier();
}
-static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
+static void __tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
- free_page((unsigned long)sp->external_spt);
free_page((unsigned long)sp->spt);
kmem_cache_free(mmu_page_header_cache, sp);
}
+static void tdp_mmu_free_unused_sp(struct kvm_mmu_page *sp)
+{
+ free_page((unsigned long)sp->external_spt);
+ __tdp_mmu_free_sp(sp);
+}
+
/*
* This is called through call_rcu in order to free TDP page table memory
* safely with respect to other kernel threads that may be operating on
@@ -73,7 +78,8 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
rcu_head);
- tdp_mmu_free_sp(sp);
+ WARN_ON_ONCE(sp->external_spt);
+ __tdp_mmu_free_sp(sp);
}
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
@@ -1266,7 +1272,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* failed, e.g. because a different task modified the SPTE.
*/
if (r) {
- tdp_mmu_free_sp(sp);
+ tdp_mmu_free_unused_sp(sp);
goto retry;
}
@@ -1577,7 +1583,7 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
* installs its own sp in place of the last sp we tried to split.
*/
if (sp)
- tdp_mmu_free_sp(sp);
+ tdp_mmu_free_unused_sp(sp);
return 0;
}
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 9431bc443d50..2539107e0ad3 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1869,7 +1869,16 @@ static void tdx_sept_free_private_spt(struct kvm *kvm, struct kvm_mmu_page *sp)
*/
if (KVM_BUG_ON(is_hkid_assigned(to_kvm_tdx(kvm)), kvm) ||
tdx_reclaim_page(virt_to_page(sp->external_spt)))
- sp->external_spt = NULL;
+ goto out;
+
+ /*
+ * Immediately free the S-EPT page because RCU-time free is unnecessary
+ * after TDH.PHYMEM.PAGE.RECLAIM ensures there are no outstanding
+ * readers.
+ */
+ free_page((unsigned long)sp->external_spt);
+out:
+ sp->external_spt = NULL;
}
void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
--
2.43.2
^ permalink raw reply related [flat|nested] 16+ messages in thread