From: Sean Christopherson <sean.j.christopherson@intel.com>
To: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>,
Vitaly Kuznetsov <vkuznets@redhat.com>,
Wanpeng Li <wanpengli@tencent.com>,
Jim Mattson <jmattson@google.com>, Joerg Roedel <joro@8bytes.org>,
kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
eric van tassell <Eric.VanTassell@amd.com>,
Tom Lendacky <thomas.lendacky@amd.com>
Subject: [RFC PATCH 4/8] KVM: x86/mmu: Add infrastructure for pinning PFNs on demand
Date: Fri, 31 Jul 2020 14:23:19 -0700 [thread overview]
Message-ID: <20200731212323.21746-5-sean.j.christopherson@intel.com> (raw)
In-Reply-To: <20200731212323.21746-1-sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
---
arch/x86/include/asm/kvm_host.h | 7 ++
arch/x86/kvm/mmu/mmu.c | 111 ++++++++++++++++++++++++++------
2 files changed, 99 insertions(+), 19 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1bab87a444d78..b14864f3e8e74 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1138,6 +1138,13 @@ struct kvm_x86_ops {
void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long cr3);
+ bool (*pin_spte)(struct kvm_vcpu *vcpu, gfn_t gfn, int level,
+ kvm_pfn_t pfn);
+ void (*drop_pinned_spte)(struct kvm *kvm, gfn_t gfn, int level,
+ kvm_pfn_t pfn);
+ void (*zap_pinned_spte)(struct kvm *kvm, gfn_t gfn, int level);
+ void (*unzap_pinned_spte)(struct kvm *kvm, gfn_t gfn, int level);
+
bool (*has_wbinvd_exit)(void);
/* Returns actual tsc_offset set in active VMCS */
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 182f398036248..cab3b2f2f49c3 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -133,6 +133,9 @@ module_param(dbg, bool, 0644);
#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
#define SPTE_MMIO_MASK (3ULL << 52)
+/* Special SPTEs flags that can only be used for non-MMIO SPTEs. */
+#define SPTE_PINNED_MASK BIT_ULL(62)
+
#define PT64_LEVEL_BITS 9
#define PT64_LEVEL_SHIFT(level) \
@@ -211,6 +214,7 @@ enum {
RET_PF_EMULATE = 1,
RET_PF_INVALID = 2,
RET_PF_FIXED = 3,
+ RET_PF_UNZAPPED = 4,
};
struct pte_list_desc {
@@ -635,6 +639,11 @@ static bool is_shadow_present_pte(u64 pte)
return __is_shadow_present_pte(pte) && !is_mmio_spte(pte);
}
+static bool is_pinned_pte(u64 pte)
+{
+ return !!(pte & SPTE_PINNED_MASK);
+}
+
static int is_large_pte(u64 pte)
{
return pte & PT_PAGE_SIZE_MASK;
@@ -937,15 +946,15 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
* state bits, it is used to clear the last level sptep.
* Returns the old PTE.
*/
-static u64 mmu_spte_clear_track_bits(u64 *sptep)
+static u64 __mmu_spte_clear_track_bits(u64 *sptep, u64 clear_value)
{
kvm_pfn_t pfn;
u64 old_spte = *sptep;
if (!spte_has_volatile_bits(old_spte))
- __update_clear_spte_fast(sptep, 0ull);
+ __update_clear_spte_fast(sptep, clear_value);
else
- old_spte = __update_clear_spte_slow(sptep, 0ull);
+ old_spte = __update_clear_spte_slow(sptep, clear_value);
if (!is_shadow_present_pte(old_spte))
return old_spte;
@@ -968,6 +977,11 @@ static u64 mmu_spte_clear_track_bits(u64 *sptep)
return old_spte;
}
+static inline u64 mmu_spte_clear_track_bits(u64 *sptep)
+{
+ return __mmu_spte_clear_track_bits(sptep, 0ull);
+}
+
/*
* Rules for using mmu_spte_clear_no_track:
* Directly clear spte without caring the state bits of sptep,
@@ -1399,7 +1413,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
return pte_list_add(vcpu, spte, rmap_head);
}
-static void rmap_remove(struct kvm *kvm, u64 *spte)
+static void rmap_remove(struct kvm *kvm, u64 *spte, u64 old_spte)
{
struct kvm_mmu_page *sp;
gfn_t gfn;
@@ -1409,6 +1423,10 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
rmap_head = gfn_to_rmap(kvm, gfn, sp);
__pte_list_remove(spte, rmap_head);
+
+ if (is_pinned_pte(old_spte))
+ kvm_x86_ops.drop_pinned_spte(kvm, gfn, sp->role.level - 1,
+ spte_to_pfn(old_spte));
}
/*
@@ -1446,7 +1464,7 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
iter->pos = 0;
sptep = iter->desc->sptes[iter->pos];
out:
- BUG_ON(!is_shadow_present_pte(*sptep));
+ BUG_ON(!is_shadow_present_pte(*sptep) && !is_pinned_pte(*sptep));
return sptep;
}
@@ -1491,8 +1509,8 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
{
u64 old_spte = mmu_spte_clear_track_bits(sptep);
- if (is_shadow_present_pte(old_spte))
- rmap_remove(kvm, sptep);
+ if (is_shadow_present_pte(old_spte) || is_pinned_pte(old_spte))
+ rmap_remove(kvm, sptep, old_spte);
}
@@ -1730,17 +1748,49 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
}
+static bool kvm_mmu_zap_pinned_spte(struct kvm *kvm, u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+
+ if (!(*sptep & SPTE_PINNED_MASK))
+ return false;
+
+ sp = sptep_to_sp(sptep);
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+ pfn = spte_to_pfn(*sptep);
+
+ if (kvm_x86_ops.zap_pinned_spte)
+ kvm_x86_ops.zap_pinned_spte(kvm, gfn, sp->role.level - 1);
+
+ __mmu_spte_clear_track_bits(sptep, SPTE_PINNED_MASK | pfn << PAGE_SHIFT);
+ return true;
+}
+
static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;
- while ((sptep = rmap_get_first(rmap_head, &iter))) {
+restart:
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
+ if (!is_shadow_present_pte(*sptep)) {
+ WARN_ON_ONCE(!is_pinned_pte(*sptep));
+ continue;
+ }
+
+ flush = true;
+
+ /* Keep the rmap if the SPTE is pinned. */
+ if (kvm_mmu_zap_pinned_spte(kvm, sptep))
+ continue;
+
pte_list_remove(rmap_head, sptep);
- flush = true;
+ goto restart;
}
return flush;
@@ -1774,6 +1824,10 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
need_flush = 1;
+ /* Pinned pages should not be relocated (obviously). */
+ if (WARN_ON_ONCE(is_pinned_pte(*sptep)))
+ continue;
+
if (pte_write(*ptep)) {
pte_list_remove(rmap_head, sptep);
goto restart;
@@ -2630,7 +2684,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
struct kvm_mmu_page *child;
pte = *spte;
- if (is_shadow_present_pte(pte)) {
+ if (is_shadow_present_pte(pte) || is_pinned_pte(pte)) {
if (is_last_spte(pte, sp->role.level)) {
drop_spte(kvm, spte);
if (is_large_pte(pte))
@@ -2639,7 +2693,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
}
- return true;
+ return is_shadow_present_pte(pte);
}
if (is_mmio_spte(pte))
@@ -2987,10 +3041,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
u64 spte = 0;
int ret = 0;
struct kvm_mmu_page *sp;
+ bool is_mmio_pfn;
if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
return 0;
+ is_mmio_pfn = kvm_is_mmio_pfn(pfn);
+
sp = sptep_to_sp(sptep);
if (sp_ad_disabled(sp))
spte |= SPTE_AD_DISABLED_MASK;
@@ -3023,15 +3080,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (level > PG_LEVEL_4K)
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
- spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
- kvm_is_mmio_pfn(pfn));
+ spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, is_mmio_pfn);
if (host_writable)
spte |= SPTE_HOST_WRITEABLE;
else
pte_access &= ~ACC_WRITE_MASK;
- if (!kvm_is_mmio_pfn(pfn))
+ if (!is_mmio_pfn)
spte |= shadow_me_mask;
spte |= (u64)pfn << PAGE_SHIFT;
@@ -3065,6 +3121,12 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (speculative)
spte = mark_spte_for_access_track(spte);
+ if (is_pinned_pte(*sptep) ||
+ (vcpu->arch.mmu->direct_map && !is_mmio_pfn &&
+ kvm_x86_ops.pin_spte &&
+ kvm_x86_ops.pin_spte(vcpu, gfn, level, pfn)))
+ spte |= SPTE_PINNED_MASK;
+
set_pte:
if (mmu_spte_update(sptep, spte))
ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
@@ -3081,29 +3143,33 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
int set_spte_ret;
int ret = RET_PF_FIXED;
bool flush = false;
+ u64 pte = *sptep;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
- if (is_shadow_present_pte(*sptep)) {
+ if (is_shadow_present_pte(pte)) {
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
* the parent of the now unreachable PTE.
*/
- if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
+ if (level > PG_LEVEL_4K && !is_large_pte(pte)) {
struct kvm_mmu_page *child;
- u64 pte = *sptep;
child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, sptep);
flush = true;
- } else if (pfn != spte_to_pfn(*sptep)) {
+ } else if (pfn != spte_to_pfn(pte)) {
pgprintk("hfn old %llx new %llx\n",
- spte_to_pfn(*sptep), pfn);
+ spte_to_pfn(pte), pfn);
drop_spte(vcpu->kvm, sptep);
flush = true;
} else
was_rmapped = 1;
+ } else if (is_pinned_pte(pte)) {
+ WARN_ON_ONCE(pfn != spte_to_pfn(pte));
+ ret = RET_PF_UNZAPPED;
+ was_rmapped = 1;
}
set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
@@ -3136,6 +3202,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
rmap_recycle(vcpu, sptep, gfn);
}
+ if (ret == RET_PF_UNZAPPED && kvm_x86_ops.unzap_pinned_spte)
+ kvm_x86_ops.unzap_pinned_spte(vcpu->kvm, gfn, level - 1);
+
return ret;
}
@@ -5921,6 +5990,10 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
sp = sptep_to_sp(sptep);
pfn = spte_to_pfn(*sptep);
+ /* Pinned page dirty logging is not supported. */
+ if (WARN_ON_ONCE(is_pinned_pte(*sptep)))
+ continue;
+
/*
* We cannot do huge page mapping for indirect shadow pages,
* which are found on the last rmap (level = 1) when not using
--
2.28.0
next prev parent reply other threads:[~2020-07-31 21:23 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-07-31 21:23 [RFC PATCH 0/8] KVM: x86/mmu: Introduce pinned SPTEs framework Sean Christopherson
2020-07-31 21:23 ` [RFC PATCH 1/8] KVM: x86/mmu: Return old SPTE from mmu_spte_clear_track_bits() Sean Christopherson
2020-07-31 21:23 ` [RFC PATCH 2/8] KVM: x86/mmu: Use bits 2:0 to check for present SPTEs Sean Christopherson
2020-07-31 21:23 ` [RFC PATCH 3/8] KVM: x86/mmu: Refactor handling of not-present SPTEs in mmu_set_spte() Sean Christopherson
2020-07-31 21:23 ` Sean Christopherson [this message]
2020-07-31 21:23 ` [RFC PATCH 5/8] KVM: SVM: Use the KVM MMU SPTE pinning hooks to pin pages on demand Sean Christopherson
2020-07-31 21:23 ` [RFC PATCH 6/8] KVM: x86/mmu: Move 'pfn' variable to caller of direct_page_fault() Sean Christopherson
2020-07-31 21:23 ` [RFC PATCH 7/8] KVM: x86/mmu: Introduce kvm_mmu_map_tdp_page() for use by SEV Sean Christopherson
2020-07-31 21:23 ` [RFC PATCH 8/8] KVM: SVM: Pin SEV pages in MMU during sev_launch_update_data() Sean Christopherson
2020-08-03 3:00 ` [RFC PATCH 0/8] KVM: x86/mmu: Introduce pinned SPTEs framework Eric van Tassell
2020-08-03 15:00 ` Sean Christopherson
2020-08-03 15:52 ` Brijesh Singh
2020-08-03 17:16 ` Sean Christopherson
2020-08-04 19:40 ` Brijesh Singh
2020-10-27 3:22 ` Brijesh Singh
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200731212323.21746-5-sean.j.christopherson@intel.com \
--to=sean.j.christopherson@intel.com \
--cc=Eric.VanTassell@amd.com \
--cc=jmattson@google.com \
--cc=joro@8bytes.org \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=pbonzini@redhat.com \
--cc=thomas.lendacky@amd.com \
--cc=vkuznets@redhat.com \
--cc=wanpengli@tencent.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox